Repository: NVIDIA/apex
Branch: master
Commit: ba32a259b7aa
Files: 419
Total size: 3.6 MB

Directory structure:
gitextract_8yaiblk9/

├── .clang-format
├── .git-blame-ignore-revs
├── .github/
│   └── ISSUE_TEMPLATE/
│       └── bug_report.md
├── .gitignore
├── .gitmodules
├── .nojekyll
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── apex/
│   ├── __init__.py
│   ├── _autocast_utils.py
│   ├── contrib/
│   │   ├── __init__.py
│   │   ├── bottleneck/
│   │   │   ├── __init__.py
│   │   │   ├── bottleneck.py
│   │   │   ├── halo_exchangers.py
│   │   │   └── test.py
│   │   ├── clip_grad/
│   │   │   ├── __init__.py
│   │   │   └── clip_grad.py
│   │   ├── conv_bias_relu/
│   │   │   ├── __init__.py
│   │   │   └── conv_bias_relu.py
│   │   ├── csrc/
│   │   │   ├── bottleneck/
│   │   │   │   └── bottleneck.cpp
│   │   │   ├── conv_bias_relu/
│   │   │   │   └── conv_bias_relu.cpp
│   │   │   ├── cudnn_gbn/
│   │   │   │   ├── cudnn_gbn.cpp
│   │   │   │   ├── norm_sample.cpp
│   │   │   │   └── norm_sample.h
│   │   │   ├── fmha/
│   │   │   │   ├── fmha_api.cpp
│   │   │   │   └── src/
│   │   │   │       ├── fmha/
│   │   │   │       │   ├── gemm.h
│   │   │   │       │   ├── gmem_tile.h
│   │   │   │       │   ├── kernel_traits.h
│   │   │   │       │   ├── mask.h
│   │   │   │       │   ├── smem_tile.h
│   │   │   │       │   ├── softmax.h
│   │   │   │       │   └── utils.h
│   │   │   │       ├── fmha.h
│   │   │   │       ├── fmha_dgrad_fp16_128_64_kernel.sm80.cu
│   │   │   │       ├── fmha_dgrad_fp16_256_64_kernel.sm80.cu
│   │   │   │       ├── fmha_dgrad_fp16_384_64_kernel.sm80.cu
│   │   │   │       ├── fmha_dgrad_fp16_512_64_kernel.sm80.cu
│   │   │   │       ├── fmha_dgrad_kernel_1xN_reload.h
│   │   │   │       ├── fmha_dgrad_kernel_1xN_reload_nl.h
│   │   │   │       ├── fmha_fill.cu
│   │   │   │       ├── fmha_fprop_fp16_128_64_kernel.sm80.cu
│   │   │   │       ├── fmha_fprop_fp16_256_64_kernel.sm80.cu
│   │   │   │       ├── fmha_fprop_fp16_384_64_kernel.sm80.cu
│   │   │   │       ├── fmha_fprop_fp16_512_64_kernel.sm80.cu
│   │   │   │       ├── fmha_fprop_kernel_1xN.h
│   │   │   │       ├── fmha_kernel.h
│   │   │   │       ├── fmha_noloop_reduce.cu
│   │   │   │       └── fmha_utils.h
│   │   │   ├── focal_loss/
│   │   │   │   ├── focal_loss_cuda.cpp
│   │   │   │   └── focal_loss_cuda_kernel.cu
│   │   │   ├── gpu_direct_storage/
│   │   │   │   ├── gds.cpp
│   │   │   │   ├── gds.h
│   │   │   │   └── gds_pybind.cpp
│   │   │   ├── group_norm/
│   │   │   │   ├── group_norm_nhwc.cpp
│   │   │   │   ├── group_norm_nhwc.h
│   │   │   │   ├── group_norm_nhwc_bwd_one_pass.h
│   │   │   │   ├── group_norm_nhwc_bwd_one_pass_kernel.cuh
│   │   │   │   ├── group_norm_nhwc_bwd_two_pass.cu
│   │   │   │   ├── group_norm_nhwc_fwd_one_pass.h
│   │   │   │   ├── group_norm_nhwc_fwd_one_pass_kernel.cuh
│   │   │   │   ├── group_norm_nhwc_fwd_two_pass.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_10.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_112.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_12.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_120.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_128.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_14.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_16.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_160.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_20.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_24.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_26.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_28.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_30.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_32.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_4.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_40.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_42.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_48.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_56.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_60.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_64.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_70.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_8.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_80.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_84.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_96.cu
│   │   │   │   ├── group_norm_nhwc_one_pass_98.cu
│   │   │   │   ├── group_norm_nhwc_op.cpp
│   │   │   │   ├── macros.h
│   │   │   │   └── traits.h
│   │   │   ├── group_norm_v2/
│   │   │   │   ├── generate_gn_cuda_inst.py
│   │   │   │   ├── gn.cpp
│   │   │   │   ├── gn.hpp
│   │   │   │   ├── gn_cuda.cu
│   │   │   │   ├── gn_cuda_host_template.cuh
│   │   │   │   ├── gn_cuda_inst_1024_1280.cu
│   │   │   │   ├── gn_cuda_inst_1024_1920.cu
│   │   │   │   ├── gn_cuda_inst_1024_320.cu
│   │   │   │   ├── gn_cuda_inst_1024_640.cu
│   │   │   │   ├── gn_cuda_inst_1024_960.cu
│   │   │   │   ├── gn_cuda_inst_256_1280.cu
│   │   │   │   ├── gn_cuda_inst_256_1920.cu
│   │   │   │   ├── gn_cuda_inst_256_2560.cu
│   │   │   │   ├── gn_cuda_inst_256_640.cu
│   │   │   │   ├── gn_cuda_inst_4096_320.cu
│   │   │   │   ├── gn_cuda_inst_4096_640.cu
│   │   │   │   ├── gn_cuda_inst_4096_960.cu
│   │   │   │   ├── gn_cuda_inst_64_1280.cu
│   │   │   │   ├── gn_cuda_inst_64_2560.cu
│   │   │   │   ├── gn_cuda_kernel.cuh
│   │   │   │   ├── gn_dispatch_hw_c.hpp
│   │   │   │   ├── gn_utils.cpp
│   │   │   │   └── gn_utils.hpp
│   │   │   ├── groupbn/
│   │   │   │   ├── batch_norm.cu
│   │   │   │   ├── batch_norm.h
│   │   │   │   ├── batch_norm_add_relu.cu
│   │   │   │   ├── batch_norm_add_relu.h
│   │   │   │   ├── cuda_utils.h
│   │   │   │   ├── interface.cpp
│   │   │   │   ├── ipc.cu
│   │   │   │   └── nhwc_batch_norm_kernel.h
│   │   │   ├── index_mul_2d/
│   │   │   │   ├── index_mul_2d_cuda.cpp
│   │   │   │   └── index_mul_2d_cuda_kernel.cu
│   │   │   ├── layer_norm/
│   │   │   │   ├── ln.h
│   │   │   │   ├── ln_api.cpp
│   │   │   │   ├── ln_bwd_kernels.cuh
│   │   │   │   ├── ln_bwd_semi_cuda_kernel.cu
│   │   │   │   ├── ln_fwd_cuda_kernel.cu
│   │   │   │   ├── ln_fwd_kernels.cuh
│   │   │   │   ├── ln_kernel_traits.h
│   │   │   │   └── ln_utils.cuh
│   │   │   ├── multihead_attn/
│   │   │   │   ├── additive_masked_softmax_dropout_cuda.cu
│   │   │   │   ├── dropout.cuh
│   │   │   │   ├── encdec_multihead_attn_cuda.cu
│   │   │   │   ├── encdec_multihead_attn_norm_add_cuda.cu
│   │   │   │   ├── layer_norm.cuh
│   │   │   │   ├── masked_softmax_dropout_cuda.cu
│   │   │   │   ├── multihead_attn_frontend.cpp
│   │   │   │   ├── philox.cuh
│   │   │   │   ├── self_multihead_attn_bias_additive_mask_cuda.cu
│   │   │   │   ├── self_multihead_attn_bias_cuda.cu
│   │   │   │   ├── self_multihead_attn_cuda.cu
│   │   │   │   ├── self_multihead_attn_norm_add_cuda.cu
│   │   │   │   ├── softmax.cuh
│   │   │   │   └── strided_batched_gemm.cuh
│   │   │   ├── nccl_allocator/
│   │   │   │   └── NCCLAllocator.cpp
│   │   │   ├── nccl_p2p/
│   │   │   │   ├── nccl_p2p.cpp
│   │   │   │   ├── nccl_p2p_cuda.cu
│   │   │   │   ├── nccl_p2p_cuda.cuh
│   │   │   │   ├── nccl_version.cpp
│   │   │   │   └── nccl_version_check.cu
│   │   │   ├── optimizers/
│   │   │   │   ├── fused_adam_cuda.cpp
│   │   │   │   ├── fused_adam_cuda_kernel.cu
│   │   │   │   ├── fused_lamb_cuda.cpp
│   │   │   │   ├── fused_lamb_cuda_kernel.cu
│   │   │   │   ├── multi_tensor_distopt_adam.cpp
│   │   │   │   ├── multi_tensor_distopt_adam_kernel.cu
│   │   │   │   ├── multi_tensor_distopt_lamb.cpp
│   │   │   │   └── multi_tensor_distopt_lamb_kernel.cu
│   │   │   ├── peer_memory/
│   │   │   │   ├── peer_memory.cpp
│   │   │   │   ├── peer_memory_cuda.cu
│   │   │   │   └── peer_memory_cuda.cuh
│   │   │   ├── transducer/
│   │   │   │   ├── transducer_joint.cpp
│   │   │   │   ├── transducer_joint_kernel.cu
│   │   │   │   ├── transducer_loss.cpp
│   │   │   │   └── transducer_loss_kernel.cu
│   │   │   └── xentropy/
│   │   │       ├── interface.cpp
│   │   │       └── xentropy_kernel.cu
│   │   ├── cudnn_gbn/
│   │   │   ├── __init__.py
│   │   │   └── batch_norm.py
│   │   ├── examples/
│   │   │   ├── gpu_direct_storage/
│   │   │   │   ├── benchmark_load.py
│   │   │   │   ├── benchmark_save.py
│   │   │   │   ├── example_load.py
│   │   │   │   └── example_save.py
│   │   │   ├── multihead_attn/
│   │   │   │   ├── func_test_multihead_attn.py
│   │   │   │   └── perf_test_multihead_attn.py
│   │   │   └── nccl_allocator/
│   │   │       ├── allreduce.py
│   │   │       ├── cache.py
│   │   │       ├── change_cuda_allocator.py
│   │   │       └── toy_ddp.py
│   │   ├── fmha/
│   │   │   ├── __init__.py
│   │   │   └── fmha.py
│   │   ├── focal_loss/
│   │   │   ├── __init__.py
│   │   │   └── focal_loss.py
│   │   ├── gpu_direct_storage/
│   │   │   ├── README.md
│   │   │   └── __init__.py
│   │   ├── group_norm/
│   │   │   ├── __init__.py
│   │   │   └── group_norm.py
│   │   ├── groupbn/
│   │   │   ├── __init__.py
│   │   │   └── batch_norm.py
│   │   ├── index_mul_2d/
│   │   │   ├── __init__.py
│   │   │   └── index_mul_2d.py
│   │   ├── layer_norm/
│   │   │   ├── __init__.py
│   │   │   └── layer_norm.py
│   │   ├── multihead_attn/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── encdec_multihead_attn.py
│   │   │   ├── encdec_multihead_attn_func.py
│   │   │   ├── fast_encdec_multihead_attn_func.py
│   │   │   ├── fast_encdec_multihead_attn_norm_add_func.py
│   │   │   ├── fast_self_multihead_attn_func.py
│   │   │   ├── fast_self_multihead_attn_norm_add_func.py
│   │   │   ├── mask_softmax_dropout_func.py
│   │   │   ├── self_multihead_attn.py
│   │   │   └── self_multihead_attn_func.py
│   │   ├── nccl_allocator/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   └── nccl_allocator.py
│   │   ├── openfold_triton/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── _layer_norm_backward_kernels.py
│   │   │   ├── _layer_norm_config_ampere.py
│   │   │   ├── _layer_norm_config_hopper.py
│   │   │   ├── _layer_norm_forward_kernels.py
│   │   │   ├── _mha_kernel.py
│   │   │   ├── fused_adam_swa.py
│   │   │   ├── layer_norm.py
│   │   │   └── mha.py
│   │   ├── optimizers/
│   │   │   ├── __init__.py
│   │   │   ├── distributed_fused_adam.py
│   │   │   ├── distributed_fused_lamb.py
│   │   │   ├── fp16_optimizer.py
│   │   │   ├── fused_adam.py
│   │   │   ├── fused_lamb.py
│   │   │   └── fused_sgd.py
│   │   ├── peer_memory/
│   │   │   ├── __init__.py
│   │   │   ├── peer_halo_exchanger_1d.py
│   │   │   └── peer_memory.py
│   │   ├── sparsity/
│   │   │   ├── COPYRIGHT
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── asp.py
│   │   │   ├── permutation_lib.py
│   │   │   ├── permutation_search_kernels/
│   │   │   │   ├── CUDA_kernels/
│   │   │   │   │   └── permutation_search_kernels.cu
│   │   │   │   ├── __init__.py
│   │   │   │   ├── call_permutation_search_kernels.py
│   │   │   │   ├── channel_swap.py
│   │   │   │   ├── exhaustive_search.py
│   │   │   │   └── permutation_utilities.py
│   │   │   ├── permutation_tests/
│   │   │   │   ├── README.md
│   │   │   │   ├── ablation_studies.sh
│   │   │   │   ├── permutation_test.py
│   │   │   │   ├── runtime_table.sh
│   │   │   │   └── unstructured_study.sh
│   │   │   ├── sparse_masklib.py
│   │   │   └── test/
│   │   │       ├── checkpointing_test_part1.py
│   │   │       ├── checkpointing_test_part2.py
│   │   │       ├── checkpointing_test_reference.py
│   │   │       ├── test_permutation_application.py
│   │   │       └── toy_problem.py
│   │   ├── test/
│   │   │   ├── __init__.py
│   │   │   ├── bottleneck/
│   │   │   │   ├── __init__.py
│   │   │   │   └── test_bottleneck_module.py
│   │   │   ├── clip_grad/
│   │   │   │   ├── __init__.py
│   │   │   │   └── test_clip_grad.py
│   │   │   ├── conv_bias_relu/
│   │   │   │   ├── __init__.py
│   │   │   │   └── test_conv_bias_relu.py
│   │   │   ├── cudnn_gbn/
│   │   │   │   ├── __init__.py
│   │   │   │   └── test_cudnn_gbn_with_two_gpus.py
│   │   │   ├── fmha/
│   │   │   │   ├── __init__.py
│   │   │   │   └── test_fmha.py
│   │   │   ├── focal_loss/
│   │   │   │   ├── __init__.py
│   │   │   │   └── test_focal_loss.py
│   │   │   ├── fused_dense/
│   │   │   │   └── test_fused_dense.py
│   │   │   ├── group_norm/
│   │   │   │   ├── __init__.py
│   │   │   │   └── test_group_norm.py
│   │   │   ├── index_mul_2d/
│   │   │   │   ├── __init__.py
│   │   │   │   └── test_index_mul_2d.py
│   │   │   ├── layer_norm/
│   │   │   │   ├── __init__.py
│   │   │   │   └── test_fast_layer_norm.py
│   │   │   ├── multihead_attn/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── test_encdec_multihead_attn.py
│   │   │   │   ├── test_encdec_multihead_attn_norm_add.py
│   │   │   │   ├── test_fast_self_multihead_attn_bias.py
│   │   │   │   ├── test_mha_fused_softmax.py
│   │   │   │   ├── test_self_multihead_attn.py
│   │   │   │   └── test_self_multihead_attn_norm_add.py
│   │   │   ├── openfold_triton/
│   │   │   │   ├── test_fused_adam_swa.py
│   │   │   │   ├── test_openfold_mha.py
│   │   │   │   └── test_sync_triton_auto_tune_cache_across_gpus.py
│   │   │   ├── optimizers/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── test_dist_adam.py
│   │   │   │   └── test_distributed_fused_lamb.py
│   │   │   ├── peer_memory/
│   │   │   │   ├── __init__.py
│   │   │   │   └── test_peer_halo_exchange_module.py
│   │   │   ├── transducer/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── test_transducer_joint.py
│   │   │   │   └── test_transducer_loss.py
│   │   │   └── xentropy/
│   │   │       ├── __init__.py
│   │   │       └── test_label_smoothing.py
│   │   ├── torchsched/
│   │   │   ├── __init__.py
│   │   │   ├── backend.py
│   │   │   ├── config.py
│   │   │   ├── inductor/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── _utils.py
│   │   │   │   ├── event.py
│   │   │   │   ├── graph.py
│   │   │   │   ├── scheduler.py
│   │   │   │   └── wrapper.py
│   │   │   ├── ops/
│   │   │   │   ├── __init__.py
│   │   │   │   └── layer_norm.py
│   │   │   └── passes/
│   │   │       ├── __init__.py
│   │   │       └── pre_grad_passes.py
│   │   ├── transducer/
│   │   │   ├── __init__.py
│   │   │   ├── _transducer_ref.py
│   │   │   └── transducer.py
│   │   └── xentropy/
│   │       ├── __init__.py
│   │       └── softmax_xentropy.py
│   ├── distributed_testing/
│   │   ├── __init__.py
│   │   ├── _ucc_util.py
│   │   └── distributed_test_base.py
│   ├── fused_dense/
│   │   ├── __init__.py
│   │   └── fused_dense.py
│   ├── mlp/
│   │   ├── __init__.py
│   │   └── mlp.py
│   ├── multi_tensor_apply/
│   │   ├── __init__.py
│   │   └── multi_tensor_apply.py
│   ├── normalization/
│   │   ├── __init__.py
│   │   └── fused_layer_norm.py
│   └── optimizers/
│       ├── __init__.py
│       ├── fused_adagrad.py
│       ├── fused_adam.py
│       ├── fused_lamb.py
│       ├── fused_mixed_precision_lamb.py
│       ├── fused_novograd.py
│       └── fused_sgd.py
├── csrc/
│   ├── amp_C_frontend.cpp
│   ├── flatten_unflatten.cpp
│   ├── fused_dense.cpp
│   ├── fused_dense_cuda.cu
│   ├── layer_norm_cuda.cpp
│   ├── layer_norm_cuda_kernel.cu
│   ├── megatron/
│   │   ├── fused_rotary_positional_embedding.cpp
│   │   ├── fused_rotary_positional_embedding.h
│   │   ├── fused_rotary_positional_embedding_cuda.cu
│   │   ├── fused_weight_gradient_dense.cpp
│   │   ├── fused_weight_gradient_dense_16bit_prec_cuda.cu
│   │   ├── fused_weight_gradient_dense_cuda.cu
│   │   ├── generic_scaled_masked_softmax.cpp
│   │   ├── generic_scaled_masked_softmax.h
│   │   ├── generic_scaled_masked_softmax_cuda.cu
│   │   ├── scaled_masked_softmax.cpp
│   │   ├── scaled_masked_softmax.h
│   │   ├── scaled_masked_softmax_cuda.cu
│   │   ├── scaled_softmax.cpp
│   │   ├── scaled_softmax_cuda.cu
│   │   ├── scaled_upper_triang_masked_softmax.cpp
│   │   ├── scaled_upper_triang_masked_softmax.h
│   │   └── scaled_upper_triang_masked_softmax_cuda.cu
│   ├── mlp.cpp
│   ├── mlp_cuda.cu
│   ├── multi_tensor_adagrad.cu
│   ├── multi_tensor_adam.cu
│   ├── multi_tensor_apply.cuh
│   ├── multi_tensor_axpby_kernel.cu
│   ├── multi_tensor_l2norm_kernel.cu
│   ├── multi_tensor_l2norm_kernel_mp.cu
│   ├── multi_tensor_l2norm_scale_kernel.cu
│   ├── multi_tensor_lamb.cu
│   ├── multi_tensor_lamb_mp.cu
│   ├── multi_tensor_lamb_stage_1.cu
│   ├── multi_tensor_lamb_stage_2.cu
│   ├── multi_tensor_novograd.cu
│   ├── multi_tensor_scale_kernel.cu
│   ├── multi_tensor_sgd_kernel.cu
│   ├── static_switch.h
│   ├── syncbn.cpp
│   ├── type_shim.h
│   ├── update_scale_hysteresis.cu
│   └── welford.cu
├── docs/
│   ├── Makefile
│   └── source/
│       ├── _static/
│       │   └── css/
│       │       └── pytorch_theme.css
│       ├── _templates/
│       │   └── layout.html
│       ├── conf.py
│       ├── index.rst
│       ├── layernorm.rst
│       └── optimizers.rst
├── examples/
│   ├── README.md
│   ├── dcgan/
│   │   ├── README.md
│   │   └── main_amp.py
│   ├── docker/
│   │   ├── Dockerfile
│   │   └── README.md
│   ├── imagenet/
│   │   ├── README.md
│   │   └── main_amp.py
│   └── simple/
│       └── distributed/
│           ├── README.md
│           ├── distributed_data_parallel.py
│           └── run.sh
├── pyproject.toml
├── requirements.txt
├── requirements_dev.txt
├── setup.py
└── tests/
    ├── L0/
    │   ├── run_fused_layer_norm/
    │   │   └── test_fused_layer_norm.py
    │   ├── run_mlp/
    │   │   └── test_mlp.py
    │   ├── run_optimizers/
    │   │   ├── __init__.py
    │   │   ├── test_adam.py
    │   │   ├── test_fused_novograd.py
    │   │   ├── test_fused_optimizer.py
    │   │   └── test_lamb.py
    │   └── run_test.py
    ├── L1/
    │   ├── common/
    │   │   ├── compare.py
    │   │   ├── main_amp.py
    │   │   └── run_test.sh
    │   ├── cross_product/
    │   │   └── run.sh
    │   └── cross_product_distributed/
    │       └── run.sh
    ├── distributed/
    │   ├── DDP/
    │   │   ├── ddp_race_condition_test.py
    │   │   └── run_race_test.sh
    │   ├── amp_master_params/
    │   │   ├── amp_master_params.py
    │   │   ├── compare.py
    │   │   └── run.sh
    │   └── synced_batchnorm/
    │       ├── python_single_gpu_unit_test.py
    │       ├── single_gpu_unit_test.py
    │       ├── test_batchnorm1d.py
    │       ├── test_groups.py
    │       ├── two_gpu_test_different_batch_size.py
    │       ├── two_gpu_unit_test.py
    │       └── unit_test.sh
    └── docker_extension_builds/
        └── run.sh

================================================
FILE CONTENTS
================================================

================================================
FILE: .clang-format
================================================
# Start with a built-in style and modify it
BasedOnStyle: Google

# Overrides
ColumnLimit: 120


================================================
FILE: .git-blame-ignore-revs
================================================
# Commits to ignore in git-blame
# These commits are bulk formatting or refactoring changes that should be skipped when viewing blame history

# Add pre-commit and GitHub Actions workflow for it (#1949)
1f20398756f0eeba37d6887a2d3f65e0687ec94f
# Remove github actions config of pre-commit in favor of pre-commit ci (#1958)
27e0e8951352d9d58c88b2895cd8f2c752bda963
# Enable Ruff pre-commit hooks (#1957)
16fadfe71c0d57312351c2d8b056251a0c8ce1ef


================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: Bug report
about: Create a report to help us improve apex
title: ''
labels: bug
assignees: ''

---

**Describe the Bug**

**Minimal Steps/Code to Reproduce the Bug**
<!--
Please list the *minimal* steps or provide a code snippet for us to be able to reproduce the bug.

A helpful guide on on how to craft a minimal bug report http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports.
--> 

**Expected Behavior**
<!-- A clear and concise description of what you expected to happen. -->

**Environment**
<!-- OS, version of Python, CUDA, PyTorch; collect these via `python -m torch.utils.collect_env` -->


================================================
FILE: .gitignore
================================================
apex.egg-info
dist
build
docs/build
*~
__pycache__
.vscode

# Copied from https://raw.githubusercontent.com/github/gitignore/master/Python.gitignore
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/


================================================
FILE: .gitmodules
================================================
[submodule "apex/contrib/csrc/multihead_attn/cutlass"]
	path = apex/contrib/csrc/multihead_attn/cutlass
	url = https://github.com/NVIDIA/cutlass.git
	branch = v1.2.0
[submodule "apex/contrib/csrc/cudnn-frontend"]
	path = apex/contrib/csrc/cudnn-frontend
	url = https://github.com/NVIDIA/cudnn-frontend.git


================================================
FILE: .nojekyll
================================================


================================================
FILE: .pre-commit-config.yaml
================================================
repos:
- repo: https://github.com/pre-commit/mirrors-clang-format
  rev: v22.1.1 # Or pin to your preferred clang-format version
  hooks:
  - id: clang-format
    files: \.(c|h|cpp|hpp|proto|cu|cuh)$
    exclude: ^(apex/contrib/csrc/multihead_attn/cutlass|apex/contrib/csrc/cudnn-frontend)/

- repo: https://github.com/astral-sh/ruff-pre-commit
  rev: v0.15.6
  hooks:
  - id: ruff-check
    args: ["--fix"]
  - id: ruff-format
    types_or: [python]
    exclude: "examples"


================================================
FILE: LICENSE
================================================
All rights reserved.

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

================================================
FILE: README.md
================================================
# Introduction

This repository holds NVIDIA-maintained utilities to streamline mixed precision and distributed training in Pytorch.
Some of the code here will be included in upstream Pytorch eventually.
The intent of Apex is to make up-to-date utilities available to users as quickly as possible.

# Installation
Each [`apex.contrib`](./apex/contrib) module requires one or more install options other than `--cpp_ext` and `--cuda_ext`.
Note that contrib modules do not necessarily support stable PyTorch releases, some of them might only be compatible with nightlies.

## Containers
NVIDIA PyTorch Containers are available on NGC: https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch.
The containers come with all the custom extensions available at the moment. 

See [the NGC documentation](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html) for details such as:
- how to pull a container
- how to run a pulled container
- release notes

## From Source

To install Apex from source, we recommend using the nightly Pytorch obtainable from https://github.com/pytorch/pytorch.

The latest stable release obtainable from https://pytorch.org should also work.

We recommend installing [`Ninja`](https://ninja-build.org/) to make compilation faster.

### Linux

For performance and full functionality, we recommend installing Apex with CUDA and C++ extensions using environment variables:

#### Using Environment Variables (Recommended)

```bash
git clone https://github.com/NVIDIA/apex
cd apex
# Build with core extensions (cpp and cuda)
APEX_CPP_EXT=1 APEX_CUDA_EXT=1 pip install -v --no-build-isolation .

# To build with additional extensions, specify them with environment variables
APEX_CPP_EXT=1 APEX_CUDA_EXT=1 APEX_FAST_MULTIHEAD_ATTN=1 APEX_FUSED_CONV_BIAS_RELU=1 pip install -v --no-build-isolation .

# To build all contrib extensions at once
APEX_CPP_EXT=1 APEX_CUDA_EXT=1 APEX_ALL_CONTRIB_EXT=1 pip install -v --no-build-isolation .
```

To reduce the build time, parallel building can be enabled:

```bash
NVCC_APPEND_FLAGS="--threads 4" APEX_PARALLEL_BUILD=8 APEX_CPP_EXT=1 APEX_CUDA_EXT=1 pip install -v --no-build-isolation .
```

When CPU cores or memory are limited, the `--parallel` option is generally preferred over `--threads`. See [pull#1882](https://github.com/NVIDIA/apex/pull/1882) for more details.

#### Using Command-Line Flags (Legacy Method)

The traditional command-line flags are still supported:

```bash
# Using pip config-settings (pip >= 23.1)
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./

# For older pip versions
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --global-option="--cpp_ext" --global-option="--cuda_ext" ./

# To build with additional extensions
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_multihead_attn" ./
```

#### Python-Only Build

APEX also supports a Python-only build via:
```bash
pip install -v --disable-pip-version-check --no-build-isolation --no-cache-dir ./
```
A Python-only build omits:
- Fused kernels required to use `apex.optimizers.FusedAdam`.
- Fused kernels required to use `apex.normalization.FusedLayerNorm` and `apex.normalization.FusedRMSNorm`.
- Fused kernels that improve the performance and numerical stability of `apex.parallel.SyncBatchNorm`.
- Fused kernels that improve the performance of `apex.parallel.DistributedDataParallel` and `apex.amp`.
`DistributedDataParallel`, `amp`, and `SyncBatchNorm` will still be usable, but they may be slower.


### [Experimental] Windows
`pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" .` may work if you were able to build Pytorch from source
on your system. A Python-only build via `pip install -v --no-cache-dir .` is more likely to work.  
If you installed Pytorch in a Conda environment, make sure to install Apex in that same environment.


## Custom C++/CUDA Extensions and Install Options

If a requirement of a module is not met, then it will not be built.

|  Module Name  |  Environment Variable  |  Install Option  |  Misc  |
|---------------|------------------------|------------------|--------|
|  `apex_C`     |  `APEX_CPP_EXT=1`      |  `--cpp_ext`     | |
|  `amp_C`      |  `APEX_CUDA_EXT=1`     |  `--cuda_ext`    | |
|  `syncbn`     |  `APEX_CUDA_EXT=1`     |  `--cuda_ext`    | |
|  `fused_layer_norm_cuda`  |  `APEX_CUDA_EXT=1`  |  `--cuda_ext`  | [`apex.normalization`](./apex/normalization) |
|  `mlp_cuda`   |  `APEX_CUDA_EXT=1`     |  `--cuda_ext`    | |
|  `scaled_upper_triang_masked_softmax_cuda`  |  `APEX_CUDA_EXT=1`  |  `--cuda_ext`  | |
|  `generic_scaled_masked_softmax_cuda`  |  `APEX_CUDA_EXT=1`  |  `--cuda_ext`  | |
|  `scaled_masked_softmax_cuda`  |  `APEX_CUDA_EXT=1`  |  `--cuda_ext`  | |
|  `fused_weight_gradient_mlp_cuda`  |  `APEX_CUDA_EXT=1`  |  `--cuda_ext`  | Requires CUDA>=11 |
|  `permutation_search_cuda`  |  `APEX_PERMUTATION_SEARCH=1`  |  `--permutation_search`  | [`apex.contrib.sparsity`](./apex/contrib/sparsity)  |
|  `bnp`        |  `APEX_BNP=1`          |  `--bnp`         |  [`apex.contrib.groupbn`](./apex/contrib/groupbn) |
|  `xentropy`   |  `APEX_XENTROPY=1`     |  `--xentropy`    |  [`apex.contrib.xentropy`](./apex/contrib/xentropy)  |
|  `focal_loss_cuda`  |  `APEX_FOCAL_LOSS=1`  |  `--focal_loss`  |  [`apex.contrib.focal_loss`](./apex/contrib/focal_loss)  |
|  `fused_index_mul_2d`  |  `APEX_INDEX_MUL_2D=1`  |  `--index_mul_2d`  |  [`apex.contrib.index_mul_2d`](./apex/contrib/index_mul_2d)  |
|  `fused_adam_cuda`  |  `APEX_DEPRECATED_FUSED_ADAM=1`  |  `--deprecated_fused_adam`  |  [`apex.contrib.optimizers`](./apex/contrib/optimizers)  |
|  `fused_lamb_cuda`  |  `APEX_DEPRECATED_FUSED_LAMB=1`  |  `--deprecated_fused_lamb`  |  [`apex.contrib.optimizers`](./apex/contrib/optimizers)  |
|  `fast_layer_norm`  |  `APEX_FAST_LAYER_NORM=1`  |  `--fast_layer_norm`  |  [`apex.contrib.layer_norm`](./apex/contrib/layer_norm). different from `fused_layer_norm` |
|  `fmhalib`    |  `APEX_FMHA=1`         |  `--fmha`        |  [`apex.contrib.fmha`](./apex/contrib/fmha)  |
|  `fast_multihead_attn`  |  `APEX_FAST_MULTIHEAD_ATTN=1`  |  `--fast_multihead_attn`  |  [`apex.contrib.multihead_attn`](./apex/contrib/multihead_attn)  |
|  `transducer_joint_cuda`  |  `APEX_TRANSDUCER=1`  |  `--transducer`  |  [`apex.contrib.transducer`](./apex/contrib/transducer)  |
|  `transducer_loss_cuda`   |  `APEX_TRANSDUCER=1`  |  `--transducer`  |  [`apex.contrib.transducer`](./apex/contrib/transducer)  |
|  `cudnn_gbn_lib`  |  `APEX_CUDNN_GBN=1`  |  `--cudnn_gbn`  | Requires cuDNN>=8.5, [`apex.contrib.cudnn_gbn`](./apex/contrib/cudnn_gbn) |
|  `peer_memory_cuda`  |  `APEX_PEER_MEMORY=1`  |  `--peer_memory`  |  [`apex.contrib.peer_memory`](./apex/contrib/peer_memory)  |
|  `nccl_p2p_cuda`  |  `APEX_NCCL_P2P=1`  |  `--nccl_p2p`  | Requires NCCL >= 2.10, [`apex.contrib.nccl_p2p`](./apex/contrib/nccl_p2p)  |
|  `fast_bottleneck`  |  `APEX_FAST_BOTTLENECK=1`  |  `--fast_bottleneck`  |  Requires `peer_memory_cuda` and `nccl_p2p_cuda`, [`apex.contrib.bottleneck`](./apex/contrib/bottleneck) |
|  `fused_conv_bias_relu`  |  `APEX_FUSED_CONV_BIAS_RELU=1`  |  `--fused_conv_bias_relu`  | Requires cuDNN>=8.4, [`apex.contrib.conv_bias_relu`](./apex/contrib/conv_bias_relu) |
|  `distributed_adam_cuda`  |  `APEX_DISTRIBUTED_ADAM=1`  |  `--distributed_adam`  |  [`apex.contrib.optimizers`](./apex/contrib/optimizers)  |
|  `distributed_lamb_cuda`  |  `APEX_DISTRIBUTED_LAMB=1`  |  `--distributed_lamb`  |  [`apex.contrib.optimizers`](./apex/contrib/optimizers)  |
|  `_apex_nccl_allocator`  |  `APEX_NCCL_ALLOCATOR=1`  |  `--nccl_allocator`  | Requires NCCL >= 2.19, [`apex.contrib.nccl_allocator`](./apex/contrib/nccl_allocator)  |
|  `_apex_gpu_direct_storage`  |  `APEX_GPU_DIRECT_STORAGE=1`  |  `--gpu_direct_storage`  |  [`apex.contrib.gpu_direct_storage`](./apex/contrib/gpu_direct_storage)  |

You can also build all contrib extensions at once by setting `APEX_ALL_CONTRIB_EXT=1`.


================================================
FILE: apex/__init__.py
================================================
import logging
import warnings

# May help avoid undefined symbol errors https://pytorch.org/cppdocs/notes/faq.html#undefined-symbol-errors-from-pytorch-aten
import torch

# For optimizers and normalization there is no Python fallback.
# Absence of cuda backend is a hard error.
# I would like the errors from importing fused_adam_cuda or fused_layer_norm_cuda
# to be triggered lazily, because if someone has installed with --cpp_ext and --cuda_ext
# so they expect those backends to be available, but for some reason they actually aren't
# available (for example because they built improperly in a way that isn't revealed until
# load time) the error message is timely and visible.
from . import optimizers
from . import normalization


__all__ = ["optimizers", "normalization"]


def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int) -> bool:
    cudnn_available = torch.backends.cudnn.is_available()
    cudnn_version = torch.backends.cudnn.version() if cudnn_available else None
    if not (cudnn_available and (cudnn_version >= required_cudnn_version)):
        warnings.warn(
            f"`{global_option}` depends on cuDNN {required_cudnn_version} or later, "
            f"but {'cuDNN is not available' if not cudnn_available else cudnn_version}"
        )
        return False
    return True


class DeprecatedFeatureWarning(FutureWarning):
    pass


def deprecated_warning(msg: str) -> None:
    if (
        not torch.distributed.is_available
        or not torch.distributed.is_initialized()
        or (torch.distributed.is_initialized() and torch.distributed.get_rank() == 0)
    ):
        warnings.warn(msg, DeprecatedFeatureWarning)


================================================
FILE: apex/_autocast_utils.py
================================================
from typing import Optional, Sequence

import torch


__all__ = ["_cast_if_autocast_enabled"]


def _get_autocast_dtypes() -> Sequence[torch.dtype]:
    if torch.cuda.is_bf16_supported():
        return [torch.half, torch.bfloat16]
    return [torch.half]


def _get_current_dtype(dtype: Optional[torch.dtype] = None) -> torch.dtype:
    if not torch.is_autocast_enabled():
        return torch.float or dtype
    else:
        return torch.get_autocast_gpu_dtype()


def _cast_if_autocast_enabled(*args):
    if not torch.is_autocast_enabled():
        return args
    else:
        return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())


================================================
FILE: apex/contrib/__init__.py
================================================


================================================
FILE: apex/contrib/bottleneck/__init__.py
================================================
from .bottleneck import Bottleneck, SpatialBottleneck
from .halo_exchangers import (
    HaloExchangerNoComm,
    HaloExchangerAllGather,
    HaloExchangerSendRecv,
    HaloExchangerPeer,
)


================================================
FILE: apex/contrib/bottleneck/bottleneck.py
================================================
import functools as func

import torch
from torch import nn

from apex import check_cudnn_version_and_warn
import fast_bottleneck
import nccl_p2p_cuda as inc


assert check_cudnn_version_and_warn(__name__, 8400)


def kaiming_uniform_(tensor, a=0, mode="fan_in", nonlinearity="leaky_relu"):
    weight_tensor_nchw = tensor
    nn.init.kaiming_uniform_(weight_tensor_nchw, a=a, mode=mode, nonlinearity=nonlinearity)


def compute_scale_bias_one(nhwc, weight, bias, running_mean, running_var, w_scale, w_bias):
    scale = weight * running_var.rsqrt()
    bias = bias - running_mean * scale
    w_scale.copy_(scale)
    w_bias.copy_(bias)


def compute_scale_bias_method(nhwc, args):
    for arg in args:
        # arg is tuple of (weight, bias, running_mean, running_var, w_scale, w_bias)
        compute_scale_bias_one(nhwc, *arg)


class FrozenBatchNorm2d(torch.jit.ScriptModule):
    """
    BatchNorm2d where the batch statistics and the affine parameters are fixed
    """

    def __init__(self, n):
        super(FrozenBatchNorm2d, self).__init__()
        self.register_buffer("weight", torch.ones(n))
        self.register_buffer("bias", torch.zeros(n))
        self.register_buffer("running_mean", torch.zeros(n))
        self.register_buffer("running_var", torch.ones(n))

    @torch.jit.script_method
    def get_scale_bias(self, nhwc):
        # type: (bool) -> List[torch.Tensor]
        scale = self.weight * self.running_var.rsqrt()
        bias = self.bias - self.running_mean * scale
        if nhwc:
            scale = scale.reshape(1, 1, 1, -1)
            bias = bias.reshape(1, 1, 1, -1)
        else:
            scale = scale.reshape(1, -1, 1, 1)
            bias = bias.reshape(1, -1, 1, 1)
        return scale, bias

    @torch.jit.script_method
    def forward(self, x):
        scale, bias = self.get_scale_bias(False)
        return x * scale + bias


@torch.jit.script
def drelu_dscale1(grad_o, output, scale1):
    relu_mask = output > 0
    dx_relu = relu_mask * grad_o
    g1 = dx_relu * scale1
    return g1, dx_relu


@torch.jit.script
def drelu_dscale2(grad_o, output, scale1, scale2):
    relu_mask = output > 0
    dx_relu = relu_mask * grad_o
    g1 = dx_relu * scale1
    g2 = dx_relu * scale2
    return g1, g2


class BottleneckFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, nhwc, stride_1x1, scale, bias, x, *conv):
        # TODO: clean up order of tensors
        args = [x, *conv[0:3], *scale[0:3], *bias[0:3]]
        ctx.downsample = len(conv) > 3
        if ctx.downsample:
            args.append(conv[3])
            args.append(scale[3])
            args.append(bias[3])

        # weight buffers are always in nhwc while shape can be nhwc or channels_last
        # here we pass in flag and let c++ handle it
        # alternatively, we can put all sizes into a fixed format and pass it in
        outputs = fast_bottleneck.forward(nhwc, stride_1x1, args)
        ctx.save_for_backward(*(args + outputs))
        # save relu outputs for drelu
        ctx.nhwc = nhwc
        ctx.stride_1x1 = stride_1x1
        return outputs[2]

    # backward relu is not exposed, MUL with mask used now
    # only support dgrad
    @staticmethod
    def backward(ctx, grad_o):
        outputs = ctx.saved_tensors[-3:]

        if ctx.downsample:
            grad_conv3, grad_conv4 = drelu_dscale2(
                grad_o, outputs[2], ctx.saved_tensors[6], ctx.saved_tensors[11]
            )
        else:
            grad_conv3, grad_conv4 = drelu_dscale1(grad_o, outputs[2], ctx.saved_tensors[6])

        # create input vector for backward
        t_list = [*ctx.saved_tensors[0:10]]
        t_list.append(grad_conv3)
        t_list.append(grad_conv4)

        # outputs used for wgrad and generating drelu mask
        t_list.append(outputs[0])
        t_list.append(outputs[1])

        # in case there is downsample
        if ctx.downsample:
            t_list.append(ctx.saved_tensors[10])

        grads = fast_bottleneck.backward(ctx.nhwc, ctx.stride_1x1, t_list)

        return (None, None, None, None, *grads)


bottleneck_function = BottleneckFunction.apply


def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(
        in_planes,
        out_planes,
        kernel_size=3,
        stride=stride,
        padding=dilation,
        groups=groups,
        bias=False,
        dilation=dilation,
    )


def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)


class Bottleneck(torch.nn.Module):
    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
    # This variant is also known as ResNet V1.5 and improves accuracy according to
    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
    # here we put it at 1x1

    def __init__(
        self,
        in_channels,
        bottleneck_channels,
        out_channels,
        stride=1,
        groups=1,
        dilation=1,
        norm_func=None,
        use_cudnn=False,
        explicit_nhwc=False,
    ):
        super(Bottleneck, self).__init__()
        if groups != 1:
            raise RuntimeError("Only support groups == 1")
        if dilation != 1:
            raise RuntimeError("Only support dilation == 1")
        if norm_func == None:
            norm_func = FrozenBatchNorm2d
        else:
            raise RuntimeError("Only support frozen BN now.")

        if stride != 1 or in_channels != out_channels:
            self.downsample = nn.Sequential(
                conv1x1(in_channels, out_channels, stride),
                norm_func(out_channels),
            )
        else:
            self.downsample = None

        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv1x1(in_channels, bottleneck_channels, stride)
        self.conv2 = conv3x3(bottleneck_channels, bottleneck_channels)
        self.conv3 = conv1x1(bottleneck_channels, out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.stride = stride

        self.bn1 = norm_func(bottleneck_channels)
        self.bn2 = norm_func(bottleneck_channels)
        self.bn3 = norm_func(out_channels)
        self.w_scale = None

        self.use_cudnn = use_cudnn

        # setup conv weights
        self.w_conv = [self.conv1.weight, self.conv2.weight, self.conv3.weight]
        if self.downsample is not None:
            self.w_conv.append(self.downsample[0].weight)

        # init weight in nchw format before possible transpose
        for w in self.w_conv:
            kaiming_uniform_(w, a=1)

        # TODO: prevent unsupported case usage
        # support cases
        #                 native      cudnn
        # normal             yes         no
        # channel_last       yes        yes
        # explicit_nhwc       no        yes
        self.explicit_nhwc = explicit_nhwc
        if self.explicit_nhwc:
            for p in self.parameters():
                with torch.no_grad():
                    p.data = p.data.permute(0, 2, 3, 1).contiguous()

        return

    # Returns single callable that recomputes scale and bias for all frozen batch-norms.
    # This method must be called before cuda graphing.
    # The callable it returns can be called anytime.
    # Calling this method will prevent these from being computed every forward call.
    def get_scale_bias_callable(self):
        self.w_scale, self.w_bias, args = [], [], []
        batch_norms = [self.bn1, self.bn2, self.bn3]
        if self.downsample is not None:
            batch_norms.append(self.downsample[1])
        for bn in batch_norms:
            s = torch.empty_like(bn.weight)
            b = torch.empty_like(s)
            args.append((bn.weight, bn.bias, bn.running_mean, bn.running_var, s, b))
            if self.explicit_nhwc:
                self.w_scale.append(s.reshape(1, 1, 1, -1))
                self.w_bias.append(b.reshape(1, 1, 1, -1))
            else:
                self.w_scale.append(s.reshape(1, -1, 1, 1))
                self.w_bias.append(b.reshape(1, -1, 1, 1))
        return func.partial(compute_scale_bias_method, self.explicit_nhwc, args)

    def forward(self, x):
        if self.use_cudnn:
            if self.w_scale is None:
                # calculate scale/bias from registered buffers
                # TODO: make this better
                s1, b1 = self.bn1.get_scale_bias(self.explicit_nhwc)
                s2, b2 = self.bn2.get_scale_bias(self.explicit_nhwc)
                s3, b3 = self.bn3.get_scale_bias(self.explicit_nhwc)
                w_scale = [s1, s2, s3]
                w_bias = [b1, b2, b3]
                if self.downsample is not None:
                    s4, b4 = self.downsample[1].get_scale_bias(self.explicit_nhwc)
                    w_scale.append(s4)
                    w_bias.append(b4)
                out = bottleneck_function(
                    self.explicit_nhwc, self.stride, w_scale, w_bias, x, *self.w_conv
                )
            else:
                out = bottleneck_function(
                    self.explicit_nhwc,
                    self.stride,
                    self.w_scale,
                    self.w_bias,
                    x,
                    *self.w_conv,
                )
            return out

        if self.explicit_nhwc:
            raise RuntimeError("explicit nhwc with native ops is not supported.")

        # fallback to native ops
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


class SpatialBottleneckFunction(torch.autograd.Function):
    @staticmethod
    def forward(
        ctx,
        spatial_group_size,
        spatial_group_rank,
        spatial_communicator,
        spatial_halo_exchanger,
        spatial_method,
        use_delay_kernel,
        explicit_nhwc,
        stride_1x1,
        scale,
        bias,
        thresholdTop,
        thresholdBottom,
        x,
        *conv,
    ):
        if spatial_group_size > 1:
            stream1 = spatial_halo_exchanger.stream1
            stream2 = spatial_halo_exchanger.stream2
            stream3 = spatial_halo_exchanger.stream3

        # TODO: clean up order of tensors
        args = [x, *conv[0:3], *scale[0:3], *bias[0:3]]
        ctx.downsample = len(conv) > 3
        if ctx.downsample:
            args.append(conv[3])
            args.append(scale[3])
            args.append(bias[3])

        # weight buffers are always in explicit_nhwc while shape can be explicit_nhwc or channels_last
        # here we pass in flag and let c++ handle it
        # alternatively, we can put all sizes into a fixed format and pass it in
        outputs = fast_bottleneck.forward_init(explicit_nhwc, stride_1x1, args)
        fast_bottleneck.forward_out1(explicit_nhwc, stride_1x1, args, outputs)

        if spatial_group_size > 1:
            out1 = outputs[0]
            if explicit_nhwc:
                N, Hs, W, C = list(out1.shape)
                memory_format = torch.contiguous_format
                out1_pad = torch.empty([N, Hs + 2, W, C], dtype=out1.dtype, device="cuda")
            else:
                N, C, Hs, W = list(out1.shape)
                memory_format = (
                    torch.channels_last
                    if out1.is_contiguous(memory_format=torch.channels_last)
                    else torch.contiguous_format
                )
                out1_pad = torch.empty(
                    [N, C, Hs + 2, W],
                    dtype=out1.dtype,
                    device="cuda",
                    memory_format=memory_format,
                )
            stream1.wait_stream(torch.cuda.current_stream())
            if spatial_method != 2:
                stream3.wait_stream(torch.cuda.current_stream())
            with torch.cuda.stream(stream1):
                if explicit_nhwc:
                    top_out1_halo = out1_pad[:, :1, :, :]
                    btm_out1_halo = out1_pad[:, Hs + 1 : Hs + 2, :, :]
                    spatial_halo_exchanger.left_right_halo_exchange(
                        out1[:, :1, :, :],
                        out1[:, Hs - 1 :, :, :],
                        top_out1_halo,
                        btm_out1_halo,
                    )
                else:
                    top_out1_halo = out1_pad[:, :, :1, :]
                    btm_out1_halo = out1_pad[:, :, Hs + 1 : Hs + 2, :]
                    spatial_halo_exchanger.left_right_halo_exchange(
                        out1[:, :, :1, :],
                        out1[:, :, Hs - 1 :, :],
                        top_out1_halo,
                        btm_out1_halo,
                    )
            if spatial_method == 1:
                # overlap mid convolution with halo transfer
                if spatial_group_rank < spatial_group_size - 1:
                    stream2.wait_stream(stream1)
                    with torch.cuda.stream(stream2):
                        if explicit_nhwc:
                            btm_fat_halo = torch.empty(
                                (N, 3, W, C), dtype=out1.dtype, device=out1.device
                            )
                            btm_fat_halo[:, 0:2, :, :].copy_(out1[:, Hs - 2 :, :, :])
                            btm_fat_halo[:, 2:, :, :].copy_(btm_out1_halo)
                        else:
                            btm_fat_halo = torch.empty(
                                (N, C, 3, W), dtype=out1.dtype, device=out1.device
                            )
                            btm_fat_halo[:, :, 0:2, :].copy_(out1[:, :, Hs - 2 :, :])
                            btm_fat_halo[:, :, 2:, :].copy_(btm_out1_halo)
                        btm_out2 = fast_bottleneck.forward_out2_halo(
                            explicit_nhwc, btm_fat_halo, args
                        )
                if spatial_group_rank > 0:
                    with torch.cuda.stream(stream1):
                        if explicit_nhwc:
                            top_fat_halo = torch.empty(
                                (N, 3, W, C), dtype=out1.dtype, device=out1.device
                            )
                            top_fat_halo[:, :1, :, :].copy_(top_out1_halo)
                            top_fat_halo[:, 1:3, :, :].copy_(out1[:, :2, :, :])
                        else:
                            top_fat_halo = torch.empty(
                                (N, C, 3, W), dtype=out1.dtype, device=out1.device
                            )
                            top_fat_halo[:, :, :1, :].copy_(top_out1_halo)
                            top_fat_halo[:, :, 1:3, :].copy_(out1[:, :, :2, :])
                        top_out2 = fast_bottleneck.forward_out2_halo(
                            explicit_nhwc, top_fat_halo, args
                        )
                if use_delay_kernel:
                    inc.add_delay(10)
            elif spatial_method != 2 and spatial_method != 3:
                assert False, "spatial_method must be 1, 2 or 3"

        if spatial_group_size <= 1:
            fast_bottleneck.forward_out2(explicit_nhwc, stride_1x1, args, outputs)
        elif spatial_method == 1:
            fast_bottleneck.forward_out2(explicit_nhwc, stride_1x1, args, outputs)
            with torch.cuda.stream(stream3):
                if explicit_nhwc:
                    out1_pad[:, 1 : Hs + 1, :, :].copy_(out1)
                else:
                    out1_pad[:, :, 1 : Hs + 1, :].copy_(out1)
        elif spatial_method == 2:
            # wait for halo transfer to finish before doing a full convolution of padded x
            if explicit_nhwc:
                out1_pad[:, 1 : Hs + 1, :, :].copy_(out1)
            else:
                out1_pad[:, :, 1 : Hs + 1, :].copy_(out1)
            torch.cuda.current_stream().wait_stream(stream1)
            fast_bottleneck.forward_out2_pad(explicit_nhwc, stride_1x1, args, outputs, out1_pad)
        elif spatial_method == 3:
            fast_bottleneck.forward_out2_mask(
                explicit_nhwc, stride_1x1, args, outputs, thresholdTop, thresholdBottom
            )
            with torch.cuda.stream(stream3):
                if explicit_nhwc:
                    out1_pad[:, 1 : Hs + 1, :, :].copy_(out1)
                else:
                    out1_pad[:, :, 1 : Hs + 1, :].copy_(out1)

        # compute halo cells for outputs[1] (out2)
        if spatial_group_size > 1:
            out2 = outputs[1]
            if explicit_nhwc:
                top_out2_halo = out2[:, :1, :, :]
                btm_out2_halo = out2[:, Hs - 1 :, :, :]
            else:
                top_out2_halo = out2[:, :, :1, :]
                btm_out2_halo = out2[:, :, Hs - 1 :, :]
            if spatial_method == 1:
                if spatial_group_rank > 0:
                    torch.cuda.current_stream().wait_stream(stream1)
                    top_out2_halo.copy_(top_out2)
                if spatial_group_rank < spatial_group_size - 1:
                    torch.cuda.current_stream().wait_stream(stream2)
                    btm_out2_halo.copy_(btm_out2)
            elif spatial_method == 3:
                # Note
                # out2 halo correction cannot overlap with anything since it has
                # to wait for out2_mask to finish, but itself has to finish before
                # the first kernel of _forward_rest can launch.
                # At least we can overlap the two halo correction kernels.
                if spatial_group_rank < spatial_group_size - 1:
                    stream2.wait_stream(stream1)  # wait for halo transfers to finish
                    stream2.wait_stream(
                        torch.cuda.current_stream()
                    )  # wait for *_out2_mask to finish
                    with torch.cuda.stream(stream2):
                        w1by3 = args[2][:, 2:3, :, :].clone()
                        btm_out1_halo = btm_out1_halo.clone()
                        btm_out2 = fast_bottleneck.forward_out2_halo_corr(
                            explicit_nhwc,
                            btm_out1_halo,
                            args,
                            w1by3,
                            btm_out2_halo.clone(),
                        )
                        btm_out2_halo.copy_(btm_out2)
                if spatial_group_rank > 0:
                    stream1.wait_stream(
                        torch.cuda.current_stream()
                    )  # wait for *_out2_mask to finish
                    with torch.cuda.stream(stream1):
                        w1by3 = args[2][:, :1, :, :].clone()
                        top_out1_halo = top_out1_halo.clone()
                        top_out2 = fast_bottleneck.forward_out2_halo_corr(
                            explicit_nhwc,
                            top_out1_halo,
                            args,
                            w1by3,
                            top_out2_halo.clone(),
                        )
                        top_out2_halo.copy_(top_out2)
                if spatial_group_rank < spatial_group_size - 1:
                    torch.cuda.current_stream().wait_stream(stream2)
                if spatial_group_rank > 0:
                    torch.cuda.current_stream().wait_stream(stream1)

        fast_bottleneck.forward_rest(explicit_nhwc, stride_1x1, args, outputs)
        # save halos for backward pass
        if spatial_group_size > 1:
            if spatial_method != 2:
                # make sure copy of mid-section of out1 into out1_pad is done before exiting
                torch.cuda.current_stream().wait_stream(stream3)
            ctx.save_for_backward(
                *(
                    args
                    + outputs
                    + [
                        out1_pad,
                    ]
                )
            )
        else:
            ctx.save_for_backward(*(args + outputs))
        # save relu outputs for drelu
        ctx.explicit_nhwc = explicit_nhwc
        ctx.stride_1x1 = stride_1x1
        ctx.spatial_group_size = spatial_group_size
        if spatial_group_size > 1:
            ctx.spatial_group_rank = spatial_group_rank
            ctx.spatial_halo_exchanger = spatial_halo_exchanger
            ctx.spatial_method = spatial_method
            ctx.use_delay_kernel = use_delay_kernel
            ctx.thresholdTop = thresholdTop
            ctx.thresholdBottom = thresholdBottom
            ctx.stream1 = stream1
            ctx.stream2 = stream2
            ctx.stream3 = stream3
        return outputs[2]

    # backward relu is not exposed, MUL with mask used now
    # only support dgrad
    @staticmethod
    def backward(ctx, grad_o):
        if ctx.spatial_group_size > 1:
            out1_pad = ctx.saved_tensors[-1]
            outputs = ctx.saved_tensors[-4:-1]
        else:
            outputs = ctx.saved_tensors[-3:]

        if ctx.downsample:
            grad_conv3, grad_conv4 = drelu_dscale2(
                grad_o, outputs[2], ctx.saved_tensors[6], ctx.saved_tensors[11]
            )
        else:
            grad_conv3, grad_conv4 = drelu_dscale1(grad_o, outputs[2], ctx.saved_tensors[6])

        # create input vector for backward
        t_list = [*ctx.saved_tensors[0:10]]
        t_list.append(grad_conv3)
        t_list.append(grad_conv4)

        # outputs used for wgrad and generating drelu mask
        t_list.append(outputs[0])
        t_list.append(outputs[1])

        # in case there is downsample
        if ctx.downsample:
            t_list.append(ctx.saved_tensors[10])

        grads = fast_bottleneck.backward_init(ctx.explicit_nhwc, ctx.stride_1x1, t_list)
        wgrad3_stream = torch.cuda.Stream()
        wgrad3_stream.wait_stream(torch.cuda.current_stream())
        grad_out2 = fast_bottleneck.backward_grad_out2(
            ctx.explicit_nhwc, ctx.stride_1x1, t_list, grads
        )
        wgrad2_stream = torch.cuda.Stream()
        wgrad2_stream.wait_stream(torch.cuda.current_stream())
        # do halo exchange of grad_out2 here
        # compute halo cells for grad_out1
        if ctx.spatial_group_size > 1:
            if ctx.explicit_nhwc:
                N, Hs, W, C = list(grad_out2.shape)
            else:
                N, C, Hs, W = list(grad_out2.shape)
            relu1 = t_list[12]
            ctx.stream1.wait_stream(torch.cuda.current_stream())
            with torch.cuda.stream(ctx.stream1):
                top_halo, btm_halo = ctx.spatial_halo_exchanger.left_right_halo_exchange(
                    grad_out2[:, :1, :, :], grad_out2[:, Hs - 1 :, :, :]
                )
                # copy halos to send buffer
            if ctx.spatial_method == 1 or ctx.spatial_method == 2:
                # 1 -> halo recompute approach
                # 2 -> wait for concatenated halos, then do single conv on full input (not implemented yet for bprop)
                if ctx.spatial_group_rank < ctx.spatial_group_size - 1:
                    ctx.stream2.wait_stream(ctx.stream1)
                    with torch.cuda.stream(ctx.stream2):
                        if ctx.explicit_nhwc:
                            btm_fat_halo = torch.empty(
                                (N, 3, W, C),
                                dtype=grad_out2.dtype,
                                device=grad_out2.device,
                            )
                            btm_fat_halo[:, :2, :, :].copy_(grad_out2[:, Hs - 2 :, :, :])
                            btm_fat_halo[:, 2:, :, :].copy_(btm_halo)
                            btm_fat_relu_halo = torch.empty(
                                (N, 3, W, C),
                                dtype=grad_out2.dtype,
                                device=grad_out2.device,
                            )
                            btm_fat_relu_halo[:, :2, :, :].copy_(relu1[:, Hs - 2 :, :, :])
                            btm_fat_relu_halo[:, 2:, :, :].zero_()
                        else:
                            btm_fat_halo = torch.empty(
                                (N, C, 3, W),
                                dtype=grad_out2.dtype,
                                device=grad_out2.device,
                            )
                            btm_fat_halo[:, :, :2, :].copy_(grad_out2[:, :, Hs - 2 :, :])
                            btm_fat_halo[:, :, 2:, :].copy_(btm_halo)
                            btm_fat_relu_halo = torch.empty(
                                (N, C, 3, W),
                                dtype=grad_out2.dtype,
                                device=grad_out2.device,
                            )
                            btm_fat_relu_halo[:, :, :2, :].copy_(relu1[:, :, Hs - 2 :, :])
                            btm_fat_relu_halo[:, :, 2:, :].zero_()
                        btm_grad_out1_halo = fast_bottleneck.backward_grad_out1_halo(
                            ctx.explicit_nhwc,
                            ctx.stride_1x1,
                            t_list,
                            grads,
                            btm_fat_halo,
                            btm_fat_relu_halo,
                        )
                        if ctx.explicit_nhwc:
                            btm_grad_out1_halo = btm_grad_out1_halo[:, 1:2, :, :]
                        else:
                            btm_grad_out1_halo = btm_grad_out1_halo[:, :, 1:2, :]
                if ctx.spatial_group_rank > 0:
                    with torch.cuda.stream(ctx.stream1):
                        if ctx.explicit_nhwc:
                            top_fat_halo = torch.empty(
                                (N, 3, W, C),
                                dtype=grad_out2.dtype,
                                device=grad_out2.device,
                            )
                            top_fat_halo[:, :1, :, :].copy_(top_halo)
                            top_fat_halo[:, 1:, :, :].copy_(grad_out2[:, :2, :, :])
                            top_fat_relu_halo = torch.empty(
                                (N, 3, W, C),
                                dtype=grad_out2.dtype,
                                device=grad_out2.device,
                            )
                            top_fat_relu_halo[:, :1, :, :].zero_()
                            top_fat_relu_halo[:, 1:, :, :].copy_(relu1[:, :2, :, :])
                        else:
                            top_fat_halo = torch.empty(
                                (N, C, 3, W),
                                dtype=grad_out2.dtype,
                                device=grad_out2.device,
                            )
                            top_fat_halo[:, :, :1, :].copy_(top_halo)
                            top_fat_halo[:, :, 1:, :].copy_(grad_out2[:, :, :2, :])
                            top_fat_relu_halo = torch.empty(
                                (N, C, 3, W),
                                dtype=grad_out2.dtype,
                                device=grad_out2.device,
                            )
                            top_fat_relu_halo[:, :, :1, :].zero_()
                            top_fat_relu_halo[:, :, 1:, :].copy_(relu1[:, :, :2, :])
                        top_grad_out1_halo = fast_bottleneck.backward_grad_out1_halo(
                            ctx.explicit_nhwc,
                            ctx.stride_1x1,
                            t_list,
                            grads,
                            top_fat_halo,
                            top_fat_relu_halo,
                        )
                        if ctx.explicit_nhwc:
                            top_grad_out1_halo = top_grad_out1_halo[:, 1:2, :, :]
                        else:
                            top_grad_out1_halo = top_grad_out1_halo[:, :, 1:2, :]
                if ctx.use_delay_kernel:
                    inc.add_delay(10)
            elif ctx.spatial_method != 3:
                assert False, "spatial_method must be 1, 2 or 3"

        # compute grad_out1 for internal cells
        if ctx.spatial_group_size <= 1 or ctx.spatial_method == 1 or ctx.spatial_method == 2:
            grad_out1 = fast_bottleneck.backward_grad_out1(
                ctx.explicit_nhwc, ctx.stride_1x1, t_list, grads, grad_out2
            )
        elif ctx.spatial_group_size > 1 and ctx.spatial_method == 3:
            grad_out1 = fast_bottleneck.backward_grad_out1_mask(
                ctx.explicit_nhwc,
                ctx.stride_1x1,
                t_list,
                grads,
                grad_out2,
                ctx.thresholdTop,
                ctx.thresholdBottom,
            )

        # apply halo cells to grad_out1
        if ctx.spatial_group_size > 1:
            w = t_list[2]
            z = t_list[4]
            relu1 = t_list[12]
            # print("w.shape = %s, z.shape = %s, relu1.shape = %s" % (str(list(w.shape)), str(list(z.shape)), str(list(relu1.shape))))
            if ctx.spatial_method == 1 or ctx.spatial_method == 2:
                if ctx.spatial_group_rank < ctx.spatial_group_size - 1:
                    torch.cuda.current_stream().wait_stream(ctx.stream2)
                    if ctx.explicit_nhwc:
                        grad_out1[:, Hs - 1 :, :, :].copy_(btm_grad_out1_halo)
                    else:
                        grad_out1[:, :, Hs - 1 :, :].copy_(btm_grad_out1_halo)
                    # print("ctx.spatial_group_rank = %d, apply grad_out1 btm halo (grad_out1.shape = %s)" % (ctx.spatial_group_rank, str(list(grad_out1.shape))))
                if ctx.spatial_group_rank > 0:
                    torch.cuda.current_stream().wait_stream(ctx.stream1)
                    if ctx.explicit_nhwc:
                        grad_out1[:, :1, :, :].copy_(top_grad_out1_halo)
                    else:
                        grad_out1[:, :, :1, :].copy_(top_grad_out1_halo)
                    # print("ctx.spatial_group_rank = %d, apply grad_out1 top halo (grad_out1.shape = %s)" % (ctx.spatial_group_rank, str(list(grad_out1.shape))))
            elif ctx.spatial_method == 3:
                if ctx.spatial_group_rank < ctx.spatial_group_size - 1:
                    if ctx.explicit_nhwc:
                        btm_relu_halo = relu1[:, Hs - 1 :, :, :].clone()
                        btm_grad_out1 = grad_out1[:, Hs - 1 :, :, :]
                    else:
                        btm_relu_halo = relu1[:, :, Hs - 1 :, :].clone()
                        btm_grad_out1 = grad_out1[:, :, Hs - 1 :, :]
                    w1by3 = w[:, :1, :, :].clone()
                    ctx.stream2.wait_stream(ctx.stream1)  # wait for halo transfers to finish
                    ctx.stream2.wait_stream(
                        torch.cuda.current_stream()
                    )  # wait for backward_grad_out1_mask to finish before launching halo correction kernel
                    with torch.cuda.stream(ctx.stream2):
                        btm_grad_out1_halo = fast_bottleneck.backward_grad_out1_halo_corr(
                            ctx.explicit_nhwc,
                            ctx.stride_1x1,
                            t_list,
                            w1by3,
                            grads,
                            btm_halo,
                            btm_relu_halo,
                            btm_grad_out1.clone(),
                        )
                        btm_grad_out1.copy_(btm_grad_out1_halo)
                if ctx.spatial_group_rank > 0:
                    if ctx.explicit_nhwc:
                        top_relu_halo = relu1[:, :1, :, :].clone()
                        top_grad_out1 = grad_out1[:, :1, :, :]
                    else:
                        top_relu_halo = relu1[:, :, :1, :].clone()
                        top_grad_out1 = grad_out1[:, :, :1, :]
                    w1by3 = w[:, 2:, :, :].clone()
                    ctx.stream1.wait_stream(
                        torch.cuda.current_stream()
                    )  # wait for backward_grad_out1_mask to finish before launching halo correction kernel
                    with torch.cuda.stream(ctx.stream1):
                        top_grad_out1_halo = fast_bottleneck.backward_grad_out1_halo_corr(
                            ctx.explicit_nhwc,
                            ctx.stride_1x1,
                            t_list,
                            w1by3,
                            grads,
                            top_halo,
                            top_relu_halo,
                            top_grad_out1.clone(),
                        )
                        top_grad_out1.copy_(top_grad_out1_halo)
                if ctx.spatial_group_rank < ctx.spatial_group_size - 1:
                    torch.cuda.current_stream().wait_stream(
                        ctx.stream2
                    )  # wait for halo correction to finish
                if ctx.spatial_group_rank > 0:
                    torch.cuda.current_stream().wait_stream(ctx.stream1)

        wgrad1_stream = torch.cuda.Stream()
        wgrad1_stream.wait_stream(torch.cuda.current_stream())
        fast_bottleneck.backward_rest(
            ctx.explicit_nhwc, ctx.stride_1x1, t_list, grads, grad_out2, grad_out1
        )
        with torch.cuda.stream(wgrad3_stream):
            fast_bottleneck.backward_wgrad3(ctx.explicit_nhwc, ctx.stride_1x1, t_list, grads)
        with torch.cuda.stream(wgrad2_stream):
            if ctx.spatial_group_size > 1:
                fast_bottleneck.backward_wgrad2_pad(
                    ctx.explicit_nhwc,
                    ctx.stride_1x1,
                    t_list,
                    grads,
                    out1_pad,
                    grad_out2,
                )
            else:
                fast_bottleneck.backward_wgrad2(
                    ctx.explicit_nhwc, ctx.stride_1x1, t_list, grads, grad_out2
                )
        with torch.cuda.stream(wgrad1_stream):
            fast_bottleneck.backward_wgrad1(
                ctx.explicit_nhwc, ctx.stride_1x1, t_list, grads, grad_out1
            )
        torch.cuda.current_stream().wait_stream(wgrad3_stream)
        torch.cuda.current_stream().wait_stream(wgrad2_stream)
        torch.cuda.current_stream().wait_stream(wgrad1_stream)

        return (
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            *grads,
        )


spatial_bottleneck_function = SpatialBottleneckFunction.apply


class SpatialBottleneck(torch.nn.Module):
    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
    # This variant is also known as ResNet V1.5 and improves accuracy according to
    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
    # here we put it at 1x1

    def __init__(
        self,
        in_channels,
        bottleneck_channels,
        out_channels,
        stride=1,
        groups=1,
        dilation=1,
        norm_func=None,
        use_cudnn=False,
        explicit_nhwc=False,
        spatial_parallel_args=None,
    ):
        super(SpatialBottleneck, self).__init__()
        if groups != 1:
            raise RuntimeError("Only support groups == 1")
        if dilation != 1:
            raise RuntimeError("Only support dilation == 1")
        if norm_func == None:
            norm_func = FrozenBatchNorm2d
        else:
            raise RuntimeError("Only support frozen BN now.")

        if stride != 1 or in_channels != out_channels:
            self.downsample = nn.Sequential(
                conv1x1(in_channels, out_channels, stride),
                norm_func(out_channels),
            )
        else:
            self.downsample = None

        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv1x1(in_channels, bottleneck_channels, stride)
        self.conv2 = conv3x3(bottleneck_channels, bottleneck_channels)
        self.conv3 = conv1x1(bottleneck_channels, out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.stride = stride

        self.bn1 = norm_func(bottleneck_channels)
        self.bn2 = norm_func(bottleneck_channels)
        self.bn3 = norm_func(out_channels)
        self.w_scale = None

        self.use_cudnn = use_cudnn

        # setup conv weights
        self.w_conv = [self.conv1.weight, self.conv2.weight, self.conv3.weight]
        if self.downsample is not None:
            self.w_conv.append(self.downsample[0].weight)

        # init weight in nchw format before possible transpose
        for w in self.w_conv:
            kaiming_uniform_(w, a=1)

        self.thresholdTop, self.thresholdBottom = None, None

        # TODO: prevent unsupported case usage
        # support cases
        #                 native      cudnn
        # normal             yes         no
        # channel_last       yes        yes
        # explicit_nhwc       no        yes
        self.explicit_nhwc = explicit_nhwc
        if self.explicit_nhwc:
            for p in self.parameters():
                with torch.no_grad():
                    p.data = p.data.permute(0, 2, 3, 1).contiguous()

        # spatial communicator
        if spatial_parallel_args is None:
            self.spatial_parallel_args = (1, 0, None, None, 0, False)
        else:
            self.spatial_parallel_args = spatial_parallel_args
        return

    # Returns single callable that recomputes scale and bias for all frozen batch-norms.
    # This method must be called before cuda graphing.
    # The callable it returns can be called anytime.
    # Calling this method will prevent these from being computed every forward call.
    def get_scale_bias_callable(self):
        self.w_scale, self.w_bias, args = [], [], []
        batch_norms = [self.bn1, self.bn2, self.bn3]
        if self.downsample is not None:
            batch_norms.append(self.downsample[1])
        for bn in batch_norms:
            s = torch.empty_like(bn.weight)
            b = torch.empty_like(s)
            args.append((bn.weight, bn.bias, bn.running_mean, bn.running_var, s, b))
            if self.explicit_nhwc:
                self.w_scale.append(s.reshape(1, 1, 1, -1))
                self.w_bias.append(b.reshape(1, 1, 1, -1))
            else:
                self.w_scale.append(s.reshape(1, -1, 1, 1))
                self.w_bias.append(b.reshape(1, -1, 1, 1))
        return func.partial(compute_scale_bias_method, self.explicit_nhwc, args)

    def forward(self, x):
        if self.use_cudnn:
            if self.thresholdTop is None:
                spatial_group_size, spatial_group_rank, _, _, _, _ = self.spatial_parallel_args
                if self.explicit_nhwc:
                    N, H, W, C = list(x.shape)
                else:
                    N, C, H, W = list(x.shape)
                self.thresholdTop = torch.tensor(
                    [1 if spatial_group_rank > 0 else 0],
                    dtype=torch.int32,
                    device="cuda",
                )
                self.thresholdBottom = torch.tensor(
                    [H - 2 if spatial_group_rank < spatial_group_size - 1 else H - 1],
                    dtype=torch.int32,
                    device="cuda",
                )

            if self.w_scale is None:
                # calculate scale/bias from registered buffers
                # TODO: make this better
                s1, b1 = self.bn1.get_scale_bias(self.explicit_nhwc)
                s2, b2 = self.bn2.get_scale_bias(self.explicit_nhwc)
                s3, b3 = self.bn3.get_scale_bias(self.explicit_nhwc)
                w_scale = [s1, s2, s3]
                w_bias = [b1, b2, b3]
                if self.downsample is not None:
                    s4, b4 = self.downsample[1].get_scale_bias(self.explicit_nhwc)
                    w_scale.append(s4)
                    w_bias.append(b4)
                out = spatial_bottleneck_function(
                    *self.spatial_parallel_args,
                    self.explicit_nhwc,
                    self.stride,
                    w_scale,
                    w_bias,
                    self.thresholdTop,
                    self.thresholdBottom,
                    x,
                    *self.w_conv,
                )
            else:
                out = spatial_bottleneck_function(
                    *self.spatial_parallel_args,
                    self.explicit_nhwc,
                    self.stride,
                    self.w_scale,
                    self.w_bias,
                    self.thresholdTop,
                    self.thresholdBottom,
                    x,
                    *self.w_conv,
                )
            return out

        if self.explicit_nhwc:
            raise RuntimeError("explicit nhwc with native ops is not supported.")

        # fallback to native ops
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


================================================
FILE: apex/contrib/bottleneck/halo_exchangers.py
================================================
import torch
import nccl_p2p_cuda as inc
import peer_memory_cuda as pm


# Communication free halo exchanger.
# NB! This halo exchanger does not exchange halos with neighbors as it should, it merely swaps the inputs
# NB! This is only useful for performance testing.
# NB! Do not use for actual production runs
class HaloExchanger(object):
    def __init__(self, ranks, rank_in_group):
        self.stream1 = torch.cuda.Stream()
        self.stream2 = torch.cuda.Stream()
        self.stream3 = torch.cuda.Stream()
        self.group_size = len(ranks)
        self.ranks = ranks
        self.rank_in_group = rank_in_group
        self.wrap_around_left_rank_in_group = (
            rank_in_group + self.group_size - 1
        ) % self.group_size
        self.wrap_around_right_rank_in_group = (rank_in_group + 1) % self.group_size
        self.left_rank = ranks[rank_in_group - 1] if rank_in_group > 0 else -1
        self.left_zero = True if rank_in_group == 0 else False
        self.right_rank = ranks[rank_in_group + 1] if rank_in_group < self.group_size - 1 else -1
        self.right_zero = True if rank_in_group == self.group_size - 1 else False


class HaloExchangerNoComm(HaloExchanger):
    def __init__(self, ranks, rank_in_group):
        super(HaloExchangerNoComm, self).__init__(ranks, rank_in_group)

    def left_right_halo_exchange(
        self,
        left_output_halo,
        right_output_halo,
        left_input_halo=None,
        right_input_halo=None,
    ):
        if left_input_halo is None:
            return right_output_halo, left_output_halo
        else:
            left_input_halo.copy_(right_output_halo)
            right_input_halo.copy_(left_output_halo)


class HaloExchangerAllGather(HaloExchanger):
    def __init__(self, ranks, rank_in_group, comm):
        super(HaloExchangerAllGather, self).__init__(ranks, rank_in_group)
        # self.comm must be NCCL process_group created with torch.distributed.new_group(ranks=ranks)
        self.comm = comm

    def left_right_halo_exchange(
        self,
        left_output_halo,
        right_output_halo,
        left_input_halo=None,
        right_input_halo=None,
    ):
        N, Hh, W, C = list(left_output_halo.shape)
        send_halos = torch.empty(
            (N, 2 * Hh, W, C),
            dtype=left_output_halo.dtype,
            device=left_output_halo.device,
        )
        send_halos[:, :Hh, :, :].copy_(left_output_halo)
        send_halos[:, Hh:, :, :].copy_(right_output_halo)
        all_halos = torch.empty(
            (N, 2 * Hh * self.group_size, W, C),
            dtype=left_output_halo.dtype,
            device=left_output_halo.device,
        )
        all_halos = [
            all_halos[:, i * 2 * Hh : (i + 1) * 2 * Hh, :, :] for i in range(self.group_size)
        ]
        torch.distributed.all_gather(all_halos, send_halos, group=self.comm, no_copy=True)
        ag_left_input_halo = all_halos[self.wrap_around_left_rank_in_group][:, Hh:, :, :]
        ag_right_input_halo = all_halos[self.wrap_around_right_rank_in_group][:, :Hh, :, :]
        if left_input_halo is None:
            if self.left_zero:
                ag_left_input_halo.zero_()
            if self.right_zero:
                ag_right_input_halo.zero_()
            return ag_left_input_halo, ag_right_input_halo
        else:
            if self.left_zero:
                left_input_halo.zero_()
            else:
                left_input_halo.copy_(ag_left_input_halo)
            if self.right_zero:
                right_input_halo.zero_()
            else:
                right_input_halo.copy_(ag_right_input_halo)


class HaloExchangerSendRecv(HaloExchanger):
    def __init__(self, ranks, rank_in_group):
        super(HaloExchangerSendRecv, self).__init__(ranks, rank_in_group)
        nccl_id = inc.get_unique_nccl_id(1).cuda()
        torch.distributed.broadcast(nccl_id, 0)
        nccl_id = nccl_id.cpu()
        print("%d :: nccl_id = %s" % (torch.distributed.get_rank(), str(nccl_id)))
        # Create another global nccl communicator in addition to the one created by torch.distributed.init_process_group("nccl")
        # This is unavoidable because the underlying NCCL communicator torch.distributed creates is a protected variable, hence
        # it cannot be accessed from another class.
        # TODO: Figure out a way to avoid creating a second global communicator
        assert torch.distributed.get_rank() == self.ranks[self.rank_in_group], (
            "ranks[%d](%d) != torch.distributed.get_rank()(%d)"
            % (
                self.rank_in_group,
                self.ranks[self.rank_in_group],
                torch.distributed.get_rank(),
            )
        )
        self.handle = inc.init_nccl_comm(
            nccl_id, torch.distributed.get_rank(), torch.distributed.get_world_size()
        )

    def left_right_halo_exchange(
        self,
        left_output_halo,
        right_output_halo,
        left_input_halo=None,
        right_input_halo=None,
    ):
        if left_input_halo is None:
            left_input_halo, right_input_halo = inc.left_right_halo_exchange(
                self.handle,
                self.left_rank,
                self.right_rank,
                left_output_halo,
                right_output_halo,
            )
            return left_input_halo, right_input_halo
        else:
            inc.left_right_halo_exchange_inplace(
                self.handle,
                self.left_rank,
                self.right_rank,
                left_output_halo,
                right_output_halo,
                left_input_halo,
                right_input_halo,
            )


class HaloExchangerPeer(HaloExchanger):
    def __init__(self, ranks, rank_in_group, peer_pool, explicit_nhwc, numSM=0):
        super(HaloExchangerPeer, self).__init__(ranks, rank_in_group)
        self.diagnostics = False
        self.explicit_nhwc = explicit_nhwc
        self.numSM = numSM
        self.peer_pool = peer_pool

    def _allocate_peer_tensor(self, halo):
        # Compute size in bytes
        # Note: Pad buffer so each CUDA block gets required buffer size
        size = 4 * halo.numel() * halo.element_size()
        size_per_block = 128 * 2 * 16  # 128 threads each require two 128b buffers
        size = (size + size_per_block - 1) // size_per_block * size_per_block

        # Construct dtype peer buffer with desired size
        shape = [1, 1, 1, size // halo.element_size()]
        return self.peer_pool.allocate_peer_tensors(shape, halo.dtype, False, True)

    def left_right_halo_exchange(
        self,
        left_output_halo,
        right_output_halo,
        left_input_halo=None,
        right_input_halo=None,
    ):
        inplace = False if left_input_halo is None and right_input_halo is None else True
        if not inplace:
            left_input_halo = torch.empty_like(right_output_halo)
            right_input_halo = torch.empty_like(left_output_halo)
        channels_last = (
            left_output_halo.is_contiguous(memory_format=torch.channels_last)
            and not self.explicit_nhwc
        )
        left_tx = self._allocate_peer_tensor(left_input_halo)
        right_tx = self._allocate_peer_tensor(right_input_halo)
        pm.push_pull_halos_1d(
            self.diagnostics,
            self.explicit_nhwc,
            self.numSM,
            self.rank_in_group,
            self.left_zero,
            left_output_halo,
            left_tx[self.rank_in_group],
            right_tx[self.wrap_around_left_rank_in_group],
            left_input_halo,
            self.right_zero,
            right_output_halo,
            right_tx[self.rank_in_group],
            left_tx[self.wrap_around_right_rank_in_group],
            right_input_halo,
        )
        if not inplace:
            return left_input_halo, right_input_halo


# Class that combines input volume with halos from neighbors (1d).
class HaloPadder:
    def __init__(self, halo_ex):
        self.halo_ex = halo_ex
        self.stream1 = torch.cuda.Stream()
        self.stream2 = torch.cuda.Stream()

    def __call__(self, y, half_halo, explicit_nhwc, H_split):
        channels_last = not explicit_nhwc and y.is_contiguous(memory_format=torch.channels_last)
        if explicit_nhwc:
            N, H, W, C = list(y.shape)
            if H_split:
                padded_shape = [N, H + 2 * half_halo, W, C]
                ypad = torch.empty(
                    shape=padded_shape,
                    dtype=y.dtype,
                    device=y.device,
                    memory_format=torch.contiguous_format,
                )
                yleft = ypad[:, :half_halo, :, :]
                ymid = ypad[:, half_halo : H + half_halo, :, :]
                yright = ypad[:, H + half_halo : H + 2 * half_halo, :, :]
                oleft = y[:, :half_halo, :, :]
                oright = y[:, H - half_halo :, :, :]
            else:
                padded_shape = [N, H, W + 2 * half_halo, C]
                ypad = torch.empty(
                    shape=padded_shape,
                    dtype=y.dtype,
                    device=y.device,
                    memory_format=torch.contiguous_format,
                )
                yleft = ypad[:, :, :half_halo, :]
                ymid = ypad[:, :, half_halo : W + half_halo, :]
                yright = ypad[:, :, W + half_halo : W + 2 * half_halo, :]
                oleft = y[:, :, :half_halo, :]
                oright = y[:, :, W - half_halo :, :]
        else:
            N, C, H, W = list(y.shape)
            if H_split:
                padded_shape = [N, C, H + 2 * half_halo, W]
                ypad = torch.empty(
                    shape=padded_shape,
                    dtype=y.dtype,
                    device=y.device,
                    memory_format=torch.channels_last,
                )
                yleft = ypad[:, :, :half_halo, :]
                ymid = ypad[:, :, half_halo : H + half_halo, :]
                yright = ypad[:, :, H + half_halo : H + 2 * half_halo, :]
                oleft = y[:, :, :half_halo, :]
                oright = y[:, :, H - half_halo :, :]
            else:
                padded_shape = [N, C, H, W + 2 * half_halo]
                ypad = torch.empty(
                    shape=padded_shape,
                    dtype=y.dtype,
                    device=y.device,
                    memory_format=torch.channels_last,
                )
                yleft = ypad[:, :, :, :half_halo]
                ymid = ypad[:, :, :, half_halo : W + half_halo]
                yright = ypad[:, :, :, W + half_halo : W + 2 * half_halo]
                oleft = y[:, :, :, :half_halo]
                oright = y[:, :, :, W - half_halo :]
        with torch.cuda.stream(self.stream1):
            self.halo_ex(oleft, oright, yleft, yright)
        with torch.cuda.stream(self.stream2):
            ymid.copy_(y)
        return ypad

    def wait(self):
        current_stream = torch.cuda.current_stream()
        current_stream.wait_stream(self.stream1)
        current_stream.wait_stream(self.stream2)


================================================
FILE: apex/contrib/bottleneck/test.py
================================================
import torch
from bottleneck import Bottleneck

torch.manual_seed(23337)

# use True to print layerwise sum for all outputs in reference code path
DEBUG = False  # True

for stride, o_channel in [(1, 32), (1, 128), (2, 32)]:
    print("testing stride ==", stride, ", in_channel == 32 , out_channel ==", o_channel)
    a_ = torch.randn(17, 32, 28, 28)

    a = a_.cuda().half().to(memory_format=torch.channels_last).requires_grad_()
    model = (
        Bottleneck(32, 8, o_channel, stride=stride)
        .cuda()
        .half()
        .to(memory_format=torch.channels_last)
    )

    # test model
    b = model(a)
    b.mean().backward()
    d_grad = a.grad.float()
    a.grad = None
    torch.cuda.synchronize()

    if DEBUG:
        print("[DEBUG] ref dx :", d_grad.sum().item())
        # print wgrad. we don't need to reset since later cpp print before accumulation
        for i, w in enumerate(model.w_conv):
            print("[DEBUG] ref wgrad{} :".format(i + 1), w.grad.sum().item())

    wgrads = []
    for w in model.w_conv:
        wgrads.append(w.grad.float())

    model.use_cudnn = True
    model.zero_grad()
    c = model(a)
    c.mean().backward()

    torch.cuda.synchronize()
    print("comparing native and channels_last:")
    print(
        "max error fprop:",
        (b - c).abs().max().item(),
        "max elem:",
        b.abs().max().item(),
    )
    print(
        "max error dgrad:",
        (d_grad - a.grad.float()).abs().max().item(),
        "max elem:",
        d_grad.abs().max().item(),
    )
    for i, (w, wgrad) in enumerate(zip(model.w_conv, wgrads)):
        print(
            "max error wgrad{}:".format(i + 1),
            (wgrad - w.grad.float()).abs().max().item(),
            "max elem:",
            wgrad.abs().max().item(),
        )

    nhwc_a = a_.permute(0, 2, 3, 1).contiguous().cuda().half().requires_grad_()
    nhwc_model = (
        Bottleneck(32, 8, o_channel, stride=stride, explicit_nhwc=True, use_cudnn=True)
        .cuda()
        .half()
    )
    for p, q in zip(model.parameters(), nhwc_model.parameters()):
        # model's storage is already in nhwc, we clone and assign to explicit nhwc model
        q.data.copy_(p.data.permute(0, 2, 3, 1).contiguous())
    for p, q in zip(model.buffers(), nhwc_model.buffers()):
        q.data.copy_(p.data)

    d = nhwc_model(nhwc_a)
    d.mean().backward()
    torch.cuda.synchronize()

    # reset reference to cudnn channels_last permute
    # c_s = c.storage().tolist()
    # d_s = d.storage().tolist()
    # print(max([x-y for x,y in zip(c_s,d_s)]))
    c = c.contiguous(memory_format=torch.contiguous_format).permute(0, 2, 3, 1).contiguous()
    d_grad = a.grad.float().permute(0, 2, 3, 1).contiguous()
    wgrads = []
    for w in model.w_conv:
        wgrads.append(w.grad.float().permute(0, 2, 3, 1).contiguous())

    torch.cuda.synchronize()
    print("comparing nhwc and channels_last:")
    print(
        "max error fprop:",
        (d - c).abs().max().item(),
        "max elem:",
        c.abs().max().item(),
    )
    print(
        "max error dgrad:",
        (d_grad - nhwc_a.grad.float()).abs().max().item(),
        "max elem:",
        d_grad.abs().max().item(),
    )
    for i, (w, wgrad) in enumerate(zip(nhwc_model.w_conv, wgrads)):
        print(
            "max error wgrad{}:".format(i + 1),
            (wgrad - w.grad.float()).abs().max().item(),
            "max elem:",
            wgrad.abs().max().item(),
        )


================================================
FILE: apex/contrib/clip_grad/__init__.py
================================================
from .clip_grad import clip_grad_norm_


================================================
FILE: apex/contrib/clip_grad/clip_grad.py
================================================
from typing import Union, Iterable

import torch

_kernel_import_succeeded = False
try:
    import amp_C
    from apex.multi_tensor_apply import multi_tensor_applier

    _kernel_import_succeeded = True
except ImportError:
    _kernel_import_succeeded = False

_tensor_or_tensors = Union[torch.Tensor, Iterable[torch.Tensor]]


def clip_grad_norm_(
    parameters: _tensor_or_tensors,
    max_norm: float,
    norm_type: float = 2.0,
    error_if_nonfinite: bool = False,
) -> torch.Tensor:
    r"""Clips gradient norm of an iterable of parameters.

    The norm is computed over all gradients together, as if they were
    concatenated into a single vector. Gradients are modified in-place.

    This is identical to torch.nn.utils.clip_grad_norm_, except it
    uses a fused CUDA kernel when computing the 2-norm of GPU tensors
    in float32 and float16.

    Args:
        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
            single Tensor that will have gradients normalized
        max_norm (float or int): max norm of the gradients
        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
            infinity norm.
        error_if_nonfinite (bool): if True, an error is thrown if the total
            norm of the gradients from :attr:`parameters` is ``nan``,
            ``inf``, or ``-inf``. Default: False (will switch to True in the future)

    Returns:
        Total norm of the parameters (viewed as a single vector).

    """
    if isinstance(parameters, torch.Tensor):
        parameters = [parameters]
    parameters = [p for p in parameters if p.grad is not None]
    max_norm = float(max_norm)
    norm_type = float(norm_type)

    # Trivial case
    if len(parameters) == 0:
        return torch.tensor(0.0)

    # Fallback implementation
    if not (_kernel_import_succeeded and norm_type == 2.0 and any(p.is_cuda for p in parameters)):
        return torch.nn.utils.clip_grad_norm_(
            parameters,
            max_norm,
            norm_type=norm_type,
            error_if_nonfinite=error_if_nonfinite,
        )

    # Find fp32 and fp16 gradients on GPU
    device = next(p.device for p in parameters if p.is_cuda)
    grads_fp32, grads_fp16, grads_misc = [], [], []
    for p in parameters:
        grad = p.grad.detach()
        if p.dtype == torch.float32 and p.device == device:
            grads_fp32.append(grad)
        elif p.dtype == torch.float16 and p.device == device:
            grads_fp16.append(grad)
        else:
            grads_misc.append(grad)

    # Compute gradient L2 norms
    norms = []
    dummy_overflow_buf = torch.zeros([1], dtype=torch.int32, device=device)
    if grads_fp32:
        norms.append(
            multi_tensor_applier(
                amp_C.multi_tensor_l2norm,
                dummy_overflow_buf,
                [grads_fp32],
                False,
            )[0]
        )
    if grads_fp16:
        norms.append(
            multi_tensor_applier(
                amp_C.multi_tensor_l2norm,
                dummy_overflow_buf,
                [grads_fp16],
                False,
            )[0],
        )
    for g in grads_misc:
        norms.append(torch.linalg.norm(g).unsqueeze(0).to(device))
    total_norm = torch.linalg.norm(torch.cat(norms))

    # Check for non-finite values
    if error_if_nonfinite and torch.logical_or(total_norm.isnan(), total_norm.isinf()):
        raise RuntimeError(
            f"The total norm of order {norm_type} for gradients from "
            "`parameters` is non-finite, so it cannot be clipped. To disable "
            "this error and scale the gradients by the non-finite norm anyway, "
            "set `error_if_nonfinite=False`"
        )

    # Scale gradients
    clip_coef = max_norm / (total_norm + 1e-6)
    clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
    if grads_fp32:
        multi_tensor_applier(
            amp_C.multi_tensor_scale,
            dummy_overflow_buf,
            [grads_fp32, grads_fp32],
            clip_coef_clamped,
        )
    if grads_fp16:
        multi_tensor_applier(
            amp_C.multi_tensor_scale,
            dummy_overflow_buf,
            [grads_fp16, grads_fp16],
            clip_coef_clamped,
        )
    for g in grads_misc:
        g.mul_(clip_coef_clamped.to(g.device))

    return total_norm


================================================
FILE: apex/contrib/conv_bias_relu/__init__.py
================================================
from .conv_bias_relu import (
    ConvBiasReLU,
    ConvBias,
    ConvBiasMaskReLU,
    ConvFrozenScaleBiasReLU,
)


================================================
FILE: apex/contrib/conv_bias_relu/conv_bias_relu.py
================================================
import torch

from apex import check_cudnn_version_and_warn
import fused_conv_bias_relu

check_cudnn_version_and_warn(__name__, 8400)


class ConvBiasReLU_(torch.autograd.Function):
    @staticmethod
    @torch.amp.custom_fwd(cast_inputs=torch.half, device_type="cuda")
    def forward(ctx, x, weight, bias, padding, stride):
        outputs = fused_conv_bias_relu.forward([x, weight, bias], padding, stride)
        ctx.save_for_backward(x, weight, outputs[0])
        ctx.padding = padding
        ctx.stride = stride

        return outputs[0]

    @staticmethod
    @torch.amp.custom_bwd(device_type="cuda")
    def backward(ctx, grad_output):
        bwd_args = [*ctx.saved_tensors, grad_output]
        padding = ctx.padding
        stride = ctx.stride
        grads = fused_conv_bias_relu.backward(bwd_args, padding, stride)

        return grads[0], grads[1], grads[2], None, None


class ConvBiasMaskReLU_(torch.autograd.Function):
    @staticmethod
    @torch.amp.custom_fwd(cast_inputs=torch.half, device_type="cuda")
    def forward(ctx, x, weight, bias, mask, padding, stride):
        outputs = fused_conv_bias_relu.forward_mask([x, weight, bias, mask], padding, stride)
        ctx.save_for_backward(x, weight, outputs[0])
        ctx.padding = padding
        ctx.stride = stride

        return outputs[0]

    @staticmethod
    @torch.amp.custom_bwd(device_type="cuda")
    def backward(ctx, grad_output):
        bwd_args = [*ctx.saved_tensors, grad_output]
        padding = ctx.padding
        stride = ctx.stride
        grads = fused_conv_bias_relu.backward(bwd_args, padding, stride)

        return grads[0], grads[1], grads[2], None, None, None


class ConvBias_(torch.autograd.Function):
    @staticmethod
    @torch.amp.custom_fwd(cast_inputs=torch.half, device_type="cuda")
    def forward(ctx, x, weight, bias, padding, stride):
        outputs = fused_conv_bias_relu.forward_no_relu([x, weight, bias], padding, stride)
        ctx.save_for_backward(x, weight)
        ctx.padding = padding
        ctx.stride = stride

        return outputs[0]

    @staticmethod
    @torch.amp.custom_bwd(device_type="cuda")
    def backward(ctx, grad_output):
        bwd_args = [*ctx.saved_tensors, grad_output]
        padding = ctx.padding
        stride = ctx.stride
        grads = fused_conv_bias_relu.backward_no_relu(bwd_args, padding, stride)

        return grads[0], grads[1], grads[2], None, None


class ConvFrozenScaleBiasReLU_(torch.autograd.Function):
    @staticmethod
    @torch.amp.custom_fwd(cast_inputs=torch.half, device_type="cuda")
    def forward(ctx, x, weight, scale, bias, padding, stride):
        output = fused_conv_bias_relu.forward_cscale_cbias_relu(
            [x, weight, scale, bias], padding, stride
        )
        ctx.save_for_backward(x, weight, scale, output)
        ctx.padding = padding
        ctx.stride = stride

        return output

    @staticmethod
    @torch.amp.custom_bwd(device_type="cuda")
    def backward(ctx, grad_output):
        bwd_args = [*ctx.saved_tensors, grad_output]
        padding = ctx.padding
        stride = ctx.stride
        grads = fused_conv_bias_relu.backward_cscale_cbias_relu(bwd_args, padding, stride)

        return grads[0], grads[1], None, None, None, None


ConvBiasReLU = ConvBiasReLU_.apply
ConvBiasMaskReLU = ConvBiasMaskReLU_.apply
ConvBias = ConvBias_.apply
ConvFrozenScaleBiasReLU = ConvFrozenScaleBiasReLU_.apply


================================================
FILE: apex/contrib/csrc/bottleneck/bottleneck.cpp
================================================
#include <ATen/ATen.h>
#include <ATen/cudnn/Handle.h>  // for getcudnnhandle
#include <cudnn_frontend.h>
#include <torch/extension.h>
#include <torch/torch.h>

#include <iostream>
#include <vector>

#ifdef DEBUG
#define DEBUG_MSG(str)             \
  do {                             \
    std::cout << str << std::endl; \
  } while (false)
#else
#define DEBUG_MSG(str) \
  do {                 \
  } while (false)
#endif

#ifdef DEBUG_CUDNN
#define DEBUG_CUDNN_MSG(buf, str) \
  do {                            \
    buf << str << std::endl;      \
  } while (false)
#else
#define DEBUG_CUDNN_MSG(buf, str) \
  do {                            \
  } while (false)
#endif

#define checkCudnnErr(...)                                                    \
  do {                                                                        \
    int err = checkCudnnError(__VA_ARGS__, #__VA_ARGS__, __FILE__, __LINE__); \
    if (err) {                                                                \
      return;                                                                 \
    }                                                                         \
  } while (0)

int checkCudnnError(cudnnStatus_t code, const char* expr, const char* file, int line) {
  if (code) {
    printf("CUDNN error at %s:%d, code=%d (%s) in '%s'\n", file, line, (int)code, cudnnGetErrorString(code), expr);
    return 1;
  }
  return 0;
}

void checkError(cudaError_t code, char const* func, const char* file, const int line, bool abort = true);
#define checkCUDAError(val)                      \
  {                                              \
    checkError((val), #val, __FILE__, __LINE__); \
  }  // in-line regular function

void checkError(cudaError_t code, char const* func, const char* file, const int line, bool abort) {
  if (code != cudaSuccess) {
    const char* errorMessage = cudaGetErrorString(code);
    fprintf(stderr, "CUDA error returned from \"%s\" at %s:%d, Error code: %d (%s)\n", func, file, line, code,
            errorMessage);
    if (abort) {
      cudaDeviceReset();
      exit(code);
    }
  }
}

void generateStrides(const int64_t* dimA, int64_t* strideA, int nbDims, cudnnTensorFormat_t filterFormat) {
  // For INT8x4 and INT8x32 we still compute standard strides here to input
  // into the cuDNN functions. We will manually scale by resizeFactor in the cpu ref.
  if (filterFormat == CUDNN_TENSOR_NCHW) {
    strideA[nbDims - 1] = 1;
    for (int64_t d = nbDims - 2; d >= 0; d--) {
      strideA[d] = strideA[d + 1] * dimA[d + 1];
    }
  } else {
    // Here we assume that the format is CUDNN_TENSOR_NHWC
    strideA[1] = 1;
    strideA[nbDims - 1] = strideA[1] * dimA[1];
    for (int64_t d = nbDims - 2; d >= 2; d--) {
      strideA[d] = strideA[d + 1] * dimA[d + 1];
    }
    strideA[0] = strideA[2] * dimA[2];
  }
}

int getFwdConvDilatedFilterDim(int filterDim, int dilation) { return ((filterDim - 1) * dilation) + 1; }

int getFwdConvPaddedImageDim(int tensorDim, int pad) { return tensorDim + (2 * pad); }

int getFwdConvOutputDim(int tensorDim, int pad, int filterDim, int stride, int dilation) {
  int p = (getFwdConvPaddedImageDim(tensorDim, pad) - getFwdConvDilatedFilterDim(filterDim, dilation)) / stride + 1;
  return (p);
}

enum {
  X_TENSOR,
  Y_TENSOR,
  W_TENSOR,
  Z_TENSOR,
  B_TENSOR,
  AFTERADD_TENSOR,
  AFTERBIAS_TENSOR,
  AFTERCONV_TENSOR,
  OPTIONAL,
  AFTEROPT_TENSOR,
};

using common_conv_descriptors =
    std::tuple<cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::ConvDesc>;

common_conv_descriptors create_common_descriptors(int64_t* x_dim_padded, int64_t* padA, int64_t* convstrideA,
                                                  int64_t* dilationA, int64_t* w_dim_padded, int64_t* y_dim_padded,
                                                  cudnnDataType_t dataType, cudnnConvolutionMode_t mode) {
  const int convDim = 2;

  int64_t strideA_padded[4];
  int64_t outstrideA_padded[4];
  int64_t filterstrideA_padded[4];

  generateStrides(w_dim_padded, filterstrideA_padded, 4, CUDNN_TENSOR_NHWC);
  generateStrides(x_dim_padded, strideA_padded, 4, CUDNN_TENSOR_NHWC);
  generateStrides(y_dim_padded, outstrideA_padded, 4, CUDNN_TENSOR_NHWC);

  return common_conv_descriptors(cudnn_frontend::TensorBuilder()
                                     .setDim(4, x_dim_padded)
                                     .setStrides(4, strideA_padded)
                                     .setId('x')
                                     .setAlignment(16)
                                     .setDataType(dataType)
                                     .build(),
                                 cudnn_frontend::TensorBuilder()
                                     .setDim(4, y_dim_padded)
                                     .setStrides(4, outstrideA_padded)
                                     .setId('y')
                                     .setAlignment(16)
                                     .setDataType(dataType)
                                     .build(),
                                 cudnn_frontend::TensorBuilder()
                                     .setDim(4, w_dim_padded)
                                     .setStrides(4, filterstrideA_padded)
                                     .setId('w')
                                     .setAlignment(16)
                                     .setDataType(dataType)
                                     .build(),
                                 cudnn_frontend::ConvDescBuilder()
                                     .setDataType(CUDNN_DATA_FLOAT)
                                     .setMathMode(mode)
                                     .setNDims(convDim)
                                     .setStrides(convDim, convstrideA)
                                     .setPrePadding(convDim, padA)
                                     .setPostPadding(convDim, padA)
                                     .setDilation(convDim, dilationA)
                                     .build());
}

using common_convbias_descriptors =
    std::tuple<cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor,
               cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor,
               cudnn_frontend::Tensor, cudnn_frontend::Tensor>;

common_convbias_descriptors create_conv_bias_add_act_descriptors(int64_t* x_dim_padded, int64_t* padA,
                                                                 int64_t* convstrideA, int64_t* dilationA,
                                                                 int64_t* w_dim_padded, int64_t* y_dim_padded,
                                                                 cudnnDataType_t dataType) {
  const int convDim = 2;

  int64_t b_dim_padded[4];
  b_dim_padded[0] = 1;
  b_dim_padded[1] = y_dim_padded[1];
  b_dim_padded[2] = 1;
  b_dim_padded[3] = 1;

  int64_t x_stride_padded[4];
  int64_t y_stride_padded[4];
  int64_t w_stride_padded[4];
  int64_t b_stride_padded[4];

  generateStrides(w_dim_padded, w_stride_padded, 4, CUDNN_TENSOR_NHWC);
  generateStrides(x_dim_padded, x_stride_padded, 4, CUDNN_TENSOR_NHWC);
  generateStrides(y_dim_padded, y_stride_padded, 4, CUDNN_TENSOR_NHWC);
  generateStrides(b_dim_padded, b_stride_padded, 4, CUDNN_TENSOR_NHWC);

  return common_convbias_descriptors(cudnn_frontend::TensorBuilder()
                                         .setDim(4, x_dim_padded)
                                         .setStrides(4, x_stride_padded)
                                         .setId('x')
                                         .setAlignment(16)
                                         .setDataType(dataType)
                                         .build(),
                                     cudnn_frontend::TensorBuilder()
                                         .setDim(4, y_dim_padded)
                                         .setStrides(4, y_stride_padded)
                                         .setId('y')
                                         .setAlignment(16)
                                         .setDataType(dataType)
                                         .build(),
                                     cudnn_frontend::TensorBuilder()
                                         .setDim(4, w_dim_padded)
                                         .setStrides(4, w_stride_padded)
                                         .setId('w')
                                         .setAlignment(16)
                                         .setDataType(dataType)
                                         .build(),
                                     cudnn_frontend::TensorBuilder()
                                         .setDim(4, b_dim_padded)
                                         .setStrides(4, b_stride_padded)
                                         .setId('z')
                                         .setAlignment(16)
                                         .setDataType(dataType)
                                         .build(),
                                     cudnn_frontend::TensorBuilder()
                                         .setDim(4, b_dim_padded)
                                         .setStrides(4, b_stride_padded)
                                         .setId('b')
                                         .setAlignment(16)
                                         .setDataType(dataType)
                                         .build(),
                                     cudnn_frontend::TensorBuilder()
                                         .setDim(4, y_dim_padded)
                                         .setStrides(4, y_stride_padded)
                                         .setVirtual()
                                         .setId('A')  // after add
                                         .setAlignment(16)
                                         .setDataType(CUDNN_DATA_FLOAT)
                                         .build(),
                                     cudnn_frontend::TensorBuilder()
                                         .setDim(4, y_dim_padded)
                                         .setStrides(4, y_stride_padded)
                                         .setVirtual()
                                         .setId('B')  // after bias
                                         .setAlignment(16)
                                         .setDataType(CUDNN_DATA_FLOAT)
                                         .build(),
                                     cudnn_frontend::TensorBuilder()
                                         .setDim(4, y_dim_padded)
                                         .setStrides(4, y_stride_padded)
                                         .setId('C')  // after conv
                                         .setAlignment(16)
                                         .setVirtual()
                                         .setDataType(CUDNN_DATA_FLOAT)
                                         .build(),
                                     cudnn_frontend::TensorBuilder()
                                         .setDim(4, y_dim_padded)
                                         .setStrides(4, y_stride_padded)
                                         .setId('i')
                                         .setAlignment(16)
                                         .setDataType(dataType)
                                         .build(),
                                     cudnn_frontend::TensorBuilder()
                                         .setDim(4, y_dim_padded)
                                         .setStrides(4, y_stride_padded)
                                         .setId('D')  // after optional add
                                         .setAlignment(16)
                                         .setVirtual()
                                         .setDataType(CUDNN_DATA_FLOAT)
                                         .build());
}

// tensor descriptors used for dgrad
enum {
  X_OR_DX_TENSOR,
  DY_TENSOR,
  W_OR_DW_TENSOR,
  SCALE_TENSOR,
  RELU_TENSOR,
  AFTER_DCONV_TENSOR,
  AFTER_DRELU_TENSOR,
};

using dconv_descriptors =
    std::tuple<cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor,
               cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor>;

dconv_descriptors create_dconv_descriptors(int64_t* x_dim_padded, int64_t* padA, int64_t* convstrideA,
                                           int64_t* dilationA, int64_t* w_dim_padded, int64_t* y_dim_padded,
                                           cudnnDataType_t dataType) {
  const int convDim = 2;

  int64_t b_dim_padded[4];
  b_dim_padded[0] = 1;
  b_dim_padded[1] = x_dim_padded[1];
  b_dim_padded[2] = 1;
  b_dim_padded[3] = 1;

  int64_t x_stride_padded[4];
  int64_t y_stride_padded[4];
  int64_t w_stride_padded[4];
  int64_t b_stride_padded[4];

  generateStrides(w_dim_padded, w_stride_padded, 4, CUDNN_TENSOR_NHWC);
  generateStrides(x_dim_padded, x_stride_padded, 4, CUDNN_TENSOR_NHWC);
  generateStrides(y_dim_padded, y_stride_padded, 4, CUDNN_TENSOR_NHWC);
  generateStrides(b_dim_padded, b_stride_padded, 4, CUDNN_TENSOR_NHWC);

  return dconv_descriptors(cudnn_frontend::TensorBuilder()
                               .setDim(4, x_dim_padded)
                               .setStrides(4, x_stride_padded)
                               .setId('x')
                               .setAlignment(16)
                               .setDataType(dataType)
                               .build(),
                           cudnn_frontend::TensorBuilder()
                               .setDim(4, y_dim_padded)
                               .setStrides(4, y_stride_padded)
                               .setId('y')
                               .setAlignment(16)
                               .setDataType(dataType)
                               .build(),
                           cudnn_frontend::TensorBuilder()
                               .setDim(4, w_dim_padded)
                               .setStrides(4, w_stride_padded)
                               .setId('w')
                               .setAlignment(16)
                               .setDataType(dataType)
                               .build(),
                           cudnn_frontend::TensorBuilder()
                               .setDim(4, b_dim_padded)
                               .setStrides(4, b_stride_padded)
                               .setId('s')
                               .setAlignment(16)
                               .setDataType(dataType)
                               .build(),
                           cudnn_frontend::TensorBuilder()
                               .setDim(4, x_dim_padded)
                               .setStrides(4, x_stride_padded)
                               .setId('r')
                               .setAlignment(16)
                               .setDataType(dataType)
                               .build(),
                           cudnn_frontend::TensorBuilder()
                               .setDim(4, x_dim_padded)
                               .setStrides(4, x_stride_padded)
                               .setVirtual()
                               .setId('A')  // after dconv
                               .setAlignment(16)
                               .setDataType(CUDNN_DATA_FLOAT)
                               .build(),
                           cudnn_frontend::TensorBuilder()
                               .setDim(4, x_dim_padded)
                               .setStrides(4, x_stride_padded)
                               .setVirtual()
                               .setId('B')  // after drelu
                               .setAlignment(16)
                               .setDataType(CUDNN_DATA_FLOAT)
                               .build());
}

// create a cache for plan
std::unordered_map<std::string, cudnn_frontend::ExecutionPlan> plan_cache;

// TODO: better name
std::string getConvFusionString(int64_t* x_dim_padded, int64_t* padA, int64_t* convstrideA, int64_t* dilationA,
                                int64_t* w_dim_padded, cudnnDataType_t dataType, std::string fusion_string) {
  for (int i = 0; i < 4; i++) {
    fusion_string += 'X';
    fusion_string += std::to_string(x_dim_padded[i]);
  }
  for (int i = 0; i < 4; i++) {
    fusion_string += 'W';
    fusion_string += std::to_string(w_dim_padded[i]);
  }
  for (int i = 0; i < 2; i++) {
    fusion_string += 'P';
    fusion_string += std::to_string(padA[i]);
  }
  for (int i = 0; i < 2; i++) {
    fusion_string += 'S';
    fusion_string += std::to_string(convstrideA[i]);
  }
  for (int i = 0; i < 2; i++) {
    fusion_string += 'D';
    fusion_string += std::to_string(dilationA[i]);
  }
  fusion_string += 'T';
  fusion_string += std::to_string(dataType);
  return fusion_string;
}

cudnn_frontend::ExecutionPlan& getOrCreatePlan(cudnnHandle_t handle_, std::stringstream& log_buf,
                                               cudnn_frontend::OperationGraph& opGraph, std::string cache_string,
                                               bool use_heuristic = true) {
  auto it = plan_cache.find(cache_string);
  if (it != plan_cache.end()) {
    DEBUG_CUDNN_MSG(log_buf, "Found plan in cache");
    return it->second;
  } else {
    if (use_heuristic) {
      // TODO: confirm which mode to use
      auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
                            .setOperationGraph(opGraph)
                            .setHeurMode(CUDNN_HEUR_MODE_INSTANT)
                            .build();
      // try 3 times for now as WAR for no heuristic training
      int max_tries = 3, count = 0;
      auto& engine_configs = heuristics.getEngineConfig(max_tries);
      while (true) {
        try {
          plan_cache.emplace(cache_string, std::move(cudnn_frontend::ExecutionPlanBuilder()
                                                         .setHandle(handle_)
                                                         .setEngineConfig(engine_configs[count], opGraph.getTag())
                                                         .build()));
          break;
        } catch (cudnn_frontend::cudnnException e) {
          if (++count == max_tries) throw e;
        }
      }
    } else {
      DEBUG_CUDNN_MSG(log_buf, "No plan in cache");
      // How many engines support this operation graph ?
      auto total_engines = opGraph.getEngineCount();
      DEBUG_CUDNN_MSG(log_buf, opGraph.describe() << " has " << total_engines << " engines.");
      // We have to randomly pick one engine from [0, total_engines)
      // Selecting "0" by default
      auto engine = cudnn_frontend::EngineBuilder().setGlobalEngineIdx(0).setOperationGraph(opGraph).build();
      DEBUG_CUDNN_MSG(log_buf, engine.describe());
      auto& knobs = engine.getSupportedKnobs();
      for (auto it = std::begin(knobs); it != std::end(knobs); ++it) {
        DEBUG_CUDNN_MSG(log_buf, it->describe());
      }
      if (knobs.begin() != knobs.end()) {
        DEBUG_CUDNN_MSG(log_buf, "Updated knob choice");
        knobs.begin()->setChoice(knobs.begin()->getMinValue() + 1);
        DEBUG_CUDNN_MSG(log_buf, knobs.begin()->describe());
      }

      // Createmplacee the requisite engine config
      auto engine_config = cudnn_frontend::EngineConfigBuilder().setEngine(engine).build();
      DEBUG_CUDNN_MSG(log_buf, engine_config.describe());
      plan_cache.emplace(
          cache_string,
          std::move(cudnn_frontend::ExecutionPlanBuilder().setHandle(handle_).setEngineConfig(engine_config).build()));
    }

    return plan_cache.find(cache_string)->second;
  }
}

void run_conv_scale_bias_add_activation(int64_t* x_dim_padded, int64_t* pad, int64_t* convstride, int64_t* dilation,
                                        int64_t* w_dim_padded, int64_t* y_dim_padded, cudnnDataType_t dataType,
                                        at::Half* devPtrX, at::Half* devPtrW, at::Half* devPtrY, at::Half* devPtrZ,
                                        at::Half* devPtrB, at::Half* devPtrI) {
  cudnnHandle_t handle_ = torch::native::getCudnnHandle();
  std::stringstream log_buf;
  try {
    int convDim = 2;

    // Creates the necessary tensor descriptors
    common_convbias_descriptors tensors = create_conv_bias_add_act_descriptors(x_dim_padded, pad, convstride, dilation,
                                                                               w_dim_padded, y_dim_padded, dataType);
    DEBUG_CUDNN_MSG(log_buf, std::get<X_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<Y_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<W_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<Z_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<B_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<AFTERADD_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<AFTERBIAS_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<AFTERCONV_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<OPTIONAL>(tensors).describe());

    // Define the add operation
    auto scaleDesc =
        cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_MUL).setMathPrecision(CUDNN_DATA_FLOAT).build();
    DEBUG_CUDNN_MSG(log_buf, scaleDesc.describe());

    // Define the bias operation
    auto biasDesc =
        cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_ADD).setMathPrecision(CUDNN_DATA_FLOAT).build();
    DEBUG_CUDNN_MSG(log_buf, biasDesc.describe());

    // optional add
    auto addDesc =
        cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_ADD).setMathPrecision(CUDNN_DATA_FLOAT).build();
    DEBUG_CUDNN_MSG(log_buf, addDesc.describe());

    // Define the activation operation
    auto actDesc = cudnn_frontend::PointWiseDescBuilder()
                       .setMode(CUDNN_POINTWISE_RELU_FWD)
                       .setMathPrecision(CUDNN_DATA_FLOAT)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, actDesc.describe());

    // Define the convolution problem
    auto convDesc = cudnn_frontend::ConvDescBuilder()
                        .setDataType(CUDNN_DATA_FLOAT)
                        .setMathMode(CUDNN_CROSS_CORRELATION)
                        .setNDims(convDim)
                        .setStrides(convDim, convstride)
                        .setPrePadding(convDim, pad)
                        .setPostPadding(convDim, pad)
                        .setDilation(convDim, dilation)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, convDesc.describe());

    float alpha = 1.0f;
    float beta = 0.0f;

    // Create a convolution Node
    auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
                       .setxDesc(std::get<X_TENSOR>(tensors))
                       .setwDesc(std::get<W_TENSOR>(tensors))
                       .setyDesc(std::get<AFTERCONV_TENSOR>(tensors))
                       .setcDesc(convDesc)
                       .setAlpha(alpha)
                       .setBeta(beta)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, conv_op.describe());

    // Create a Add Node with scaling parameters.
    auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                        .setxDesc(conv_op.getOutputTensor())
                        .setbDesc(std::get<Z_TENSOR>(tensors))
                        .setyDesc(std::get<AFTERADD_TENSOR>(tensors))
                        .setpwDesc(scaleDesc)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, scale_op.describe());

    // Create a Bias Node.
    auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                       .setxDesc(scale_op.getOutputTensor())
                       .setbDesc(std::get<B_TENSOR>(tensors))
                       .setyDesc(std::get<AFTERBIAS_TENSOR>(tensors))
                       .setpwDesc(biasDesc)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, bias_op.describe());

    // Create a optional add Node.
    auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                      .setxDesc(bias_op.getOutputTensor())
                      .setbDesc(std::get<OPTIONAL>(tensors))
                      .setyDesc(std::get<AFTEROPT_TENSOR>(tensors))
                      .setpwDesc(addDesc)
                      .build();
    DEBUG_CUDNN_MSG(log_buf, add_op.describe());

    // Create an Activation Node.
    auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                      .setxDesc(devPtrI ? add_op.getOutputTensor() : bias_op.getOutputTensor())
                      .setyDesc(std::get<Y_TENSOR>(tensors))
                      .setpwDesc(actDesc)
                      .build();
    DEBUG_CUDNN_MSG(log_buf, act_op.describe());

    // Create an Operation Graph. In this case it is convolution add bias activation
    std::array<cudnn_frontend::Operation const*, 5> ops = {&conv_op, &scale_op, &bias_op, devPtrI ? &add_op : &act_op,
                                                           &act_op};

    auto opGraph = cudnn_frontend::OperationGraphBuilder()
                       .setHandle(handle_)
                       .setOperationGraph(devPtrI ? ops.size() : 4, ops.data())
                       .build();

    // Create string encoding for plan caching
    auto cache_string =
        getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
    DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);

    auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
    DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());

    auto workspace_size = plan.getWorkspaceSize();
    DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);

    void* workspace_ptr = nullptr;
    auto workspace_tensor = at::empty({(workspace_size + 3) / 4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
    if (workspace_size > 0) {
      workspace_ptr = workspace_tensor.data_ptr<float>();
    }
    void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrZ, devPtrB, devPtrI};
    int64_t uids[] = {'x', 'y', 'w', 'z', 'b', 'i'};
    auto variantPack = cudnn_frontend::VariantPackBuilder()
                           .setWorkspacePointer(workspace_ptr)
                           .setDataPointers(devPtrI ? 6 : 5, data_ptrs)
                           .setUids(devPtrI ? 6 : 5, uids)
                           .build();
    DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
    cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
    checkCudnnErr(status);
    cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
  } catch (cudnn_frontend::cudnnException e) {
    std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
  }
}

void run_conv_scale_bias(int64_t* x_dim_padded, int64_t* pad, int64_t* convstride, int64_t* dilation,
                         int64_t* w_dim_padded, int64_t* y_dim_padded, cudnnDataType_t dataType, at::Half* devPtrX,
                         at::Half* devPtrW, at::Half* devPtrY, at::Half* devPtrZ, at::Half* devPtrB) {
  cudnnHandle_t handle_ = torch::native::getCudnnHandle();
  std::stringstream log_buf;
  try {
    int convDim = 2;

    // Creates the necessary tensor descriptors
    common_convbias_descriptors tensors = create_conv_bias_add_act_descriptors(x_dim_padded, pad, convstride, dilation,
                                                                               w_dim_padded, y_dim_padded, dataType);
    DEBUG_CUDNN_MSG(log_buf, std::get<X_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<Y_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<W_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<Z_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<B_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<AFTERADD_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<AFTERBIAS_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<AFTERCONV_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<OPTIONAL>(tensors).describe());

    // Define the add operation
    auto scaleDesc =
        cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_MUL).setMathPrecision(CUDNN_DATA_FLOAT).build();
    DEBUG_CUDNN_MSG(log_buf, scaleDesc.describe());

    // Define the bias operation
    auto addDesc =
        cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_ADD).setMathPrecision(CUDNN_DATA_FLOAT).build();
    DEBUG_CUDNN_MSG(log_buf, addDesc.describe());

    // Define the convolution problem
    auto convDesc = cudnn_frontend::ConvDescBuilder()
                        .setDataType(CUDNN_DATA_FLOAT)
                        .setMathMode(CUDNN_CROSS_CORRELATION)
                        .setNDims(convDim)
                        .setStrides(convDim, convstride)
                        .setPrePadding(convDim, pad)
                        .setPostPadding(convDim, pad)
                        .setDilation(convDim, dilation)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, convDesc.describe());

    float alpha = 1.0f;
    float beta = 0.0f;

    // Create a convolution Node
    auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
                       .setxDesc(std::get<X_TENSOR>(tensors))
                       .setwDesc(std::get<W_TENSOR>(tensors))
                       .setyDesc(std::get<AFTERCONV_TENSOR>(tensors))
                       .setcDesc(convDesc)
                       .setAlpha(alpha)
                       .setBeta(beta)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, conv_op.describe());

    // Create a Add Node with scaling parameters.
    auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                        .setxDesc(conv_op.getOutputTensor())
                        .setbDesc(std::get<Z_TENSOR>(tensors))
                        .setyDesc(std::get<AFTERADD_TENSOR>(tensors))  // TODO: change enum to aftermul
                        .setpwDesc(scaleDesc)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, scale_op.describe());

    // Create a Bias Node.
    auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                      .setxDesc(scale_op.getOutputTensor())
                      .setbDesc(std::get<B_TENSOR>(tensors))
                      .setyDesc(std::get<Y_TENSOR>(tensors))
                      .setpwDesc(addDesc)
                      .build();
    DEBUG_CUDNN_MSG(log_buf, add_op.describe());

    // Create an Operation Graph. In this case it is convolution add bias activation
    std::array<cudnn_frontend::Operation const*, 3> ops = {&conv_op, &scale_op, &add_op};

    auto opGraph =
        cudnn_frontend::OperationGraphBuilder().setHandle(handle_).setOperationGraph(ops.size(), ops.data()).build();

    // Create string encoding for plan caching
    auto cache_string =
        getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
    DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);

    auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
    DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());

    auto workspace_size = plan.getWorkspaceSize();
    DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);

    void* workspace_ptr = nullptr;
    auto workspace_tensor = at::empty({(workspace_size + 3) / 4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
    if (workspace_size > 0) {
      workspace_ptr = workspace_tensor.data_ptr<float>();
    }
    void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrZ, devPtrB};
    int64_t uids[] = {'x', 'y', 'w', 'z', 'b'};
    auto variantPack = cudnn_frontend::VariantPackBuilder()
                           .setWorkspacePointer(workspace_ptr)
                           .setDataPointers(5, data_ptrs)
                           .setUids(5, uids)
                           .build();
    DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
    cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
    checkCudnnErr(status);
    cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
  } catch (cudnn_frontend::cudnnException e) {
    std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
  }
}

void run_dconv_drelu_dscale(int64_t* x_dim_padded, int64_t* pad, int64_t* convstride, int64_t* dilation,
                            int64_t* w_dim_padded, int64_t* y_dim_padded, cudnnDataType_t dataType, at::Half* devPtrX,
                            at::Half* devPtrW, at::Half* devPtrY, at::Half* devPtrZ, at::Half* devPtrR) {
  cudnnHandle_t handle_ = torch::native::getCudnnHandle();
  std::stringstream log_buf;
  try {
    int convDim = 2;

    // Creates the necessary tensor descriptors
    dconv_descriptors tensors =
        create_dconv_descriptors(x_dim_padded, pad, convstride, dilation, w_dim_padded, y_dim_padded, dataType);
    DEBUG_CUDNN_MSG(log_buf, std::get<X_OR_DX_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<DY_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<W_OR_DW_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<SCALE_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<RELU_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<AFTER_DCONV_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<AFTER_DRELU_TENSOR>(tensors).describe());

    // Define the convolution problem
    auto convDesc = cudnn_frontend::ConvDescBuilder()
                        .setDataType(CUDNN_DATA_FLOAT)
                        .setMathMode(CUDNN_CROSS_CORRELATION)
                        .setNDims(convDim)
                        .setStrides(convDim, convstride)
                        .setPrePadding(convDim, pad)
                        .setPostPadding(convDim, pad)
                        .setDilation(convDim, dilation)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, convDesc.describe());

    // Define the activation backward operation
    auto actDesc = cudnn_frontend::PointWiseDescBuilder()
                       .setMode(CUDNN_POINTWISE_RELU_BWD)
                       .setMathPrecision(CUDNN_DATA_FLOAT)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, actDesc.describe());

    // Define the scale backward operation
    auto scaleDesc =
        cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_MUL).setMathPrecision(CUDNN_DATA_FLOAT).build();
    DEBUG_CUDNN_MSG(log_buf, scaleDesc.describe());

    float alpha = 1.0f;
    float beta = 0.0f;

    // Create a convolution Node
    auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR)
                       .setdxDesc(std::get<AFTER_DCONV_TENSOR>(tensors))
                       .setwDesc(std::get<W_OR_DW_TENSOR>(tensors))
                       .setdyDesc(std::get<DY_TENSOR>(tensors))
                       .setcDesc(convDesc)
                       .setAlpha(alpha)
                       .setBeta(beta)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, conv_op.describe());

    // TODO: do we need getOutputTensor(), and what it returns in backward case?
    // Create an relu backward Node.
    auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                      .setdyDesc(std::get<AFTER_DCONV_TENSOR>(tensors))
                      .setxDesc(std::get<RELU_TENSOR>(tensors))
                      .setdxDesc(std::get<AFTER_DRELU_TENSOR>(tensors))
                      .setpwDesc(actDesc)
                      .build();
    DEBUG_CUDNN_MSG(log_buf, act_op.describe());

    // Create a Scale Node.
    auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                        .setxDesc(std::get<AFTER_DRELU_TENSOR>(tensors))
                        .setbDesc(std::get<SCALE_TENSOR>(tensors))
                        .setyDesc(std::get<X_OR_DX_TENSOR>(tensors))
                        .setpwDesc(scaleDesc)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, scale_op.describe());

    // Create an Operation Graph. In this case it is convolution add bias activation
    std::array<cudnn_frontend::Operation const*, 3> ops = {&conv_op, &act_op, &scale_op};

    auto opGraph =
        cudnn_frontend::OperationGraphBuilder().setHandle(handle_).setOperationGraph(ops.size(), ops.data()).build();

    // Create string encoding for plan caching
    auto cache_string =
        getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
    DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);

    auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
    DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());

    auto workspace_size = plan.getWorkspaceSize();
    DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);

    void* workspace_ptr = nullptr;
    auto workspace_tensor = at::empty({(workspace_size + 3) / 4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
    if (workspace_size > 0) {
      workspace_ptr = workspace_tensor.data_ptr<float>();
    }
    void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrZ, devPtrR};
    int64_t uids[] = {'x', 'y', 'w', 's', 'r'};
    auto variantPack = cudnn_frontend::VariantPackBuilder()
                           .setWorkspacePointer(workspace_ptr)
                           .setDataPointers(5, data_ptrs)
                           .setUids(5, uids)
                           .build();
    DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
    cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
    checkCudnnErr(status);
    cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
  } catch (cudnn_frontend::cudnnException e) {
    std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
  }
}

void run_dconv(int64_t* x_dim_padded, int64_t* pad, int64_t* convstride, int64_t* dilation, int64_t* w_dim_padded,
               int64_t* y_dim_padded, cudnnDataType_t dataType, at::Half* devPtrX, at::Half* devPtrW, at::Half* devPtrY,
               cudnnBackendDescriptorType_t mode) {
  cudnnHandle_t handle_ = torch::native::getCudnnHandle();
  std::stringstream log_buf;
  try {
    int convDim = 2;

    // Creates the necessary tensor descriptors
    dconv_descriptors tensors =
        create_dconv_descriptors(x_dim_padded, pad, convstride, dilation, w_dim_padded, y_dim_padded, dataType);
    DEBUG_CUDNN_MSG(log_buf, std::get<X_OR_DX_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<DY_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<W_OR_DW_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<SCALE_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<RELU_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<AFTER_DCONV_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<AFTER_DRELU_TENSOR>(tensors).describe());

    // Define the convolution problem
    auto convDesc = cudnn_frontend::ConvDescBuilder()
                        .setDataType(CUDNN_DATA_FLOAT)
                        .setMathMode(CUDNN_CROSS_CORRELATION)
                        .setNDims(convDim)
                        .setStrides(convDim, convstride)
                        .setPrePadding(convDim, pad)
                        .setPostPadding(convDim, pad)
                        .setDilation(convDim, dilation)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, convDesc.describe());

    float alpha = 1.0f;
    float beta = 0.0f;

    // Create a convolution Node
    // mode should be one of following
    // CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
    // CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
    auto conv_op_builder = cudnn_frontend::OperationBuilder(mode);
    if (mode == CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR) {
      conv_op_builder.setdxDesc(std::get<X_OR_DX_TENSOR>(tensors))
          .setwDesc(std::get<W_OR_DW_TENSOR>(tensors))
          .setdyDesc(std::get<DY_TENSOR>(tensors))
          .setcDesc(convDesc)
          .setAlpha(alpha)
          .setBeta(beta);
    } else {
      conv_op_builder.setxDesc(std::get<X_OR_DX_TENSOR>(tensors))
          .setdwDesc(std::get<W_OR_DW_TENSOR>(tensors))
          .setdyDesc(std::get<DY_TENSOR>(tensors))
          .setcDesc(convDesc)
          .setAlpha(alpha)
          .setBeta(beta);
    }
    auto conv_op = conv_op_builder.build();
    DEBUG_CUDNN_MSG(log_buf, conv_op.describe());

    // Create an Operation Graph. In this case it is convolution add bias activation
    std::array<cudnn_frontend::Operation const*, 1> ops = {&conv_op};

    auto opGraph =
        cudnn_frontend::OperationGraphBuilder().setHandle(handle_).setOperationGraph(ops.size(), ops.data()).build();

    // Create string encoding for plan caching
    auto cache_string =
        getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
    DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);

    auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
    DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());

    auto workspace_size = plan.getWorkspaceSize();
    DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);

    void* workspace_ptr = nullptr;
    auto workspace_tensor = at::empty({(workspace_size + 3) / 4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
    if (workspace_size > 0) {
      workspace_ptr = workspace_tensor.data_ptr<float>();
    }
    void* data_ptrs[] = {devPtrX, devPtrY, devPtrW};
    int64_t uids[] = {'x', 'y', 'w'};
    auto variantPack = cudnn_frontend::VariantPackBuilder()
                           .setWorkspacePointer(workspace_ptr)
                           .setDataPointers(3, data_ptrs)
                           .setUids(3, uids)
                           .build();
    DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
    cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
    checkCudnnErr(status);
    cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
  } catch (cudnn_frontend::cudnnException e) {
    std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
  }
}

void run_dconv_add(int64_t* x_dim_padded, int64_t* pad, int64_t* convstride, int64_t* dilation, int64_t* w_dim_padded,
                   int64_t* y_dim_padded, cudnnDataType_t dataType, at::Half* devPtrX, at::Half* devPtrW,
                   at::Half* devPtrY, at::Half* devPtrR) {
  cudnnHandle_t handle_ = torch::native::getCudnnHandle();
  std::stringstream log_buf;
  try {
    int convDim = 2;

    // Creates the necessary tensor descriptors
    dconv_descriptors tensors =
        create_dconv_descriptors(x_dim_padded, pad, convstride, dilation, w_dim_padded, y_dim_padded, dataType);
    DEBUG_CUDNN_MSG(log_buf, std::get<X_OR_DX_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<DY_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<W_OR_DW_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<SCALE_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<RELU_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<AFTER_DCONV_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<AFTER_DRELU_TENSOR>(tensors).describe());

    // Define the convolution problem
    auto convDesc = cudnn_frontend::ConvDescBuilder()
                        .setDataType(CUDNN_DATA_FLOAT)
                        .setMathMode(CUDNN_CROSS_CORRELATION)
                        .setNDims(convDim)
                        .setStrides(convDim, convstride)
                        .setPrePadding(convDim, pad)
                        .setPostPadding(convDim, pad)
                        .setDilation(convDim, dilation)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, convDesc.describe());

    // Define the add backward operation
    auto addDesc =
        cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_ADD).setMathPrecision(CUDNN_DATA_FLOAT).build();
    DEBUG_CUDNN_MSG(log_buf, addDesc.describe());

    float alpha = 1.0f;
    float beta = 0.0f;

    // Create a convolution Node
    auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR)
                       .setdxDesc(std::get<AFTER_DCONV_TENSOR>(tensors))
                       .setwDesc(std::get<W_OR_DW_TENSOR>(tensors))
                       .setdyDesc(std::get<DY_TENSOR>(tensors))
                       .setcDesc(convDesc)
                       .setAlpha(alpha)
                       .setBeta(beta)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, conv_op.describe());

    // TODO: do we need getOutputTensor(), and what it returns in backward case?
    // Create add Node.
    auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                      .setxDesc(std::get<AFTER_DCONV_TENSOR>(tensors))
                      .setbDesc(std::get<RELU_TENSOR>(tensors))
                      .setyDesc(std::get<X_OR_DX_TENSOR>(tensors))
                      .setpwDesc(addDesc)
                      .build();
    DEBUG_CUDNN_MSG(log_buf, add_op.describe());

    // Create an Operation Graph. In this case it is convolution add bias activation
    std::array<cudnn_frontend::Operation const*, 2> ops = {&conv_op, &add_op};

    auto opGraph =
        cudnn_frontend::OperationGraphBuilder().setHandle(handle_).setOperationGraph(ops.size(), ops.data()).build();

    // Create string encoding for plan caching
    auto cache_string =
        getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
    DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);

    auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
    DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());

    auto workspace_size = plan.getWorkspaceSize();
    DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);

    void* workspace_ptr = nullptr;
    auto workspace_tensor = at::empty({(workspace_size + 3) / 4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
    if (workspace_size > 0) {
      workspace_ptr = workspace_tensor.data_ptr<float>();
    }
    void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrR};
    int64_t uids[] = {'x', 'y', 'w', 'r'};
    auto variantPack = cudnn_frontend::VariantPackBuilder()
                           .setWorkspacePointer(workspace_ptr)
                           .setDataPointers(4, data_ptrs)
                           .setUids(4, uids)
                           .build();
    DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
    cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
    checkCudnnErr(status);
    cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
  } catch (cudnn_frontend::cudnnException e) {
    std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
  }
}

// inputs contains x,w,z,b,(i)
std::vector<at::Tensor> bottleneck_forward(bool explicit_nhwc, int stride_1X1, std::vector<at::Tensor> inputs) {
  std::cout << std::fixed;
  // create output vector
  std::vector<at::Tensor> outputs;
  auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;

  // setup dimensions
  int64_t dimA[] = {0, 0, 0, 0};
  int64_t filterdimA1[] = {0, 0, 0, 0};
  int64_t filterdimA2[] = {0, 0, 0, 0};
  int64_t filterdimA3[] = {0, 0, 0, 0};
  int64_t filterdimA4[] = {0, 0, 0, 0};

  // All dim calculation after this order of n,c,h,w
  int axis[]{0, 1, 2, 3};
  if (explicit_nhwc) {
    axis[0] = 0;
    axis[1] = 3;
    axis[2] = 1;
    axis[3] = 2;
  }
  for (int dim = 0; dim < 4; dim++) {
    dimA[dim] = inputs[0].size(axis[dim]);
    filterdimA1[dim] = inputs[1].size(axis[dim]);
    filterdimA2[dim] = inputs[2].size(axis[dim]);
    filterdimA3[dim] = inputs[3].size(axis[dim]);
  }
  if (stride_1X1 != 1 || filterdimA3[0] != dimA[1]) {
    for (int dim = 0; dim < 4; dim++) {
      filterdimA4[dim] = inputs[10].size(axis[dim]);
    }
  }

  // output dim in n,c,h,w used by backend
  int64_t outdimA1[] = {0, 0, 0, 0};  // Computed Below
  int64_t outdimA2[] = {0, 0, 0, 0};  // Computed Below
  int64_t outdimA3[] = {0, 0, 0, 0};  // Computed Below

  // use these fixed value for test run
  int64_t padA[] = {0, 0};
  int64_t padA1[] = {1, 1};
  int64_t dilationA[] = {1, 1};
  int64_t convstrideA[] = {1, 1};
  int64_t convstride1X1[] = {stride_1X1, stride_1X1};

  // compute output from pad/stride/dilation
  outdimA1[0] = dimA[0];
  outdimA1[1] = filterdimA1[0];
  for (int dim = 0; dim < 2; dim++) {
    outdimA1[dim + 2] =
        getFwdConvOutputDim(dimA[dim + 2], padA[dim], filterdimA1[dim + 2], convstride1X1[dim], dilationA[dim]);
  }

  outdimA2[0] = outdimA1[0];
  outdimA2[1] = filterdimA2[0];
  for (int dim = 0; dim < 2; dim++) {
    outdimA2[dim + 2] =
        getFwdConvOutputDim(outdimA1[dim + 2], padA1[dim], filterdimA2[dim + 2], convstrideA[dim], dilationA[dim]);
  }

  outdimA3[0] = outdimA2[0];
  outdimA3[1] = filterdimA3[0];
  for (int dim = 0; dim < 2; dim++) {
    outdimA3[dim + 2] =
        getFwdConvOutputDim(outdimA2[dim + 2], padA[dim], filterdimA3[dim + 2], convstrideA[dim], dilationA[dim]);
  }

  // Create output tensor in the correct shape in pytorch's view
  int64_t outdim1[] = {0, 0, 0, 0};
  int64_t outdim2[] = {0, 0, 0, 0};
  int64_t outdim3[] = {0, 0, 0, 0};
  if (explicit_nhwc) {
    axis[0] = 0;
    axis[1] = 2;
    axis[2] = 3;
    axis[3] = 1;
  }
  for (int dim = 0; dim < 4; dim++) {
    outdim1[dim] = outdimA1[axis[dim]];
    outdim2[dim] = outdimA2[axis[dim]];
    outdim3[dim] = outdimA3[axis[dim]];
  }

  // run
  at::Half* x = inputs[0].data_ptr<at::Half>();
  at::Half* w = inputs[1].data_ptr<at::Half>();
  at::Half* z = inputs[4].data_ptr<at::Half>();
  at::Half* b = inputs[7].data_ptr<at::Half>();
  auto out1 = at::empty(outdim1, inputs[0].type(), output_format);
  at::Half* y1 = out1.data_ptr<at::Half>();

  run_conv_scale_bias_add_activation(dimA, padA, convstride1X1, dilationA, filterdimA1, outdimA1, CUDNN_DATA_HALF, x, w,
                                     y1, z, b, nullptr);

  DEBUG_MSG("[DEBUG] new relu1 : " << out1.to(at::kFloat).sum().item<float>());

  w = inputs[2].data_ptr<at::Half>();
  z = inputs[5].data_ptr<at::Half>();
  b = inputs[8].data_ptr<at::Half>();
  auto out2 = at::empty(outdim2, inputs[0].type(), output_format);
  at::Half* y2 = out2.data_ptr<at::Half>();

  run_conv_scale_bias_add_activation(outdimA1, padA1, convstrideA, dilationA, filterdimA2, outdimA2, CUDNN_DATA_HALF,
                                     y1, w, y2, z, b, nullptr);
  DEBUG_MSG("[DEBUG] new relu2 : " << out2.to(at::kFloat).sum().item<float>());

  // create output of conv3
  auto out3 = at::empty(outdim3, inputs[0].type(), output_format);
  at::Half* y3 = out3.data_ptr<at::Half>();

  // create output of conv4 that may exist
  auto identity = at::empty_like(out3);
  at::Half* yi = identity.data_ptr<at::Half>();

  if (stride_1X1 != 1 || filterdimA3[0] != dimA[1]) {
    w = inputs[10].data_ptr<at::Half>();
    z = inputs[11].data_ptr<at::Half>();
    b = inputs[12].data_ptr<at::Half>();
    run_conv_scale_bias(dimA, padA, convstride1X1, dilationA, filterdimA4, outdimA3, CUDNN_DATA_HALF, x, w, yi, z, b);
    DEBUG_MSG("[DEBUG] new downsample : " << identity.to(at::kFloat).sum().item<float>());
  } else {
    yi = x;
  }

  w = inputs[3].data_ptr<at::Half>();
  z = inputs[6].data_ptr<at::Half>();
  b = inputs[9].data_ptr<at::Half>();

  run_conv_scale_bias_add_activation(outdimA2, padA, convstrideA, dilationA, filterdimA3, outdimA3, CUDNN_DATA_HALF, y2,
                                     w, y3, z, b, yi);
  DEBUG_MSG("[DEBUG] new relu3 : " << out3.to(at::kFloat).sum().item<float>());

  outputs.push_back(out1);
  outputs.push_back(out2);
  outputs.push_back(out3);

  return outputs;
}

std::vector<at::Tensor> bottleneck_backward(bool explicit_nhwc, int stride_1X1, std::vector<at::Tensor> inputs) {
  bool requires_grad = inputs[0].requires_grad();

  std::cout << std::fixed;
  // create output vector
  std::vector<at::Tensor> outputs;
  auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;

  // setup dimensions
  int64_t dimA[] = {0, 0, 0, 0};
  int64_t filterdimA1[] = {0, 0, 0, 0};
  int64_t filterdimA2[] = {0, 0, 0, 0};
  int64_t filterdimA3[] = {0, 0, 0, 0};
  int64_t filterdimA4[] = {0, 0, 0, 0};

  // All dim calculation after this order of n,c,h,w
  int axis[]{0, 1, 2, 3};
  if (explicit_nhwc) {
    axis[0] = 0;
    axis[1] = 3;
    axis[2] = 1;
    axis[3] = 2;
  }
  for (int dim = 0; dim < 4; dim++) {
    dimA[dim] = inputs[0].size(axis[dim]);
    filterdimA1[dim] = inputs[1].size(axis[dim]);
    filterdimA2[dim] = inputs[2].size(axis[dim]);
    filterdimA3[dim] = inputs[3].size(axis[dim]);
  }
  if (stride_1X1 != 1 || filterdimA3[0] != dimA[1]) {
    for (int dim = 0; dim < 4; dim++) {
      filterdimA4[dim] = inputs[14].size(axis[dim]);
    }
  }

  // output dim in n,c,h,w used by backend
  int64_t outdimA1[] = {0, 0, 0, 0};  // Computed Below
  int64_t outdimA2[] = {0, 0, 0, 0};  // Computed Below
  int64_t outdimA3[] = {0, 0, 0, 0};  // Computed Below

  // use these fixed value for test run
  int64_t padA[] = {0, 0};
  int64_t padA1[] = {1, 1};
  int64_t dilationA[] = {1, 1};
  int64_t convstrideA[] = {1, 1};
  int64_t convstride1X1[] = {stride_1X1, stride_1X1};

  // compute output from pad/stride/dilation
  outdimA1[0] = dimA[0];
  outdimA1[1] = filterdimA1[0];
  for (int dim = 0; dim < 2; dim++) {
    outdimA1[dim + 2] =
        getFwdConvOutputDim(dimA[dim + 2], padA[dim], filterdimA1[dim + 2], convstride1X1[dim], dilationA[dim]);
  }

  outdimA2[0] = outdimA1[0];
  outdimA2[1] = filterdimA2[0];
  for (int dim = 0; dim < 2; dim++) {
    outdimA2[dim + 2] =
        getFwdConvOutputDim(outdimA1[dim + 2], padA1[dim], filterdimA2[dim + 2], convstrideA[dim], dilationA[dim]);
  }

  outdimA3[0] = outdimA2[0];
  outdimA3[1] = filterdimA3[0];
  for (int dim = 0; dim < 2; dim++) {
    outdimA3[dim + 2] =
        getFwdConvOutputDim(outdimA2[dim + 2], padA[dim], filterdimA3[dim + 2], convstrideA[dim], dilationA[dim]);
  }

  // Create output tensor in the correct shape in pytorch's view
  int64_t outdim1[] = {0, 0, 0, 0};
  int64_t outdim2[] = {0, 0, 0, 0};
  int64_t outdim3[] = {0, 0, 0, 0};
  if (explicit_nhwc) {
    axis[0] = 0;
    axis[1] = 2;
    axis[2] = 3;
    axis[3] = 1;
  }
  for (int dim = 0; dim < 4; dim++) {
    outdim1[dim] = outdimA1[axis[dim]];
    outdim2[dim] = outdimA2[axis[dim]];
    outdim3[dim] = outdimA3[axis[dim]];
  }

  // dconv3+drelu2+dscale2
  at::Half* conv_in = inputs[13].data_ptr<at::Half>();
  at::Half* dy3 = inputs[10].data_ptr<at::Half>();

  DEBUG_MSG("[DEBUG] new dconv3 : " << inputs[10].to(at::kFloat).sum().item<float>());

  // wgrad
  auto wgrad3 = at::empty_like(inputs[3]);
  at::Half* dw3 = wgrad3.data_ptr<at::Half>();
  run_dconv(outdimA2, padA, convstrideA, dilationA, filterdimA3, outdimA3, CUDNN_DATA_HALF, conv_in, dw3, dy3,
            CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);

  // dgrad
  auto grad_out2 = at::empty(outdim2, inputs[0].type(), output_format);
  at::Half* dy2 = grad_out2.data_ptr<at::Half>();
  at::Half* w = inputs[3].data_ptr<at::Half>();
  at::Half* z = inputs[5].data_ptr<at::Half>();

  at::Half* relu2 = inputs[13].data_ptr<at::Half>();

  run_dconv_drelu_dscale(outdimA2, padA, convstrideA, dilationA, filterdimA3, outdimA3, CUDNN_DATA_HALF, dy2, w, dy3, z,
                         relu2);

  DEBUG_MSG("[DEBUG] new dconv2 : " << grad_out2.to(at::kFloat).sum().item<float>());

  // dconv2+drelu1+dscale1
  conv_in = inputs[12].data_ptr<at::Half>();

  // wgrad
  auto wgrad2 = at::empty_like(inputs[2]);
  at::Half* dw2 = wgrad2.data_ptr<at::Half>();
  run_dconv(outdimA1, padA1, convstrideA, dilationA, filterdimA2, outdimA2, CUDNN_DATA_HALF, conv_in, dw2, dy2,
            CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);

  // dgrad
  auto grad_out1 = at::empty(outdim1, inputs[0].type(), output_format);
  at::Half* dy1 = grad_out1.data_ptr<at::Half>();
  w = inputs[2].data_ptr<at::Half>();
  z = inputs[4].data_ptr<at::Half>();

  at::Half* relu1 = inputs[12].data_ptr<at::Half>();
  // fused dgrad
  run_dconv_drelu_dscale(outdimA1, padA1, convstrideA, dilationA, filterdimA2, outdimA2, CUDNN_DATA_HALF, dy1, w, dy2,
                         z, relu1);

  /*
    // backward strided conv cannot be fused
    // if stride == 1 but channel changes, we can fuse here
    if (stride_1X1 != 1){
      // dgrad
      run_dconv(outdimA1,
                padA1,
                convstride1X1,
                dilationA,
                filterdimA2,
                outdimA2,
                CUDNN_DATA_HALF,
                dy1,
                w,
                dy2,
                CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);

      // mul fused mask
      grad_out1.mul_(inputs[15]);
    }
    else {
      at::Half* relu1 = inputs[12].data_ptr<at::Half>();
      // fused dgrad
      run_dconv_drelu_dscale(outdimA1,
                             padA1,
                             convstride1X1,
                             dilationA,
                             filterdimA2,
                             outdimA2,
                             CUDNN_DATA_HALF,
                             dy1,
                             w,
                             dy2,
                             z,
                             relu1);
    }
  */
  DEBUG_MSG("[DEBUG] new dconv1 : " << grad_out1.to(at::kFloat).sum().item<float>());

  // create grads of conv4 that may exist
  auto grad_x_conv4 = at::empty_like(inputs[0]);
  at::Half* dx_conv4 = grad_x_conv4.data_ptr<at::Half>();
  at::Tensor wgrad4;

  // x used for dconv1 and dconv4 wgrad
  at::Half* x = inputs[0].data_ptr<at::Half>();

  if (stride_1X1 != 1 || filterdimA3[0] != dimA[1]) {
    w = inputs[14].data_ptr<at::Half>();
    at::Half* dy_conv4 = inputs[11].data_ptr<at::Half>();
    if (requires_grad) {
      run_dconv(dimA, padA, convstride1X1, dilationA, filterdimA4, outdimA3, CUDNN_DATA_HALF, dx_conv4, w, dy_conv4,
                CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);
      // we don't print here since we can't hook out this grad in pytorch alone to compare, due to addition with dx
      // DEBUG_MSG("[DEBUG] new dx_identity : " << grad_x_conv4.to(at::kFloat).sum().item<float>());
    }
    // wgrad
    wgrad4 = at::empty_like(inputs[14]);
    at::Half* dw4 = wgrad4.data_ptr<at::Half>();
    run_dconv(dimA, padA, convstride1X1, dilationA, filterdimA4, outdimA3, CUDNN_DATA_HALF, x, dw4, dy_conv4,
              CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
  } else {
    // if there is no downsample, dx_conv4 is fork of drelu3
    dx_conv4 = inputs[11].data_ptr<at::Half>();
  }

  // dconv1+add
  // wgrad
  auto wgrad1 = at::empty_like(inputs[1]);
  at::Half* dw1 = wgrad1.data_ptr<at::Half>();
  run_dconv(dimA, padA, convstride1X1, dilationA, filterdimA1, outdimA1, CUDNN_DATA_HALF, x, dw1, dy1,
            CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);

  // dgrad
  w = inputs[1].data_ptr<at::Half>();
  auto grad_x = at::empty_like(inputs[0]);
  at::Half* dx = grad_x.data_ptr<at::Half>();

  // backward strided conv cannot be fused
  // if stride == 1 but channel changes, we can fuse here
  if (requires_grad) {
    if (stride_1X1 != 1) {
      run_dconv(dimA, padA, convstride1X1, dilationA, filterdimA1, outdimA1, CUDNN_DATA_HALF, dx, w, dy1,
                CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);
      // add 2 together
      grad_x.add_(grad_x_conv4);
    } else {
      run_dconv_add(dimA, padA, convstride1X1, dilationA, filterdimA1, outdimA1, CUDNN_DATA_HALF, dx, w, dy1, dx_conv4);
    }
  }

  DEBUG_MSG("[DEBUG] new dx : " << grad_x.to(at::kFloat).sum().item<float>());
  DEBUG_MSG("[DEBUG] new wgrad1 : " << wgrad1.to(at::kFloat).sum().item<float>());
  DEBUG_MSG("[DEBUG] new wgrad2 : " << wgrad2.to(at::kFloat).sum().item<float>());
  DEBUG_MSG("[DEBUG] new wgrad3 : " << wgrad3.to(at::kFloat).sum().item<float>());
  outputs.push_back(grad_x);
  outputs.push_back(wgrad1);
  outputs.push_back(wgrad2);
  outputs.push_back(wgrad3);

  if (stride_1X1 != 1 || filterdimA3[0] != dimA[1]) {
    DEBUG_MSG("[DEBUG] new wgrad4 : " << wgrad4.to(at::kFloat).sum().item<float>());
    outputs.push_back(wgrad4);
  }

  return outputs;
}

namespace {

enum {
  X_TENSOR,
  Y_TENSOR,
  W_TENSOR,
  Z_TENSOR,
  B_TENSOR,
  AFTERADD_TENSOR,
  AFTERBIAS_TENSOR,
  AFTERCONV_TENSOR,
  OPTIONAL,
  AFTEROPT_TENSOR,
  AFTERACT_TENSOR,
  GEN_INDEX_TENSOR,
  MASK_TOP_TENSOR,
  MASK_BOTTOM_TENSOR,
  MASK_TENSOR,
  THRESHOLD_TOP_TENSOR,
  THRESHOLD_BOTTOM_TENSOR,
};

using masked_convbias_descriptors =
    std::tuple<cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor,
               cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor,
               cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor,
               cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor,
               cudnn_frontend::Tensor>;

masked_convbias_descriptors create_conv_bias_add_act_mask_descriptors(int64_t* x_dim_padded, int64_t* padA,
                                                                      int64_t* convstrideA, int64_t* dilationA,
                                                                      int64_t* w_dim_padded, int64_t* y_dim_padded,
                                                                      int64_t* threshold_dim,
                                                                      cudnnDataType_t dataType) {
  const int convDim = 2;

  int64_t b_dim_padded[4];
  b_dim_padded[0] = 1;
  b_dim_padded[1] = y_dim_padded[1];
  b_dim_padded[2] = 1;
  b_dim_padded[3] = 1;

  int64_t x_stride_padded[4];
  int64_t y_stride_padded[4];
  int64_t w_stride_padded[4];
  int64_t b_stride_padded[4];
  int64_t threshold_stride[4];

  generateStrides(w_dim_padded, w_stride_padded, 4, CUDNN_TENSOR_NHWC);
  generateStrides(x_dim_padded, x_stride_padded, 4, CUDNN_TENSOR_NHWC);
  generateStrides(y_dim_padded, y_stride_padded, 4, CUDNN_TENSOR_NHWC);
  generateStrides(b_dim_padded, b_stride_padded, 4, CUDNN_TENSOR_NHWC);
  generateStrides(threshold_dim, threshold_stride, 4, CUDNN_TENSOR_NHWC);

  return masked_convbias_descriptors(cudnn_frontend::TensorBuilder()
                                         .setDim(4, x_dim_padded)
                                         .setStrides(4, x_stride_padded)
                                         .setId('x')
                                         .setAlignment(16)
                                         .setDataType(dataType)
                                         .build(),
                                     cudnn_frontend::TensorBuilder()
                                         .setDim(4, y_dim_padded)
                                         .setStrides(4, y_stride_padded)
                                         .setId('y')
                                         .setAlignment(16)
                                         .setDataType(dataType)
                                         .build(),
                                     cudnn_frontend::TensorBuilder()
                                         .setDim(4, w_dim_padded)
                                         .setStrides(4, w_stride_padded)
                                         .setId('w')
                                         .setAlignment(16)
                                         .setDataType(dataType)
                                         .build(),
                                     cudnn_frontend::TensorBuilder()
                                         .setDim(4, b_dim_padded)
                                         .setStrides(4, b_stride_padded)
                                         .setId('z')
                                         .setAlignment(16)
                                         .setDataType(dataType)
                                         .build(),
                                     cudnn_frontend::TensorBuilder()
                                         .setDim(4, b_dim_padded)
                                         .setStrides(4, b_stride_padded)
                                         .setId('b')
                                         .setAlignment(16)
                                         .setDataType(dataType)
                                         .build(),
                                     cudnn_frontend::TensorBuilder()
                                         .setDim(4, y_dim_padded)
                                         .setStrides(4, y_stride_padded)
                                         .setVirtual()
                                         .setId('A')  // after add
                                         .setAlignment(16)
                                         .setDataType(CUDNN_DATA_FLOAT)
                                         .build(),
                                     cudnn_frontend::TensorBuilder()
                                         .setDim(4, y_dim_padded)
                                         .setStrides(4, y_stride_padded)
                                         .setVirtual()
                                         .setId('B')  // after bias
                                         .setAlignment(16)
                                         .setDataType(CUDNN_DATA_FLOAT)
                                         .build(),
                                     cudnn_frontend::TensorBuilder()
                                         .setDim(4, y_dim_padded)
                                         .setStrides(4, y_stride_padded)
                                         .setId('C')  // after conv
                                         .setAlignment(16)
                                         .setVirtual()
                                         .setDataType(CUDNN_DATA_FLOAT)
                                         .build(),
                                     cudnn_frontend::TensorBuilder()
                                         .setDim(4, y_dim_padded)
                                         .setStrides(4, y_stride_padded)
                                         .setId('i')
                                         .setAlignment(16)
                                         .setDataType(dataType)
                                         .build(),
                                     cudnn_frontend::TensorBuilder()
                                         .setDim(4, y_dim_padded)
                                         .setStrides(4, y_stride_padded)
                                         .setId('D')  // after optional add
                                         .setAlignment(16)
                                         .setVirtual()
                                         .setDataType(CUDNN_DATA_FLOAT)
                                         .build(),
                                     cudnn_frontend::TensorBuilder()
                                         .setDim(4, y_dim_padded)
                                         .setStrides(4, y_stride_padded)
                                         .setId('E')  // after act for masked
                                         .setAlignment(16)
                                         .setVirtual()
                                         .setDataType(CUDNN_DATA_FLOAT)
                                         .build(),
                                     cudnn_frontend::TensorBuilder()
                                         .setDim(4, y_dim_padded)
                                         .setStrides(4, y_stride_padded)
                                         .setId('I')  // output of the gen index operation
                                         .setAlignment(16)
                                         .setVirtual()
                                         .setDataType(CUDNN_DATA_INT32)
                                         .build(),
                                     cudnn_frontend::TensorBuilder()
                                         .setDim(4, y_dim_padded)
                                         .setStrides(4, y_stride_padded)
                                         .setId('m')  // top half of the mask created after the less than
                                         .setAlignment(16)
                                         .setVirtual()
                                         .setDataType(CUDNN_DATA_BOOLEAN)
                                         .build(),
                                     cudnn_frontend::TensorBuilder()
                                         .setDim(4, y_dim_padded)
                                         .setStrides(4, y_stride_padded)
                                         .setId('n')  // bottom half of the mask
                                         .setAlignment(16)
                                         .setVirtual()
                                         .setDataType(CUDNN_DATA_BOOLEAN)
                                         .build(),
                                     cudnn_frontend::TensorBuilder()
                                         .setDim(4, y_dim_padded)
                                         .setStrides(4, y_stride_padded)
                                         .setId('M')  // OR of the top and bottom masks
                                         .setAlignment(16)
                                         .setVirtual()
                                         .setDataType(CUDNN_DATA_BOOLEAN)
                                         .build(),
                                     cudnn_frontend::TensorBuilder()
                                         .setDim(4, threshold_dim)
                                         .setStrides(4, threshold_stride)
                                         .setId('t')  // threshold for creating the top mask
                                         .setAlignment(16)
                                         .setDataType(CUDNN_DATA_INT32)
                                         .build(),
                                     cudnn_frontend::TensorBuilder()
                                         .setDim(4, threshold_dim)
                                         .setStrides(4, threshold_stride)
                                         .setId('u')  // threshold for creating the bottom mask
                                         .setAlignment(16)
                                         .setDataType(CUDNN_DATA_INT32)
                                         .build());
}

// tensor descriptors used for dgrad
enum {
  X_OR_DX_TENSOR,
  DY_TENSOR,
  W_OR_DW_TENSOR,
  SCALE_TENSOR,
  RELU_TENSOR,
  AFTER_DCONV_TENSOR,
  AFTER_DRELU_TENSOR,
  DGRAD_INPUT_TENSOR,
  DGRAD_OPTIONAL_TENSOR,
  DGRAD_GEN_INDEX_TENSOR,
  DGRAD_MASK_TOP_TENSOR,
  DGRAD_MASK_BOTTOM_TENSOR,
  DGRAD_MASK_TENSOR,
  DGRAD_THRESHOLD_TOP_TENSOR,
  DGRAD_THRESHOLD_BOTTOM_TENSOR,
};

using dconv_add_descriptors = std::tuple<cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor,
                                         cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor,
                                         cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor>;

dconv_add_descriptors create_dconv_add_descriptors(int64_t* x_dim_padded, int64_t* padA, int64_t* convstrideA,
                                                   int64_t* dilationA, int64_t* w_dim_padded, int64_t* y_dim_padded,
                                                   cudnnDataType_t dataType) {
  const int convDim = 2;

  int64_t b_dim_padded[4];
  b_dim_padded[0] = 1;
  b_dim_padded[1] = x_dim_padded[1];
  b_dim_padded[2] = 1;
  b_dim_padded[3] = 1;

  int64_t x_stride_padded[4];
  int64_t y_stride_padded[4];
  int64_t w_stride_padded[4];
  int64_t b_stride_padded[4];

  generateStrides(w_dim_padded, w_stride_padded, 4, CUDNN_TENSOR_NHWC);
  generateStrides(x_dim_padded, x_stride_padded, 4, CUDNN_TENSOR_NHWC);
  generateStrides(y_dim_padded, y_stride_padded, 4, CUDNN_TENSOR_NHWC);
  generateStrides(b_dim_padded, b_stride_padded, 4, CUDNN_TENSOR_NHWC);

  return dconv_add_descriptors(cudnn_frontend::TensorBuilder()
                                   .setDim(4, x_dim_padded)
                                   .setStrides(4, x_stride_padded)
                                   .setId('x')
                                   .setAlignment(16)
                                   .setDataType(dataType)
                                   .build(),
                               cudnn_frontend::TensorBuilder()
                                   .setDim(4, y_dim_padded)
                                   .setStrides(4, y_stride_padded)
                                   .setId('y')
                                   .setAlignment(16)
                                   .setDataType(dataType)
                                   .build(),
                               cudnn_frontend::TensorBuilder()
                                   .setDim(4, w_dim_padded)
                                   .setStrides(4, w_stride_padded)
                                   .setId('w')
                                   .setAlignment(16)
                                   .setDataType(dataType)
                                   .build(),
                               cudnn_frontend::TensorBuilder()
                                   .setDim(4, b_dim_padded)
                                   .setStrides(4, b_stride_padded)
                                   .setId('s')
                                   .setAlignment(16)
                                   .setDataType(dataType)
                                   .build(),
                               cudnn_frontend::TensorBuilder()
                                   .setDim(4, x_dim_padded)
                                   .setStrides(4, x_stride_padded)
                                   .setId('r')
                                   .setAlignment(16)
                                   .setDataType(dataType)
                                   .build(),
                               cudnn_frontend::TensorBuilder()
                                   .setDim(4, x_dim_padded)
                                   .setStrides(4, x_stride_padded)
                                   .setVirtual()
                                   .setId('A')  // after dconv
                                   .setAlignment(16)
                                   .setDataType(CUDNN_DATA_FLOAT)
                                   .build(),
                               cudnn_frontend::TensorBuilder()
                                   .setDim(4, x_dim_padded)
                                   .setStrides(4, x_stride_padded)
                                   .setVirtual()
                                   .setId('B')  // after drelu
                                   .setAlignment(16)
                                   .setDataType(CUDNN_DATA_FLOAT)
                                   .build(),
                               cudnn_frontend::TensorBuilder()
                                   .setDim(4, y_dim_padded)
                                   .setStrides(4, y_stride_padded)
                                   .setId('i')
                                   .setAlignment(16)
                                   .setDataType(dataType)
                                   .build(),
                               cudnn_frontend::TensorBuilder()
                                   .setDim(4, y_dim_padded)
                                   .setStrides(4, y_stride_padded)
                                   .setId('D')  // after optional add
                                   .setAlignment(16)
                                   .setVirtual()
                                   .setDataType(CUDNN_DATA_FLOAT)
                                   .build());
}

using dconv_mask_descriptors =
    std::tuple<cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor,
               cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor,
               cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor,
               cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor>;

dconv_mask_descriptors create_dconv_mask_descriptors(int64_t* x_dim_padded, int64_t* padA, int64_t* convstrideA,
                                                     int64_t* dilationA, int64_t* w_dim_padded, int64_t* y_dim_padded,
                                                     int64_t* threshold_dim, cudnnDataType_t dataType) {
  const int convDim = 2;

  int64_t b_dim_padded[4];
  b_dim_padded[0] = 1;
  b_dim_padded[1] = x_dim_padded[1];
  b_dim_padded[2] = 1;
  b_dim_padded[3] = 1;

  int64_t x_stride_padded[4];
  int64_t y_stride_padded[4];
  int64_t w_stride_padded[4];
  int64_t b_stride_padded[4];
  int64_t threshold_stride[4];

  generateStrides(w_dim_padded, w_stride_padded, 4, CUDNN_TENSOR_NHWC);
  generateStrides(x_dim_padded, x_stride_padded, 4, CUDNN_TENSOR_NHWC);
  generateStrides(y_dim_padded, y_stride_padded, 4, CUDNN_TENSOR_NHWC);
  generateStrides(b_dim_padded, b_stride_padded, 4, CUDNN_TENSOR_NHWC);
  generateStrides(threshold_dim, threshold_stride, 4, CUDNN_TENSOR_NHWC);

  return dconv_mask_descriptors(cudnn_frontend::TensorBuilder()
                                    .setDim(4, x_dim_padded)
                                    .setStrides(4, x_stride_padded)
                                    .setId('x')
                                    .setAlignment(16)
                                    .setDataType(dataType)
                                    .build(),
                                cudnn_frontend::TensorBuilder()
                                    .setDim(4, y_dim_padded)
                                    .setStrides(4, y_stride_padded)
                                    .setId('y')
                                    .setAlignment(16)
                                    .setDataType(dataType)
                                    .build(),
                                cudnn_frontend::TensorBuilder()
                                    .setDim(4, w_dim_padded)
                                    .setStrides(4, w_stride_padded)
                                    .setId('w')
                                    .setAlignment(16)
                                    .setDataType(dataType)
                                    .build(),
                                cudnn_frontend::TensorBuilder()
                                    .setDim(4, b_dim_padded)
                                    .setStrides(4, b_stride_padded)
                                    .setId('s')
                                    .setAlignment(16)
                                    .setDataType(dataType)
                                    .build(),
                                cudnn_frontend::TensorBuilder()
                                    .setDim(4, x_dim_padded)
                                    .setStrides(4, x_stride_padded)
                                    .setId('r')
                                    .setAlignment(16)
                                    .setDataType(dataType)
                                    .build(),
                                cudnn_frontend::TensorBuilder()
                                    .setDim(4, x_dim_padded)
                                    .setStrides(4, x_stride_padded)
                                    .setVirtual()
                                    .setId('A')  // after dconv
                                    .setAlignment(16)
                                    .setDataType(CUDNN_DATA_FLOAT)
                                    .build(),
                                cudnn_frontend::TensorBuilder()
                                    .setDim(4, x_dim_padded)
                                    .setStrides(4, x_stride_padded)
                                    .setVirtual()
                                    .setId('B')  // after drelu
                                    .setAlignment(16)
                                    .setDataType(CUDNN_DATA_FLOAT)
                                    .build(),
                                cudnn_frontend::TensorBuilder()
                                    .setDim(4, y_dim_padded)
                                    .setStrides(4, y_stride_padded)
                                    .setId('i')
                                    .setAlignment(16)
                                    .setDataType(dataType)
                                    .build(),
                                cudnn_frontend::TensorBuilder()
                                    .setDim(4, y_dim_padded)
                                    .setStrides(4, y_stride_padded)
                                    .setId('D')  // after optional add
                                    .setAlignment(16)
                                    .setVirtual()
                                    .setDataType(CUDNN_DATA_FLOAT)
                                    .build(),
                                cudnn_frontend::TensorBuilder()
                                    .setDim(4, y_dim_padded)
                                    .setStrides(4, y_stride_padded)
                                    .setId('I')  // output of the gen index operation
                                    .setAlignment(16)
                                    .setVirtual()
                                    .setDataType(CUDNN_DATA_INT32)
                                    .build(),
                                cudnn_frontend::TensorBuilder()
                                    .setDim(4, y_dim_padded)
                                    .setStrides(4, y_stride_padded)
                                    .setId('m')  // top half of the mask created after the less than
                                    .setAlignment(16)
                                    .setVirtual()
                                    .setDataType(CUDNN_DATA_BOOLEAN)
                                    .build(),
                                cudnn_frontend::TensorBuilder()
                                    .setDim(4, y_dim_padded)
                                    .setStrides(4, y_stride_padded)
                                    .setId('n')  // bottom half of the mask
                                    .setAlignment(16)
                                    .setVirtual()
                                    .setDataType(CUDNN_DATA_BOOLEAN)
                                    .build(),
                                cudnn_frontend::TensorBuilder()
                                    .setDim(4, y_dim_padded)
                                    .setStrides(4, y_stride_padded)
                                    .setId('M')  // OR of the top and bottom masks
                                    .setAlignment(16)
                                    .setVirtual()
                                    .setDataType(CUDNN_DATA_BOOLEAN)
                                    .build(),
                                cudnn_frontend::TensorBuilder()
                                    .setDim(4, threshold_dim)
                                    .setStrides(4, threshold_stride)
                                    .setId('t')  // threshold for creating the top mask
                                    .setAlignment(16)
                                    .setDataType(CUDNN_DATA_INT32)
                                    .build(),
                                cudnn_frontend::TensorBuilder()
                                    .setDim(4, threshold_dim)
                                    .setStrides(4, threshold_stride)
                                    .setId('u')  // threshold for creating the bottom mask
                                    .setAlignment(16)
                                    .setDataType(CUDNN_DATA_INT32)
                                    .build());
}

void run_conv_add_scale_bias_activation(int64_t* x_dim_padded, int64_t* pad, int64_t* convstride, int64_t* dilation,
                                        int64_t* w_dim_padded, int64_t* y_dim_padded, cudnnDataType_t dataType,
                                        at::Half* devPtrX, at::Half* devPtrW, at::Half* devPtrY, at::Half* devPtrZ,
                                        at::Half* devPtrB, at::Half* devPtrI) {
  cudnnHandle_t handle_ = torch::native::getCudnnHandle();
  std::stringstream log_buf;
  try {
    int convDim = 2;

    // Creates the necessary tensor descriptors
    common_convbias_descriptors tensors = create_conv_bias_add_act_descriptors(x_dim_padded, pad, convstride, dilation,
                                                                               w_dim_padded, y_dim_padded, dataType);
    DEBUG_CUDNN_MSG(log_buf, std::get<X_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<Y_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<W_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<Z_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<B_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<AFTERADD_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<AFTERBIAS_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<AFTERCONV_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<OPTIONAL>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<AFTEROPT_TENSOR>(tensors).describe());

    // Define the add operation
    auto scaleDesc =
        cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_MUL).setMathPrecision(CUDNN_DATA_FLOAT).build();
    DEBUG_CUDNN_MSG(log_buf, scaleDesc.describe());

    // Define the bias operation
    auto biasDesc =
        cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_ADD).setMathPrecision(CUDNN_DATA_FLOAT).build();
    DEBUG_CUDNN_MSG(log_buf, biasDesc.describe());

    // optional add
    auto addDesc =
        cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_ADD).setMathPrecision(CUDNN_DATA_FLOAT).build();
    DEBUG_CUDNN_MSG(log_buf, addDesc.describe());

    // Define the activation operation
    auto actDesc = cudnn_frontend::PointWiseDescBuilder()
                       .setMode(CUDNN_POINTWISE_RELU_FWD)
                       .setMathPrecision(CUDNN_DATA_FLOAT)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, actDesc.describe());

    // Define the convolution problem
    auto convDesc = cudnn_frontend::ConvDescBuilder()
                        .setDataType(CUDNN_DATA_FLOAT)
                        .setMathMode(CUDNN_CROSS_CORRELATION)
                        .setNDims(convDim)
                        .setStrides(convDim, convstride)
                        .setPrePadding(convDim, pad)
                        .setPostPadding(convDim, pad)
                        .setDilation(convDim, dilation)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, convDesc.describe());

    float alpha = 1.0f;
    float beta = 0.0f;

    // Create a convolution Node
    auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
                       .setxDesc(std::get<X_TENSOR>(tensors))
                       .setwDesc(std::get<W_TENSOR>(tensors))
                       .setyDesc(std::get<AFTERCONV_TENSOR>(tensors))
                       .setcDesc(convDesc)
                       .setAlpha(alpha)
                       .setBeta(beta)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, conv_op.describe());

    // create an add node.
    auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                      .setxDesc(conv_op.getOutputTensor())
                      .setbDesc(std::get<OPTIONAL>(tensors))
                      .setyDesc(std::get<AFTEROPT_TENSOR>(tensors))
                      .setpwDesc(addDesc)
                      .build();
    DEBUG_CUDNN_MSG(log_buf, add_op.describe());

    // Create a Add Node with scaling parameters.
    auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                        .setxDesc(add_op.getOutputTensor())
                        .setbDesc(std::get<Z_TENSOR>(tensors))
                        .setyDesc(std::get<AFTERADD_TENSOR>(tensors))
                        .setpwDesc(scaleDesc)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, scale_op.describe());

    // Create a Bias Node.
    auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                       .setxDesc(scale_op.getOutputTensor())
                       .setbDesc(std::get<B_TENSOR>(tensors))
                       .setyDesc(std::get<AFTERBIAS_TENSOR>(tensors))
                       .setpwDesc(biasDesc)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, bias_op.describe());

    // Create an Activation Node.
    auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                      .setxDesc(bias_op.getOutputTensor())
                      .setyDesc(std::get<Y_TENSOR>(tensors))
                      .setpwDesc(actDesc)
                      .build();
    DEBUG_CUDNN_MSG(log_buf, act_op.describe());

    // Create an Operation Graph. In this case it is convolution add bias activation
    std::array<cudnn_frontend::Operation const*, 5> ops = {&conv_op, &add_op, &scale_op, &bias_op, &act_op};

    auto opGraph =
        cudnn_frontend::OperationGraphBuilder().setHandle(handle_).setOperationGraph(ops.size(), ops.data()).build();

    // Create string encoding for plan caching
    auto cache_string =
        getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
    DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);

    auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
    DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());

    auto workspace_size = plan.getWorkspaceSize();
    DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);

    void* workspace_ptr = nullptr;
    auto workspace_tensor = at::empty({(workspace_size + 3) / 4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
    if (workspace_size > 0) {
      workspace_ptr = workspace_tensor.data_ptr<float>();
    }
    void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrZ, devPtrB, devPtrI};
    int64_t uids[] = {'x', 'y', 'w', 'z', 'b', 'i'};
    auto variantPack = cudnn_frontend::VariantPackBuilder()
                           .setWorkspacePointer(workspace_ptr)
                           .setDataPointers(6, data_ptrs)
                           .setUids(6, uids)
                           .build();
    DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
    cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
    checkCudnnErr(status);
    cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
  } catch (cudnn_frontend::cudnnException e) {
    std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
  }
}

void run_conv_scale_bias_add_activation_mask(int64_t* x_dim_padded, int64_t* pad, int64_t* convstride,
                                             int64_t* dilation, int64_t* w_dim_padded, int64_t* y_dim_padded,
                                             int64_t* threshold_dim, cudnnDataType_t dataType, at::Half* devPtrX,
                                             at::Half* devPtrW, at::Half* devPtrY, at::Half* devPtrZ, at::Half* devPtrB,
                                             at::Half* devPtrI, int* devPtrT, int* devPtrU, int axis) {
  cudnnHandle_t handle_ = torch::native::getCudnnHandle();
  std::stringstream log_buf;
  try {
    int convDim = 2;

    // Creates the necessary tensor descriptors
    masked_convbias_descriptors tensors = create_conv_bias_add_act_mask_descriptors(
        x_dim_padded, pad, convstride, dilation, w_dim_padded, y_dim_padded, threshold_dim, dataType);
    DEBUG_CUDNN_MSG(log_buf, std::get<X_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<Y_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<W_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<Z_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<B_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<AFTERADD_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<AFTERBIAS_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<AFTERCONV_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<OPTIONAL>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<AFTERACT_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<GEN_INDEX_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<MASK_TOP_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<MASK_BOTTOM_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<MASK_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<THRESHOLD_TOP_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<THRESHOLD_BOTTOM_TENSOR>(tensors).describe());

    // Define the add operation
    auto scaleDesc =
        cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_MUL).setMathPrecision(CUDNN_DATA_FLOAT).build();
    DEBUG_CUDNN_MSG(log_buf, scaleDesc.describe());

    // Define the bias operation
    auto biasDesc =
        cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_ADD).setMathPrecision(CUDNN_DATA_FLOAT).build();
    DEBUG_CUDNN_MSG(log_buf, biasDesc.describe());

    // optional add
    auto addDesc =
        cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_ADD).setMathPrecision(CUDNN_DATA_FLOAT).build();
    DEBUG_CUDNN_MSG(log_buf, addDesc.describe());

    // Define the activation operation
    auto actDesc = cudnn_frontend::PointWiseDescBuilder()
                       .setMode(CUDNN_POINTWISE_RELU_FWD)
                       .setMathPrecision(CUDNN_DATA_FLOAT)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, actDesc.describe());

    // Define the convolution problem
    auto convDesc = cudnn_frontend::ConvDescBuilder()
                        .setDataType(CUDNN_DATA_FLOAT)
                        .setMathMode(CUDNN_CROSS_CORRELATION)
                        .setNDims(convDim)
                        .setStrides(convDim, convstride)
                        .setPrePadding(convDim, pad)
                        .setPostPadding(convDim, pad)
                        .setDilation(convDim, dilation)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, convDesc.describe());

    // Define the genIndex descriptor
    auto genIndexDesc = cudnn_frontend::PointWiseDescBuilder()
                            .setMode(CUDNN_POINTWISE_GEN_INDEX)
                            .setMathPrecision(CUDNN_DATA_FLOAT)
                            .setAxis(axis)
                            .build();
    DEBUG_CUDNN_MSG(log_buf, genIndexDesc.describe());

    // Define the lessThan descriptor
    auto lessThanDesc = cudnn_frontend::PointWiseDescBuilder()
                            .setMode(CUDNN_POINTWISE_CMP_LT)
                            .setMathPrecision(CUDNN_DATA_FLOAT)
                            .build();
    DEBUG_CUDNN_MSG(log_buf, lessThanDesc.describe());

    // Define the greaterThan descriptor
    auto greaterThanDesc = cudnn_frontend::PointWiseDescBuilder()
                               .setMode(CUDNN_POINTWISE_CMP_GT)
                               .setMathPrecision(CUDNN_DATA_FLOAT)
                               .build();
    DEBUG_CUDNN_MSG(log_buf, greaterThanDesc.describe());

    // Define the logical_or descriptor
    auto logicalOrDesc = cudnn_frontend::PointWiseDescBuilder()
                             .setMode(CUDNN_POINTWISE_LOGICAL_OR)
                             .setMathPrecision(CUDNN_DATA_BOOLEAN)
                             .build();
    DEBUG_CUDNN_MSG(log_buf, logicalOrDesc.describe());

    // Define the binary_selection descriptor
    auto selectionDesc = cudnn_frontend::PointWiseDescBuilder()
                             .setMode(CUDNN_POINTWISE_BINARY_SELECT)
                             .setMathPrecision(CUDNN_DATA_FLOAT)
                             .build();
    DEBUG_CUDNN_MSG(log_buf, selectionDesc.describe());

    float alpha = 1.0f;
    float beta = 0.0f;

    // Create a convolution Node
    auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
                       .setxDesc(std::get<X_TENSOR>(tensors))
                       .setwDesc(std::get<W_TENSOR>(tensors))
                       .setyDesc(std::get<AFTERCONV_TENSOR>(tensors))
                       .setcDesc(convDesc)
                       .setAlpha(alpha)
                       .setBeta(beta)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, conv_op.describe());

    // Create a Add Node with scaling parameters.
    auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                        .setxDesc(conv_op.getOutputTensor())
                        .setbDesc(std::get<Z_TENSOR>(tensors))
                        .setyDesc(std::get<AFTERADD_TENSOR>(tensors))
                        .setpwDesc(scaleDesc)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, scale_op.describe());

    // Create a Bias Node.
    auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                       .setxDesc(scale_op.getOutputTensor())
                       .setbDesc(std::get<B_TENSOR>(tensors))
                       .setyDesc(std::get<AFTERBIAS_TENSOR>(tensors))
                       .setpwDesc(biasDesc)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, bias_op.describe());

    // Create a optional add Node.
    auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                      .setxDesc(bias_op.getOutputTensor())
                      .setbDesc(std::get<OPTIONAL>(tensors))
                      .setyDesc(std::get<AFTEROPT_TENSOR>(tensors))
                      .setpwDesc(addDesc)
                      .build();
    DEBUG_CUDNN_MSG(log_buf, add_op.describe());

    // Create an Activation Node.
    auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                      .setxDesc(devPtrI ? add_op.getOutputTensor() : bias_op.getOutputTensor())
                      .setyDesc(std::get<AFTERACT_TENSOR>(tensors))
                      .setpwDesc(actDesc)
                      .build();
    DEBUG_CUDNN_MSG(log_buf, act_op.describe());

    // Create a Gen_Index Node.
    auto genIndex_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                           .setxDesc(std::get<AFTERACT_TENSOR>(tensors))
                           .setyDesc(std::get<GEN_INDEX_TENSOR>(tensors))
                           .setpwDesc(genIndexDesc)
                           .build();
    DEBUG_CUDNN_MSG(log_buf, genIndex_op.describe());

    // Create a LessThan Node.
    auto lessThan_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                           .setxDesc(std::get<GEN_INDEX_TENSOR>(tensors))
                           .setbDesc(std::get<THRESHOLD_TOP_TENSOR>(tensors))
                           .setyDesc(std::get<MASK_TOP_TENSOR>(tensors))
                           .setpwDesc(lessThanDesc)
                           .build();
    DEBUG_CUDNN_MSG(log_buf, lessThan_op.describe());

    // Create a GreaterThan Node.
    auto greaterThan_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                              .setxDesc(std::get<GEN_INDEX_TENSOR>(tensors))
                              .setbDesc(std::get<THRESHOLD_BOTTOM_TENSOR>(tensors))
                              .setyDesc(std::get<MASK_BOTTOM_TENSOR>(tensors))
                              .setpwDesc(greaterThanDesc)
                              .build();
    DEBUG_CUDNN_MSG(log_buf, greaterThan_op.describe());

    // Create a LogicalOr Node.
    auto logicalOr_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                            .setxDesc(std::get<MASK_TOP_TENSOR>(tensors))
                            .setbDesc(std::get<MASK_BOTTOM_TENSOR>(tensors))
                            .setyDesc(std::get<MASK_TENSOR>(tensors))
                            .setpwDesc(logicalOrDesc)
                            .build();
    DEBUG_CUDNN_MSG(log_buf, logicalOr_op.describe());

    // Create a Binary_Selection Node.
    auto selection_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                            .setxDesc(std::get<AFTERCONV_TENSOR>(tensors))
                            .setbDesc(std::get<AFTERACT_TENSOR>(tensors))
                            .settDesc(std::get<MASK_TENSOR>(tensors))
                            .setyDesc(std::get<Y_TENSOR>(tensors))
                            .setpwDesc(selectionDesc)
                            .build();
    DEBUG_CUDNN_MSG(log_buf, selection_op.describe());

    // Create an Operation Graph. In this case it is convolution add bias activation
    if (devPtrI) {
      std::array<cudnn_frontend::Operation const*, 10> ops = {
          &conv_op,     &scale_op,    &bias_op,        &add_op,       &act_op,
          &genIndex_op, &lessThan_op, &greaterThan_op, &logicalOr_op, &selection_op};

      auto opGraph =
          cudnn_frontend::OperationGraphBuilder().setHandle(handle_).setOperationGraph(ops.size(), ops.data()).build();

      // Create string encoding for plan caching
      auto cache_string =
          getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
      DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);

      auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
      DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());

      auto workspace_size = plan.getWorkspaceSize();
      DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);

      void* workspace_ptr = nullptr;
      auto workspace_tensor = at::empty({(workspace_size + 3) / 4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
      if (workspace_size > 0) {
        workspace_ptr = workspace_tensor.data_ptr<float>();
      }
      void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrZ, devPtrB, devPtrI, devPtrT, devPtrU};
      int64_t uids[] = {'x', 'y', 'w', 'z', 'b', 'i', 't', 'u'};
      auto variantPack = cudnn_frontend::VariantPackBuilder()
                             .setWorkspacePointer(workspace_ptr)
                             .setDataPointers(8, data_ptrs)
                             .setUids(8, uids)
                             .build();
      DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
      cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
      checkCudnnErr(status);
      cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
    } else {
      std::array<cudnn_frontend::Operation const*, 9> ops = {&conv_op,        &scale_op,     &bias_op,
                                                             &act_op,         &genIndex_op,  &lessThan_op,
                                                             &greaterThan_op, &logicalOr_op, &selection_op};

      auto opGraph =
          cudnn_frontend::OperationGraphBuilder().setHandle(handle_).setOperationGraph(ops.size(), ops.data()).build();

      // Create string encoding for plan caching
      auto cache_string =
          getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
      DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);

      auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
      DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());

      auto workspace_size = plan.getWorkspaceSize();
      DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);

      void* workspace_ptr = nullptr;
      auto workspace_tensor = at::empty({(workspace_size + 3) / 4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
      if (workspace_size > 0) {
        workspace_ptr = workspace_tensor.data_ptr<float>();
      }
      void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrZ, devPtrB, devPtrT, devPtrU};
      int64_t uids[] = {'x', 'y', 'w', 'z', 'b', 't', 'u'};
      auto variantPack = cudnn_frontend::VariantPackBuilder()
                             .setWorkspacePointer(workspace_ptr)
                             .setDataPointers(7, data_ptrs)
                             .setUids(7, uids)
                             .build();
      DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
      cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
      checkCudnnErr(status);
      cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
    }
  } catch (cudnn_frontend::cudnnException e) {
    std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
  }
}

void run_dconv_add_drelu_dscale(int64_t* x_dim_padded, int64_t* pad, int64_t* convstride, int64_t* dilation,
                                int64_t* w_dim_padded, int64_t* y_dim_padded, cudnnDataType_t dataType,
                                at::Half* devPtrX, at::Half* devPtrW, at::Half* devPtrY, at::Half* devPtrZ,
                                at::Half* devPtrR, at::Half* devPtrI) {
  cudnnHandle_t handle_ = torch::native::getCudnnHandle();
  std::stringstream log_buf;
  try {
    int convDim = 2;

    // Creates the necessary tensor descriptors
    dconv_add_descriptors tensors =
        create_dconv_add_descriptors(x_dim_padded, pad, convstride, dilation, w_dim_padded, y_dim_padded, dataType);
    DEBUG_CUDNN_MSG(log_buf, std::get<X_OR_DX_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<DY_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<W_OR_DW_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<SCALE_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<RELU_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<AFTER_DCONV_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<AFTER_DRELU_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<DGRAD_INPUT_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<DGRAD_OPTIONAL_TENSOR>(tensors).describe());

    // Define the convolution problem
    auto convDesc = cudnn_frontend::ConvDescBuilder()
                        .setDataType(CUDNN_DATA_FLOAT)
                        .setMathMode(CUDNN_CROSS_CORRELATION)
                        .setNDims(convDim)
                        .setStrides(convDim, convstride)
                        .setPrePadding(convDim, pad)
                        .setPostPadding(convDim, pad)
                        .setDilation(convDim, dilation)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, convDesc.describe());

    // optional add
    auto addDesc =
        cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_ADD).setMathPrecision(CUDNN_DATA_FLOAT).build();
    DEBUG_CUDNN_MSG(log_buf, addDesc.describe());

    // Define the activation backward operation
    auto actDesc = cudnn_frontend::PointWiseDescBuilder()
                       .setMode(CUDNN_POINTWISE_RELU_BWD)
                       .setMathPrecision(CUDNN_DATA_FLOAT)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, actDesc.describe());

    // Define the scale backward operation
    auto scaleDesc =
        cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_MUL).setMathPrecision(CUDNN_DATA_FLOAT).build();
    DEBUG_CUDNN_MSG(log_buf, scaleDesc.describe());

    float alpha = 1.0f;
    float beta = 0.0f;

    // Create a convolution Node
    auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR)
                       .setdxDesc(std::get<AFTER_DCONV_TENSOR>(tensors))
                       .setwDesc(std::get<W_OR_DW_TENSOR>(tensors))
                       .setdyDesc(std::get<DY_TENSOR>(tensors))
                       .setcDesc(convDesc)
                       .setAlpha(alpha)
                       .setBeta(beta)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, conv_op.describe());

    // Create add Node.
    auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                      .setxDesc(std::get<AFTER_DCONV_TENSOR>(tensors))
                      .setbDesc(std::get<DGRAD_INPUT_TENSOR>(tensors))
                      .setyDesc(std::get<DGRAD_OPTIONAL_TENSOR>(tensors))
                      .setpwDesc(addDesc)
                      .build();
    DEBUG_CUDNN_MSG(log_buf, add_op.describe());

    // TODO: do we need getOutputTensor(), and what it returns in backward case?
    // Create an relu backward Node.
    auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                      .setdyDesc(std::get<DGRAD_OPTIONAL_TENSOR>(tensors))
                      .setxDesc(std::get<RELU_TENSOR>(tensors))
                      .setdxDesc(std::get<AFTER_DRELU_TENSOR>(tensors))
                      .setpwDesc(actDesc)
                      .build();
    DEBUG_CUDNN_MSG(log_buf, act_op.describe());

    // Create a Scale Node.
    auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                        .setxDesc(std::get<AFTER_DRELU_TENSOR>(tensors))
                        .setbDesc(std::get<SCALE_TENSOR>(tensors))
                        .setyDesc(std::get<X_OR_DX_TENSOR>(tensors))
                        .setpwDesc(scaleDesc)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, scale_op.describe());

    // Create an Operation Graph. In this case it is convolution add bias activation
    std::array<cudnn_frontend::Operation const*, 4> ops = {&conv_op, &add_op, &act_op, &scale_op};

    auto opGraph =
        cudnn_frontend::OperationGraphBuilder().setHandle(handle_).setOperationGraph(ops.size(), ops.data()).build();

    // Create string encoding for plan caching
    auto cache_string =
        getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
    DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);

    auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
    DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());

    auto workspace_size = plan.getWorkspaceSize();
    DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);

    void* workspace_ptr = nullptr;
    auto workspace_tensor = at::empty({(workspace_size + 3) / 4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
    if (workspace_size > 0) {
      workspace_ptr = workspace_tensor.data_ptr<float>();
    }
    void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrZ, devPtrR, devPtrI};
    int64_t uids[] = {'x', 'y', 'w', 's', 'r', 'i'};
    auto variantPack = cudnn_frontend::VariantPackBuilder()
                           .setWorkspacePointer(workspace_ptr)
                           .setDataPointers(6, data_ptrs)
                           .setUids(6, uids)
                           .build();
    DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
    cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
    checkCudnnErr(status);
    cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
  } catch (cudnn_frontend::cudnnException e) {
    std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
  }
}

void run_dconv_drelu_dscale_mask(int64_t* x_dim_padded, int64_t* pad, int64_t* convstride, int64_t* dilation,
                                 int64_t* w_dim_padded, int64_t* y_dim_padded, int64_t* threshold_dim,
                                 cudnnDataType_t dataType, at::Half* devPtrX, at::Half* devPtrW, at::Half* devPtrY,
                                 at::Half* devPtrZ, at::Half* devPtrR, int* devPtrT, int* devPtrU, int axis) {
  cudnnHandle_t handle_ = torch::native::getCudnnHandle();
  std::stringstream log_buf;
  try {
    int convDim = 2;

    // Creates the necessary tensor descriptors
    dconv_mask_descriptors tensors = create_dconv_mask_descriptors(x_dim_padded, pad, convstride, dilation,
                                                                   w_dim_padded, y_dim_padded, threshold_dim, dataType);
    DEBUG_CUDNN_MSG(log_buf, std::get<X_OR_DX_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<DY_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<W_OR_DW_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<SCALE_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<RELU_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<AFTER_DCONV_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<AFTER_DRELU_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<DGRAD_OPTIONAL_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<DGRAD_GEN_INDEX_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<DGRAD_MASK_TOP_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<DGRAD_MASK_BOTTOM_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<DGRAD_MASK_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<DGRAD_THRESHOLD_TOP_TENSOR>(tensors).describe());
    DEBUG_CUDNN_MSG(log_buf, std::get<DGRAD_THRESHOLD_BOTTOM_TENSOR>(tensors).describe());

    // Define the convolution problem
    auto convDesc = cudnn_frontend::ConvDescBuilder()
                        .setDataType(CUDNN_DATA_FLOAT)
                        .setMathMode(CUDNN_CROSS_CORRELATION)
                        .setNDims(convDim)
                        .setStrides(convDim, convstride)
                        .setPrePadding(convDim, pad)
                        .setPostPadding(convDim, pad)
                        .setDilation(convDim, dilation)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, convDesc.describe());

    // Define the activation backward operation
    auto actDesc = cudnn_frontend::PointWiseDescBuilder()
                       .setMode(CUDNN_POINTWISE_RELU_BWD)
                       .setMathPrecision(CUDNN_DATA_FLOAT)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, actDesc.describe());

    // Define the scale backward operation
    auto scaleDesc =
        cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_MUL).setMathPrecision(CUDNN_DATA_FLOAT).build();
    DEBUG_CUDNN_MSG(log_buf, scaleDesc.describe());

    // Define the genIndex descriptor
    auto genIndexDesc = cudnn_frontend::PointWiseDescBuilder()
                            .setMode(CUDNN_POINTWISE_GEN_INDEX)
                            .setMathPrecision(CUDNN_DATA_FLOAT)
                            .setAxis(axis)
                            .build();
    DEBUG_CUDNN_MSG(log_buf, genIndexDesc.describe());

    // Define the lessThan descriptor
    auto lessThanDesc = cudnn_frontend::PointWiseDescBuilder()
                            .setMode(CUDNN_POINTWISE_CMP_LT)
                            .setMathPrecision(CUDNN_DATA_FLOAT)
                            .build();
    DEBUG_CUDNN_MSG(log_buf, lessThanDesc.describe());

    // Define the greaterThan descriptor
    auto greaterThanDesc = cudnn_frontend::PointWiseDescBuilder()
                               .setMode(CUDNN_POINTWISE_CMP_GT)
                               .setMathPrecision(CUDNN_DATA_FLOAT)
                               .build();
    DEBUG_CUDNN_MSG(log_buf, greaterThanDesc.describe());

    // Define the logical_or descriptor
    auto logicalOrDesc = cudnn_frontend::PointWiseDescBuilder()
                             .setMode(CUDNN_POINTWISE_LOGICAL_OR)
                             .setMathPrecision(CUDNN_DATA_BOOLEAN)
                             .build();
    DEBUG_CUDNN_MSG(log_buf, logicalOrDesc.describe());

    // Define the binary_selection descriptor
    auto selectionDesc = cudnn_frontend::PointWiseDescBuilder()
                             .setMode(CUDNN_POINTWISE_BINARY_SELECT)
                             .setMathPrecision(CUDNN_DATA_FLOAT)
                             .build();
    DEBUG_CUDNN_MSG(log_buf, selectionDesc.describe());

    float alpha = 1.0f;
    float beta = 0.0f;

    // Create a convolution Node
    auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR)
                       .setdxDesc(std::get<AFTER_DCONV_TENSOR>(tensors))
                       .setwDesc(std::get<W_OR_DW_TENSOR>(tensors))
                       .setdyDesc(std::get<DY_TENSOR>(tensors))
                       .setcDesc(convDesc)
                       .setAlpha(alpha)
                       .setBeta(beta)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, conv_op.describe());

    // TODO: do we need getOutputTensor(), and what it returns in backward case?
    // Create an relu backward Node.
    auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                      .setdyDesc(std::get<AFTER_DCONV_TENSOR>(tensors))
                      .setxDesc(std::get<RELU_TENSOR>(tensors))
                      .setdxDesc(std::get<AFTER_DRELU_TENSOR>(tensors))
                      .setpwDesc(actDesc)
                      .build();
    DEBUG_CUDNN_MSG(log_buf, act_op.describe());

    // Create a Scale Node.
    auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                        .setxDesc(std::get<AFTER_DRELU_TENSOR>(tensors))
                        .setbDesc(std::get<SCALE_TENSOR>(tensors))
                        .setyDesc(std::get<DGRAD_OPTIONAL_TENSOR>(tensors))
                        .setpwDesc(scaleDesc)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, scale_op.describe());

    // Create a Gen_Index Node.
    auto genIndex_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                           .setxDesc(std::get<DGRAD_OPTIONAL_TENSOR>(tensors))
                           .setyDesc(std::get<DGRAD_GEN_INDEX_TENSOR>(tensors))
                           .setpwDesc(genIndexDesc)
                           .build();
    DEBUG_CUDNN_MSG(log_buf, genIndex_op.describe());

    // Create a LessThan Node.
    auto lessThan_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                           .setxDesc(std::get<DGRAD_GEN_INDEX_TENSOR>(tensors))
                           .setbDesc(std::get<DGRAD_THRESHOLD_TOP_TENSOR>(tensors))
                           .setyDesc(std::get<DGRAD_MASK_TOP_TENSOR>(tensors))
                           .setpwDesc(lessThanDesc)
                           .build();
    DEBUG_CUDNN_MSG(log_buf, lessThan_op.describe());

    // Create a GreaterThan Node.
    auto greaterThan_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                              .setxDesc(std::get<DGRAD_GEN_INDEX_TENSOR>(tensors))
                              .setbDesc(std::get<DGRAD_THRESHOLD_BOTTOM_TENSOR>(tensors))
                              .setyDesc(std::get<DGRAD_MASK_BOTTOM_TENSOR>(tensors))
                              .setpwDesc(greaterThanDesc)
                              .build();
    DEBUG_CUDNN_MSG(log_buf, greaterThan_op.describe());

    // Create a LogicalOr Node.
    auto logicalOr_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                            .setxDesc(std::get<DGRAD_MASK_TOP_TENSOR>(tensors))
                            .setbDesc(std::get<DGRAD_MASK_BOTTOM_TENSOR>(tensors))
                            .setyDesc(std::get<DGRAD_MASK_TENSOR>(tensors))
                            .setpwDesc(logicalOrDesc)
                            .build();
    DEBUG_CUDNN_MSG(log_buf, logicalOr_op.describe());

    // Create a Binary_Selection Node.
    auto selection_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                            .setxDesc(std::get<AFTER_DCONV_TENSOR>(tensors))
                            .setbDesc(std::get<DGRAD_OPTIONAL_TENSOR>(tensors))
                            .settDesc(std::get<DGRAD_MASK_TENSOR>(tensors))
                            .setyDesc(std::get<X_OR_DX_TENSOR>(tensors))
                            .setpwDesc(selectionDesc)
                            .build();
    DEBUG_CUDNN_MSG(log_buf, selection_op.describe());

    // Create an Operation Graph. In this case it is convolution add bias activation
    std::array<cudnn_frontend::Operation const*, 8> ops = {&conv_op,     &act_op,         &scale_op,     &genIndex_op,
                                                           &lessThan_op, &greaterThan_op, &logicalOr_op, &selection_op};

    auto opGraph =
        cudnn_frontend::OperationGraphBuilder().setHandle(handle_).setOperationGraph(ops.size(), ops.data()).build();

    // Create string encoding for plan caching
    auto cache_string =
        getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
    DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);

    auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
    DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());

    auto workspace_size = plan.getWorkspaceSize();
    DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);

    void* workspace_ptr = nullptr;
    auto workspace_tensor = at::empty({(workspace_size + 3) / 4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
    if (workspace_size > 0) {
      workspace_ptr = workspace_tensor.data_ptr<float>();
    }
    void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrZ, devPtrR, devPtrT, devPtrU};
    int64_t uids[] = {'x', 'y', 'w', 's', 'r', 't', 'u'};
    auto variantPack = cudnn_frontend::VariantPackBuilder()
                           .setWorkspacePointer(workspace_ptr)
                           .setDataPointers(7, data_ptrs)
                           .setUids(7, uids)
                           .build();
    DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
    cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
    checkCudnnErr(status);
    cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
  } catch (cudnn_frontend::cudnnException e) {
    std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
  }
}

struct bottleneck_forward_status {
  int64_t dimA[4];
  int64_t filterdimA1[4];
  int64_t filterdimA2[4];
  int64_t filterdimA2hh[4];
  int64_t filterdimA3[4];
  int64_t filterdimA4[4];

  int64_t threshdim[4];

  int axis[4];

  int64_t outdimA0[4];
  int64_t outdimA1[4];
  int64_t outdimA1b[4];  // out1_pad
  int64_t outdimA2[4];
  int64_t outdimA3[4];
  int64_t outdimA4[4];

  int64_t padA[2];
  int64_t padA1[2];
  int64_t padA2[2];  // halo padding
  int64_t dilationA[2];
  int64_t convstrideA[2];
  int64_t convstride1X1[2];

  int64_t outdim0[4];  // halo input shape
  int64_t outdim1[4];
  int64_t outdim1b[4];
  int64_t outdim2[4];
  int64_t outdim3[4];
  int64_t outdim4[4];  // halo output shape

  void init(bool explicit_nhwc, int stride_1X1, std::vector<at::Tensor> inputs) {
    dimA[0] = dimA[1] = dimA[2] = dimA[3] = 0;
    filterdimA1[0] = filterdimA1[1] = filterdimA1[2] = filterdimA1[3] = 0;
    filterdimA2[0] = filterdimA2[1] = filterdimA2[2] = filterdimA2[3] = 0;
    filterdimA2hh[0] = filterdimA2hh[1] = filterdimA2hh[2] = filterdimA2hh[3] = 0;
    filterdimA3[0] = filterdimA3[1] = filterdimA3[2] = filterdimA3[3] = 0;
    filterdimA4[0] = filterdimA4[1] = filterdimA4[2] = filterdimA4[3] = 0;
    threshdim[0] = threshdim[1] = threshdim[2] = threshdim[3] = 1;

    // All dim calculation after this order of n,c,h,w
    if (explicit_nhwc) {
      axis[0] = 0;
      axis[1] = 3;
      axis[2] = 1;
      axis[3] = 2;
    } else {
      axis[0] = 0;
      axis[1] = 1;
      axis[2] = 2;
      axis[3] = 3;
    }

    for (int dim = 0; dim < 4; dim++) {
      dimA[dim] = inputs[0].size(axis[dim]);
      filterdimA1[dim] = inputs[1].size(axis[dim]);
      filterdimA2[dim] = inputs[2].size(axis[dim]);
      filterdimA3[dim] = inputs[3].size(axis[dim]);
    }
    if (stride_1X1 != 1 || filterdimA3[0] != dimA[1]) {
      for (int dim = 0; dim < 4; dim++) {
        filterdimA4[dim] = inputs[10].size(axis[dim]);
      }
    }
    for (int dim = 0; dim < 4; dim++) {
      if (dim == 2) {
        filterdimA2hh[dim] = 1;
      } else {
        filterdimA2hh[dim] = filterdimA2[dim];
      }
    }

    // output dim in n,c,h,w used by backend
    outdimA0[0] = outdimA0[1] = outdimA0[2] = outdimA0[3] = 0;
    outdimA1[0] = outdimA1[1] = outdimA1[2] = outdimA1[3] = 0;
    outdimA1b[0] = outdimA1b[1] = outdimA1b[2] = outdimA1b[3] = 0;
    outdimA2[0] = outdimA2[1] = outdimA2[2] = outdimA2[3] = 0;
    outdimA3[0] = outdimA3[1] = outdimA3[2] = outdimA3[3] = 0;
    outdimA4[0] = outdimA4[1] = outdimA4[2] = outdimA4[3] = 0;

    // use these fixed value for test run
    padA[0] = 0;
    padA[1] = 0;
    padA1[0] = 1;
    padA1[1] = 1;
    padA2[0] = 0;
    padA2[1] = 1;
    dilationA[0] = 1;
    dilationA[1] = 1;
    convstrideA[0] = 1;
    convstrideA[1] = 1;
    convstride1X1[0] = stride_1X1;
    convstride1X1[1] = stride_1X1;

    // compute output from pad/stride/dilation
    outdimA1[0] = dimA[0];
    outdimA1[1] = filterdimA1[0];
    for (int dim = 0; dim < 2; dim++) {
      outdimA1[dim + 2] =
          getFwdConvOutputDim(dimA[dim + 2], padA[dim], filterdimA1[dim + 2], convstride1X1[dim], dilationA[dim]);
    }
    for (int dim = 0; dim < 4; dim++) {
      if (dim == 2) {
        outdimA1b[dim] = outdimA1[dim] + 2;
      } else {
        outdimA1b[dim] = outdimA1[dim];
      }
    }

    outdimA2[0] = outdimA1[0];
    outdimA2[1] = filterdimA2[0];
    for (int dim = 0; dim < 2; dim++) {
      outdimA2[dim + 2] =
          getFwdConvOutputDim(outdimA1[dim + 2], padA1[dim], filterdimA2[dim + 2], convstrideA[dim], dilationA[dim]);
    }

    for (int dim = 0; dim < 4; dim++) {
      if (dim == 2) {
        outdimA0[dim] = 3;
        outdimA4[dim] = 1;
      } else {
        outdimA0[dim] = outdimA1[dim];
        outdimA4[dim] = outdimA2[dim];
      }
    }

    outdimA3[0] = outdimA2[0];
    outdimA3[1] = filterdimA3[0];
    for (int dim = 0; dim < 2; dim++) {
      outdimA3[dim + 2] =
          getFwdConvOutputDim(outdimA2[dim + 2], padA[dim], filterdimA3[dim + 2], convstrideA[dim], dilationA[dim]);
    }

    // Create output tensor in the correct shape in pytorch's view
    outdim1[0] = outdim1[1] = outdim1[2] = outdim1[3] = 0;
    outdim1b[0] = outdim1b[1] = outdim1b[2] = outdim1b[3] = 0;
    outdim2[0] = outdim2[1] = outdim2[2] = outdim2[3] = 0;
    outdim3[0] = outdim3[1] = outdim3[2] = outdim3[3] = 0;
    if (explicit_nhwc) {
      axis[0] = 0;
      axis[1] = 2;
      axis[2] = 3;
      axis[3] = 1;
    }
    for (int dim = 0; dim < 4; dim++) {
      outdim0[dim] = outdimA0[axis[dim]];
      outdim1[dim] = outdimA1[axis[dim]];
      outdim1b[dim] = outdimA1b[axis[dim]];
      outdim2[dim] = outdimA2[axis[dim]];
      outdim3[dim] = outdimA3[axis[dim]];
      outdim4[dim] = outdimA4[axis[dim]];
    }
  }
};

bottleneck_forward_status forward_state;

}  // end of anonymous namespace

std::vector<at::Tensor> bottleneck_forward_init(bool explicit_nhwc, int stride_1X1, std::vector<at::Tensor> inputs) {
  // NB! Bottleneck_forward and bottleneck_backward are NOT thread safe method.
  // NB! We use a global object to store state.
  forward_state.init(explicit_nhwc, stride_1X1, inputs);

  // create output vector
  std::vector<at::Tensor> outputs;
  auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;

  // printf("outdim1 =
  // (%d,%d,%d,%d)\n",forward_state.outdim1[0],forward_state.outdim1[1],forward_state.outdim1[2],forward_state.outdim1[3]);
  auto out1 = at::empty(forward_state.outdim1, inputs[0].type(), output_format);
  auto out2 = at::empty(forward_state.outdim2, inputs[0].type(), output_format);
  auto out3 = at::empty(forward_state.outdim3, inputs[0].type(), output_format);

  outputs.push_back(out1);
  outputs.push_back(out2);
  outputs.push_back(out3);

  return outputs;
}

// inputs contains x,w,z,b,(i)
void bottleneck_forward_out1(bool explicit_nhwc, int stride_1X1, std::vector<at::Tensor> inputs,
                             std::vector<at::Tensor> outputs) {
  std::cout << std::fixed;

  // run
  at::Half* x = inputs[0].data_ptr<at::Half>();
  at::Half* w = inputs[1].data_ptr<at::Half>();
  at::Half* z = inputs[4].data_ptr<at::Half>();
  at::Half* b = inputs[7].data_ptr<at::Half>();
  auto out1 = outputs[0];
  at::Half* y1 = out1.data_ptr<at::Half>();

  run_conv_scale_bias_add_activation(forward_state.dimA, forward_state.padA, forward_state.convstride1X1,
                                     forward_state.dilationA, forward_state.filterdimA1, forward_state.outdimA1,
                                     CUDNN_DATA_HALF, x, w, y1, z, b, nullptr);

  DEBUG_MSG("[DEBUG] new relu1 : " << out1.to(at::kFloat).sum().item<float>());
}

// computes halo (top or bottom) from fat halo input.
// fat halo input is 3 pixels wide in H.
at::Tensor bottleneck_forward_out2_halo(bool explicit_nhwc, at::Tensor fat_halo_y1, std::vector<at::Tensor> inputs) {
  auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;

  // run
  at::Half* w = inputs[2].data_ptr<at::Half>();
  at::Half* z = inputs[5].data_ptr<at::Half>();
  at::Half* b = inputs[8].data_ptr<at::Half>();

  at::Half* y1 = fat_halo_y1.data_ptr<at::Half>();

  auto halo_y2 = at::empty(forward_state.outdim4, inputs[0].type(), output_format);
  at::Half* y2 = halo_y2.data_ptr<at::Half>();

  run_conv_scale_bias_add_activation(forward_state.outdimA0, forward_state.padA2, forward_state.convstrideA,
                                     forward_state.dilationA, forward_state.filterdimA2, forward_state.outdimA4,
                                     CUDNN_DATA_HALF, y1, w, y2, z, b, nullptr);

  return halo_y2;
}

// compute halo correction term (top or bottom) from slim halo input (N,C,1,W).
// slim halo input is 1 pixel wide in H.
at::Tensor bottleneck_forward_out2_halo_corr(bool explicit_nhwc, at::Tensor slim_halo_y1,
                                             std::vector<at::Tensor> inputs, at::Tensor w1by3,
                                             at::Tensor out2_part_halo) {
  auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;

  // run
  at::Half* w = w1by3.data_ptr<at::Half>();  // C,C,1,3
  at::Half* z = inputs[5].data_ptr<at::Half>();
  at::Half* b = inputs[8].data_ptr<at::Half>();

  at::Half* y1 = slim_halo_y1.data_ptr<at::Half>();

  at::Half* prev_out2 = out2_part_halo.data_ptr<at::Half>();

  auto halo_y2 = at::empty(forward_state.outdim4, inputs[0].type(), output_format);
  at::Half* y2 = halo_y2.data_ptr<at::Half>();

  run_conv_add_scale_bias_activation(forward_state.outdimA4, forward_state.padA2, forward_state.convstrideA,
                                     forward_state.dilationA, forward_state.filterdimA2hh, forward_state.outdimA4,
                                     CUDNN_DATA_HALF, y1, w, y2, z, b, prev_out2);

  return halo_y2;
}

void bottleneck_forward_out2(bool explicit_nhwc, int stride_1X1, std::vector<at::Tensor> inputs,
                             std::vector<at::Tensor> outputs) {
  std::cout << std::fixed;

  // from _out1 method
  at::Half* x = inputs[0].data_ptr<at::Half>();
  auto out1 = outputs[0];
  at::Half* y1 = out1.data_ptr<at::Half>();

  // run
  at::Half* w = inputs[2].data_ptr<at::Half>();
  at::Half* z = inputs[5].data_ptr<at::Half>();
  at::Half* b = inputs[8].data_ptr<at::Half>();
  auto out2 = outputs[1];
  at::Half* y2 = out2.data_ptr<at::Half>();

  // printf("forward_state.outdimA1 =
  // {%d,%d,%d,%d}\n",forward_state.outdimA1[0],forward_state.outdimA1[1],forward_state.outdimA1[2],forward_state.outdimA1[3]);
  // printf("forward_state.padA1 = {%d,%d}\n",forward_state.padA1[0],forward_state.padA1[1]);
  // printf("forward_state.convstrideA = {%d,%d}\n",forward_state.convstrideA[0],forward_state.convstrideA[1]);
  // printf("forward_state.dilationA = {%d,%d}\n",forward_state.dilationA[0],forward_state.dilationA[1]);
  // printf("forward_state.filterdimA2 =
  // {%d,%d,%d,%d}\n",forward_state.filterdimA2[0],forward_state.filterdimA2[1],forward_state.filterdimA2[2],forward_state.filterdimA2[3]);
  // printf("forward_state.outdimA2 =
  // {%d,%d,%d,%d}\n",forward_state.outdimA2[0],forward_state.outdimA2[1],forward_state.outdimA2[2],forward_state.outdimA2[3]);
  run_conv_scale_bias_add_activation(forward_state.outdimA1, forward_state.padA1, forward_state.convstrideA,
                                     forward_state.dilationA, forward_state.filterdimA2, forward_state.outdimA2,
                                     CUDNN_DATA_HALF, y1, w, y2, z, b, nullptr);
  DEBUG_MSG("[DEBUG] new relu2 : " << out2.to(at::kFloat).sum().item<float>());
}

void bottleneck_forward_out2_mask(bool explicit_nhwc, int stride_1X1, std::vector<at::Tensor> inputs,
                                  std::vector<at::Tensor> outputs, at::Tensor thresholdTop,
                                  at::Tensor thresholdBottom) {
  std::cout << std::fixed;

  // from _out1 method
  at::Half* x = inputs[0].data_ptr<at::Half>();
  auto out1 = outputs[0];
  at::Half* y1 = out1.data_ptr<at::Half>();

  // run
  at::Half* w = inputs[2].data_ptr<at::Half>();
  at::Half* z = inputs[5].data_ptr<at::Half>();
  at::Half* b = inputs[8].data_ptr<at::Half>();
  auto out2 = outputs[1];
  at::Half* y2 = out2.data_ptr<at::Half>();

  // printf("forward_state.outdimA1 =
  // {%d,%d,%d,%d}\n",forward_state.outdimA1[0],forward_state.outdimA1[1],forward_state.outdimA1[2],forward_state.outdimA1[3]);
  // printf("forward_state.padA1 = {%d,%d}\n",forward_state.padA1[0],forward_state.padA1[1]);
  // printf("forward_state.convstrideA = {%d,%d}\n",forward_state.convstrideA[0],forward_state.convstrideA[1]);
  // printf("forward_state.dilationA = {%d,%d}\n",forward_state.dilationA[0],forward_state.dilationA[1]);
  // printf("forward_state.filterdimA2 =
  // {%d,%d,%d,%d}\n",forward_state.filterdimA2[0],forward_state.filterdimA2[1],forward_state.filterdimA2[2],forward_state.filterdimA2[3]);
  // printf("forward_state.outdimA2 =
  // {%d,%d,%d,%d}\n",forward_state.outdimA2[0],forward_state.outdimA2[1],forward_state.outdimA2[2],forward_state.outdimA2[3]);
  run_conv_scale_bias_add_activation_mask(forward_state.outdimA1, forward_state.padA1, forward_state.convstrideA,
                                          forward_state.dilationA, forward_state.filterdimA2, forward_state.outdimA2,
                                          forward_state.threshdim, CUDNN_DATA_HALF, y1, w, y2, z, b, nullptr,
                                          thresholdTop.data_ptr<int>(), thresholdBottom.data_ptr<int>(),
                                          2);  // axis == 1 -> Does this assume explicit NHWC?
  DEBUG_MSG("[DEBUG] new relu2 : " << out2.to(at::kFloat).sum().item<float>());
}

void bottleneck_forward_out2_pad(bool explicit_nhwc, int stride_1X1, std::vector<at::Tensor> inputs,
                                 std::vector<at::Tensor> outputs, at::Tensor out1_pad) {
  std::cout << std::fixed;

  // from _out1 method
  at::Half* x = inputs[0].data_ptr<at::Half>();
  auto out1 = outputs[0];
  at::Half* y1 = out1_pad.data_ptr<at::Half>();

  // run
  at::Half* w = inputs[2].data_ptr<at::Half>();
  at::Half* z = inputs[5].data_ptr<at::Half>();
  at::Half* b = inputs[8].data_ptr<at::Half>();
  auto out2 = outputs[1];
  at::Half* y2 = out2.data_ptr<at::Half>();

  // printf("forward_state.outdimA1 =
  // {%d,%d,%d,%d}\n",forward_state.outdimA1[0],forward_state.outdimA1[1],forward_state.outdimA1[2],forward_state.outdimA1[3]);
  // printf("forward_state.padA1 = {%d,%d}\n",forward_state.padA1[0],forward_state.padA1[1]);
  // printf("forward_state.convstrideA = {%d,%d}\n",forward_state.convstrideA[0],forward_state.convstrideA[1]);
  // printf("forward_state.dilationA = {%d,%d}\n",forward_state.dilationA[0],forward_state.dilationA[1]);
  // printf("forward_state.filterdimA2 =
  // {%d,%d,%d,%d}\n",forward_state.filterdimA2[0],forward_state.filterdimA2[1],forward_state.filterdimA2[2],forward_state.filterdimA2[3]);
  // printf("forward_state.outdimA2 =
  // {%d,%d,%d,%d}\n",forward_state.outdimA2[0],forward_state.outdimA2[1],forward_state.outdimA2[2],forward_state.outdimA2[3]);
  run_conv_scale_bias_add_activation(forward_state.outdimA1b, forward_state.padA2, forward_state.convstrideA,
                                     forward_state.dilationA, forward_state.filterdimA2, forward_state.outdimA2,
                                     CUDNN_DATA_HALF, y1, w, y2, z, b, nullptr);
  DEBUG_MSG("[DEBUG] new relu2 : " << out2.to(at::kFloat).sum().item<float>());
}

void bottleneck_forward_rest(bool explicit_nhwc, int stride_1X1, std::vector<at::Tensor> inputs,
                             std::vector<at::Tensor> outputs) {
  std::cout << std::fixed;

  // from _out1 method
  at::Half* x = inputs[0].data_ptr<at::Half>();

  // create output of conv3
  auto out3 = outputs[2];
  at::Half* y3 = out3.data_ptr<at::Half>();

  // create output of conv4 that may exist
  auto identity = at::empty_like(out3);
  at::Half* yi = identity.data_ptr<at::Half>();

  at::Half *w, *z, *b;

  if (stride_1X1 != 1 || forward_state.filterdimA3[0] != forward_state.dimA[1]) {
    w = inputs[10].data_ptr<at::Half>();
    z = inputs[11].data_ptr<at::Half>();
    b = inputs[12].data_ptr<at::Half>();
    run_conv_scale_bias(forward_state.dimA, forward_state.padA, forward_state.convstride1X1, forward_state.dilationA,
                        forward_state.filterdimA4, forward_state.outdimA3, CUDNN_DATA_HALF, x, w, yi, z, b);
    DEBUG_MSG("[DEBUG] new downsample : " << identity.to(at::kFloat).sum().item<float>());
  } else {
    yi = x;
  }

  auto out2 = outputs[1];
  at::Half* y2 = out2.data_ptr<at::Half>();

  w = inputs[3].data_ptr<at::Half>();
  z = inputs[6].data_ptr<at::Half>();
  b = inputs[9].data_ptr<at::Half>();

  run_conv_scale_bias_add_activation(forward_state.outdimA2, forward_state.padA, forward_state.convstrideA,
                                     forward_state.dilationA, forward_state.filterdimA3, forward_state.outdimA3,
                                     CUDNN_DATA_HALF, y2, w, y3, z, b, yi);
  DEBUG_MSG("[DEBUG] new relu3 : " << out3.to(at::kFloat).sum().item<float>());
}

namespace {

struct bottleneck_backward_state {
  int64_t dimA[4];
  int64_t filterdimA1[4];
  int64_t filterdimA2[4];
  int64_t filterdimA3[4];
  int64_t filterdimA4[4];
  int64_t filterdimA2hh[4];  // Cin,Cout,1,3
  int64_t threshdim[4];

  int axis[4];

  int64_t outdimA1[4];   // grad_out1
  int64_t outdimA1b[4];  // out1_pad
  int64_t outdimA2[4];   // grad_out2
  int64_t outdimA3[4];
  int64_t outdimA1h[4];   // output: grad_out1 halo (H=3)
  int64_t outdimA2h[4];   // input : grad_out2 halo cells (H=3)
  int64_t outdimA1hh[4];  // input: grad_out2 halo (H=1)
  int64_t outdimA2hh[4];  // input: out1 halo (H=1)

  int64_t padA[2];
  int64_t padA1[2];
  int64_t padA2[2];
  int64_t dilationA[2];
  int64_t convstrideA[2];
  int64_t convstride1X1[2];

  int64_t filterdim2hh[4];  // Cin,1,3,Cout

  int64_t outdim1[4];
  int64_t outdim1b[4];
  int64_t outdim2[4];
  int64_t outdim3[4];
  int64_t outdim1h[4];
  int64_t outdim1hh[4];

  void init(bool explicit_nhwc, int stride_1X1, std::vector<at::Tensor> inputs) {
    // setup dimensions
    dimA[0] = dimA[1] = dimA[2] = dimA[3] = 0;
    filterdimA1[0] = filterdimA1[1] = filterdimA1[2] = filterdimA1[3] = 0;
    filterdimA2[0] = filterdimA2[1] = filterdimA2[2] = filterdimA2[3] = 0;
    filterdimA3[0] = filterdimA3[1] = filterdimA3[2] = filterdimA3[3] = 0;
    filterdimA4[0] = filterdimA4[1] = filterdimA4[2] = filterdimA4[3] = 0;
    filterdimA2hh[0] = filterdimA2hh[1] = filterdimA2hh[2] = filterdimA2hh[3] = 0;
    threshdim[0] = threshdim[1] = threshdim[2] = threshdim[3] = 1;

    // All dim calculation after this order of n,c,h,w
    if (explicit_nhwc) {
      axis[0] = 0;
      axis[1] = 3;
      axis[2] = 1;
      axis[3] = 2;
    } else {
      axis[0] = 0;
      axis[1] = 1;
      axis[2] = 2;
      axis[3] = 3;
    }

    for (int dim = 0; dim < 4; dim++) {
      dimA[dim] = inputs[0].size(axis[dim]);
      filterdimA1[dim] = inputs[1].size(axis[dim]);
      filterdimA2[dim] = inputs[2].size(axis[dim]);
      filterdimA3[dim] = inputs[3].size(axis[dim]);
    }
    if (stride_1X1 != 1 || filterdimA3[0] != dimA[1]) {
      for (int dim = 0; dim < 4; dim++) {
        filterdimA4[dim] = inputs[14].size(axis[dim]);
      }
    }

    for (int dim = 0; dim < 4; dim++) {
      if (dim == 2) {
        filterdimA2hh[dim] = 1;
      } else {
        filterdimA2hh[dim] = filterdimA2[dim];
      }
    }

    // output dim in n,c,h,w used by backend
    outdimA1[0] = outdimA1[1] = outdimA1[2] = outdimA1[3] = 0;
    outdimA1b[0] = outdimA1b[1] = outdimA1b[2] = outdimA1b[3] = 0;
    outdimA2[0] = outdimA2[1] = outdimA2[2] = outdimA2[3] = 0;
    outdimA3[0] = outdimA3[1] = outdimA3[2] = outdimA3[3] = 0;
    outdimA1h[0] = outdimA1h[1] = outdimA1h[2] = outdimA1h[3] = 0;
    outdimA2h[0] = outdimA2h[1] = outdimA2h[2] = outdimA2h[3] = 0;
    outdimA1hh[0] = outdimA1hh[1] = outdimA1hh[2] = outdimA1hh[3] = 0;
    outdimA2hh[0] = outdimA2hh[1] = outdimA2hh[2] = outdimA2hh[3] = 0;

    // use these fixed value for test run
    padA[0] = 0;
    padA[1] = 0;
    padA1[0] = 1;
    padA1[1] = 1;
    padA2[0] = 0;
    padA2[1] = 1;
    dilationA[0] = 1;
    dilationA[1] = 1;
    convstrideA[0] = 1;
    convstrideA[1] = 1;
    convstride1X1[0] = stride_1X1;
    convstride1X1[1] = stride_1X1;

    // compute output from pad/stride/dilation
    outdimA1[0] = dimA[0];
    outdimA1[1] = filterdimA1[0];
    for (int dim = 0; dim < 2; dim++) {
      outdimA1[dim + 2] =
          getFwdConvOutputDim(dimA[dim + 2], padA[dim], filterdimA1[dim + 2], convstride1X1[dim], dilationA[dim]);
    }
    for (int dim = 0; dim < 4; dim++) {
      if (dim == 2) {
        outdimA1b[dim] = outdimA1[dim] + 2;
      } else {
        outdimA1b[dim] = outdimA1[dim];
      }
    }

    outdimA2[0] = outdimA1[0];
    outdimA2[1] = filterdimA2[0];
    for (int dim = 0; dim < 2; dim++) {
      outdimA2[dim + 2] =
          getFwdConvOutputDim(outdimA1[dim + 2], padA1[dim], filterdimA2[dim + 2], convstrideA[dim], dilationA[dim]);
    }

    outdimA3[0] = outdimA2[0];
    outdimA3[1] = filterdimA3[0];
    for (int dim = 0; dim < 2; dim++) {
      outdimA3[dim + 2] =
          getFwdConvOutputDim(outdimA2[dim + 2], padA[dim], filterdimA3[dim + 2], convstrideA[dim], dilationA[dim]);
    }

    for (int dim = 0; dim < 4; dim++) {
      if (dim == 2) {
        outdimA1h[dim] = 3;
        outdimA2h[dim] = 3;
        outdimA1hh[dim] = 1;
        outdimA2hh[dim] = 1;
      } else {
        outdimA1h[dim] = outdimA1[dim];
        outdimA2h[dim] = outdimA2[dim];
        outdimA1hh[dim] = outdimA1[dim];
        outdimA2hh[dim] = outdimA2[dim];
      }
    }

    // Create output tensor in the correct shape in pytorch's view
    outdim1[0] = outdim1[1] = outdim1[2] = outdim1[3] = 0;
    outdim1b[0] = outdim1b[1] = outdim1b[2] = outdim1b[3] = 0;
    outdim2[0] = outdim2[1] = outdim2[2] = outdim2[3] = 0;
    outdim3[0] = outdim3[1] = outdim3[2] = outdim3[3] = 0;
    outdim1h[0] = outdim1h[1] = outdim1h[2] = outdim1h[3] = 0;
    outdim1hh[0] = outdim1hh[1] = outdim1hh[2] = outdim1hh[3] = 0;
    filterdim2hh[0] = filterdim2hh[1] = filterdim2hh[2] = filterdim2hh[3] = 0;
    if (explicit_nhwc) {
      axis[0] = 0;
      axis[1] = 2;
      axis[2] = 3;
      axis[3] = 1;
    }
    for (int dim = 0; dim < 4; dim++) {
      outdim1[dim] = outdimA1[axis[dim]];
      outdim1b[dim] = outdimA1b[axis[dim]];
      outdim2[dim] = outdimA2[axis[dim]];
      outdim3[dim] = outdimA3[axis[dim]];
      outdim1h[dim] = outdimA1h[axis[dim]];
      outdim1hh[dim] = outdimA1hh[axis[dim]];
      filterdim2hh[dim] = filterdimA2hh[axis[dim]];
    }
  }
};

bottleneck_backward_state backward_state;

}  // namespace

std::vector<at::Tensor> bottleneck_backward_init(bool explicit_nhwc, int stride_1X1, std::vector<at::Tensor> inputs) {
  std::cout << std::fixed;

  backward_state.init(explicit_nhwc, stride_1X1, inputs);

  // create output vector
  std::vector<at::Tensor> outputs;
  auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;

  auto grad_x = at::empty_like(inputs[0]);
  auto wgrad1 = at::empty_like(inputs[1]);
  auto wgrad2 = at::empty_like(inputs[2]);
  auto wgrad3 = at::empty_like(inputs[3]);

  outputs.push_back(grad_x);
  outputs.push_back(wgrad1);
  outputs.push_back(wgrad2);
  outputs.push_back(wgrad3);
  if (stride_1X1 != 1 || backward_state.filterdimA3[0] != backward_state.dimA[1]) {
    auto wgrad4 = at::empty_like(inputs[14]);
    outputs.push_back(wgrad4);
  }

  return outputs;
}

void bottleneck_backward_wgrad3(bool explicit_nhwc, int stride_1X1, std::vector<at::Tensor> inputs,
                                std::vector<at::Tensor> outputs) {
  // dconv3+drelu2+dscale2
  at::Half* conv_in = inputs[13].data_ptr<at::Half>();
  at::Half* dy3 = inputs[10].data_ptr<at::Half>();

  // wgrad
  auto wgrad3 = outputs[3];
  at::Half* dw3 = wgrad3.data_ptr<at::Half>();
  run_dconv(backward_state.outdimA2, backward_state.padA, backward_state.convstrideA, backward_state.dilationA,
            backward_state.filterdimA3, backward_state.outdimA3, CUDNN_DATA_HALF, conv_in, dw3, dy3,
            CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
  DEBUG_MSG("[DEBUG] new wgrad3 : " << wgrad3.to(at::kFloat).sum().item<float>());
}

at::Tensor bottleneck_backward_grad_out2(bool explicit_nhwc, int stride_1X1, std::vector<at::Tensor> inputs,
                                         std::vector<at::Tensor> outputs) {
  bool requires_grad = inputs[0].requires_grad();

  std::cout << std::fixed;
  auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;

  // dconv3+drelu2+dscale2
  at::Half* conv_in = inputs[13].data_ptr<at::Half>();
  at::Half* dy3 = inputs[10].data_ptr<at::Half>();

  DEBUG_MSG("[DEBUG] new dconv3 : " << inputs[10].to(at::kFloat).sum().item<float>());

  // dgrad
  auto grad_out2 = at::empty(backward_state.outdim2, inputs[0].type(), output_format);
  at::Half* dy2 = grad_out2.data_ptr<at::Half>();
  at::Half* w = inputs[3].data_ptr<at::Half>();
  at::Half* z = inputs[5].data_ptr<at::Half>();

  at::Half* relu2 = inputs[13].data_ptr<at::Half>();

  run_dconv_drelu_dscale(backward_state.outdimA2, backward_state.padA, backward_state.convstrideA,
                         backward_state.dilationA, backward_state.filterdimA3, backward_state.outdimA3, CUDNN_DATA_HALF,
                         dy2, w, dy3, z, relu2);

  // do halo exchange of dy2 here

  DEBUG_MSG("[DEBUG] new dconv2 : " << grad_out2.to(at::kFloat).sum().item<float>());

  return grad_out2;
}

at::Tensor bottleneck_backward_grad_out1(bool explicit_nhwc, int stride_1X1, std::vector<at::Tensor> inputs,
                                         std::vector<at::Tensor> outputs, at::Tensor grad_out2) {
  bool requires_grad = inputs[0].requires_grad();

  std::cout << std::fixed;
  auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;

  // dgrad
  at::Half* dy2 = grad_out2.data_ptr<at::Half>();

  // dgrad
  auto grad_out1 = at::empty(backward_state.outdim1, inputs[0].type(), output_format);
  at::Half* dy1 = grad_out1.data_ptr<at::Half>();
  at::Half* w = inputs[2].data_ptr<at::Half>();
  at::Half* z = inputs[4].data_ptr<at::Half>();

  at::Half* relu1 = inputs[12].data_ptr<at::Half>();
  // printf("relu.shape = [%d,%d,%d,%d]\n",inputs[12].size(0),inputs[12].size(1),inputs[12].size(2),inputs[12].size(3));

  // fused dgrad
  // printf("backward_state.outdim1 =
  // {%d,%d,%d,%d}\n",backward_state.outdim1[0],backward_state.outdim1[1],backward_state.outdim1[2],backward_state.outdim1[3]);
  run_dconv_drelu_dscale(backward_state.outdimA1, backward_state.padA1, backward_state.convstrideA,
                         backward_state.dilationA, backward_state.filterdimA2, backward_state.outdimA2, CUDNN_DATA_HALF,
                         dy1, w, dy2, z, relu1);

  return grad_out1;
}

at::Tensor bottleneck_backward_grad_out1_mask(bool explicit_nhwc, int stride_1X1, std::vector<at::Tensor> inputs,
                                              std::vector<at::Tensor> outputs, at::Tensor grad_out2,
                                              at::Tensor thresholdTop, at::Tensor thresholdBottom) {
  bool requires_grad = inputs[0].requires_grad();

  std::cout << std::fixed;
  auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;

  // dgrad
  at::Half* dy2 = grad_out2.data_ptr<at::Half>();

  // dgrad
  auto grad_out1 = at::empty(backward_state.outdim1, inputs[0].type(), output_format);
  at::Half* dy1 = grad_out1.data_ptr<at::Half>();
  at::Half* w = inputs[2].data_ptr<at::Half>();
  at::Half* z = inputs[4].data_ptr<at::Half>();

  at::Half* relu1 = inputs[12].data_ptr<at::Half>();
  // printf("relu.shape = [%d,%d,%d,%d]\n",inputs[12].size(0),inputs[12].size(1),inputs[12].size(2),inputs[12].size(3));

  // fused dgrad
  run_dconv_drelu_dscale_mask(backward_state.outdimA1, backward_state.padA1, backward_state.convstrideA,
                              backward_state.dilationA, backward_state.filterdimA2, backward_state.outdimA2,
                              backward_state.threshdim, CUDNN_DATA_HALF, dy1, w, dy2, z, relu1,
                              thresholdTop.data_ptr<int>(), thresholdBottom.data_ptr<int>(), 2);

  return grad_out1;
}

// perform backward data 1x3 convolution (grad_out * w_rot180) on grad_out2 input of shape [N,1,W,C] with padding=(0,1)
// to produce output of shape [N,1,W,C]
at::Tensor bottleneck_backward_grad_out1_halo_corr(bool explicit_nhwc, int stride_1X1, std::vector<at::Tensor> inputs,
                                                   at::Tensor w1by3, std::vector<at::Tensor> outputs,
                                                   at::Tensor grad_out2_halo, at::Tensor relu1_halo,
                                                   at::Tensor part_grad_out1) {
  bool requires_grad = inputs[0].requires_grad();

  std::cout << std::fixed;
  auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;

  // dgrad
  at::Half* dy2h = grad_out2_halo.data_ptr<at::Half>();

  // dgrad
  auto grad_out1_halo = at::empty(backward_state.outdim1hh, inputs[0].type(), output_format);
  at::Half* dy1h = grad_out1_halo.data_ptr<at::Half>();
  // at::Half* w = inputs[2].data_ptr<at::Half>();  // use w1by3 instead, which is a sliced version of inputs[2]
  at::Half* w = w1by3.data_ptr<at::Half>();
  at::Half* z = inputs[4].data_ptr<at::Half>();
  at::Half* relu1h = relu1_halo.data_ptr<at::Half>();
  at::Half* pdy1h = part_grad_out1.data_ptr<at::Half>();

  // printf("relu.shape = [%d,%d,%d,%d]\n",relu1_halo.size(0),relu1_halo.size(1),relu1_halo.size(2),relu1_halo.size(3));
  //  fused dgrad
  // printf("backward_state.outdimA1h =
  // {%d,%d,%d,%d}\n",backward_state.outdimA1h[0],backward_state.outdimA1h[1],backward_state.outdimA1h[2],backward_state.outdimA1h[3]);
  // printf("backward_state.outdimA2h =
  // {%d,%d,%d,%d}\n",backward_state.outdimA2h[0],backward_state.outdimA2h[1],backward_state.outdimA2h[2],backward_state.outdimA2h[3]);
  // printf("backward_state.filterdimA2 =
  // {%d,%d,%d,%d}\n",backward_state.filterdimA2[0],backward_state.filterdimA2[1],backward_state.filterdimA2[2],backward_state.filterdimA2[3]);
  run_dconv_add_drelu_dscale(backward_state.outdimA1hh,
                             backward_state.padA2,  // 0,1
                             backward_state.convstrideA, backward_state.dilationA,
                             backward_state.filterdimA2hh,  // C,1,3,C
                             backward_state.outdimA2hh, CUDNN_DATA_HALF, dy1h, w, dy2h, z, relu1h, pdy1h);

  return grad_out1_halo;
}

// perform backward data 3x3 convolution (grad_out * w_rot180) on grad_out2 input of shape [N,3,W,C] with padding=(1,1)
// to produce output of shape [N,3,W,C]
at::Tensor bottleneck_backward_grad_out1_halo(bool explicit_nhwc, int stride_1X1, std::vector<at::Tensor> inputs,
                                              std::vector<at::Tensor> outputs, at::Tensor grad_out2_halo,
                                              at::Tensor relu1_halo) {
  bool requires_grad = inputs[0].requires_grad();

  std::cout << std::fixed;
  auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;

  // dgrad
  at::Half* dy2h = grad_out2_halo.data_ptr<at::Half>();

  // dgrad
  auto grad_out1_halo = at::empty(backward_state.outdim1h, inputs[0].type(), output_format);
  at::Half* dy1h = grad_out1_halo.data_ptr<at::Half>();
  at::Half* w = inputs[2].data_ptr<at::Half>();
  at::Half* z = inputs[4].data_ptr<at::Half>();

  at::Half* relu1h = relu1_halo.data_ptr<at::Half>();
  // printf("relu.shape = [%d,%d,%d,%d]\n",relu1_halo.size(0),relu1_halo.size(1),relu1_halo.size(2),relu1_halo.size(3));
  //  fused dgrad
  // printf("backward_state.outdimA1h =
  // {%d,%d,%d,%d}\n",backward_state.outdimA1h[0],backward_state.outdimA1h[1],backward_state.outdimA1h[2],backward_state.outdimA1h[3]);
  // printf("backward_state.outdimA2h =
  // {%d,%d,%d,%d}\n",backward_state.outdimA2h[0],backward_state.outdimA2h[1],backward_state.outdimA2h[2],backward_state.outdimA2h[3]);
  // printf("backward_state.filterdimA2 =
  // {%d,%d,%d,%d}\n",backward_state.filterdimA2[0],backward_state.filterdimA2[1],backward_state.filterdimA2[2],backward_state.filterdimA2[3]);
  run_dconv_drelu_dscale(backward_state.outdimA1h, backward_state.padA1, backward_state.convstrideA,
                         backward_state.dilationA, backward_state.filterdimA2, backward_state.outdimA2h,
                         CUDNN_DATA_HALF, dy1h, w, dy2h, z, relu1h);

  return grad_out1_halo;
}

void bottleneck_backward_wgrad2_pad(bool explicit_nhwc, int stride_1X1, std::vector<at::Tensor> inputs,
                                    std::vector<at::Tensor> outputs, at::Tensor input, at::Tensor grad_out2) {
  std::cout << std::fixed;
  auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;

  // dgrad
  at::Half* dy2 = grad_out2.data_ptr<at::Half>();

  // dconv2+drelu1+dscale1
  at::Half* conv_in = input.data_ptr<at::Half>();

  // wgrad
  auto wgrad2 = outputs[2];
  at::Half* dw2 = wgrad2.data_ptr<at::Half>();

  // printf("outdimA1b =
  // (%d,%d,%d,%d)\n",backward_state.outdimA1b[0],backward_state.outdimA1b[1],backward_state.outdimA1b[2],backward_state.outdimA1b[3]);
  // printf("backward_state.padA2 = {%d,%d}\n",backward_state.padA2[0],backward_state.padA2[1]);
  run_dconv(backward_state.outdimA1b,  // conv_in.shape (including H halos)
            backward_state.padA2,      // 0, 1
            backward_state.convstrideA, backward_state.dilationA,
            backward_state.filterdimA2,  // dw2.shape
            backward_state.outdimA2,     // dy2.shape
            CUDNN_DATA_HALF, conv_in, dw2, dy2, CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
  DEBUG_MSG("[DEBUG] new wgrad2 : " << wgrad2.to(at::kFloat).sum().item<float>());
}

void bottleneck_backward_wgrad2(bool explicit_nhwc, int stride_1X1, std::vector<at::Tensor> inputs,
                                std::vector<at::Tensor> outputs, at::Tensor grad_out2) {
  bool requires_grad = inputs[0].requires_grad();

  std::cout << std::fixed;
  auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;

  // dgrad
  at::Half* dy2 = grad_out2.data_ptr<at::Half>();

  // dconv2+drelu1+dscale1
  at::Half* conv_in = inputs[12].data_ptr<at::Half>();

  // wgrad
  auto wgrad2 = outputs[2];
  at::Half* dw2 = wgrad2.data_ptr<at::Half>();

  // printf("outdimA1 =
  // (%d,%d,%d,%d)\n",backward_state.outdimA1[0],backward_state.outdimA1[1],backward_state.outdimA1[2],backward_state.outdimA1[3]);
  run_dconv(backward_state.outdimA1, backward_state.padA1, backward_state.convstrideA, backward_state.dilationA,
            backward_state.filterdimA2, backward_state.outdimA2, CUDNN_DATA_HALF, conv_in, dw2, dy2,
            CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
  DEBUG_MSG("[DEBUG] new wgrad2 : " << wgrad2.to(at::kFloat).sum().item<float>());
}

// compute halo cells for input volume of dimension [N,1,W,C] with padding=(0,1) to produce output volume of dimension
// [N,1,W,C] input and grad_out2_halo tensors are all of same shape output tensor is of shape [Cin,1,3,Cout] (regular
// filter dims are [Cin,3,3,Cout]
at::Tensor bottleneck_backward_wgrad2_halo(bool explicit_nhwc, int stride_1X1, std::vector<at::Tensor> inputs,
                                           std::vector<at::Tensor> outputs, at::Tensor input,
                                           at::Tensor grad_out2_halo) {
  bool requires_grad = inputs[0].requires_grad();

  std::cout << std::fixed;
  auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;

  // dgrad
  at::Half* dy2 = grad_out2_halo.data_ptr<at::Half>();

  // dconv2+drelu1+dscale1
  at::Half* conv_in = input.data_ptr<at::Half>();

  // wgrad
  auto wgrad2_halo = at::empty(backward_state.filterdim2hh, input.type(), output_format);
  at::Half* dw2 = wgrad2_halo.data_ptr<at::Half>();

  // printf("backward_state.outdimA1hh =
  // {%d,%d,%d,%d}\n",backward_state.outdimA1hh[0],backward_state.outdimA1hh[1],backward_state.outdimA1hh[2],backward_state.outdimA1hh[3]);
  // printf("backward_state.outdimA2hh =
  // {%d,%d,%d,%d}\n",backward_state.outdimA2hh[0],backward_state.outdimA2hh[1],backward_state.outdimA2hh[2],backward_state.outdimA2hh[3]);
  // printf("backward_state.filterdim2hh =
  // {%d,%d,%d,%d}\n",backward_state.filterdim2hh[0],backward_state.filterdim2hh[1],backward_state.filterdim2hh[2],backward_state.filterdim2hh[3]);
  // printf("backward_state.filterdimA2hh =
  // {%d,%d,%d,%d}\n",backward_state.filterdimA2hh[0],backward_state.filterdimA2hh[1],backward_state.filterdimA2hh[2],backward_state.filterdimA2hh[3]);
  // printf("backward_state.padA2 = {%d,%d}\n",backward_state.padA2[0],backward_state.padA2[1]);
  run_dconv(backward_state.outdimA1hh,  // N,C,1,W
            backward_state.padA2,       // 0, 1
            backward_state.convstrideA, backward_state.dilationA,
            backward_state.filterdimA2hh,  // Cin,Cout,1,3
            backward_state.outdimA2hh,     // N,C,1,W
            CUDNN_DATA_HALF, conv_in, dw2, dy2, CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);

  return wgrad2_halo;
}

void bottleneck_backward_wgrad1(bool explicit_nhwc, int stride_1X1, std::vector<at::Tensor> inputs,
                                std::vector<at::Tensor> outputs, at::Tensor grad_out1) {
  at::Half* x = inputs[0].data_ptr<at::Half>();
  at::Half* dy1 = grad_out1.data_ptr<at::Half>();

  // dconv1+add
  // wgrad
  auto wgrad1 = outputs[1];
  at::Half* dw1 = wgrad1.data_ptr<at::Half>();
  run_dconv(backward_state.dimA, backward_state.padA, backward_state.convstride1X1, backward_state.dilationA,
            backward_state.filterdimA1, backward_state.outdimA1, CUDNN_DATA_HALF, x, dw1, dy1,
            CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
}

void bottleneck_backward_rest(bool explicit_nhwc, int stride_1X1, std::vector<at::Tensor> inputs,
                              std::vector<at::Tensor> outputs, at::Tensor grad_out2, at::Tensor grad_out1) {
  bool requires_grad = inputs[0].requires_grad();

  std::cout << std::fixed;
  auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;

  // dgrad
  at::Half* dy2 = grad_out2.data_ptr<at::Half>();
  at::Half* dy1 = grad_out1.data_ptr<at::Half>();

  /*
    // backward strided conv cannot be fused
    // if stride == 1 but channel changes, we can fuse here
    if (stride_1X1 != 1){
      // dgrad
      run_dconv(outdimA1,
                padA1,
                convstride1X1,
                dilationA,
                filterdimA2,
                outdimA2,
                CUDNN_DATA_HALF,
                dy1,
                w,
                dy2,
                CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);

      // mul fused mask
      grad_out1.mul_(inputs[15]);
    }
    else {
      at::Half* relu1 = inputs[12].data_ptr<at::Half>();
      // fused dgrad
      run_dconv_drelu_dscale(outdimA1,
                             padA1,
                             convstride1X1,
                             dilationA,
                             filterdimA2,
                             outdimA2,
                             CUDNN_DATA_HALF,
                             dy1,
                             w,
                             dy2,
                             z,
                             relu1);
    }
  */
  DEBUG_MSG("[DEBUG] new dconv1 : " << grad_out1.to(at::kFloat).sum().item<float>());

  // create grads of conv4 that may exist
  auto grad_x_conv4 = at::empty_like(inputs[0]);
  at::Half* dx_conv4 = grad_x_conv4.data_ptr<at::Half>();
  at::Tensor wgrad4;

  // x used for dconv1 and dconv4 wgrad
  at::Half* x = inputs[0].data_ptr<at::Half>();

  at::Half* w = NULL;

  if (stride_1X1 != 1 || backward_state.filterdimA3[0] != backward_state.dimA[1]) {
    w = inputs[14].data_ptr<at::Half>();
    at::Half* dy_conv4 = inputs[11].data_ptr<at::Half>();
    if (requires_grad) {
      run_dconv(backward_state.dimA, backward_state.padA, backward_state.convstride1X1, backward_state.dilationA,
                backward_state.filterdimA4, backward_state.outdimA3, CUDNN_DATA_HALF, dx_conv4, w, dy_conv4,
                CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);
      // we don't print here since we can't hook out this grad in pytorch alone to compare, due to addition with dx
      // DEBUG_MSG("[DEBUG] new dx_identity : " << grad_x_conv4.to(at::kFloat).sum().item<float>());
    }
    // wgrad
    wgrad4 = outputs[4];
    at::Half* dw4 = wgrad4.data_ptr<at::Half>();
    run_dconv(backward_state.dimA, backward_state.padA, backward_state.convstride1X1, backward_state.dilationA,
              backward_state.filterdimA4, backward_state.outdimA3, CUDNN_DATA_HALF, x, dw4, dy_conv4,
              CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
  } else {
    // if there is no downsample, dx_conv4 is fork of drelu3
    dx_conv4 = inputs[11].data_ptr<at::Half>();
  }

  // dgrad
  w = inputs[1].data_ptr<at::Half>();
  auto grad_x = outputs[0];
  at::Half* dx = grad_x.data_ptr<at::Half>();

  // backward strided conv cannot be fused
  // if stride == 1 but channel changes, we can fuse here
  if (requires_grad) {
    if (stride_1X1 != 1) {
      run_dconv(backward_state.dimA, backward_state.padA, backward_state.convstride1X1, backward_state.dilationA,
                backward_state.filterdimA1, backward_state.outdimA1, CUDNN_DATA_HALF, dx, w, dy1,
                CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);
      // add 2 together
      grad_x.add_(grad_x_conv4);
    } else {
      run_dconv_add(backward_state.dimA, backward_state.padA, backward_state.convstride1X1, backward_state.dilationA,
                    backward_state.filterdimA1, backward_state.outdimA1, CUDNN_DATA_HALF, dx, w, dy1, dx_conv4);
    }
  }

  DEBUG_MSG("[DEBUG] new dx : " << grad_x.to(at::kFloat).sum().item<float>());
  DEBUG_MSG("[DEBUG] new wgrad1 : " << wgrad1.to(at::kFloat).sum().item<float>());

  if (stride_1X1 != 1 || backward_state.filterdimA3[0] != backward_state.dimA[1]) {
    DEBUG_MSG("[DEBUG] new wgrad4 : " << wgrad4.to(at::kFloat).sum().item<float>());
  }
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &bottleneck_forward, "Bottleneck block forward", py::call_guard<py::gil_scoped_release>());
  m.def("backward", &bottleneck_backward, "Bottleneck block backward", py::call_guard<py::gil_scoped_release>());
  m.def("forward_init", &bottleneck_forward_init, "Bottleneck block init", py::call_guard<py::gil_scoped_release>());
  m.def("forward_out1", &bottleneck_forward_out1, "Bottleneck block forward", py::call_guard<py::gil_scoped_release>());
  m.def("forward_out2", &bottleneck_forward_out2, "Bottleneck block forward", py::call_guard<py::gil_scoped_release>());
  m.def("forward_out2_mask", &bottleneck_forward_out2_mask, "Bottleneck block forward",
        py::call_guard<py::gil_scoped_release>());
  m.def("forward_out2_halo", &bottleneck_forward_out2_halo, "Bottleneck block forward",
        py::call_guard<py::gil_scoped_release>());
  m.def("forward_out2_halo_corr", &bottleneck_forward_out2_halo_corr, "Bottleneck block forward",
        py::call_guard<py::gil_scoped_release>());
  m.def("forward_out2_pad", &bottleneck_forward_out2_pad, "Bottleneck block forward",
        py::call_guard<py::gil_scoped_release>());
  m.def("forward_rest", &bottleneck_forward_rest, "Bottleneck block forward", py::call_guard<py::gil_scoped_release>());
  m.def("backward_init", &bottleneck_backward_init, "Bottleneck block backward init",
        py::call_guard<py::gil_scoped_release>());
  m.def("backward_grad_out2", &bottleneck_backward_grad_out2, "Bottleneck block backward",
        py::call_guard<py::gil_scoped_release>());
  m.def("backward_grad_out1", &bottleneck_backward_grad_out1, "Bottleneck block backward",
        py::call_guard<py::gil_scoped_release>());
  m.def("backward_grad_out1_mask", &bottleneck_backward_grad_out1_mask, "Bottleneck block backward",
        py::call_guard<py::gil_scoped_release>());
  m.def("backward_grad_out1_halo", &bottleneck_backward_grad_out1_halo, "Bottleneck block backward",
        py::call_guard<py::gil_scoped_release>());
  m.def("backward_grad_out1_halo_corr", &bottleneck_backward_grad_out1_halo_corr, "Bottleneck block backward",
        py::call_guard<py::gil_scoped_release>());
  m.def("backward_wgrad2_pad", &bottleneck_backward_wgrad2_pad, "Bottleneck block backward",
        py::call_guard<py::gil_scoped_release>());
  m.def("backward_wgrad2", &bottleneck_backward_wgrad2, "Bottleneck block backward",
        py::call_guard<py::gil_scoped_release>());
  m.def("backward_wgrad2_halo", &bottleneck_backward_wgrad2_halo, "Bottleneck block backward",
        py::call_guard<py::gil_scoped_release>());
  m.def("backward_wgrad3", &bottleneck_backward_wgrad3, "Bottleneck block backward",
        py::call_guard<py::gil_scoped_release>());
  m.def("backward_wgrad1", &bottleneck_backward_wgrad1, "Bottleneck block backward",
        py::call_guard<py::gil_scoped_release>());
  m.def("backward_rest", &bottleneck_backward_rest, "Bottleneck block backward",
        py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: apex/contrib/csrc/conv_bias_relu/conv_bias_relu.cpp
================================================
#include <ATen/ATen.h>
#include <ATen/cudnn/Handle.h>  // for getcudnnhandle
#include <cudnn_frontend.h>
#include <torch/extension.h>
#include <torch/torch.h>

#include <iostream>
#include <vector>

#ifdef DEBUG
#define DEBUG_MSG(str)             \
  do {                             \
    std::cout << str << std::endl; \
  } while (false)
#else
#define DEBUG_MSG(str) \
  do {                 \
  } while (false)
#endif

#ifdef DEBUG_CUDNN
#define DEBUG_CUDNN_MSG(buf, str) \
  do {                            \
    buf << str << std::endl;      \
  } while (false)
#else
#define DEBUG_CUDNN_MSG(buf, str) \
  do {                            \
  } while (false)
#endif

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(at::MemoryFormat::ChannelsLast), #x " must be contiguous")
#define CHECK_INPUT(x) \
  CHECK_CUDA(x);       \
  CHECK_CONTIGUOUS(x)

#define checkCudnnErr(...)                                                    \
  do {                                                                        \
    int err = checkCudnnError(__VA_ARGS__, #__VA_ARGS__, __FILE__, __LINE__); \
    if (err) {                                                                \
      return;                                                                 \
    }                                                                         \
  } while (0)

int checkCudnnError(cudnnStatus_t code, const char* expr, const char* file, int line) {
  if (code) {
    printf("CUDNN error at %s:%d, code=%d (%s) in '%s'\n", file, line, (int)code, cudnnGetErrorString(code), expr);
    return 1;
  }
  return 0;
}

void checkError(cudaError_t code, char const* func, const char* file, const int line, bool abort = true);
#define checkCUDAError(val)                      \
  {                                              \
    checkError((val), #val, __FILE__, __LINE__); \
  }  // in-line regular function

void checkError(cudaError_t code, char const* func, const char* file, const int line, bool abort) {
  if (code != cudaSuccess) {
    const char* errorMessage = cudaGetErrorString(code);
    fprintf(stderr, "CUDA error returned from \"%s\" at %s:%d, Error code: %d (%s)\n", func, file, line, code,
            errorMessage);
    if (abort) {
      cudaDeviceReset();
      exit(code);
    }
  }
}

void generateStrides(const int64_t* dimA, int64_t* strideA, int nbDims, cudnnTensorFormat_t filterFormat) {
  // For INT8x4 and INT8x32 we still compute standard strides here to input
  // into the cuDNN functions. We will manually scale by resizeFactor in the cpu ref.
  if (filterFormat == CUDNN_TENSOR_NCHW) {
    strideA[nbDims - 1] = 1;
    for (int64_t d = nbDims - 2; d >= 0; d--) {
      strideA[d] = strideA[d + 1] * dimA[d + 1];
    }
  } else {
    // Here we assume that the format is CUDNN_TENSOR_NHWC
    strideA[1] = 1;
    strideA[nbDims - 1] = strideA[1] * dimA[1];
    for (int64_t d = nbDims - 2; d >= 2; d--) {
      strideA[d] = strideA[d + 1] * dimA[d + 1];
    }
    strideA[0] = strideA[2] * dimA[2];
  }
}

int getFwdConvDilatedFilterDim(int filterDim, int dilation) { return ((filterDim - 1) * dilation) + 1; }

int getFwdConvPaddedImageDim(int tensorDim, int pad) { return tensorDim + (2 * pad); }

int getFwdConvOutputDim(int tensorDim, int pad, int filterDim, int stride, int dilation) {
  int p = (getFwdConvPaddedImageDim(tensorDim, pad) - getFwdConvDilatedFilterDim(filterDim, dilation)) / stride + 1;
  return (p);
}

// create a cache for plan
std::unordered_map<std::string, cudnn_frontend::ExecutionPlan> plan_cache;

std::string getConvFusionString(int64_t* x_dim_padded, int64_t* padA, int64_t* convstrideA, int64_t* dilationA,
                                int64_t* w_dim_padded, cudnnDataType_t dataType, std::string fusion_string) {
  for (int i = 0; i < 4; i++) {
    fusion_string += 'X';
    fusion_string += std::to_string(x_dim_padded[i]);
  }
  for (int i = 0; i < 4; i++) {
    fusion_string += 'W';
    fusion_string += std::to_string(w_dim_padded[i]);
  }
  for (int i = 0; i < 2; i++) {
    fusion_string += 'P';
    fusion_string += std::to_string(padA[i]);
  }
  for (int i = 0; i < 2; i++) {
    fusion_string += 'S';
    fusion_string += std::to_string(convstrideA[i]);
  }
  for (int i = 0; i < 2; i++) {
    fusion_string += 'D';
    fusion_string += std::to_string(dilationA[i]);
  }
  fusion_string += 'T';
  fusion_string += std::to_string(dataType);
  return fusion_string;
}

cudnn_frontend::ExecutionPlan& getOrCreatePlan(cudnnHandle_t handle_, std::stringstream& log_buf,
                                               cudnn_frontend::OperationGraph& opGraph, std::string cache_string,
                                               bool use_heuristic = true) {
  auto it = plan_cache.find(cache_string);
  if (it != plan_cache.end()) {
    DEBUG_CUDNN_MSG(log_buf, "Found plan in cache");
    return it->second;
  } else {
    DEBUG_CUDNN_MSG(log_buf, "No plan in cache");
    if (use_heuristic) {
      // TODO: confirm which mode to use
      auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
                            .setOperationGraph(opGraph)
                            .setHeurMode(CUDNN_HEUR_MODE_INSTANT)
                            .build();
      auto engine_config_count = heuristics.getEngineConfigCount();
      auto& engine_configs = heuristics.getEngineConfig(engine_config_count);
      for (int64_t count = 0; count < engine_config_count; count++) {
        try {
          plan_cache.emplace(cache_string, std::move(cudnn_frontend::ExecutionPlanBuilder()
                                                         .setHandle(handle_)
                                                         .setEngineConfig(engine_configs[count], opGraph.getTag())
                                                         .build()));
          break;
        } catch (cudnn_frontend::cudnnException e) {
          // Throw exception if all engines failed
          if (count == (engine_config_count - 1)) {
            throw e;
          } else {
            continue;
          }
        }
      }
    } else {
      // How many engines support this operation graph ?
      auto total_engines = opGraph.getEngineCount();
      DEBUG_CUDNN_MSG(log_buf, opGraph.describe() << " has " << total_engines << " engines.");
      // We have to randomly pick one engine from [0, total_engines)
      // Selecting "0" by default
      auto engine = cudnn_frontend::EngineBuilder().setGlobalEngineIdx(0).setOperationGraph(opGraph).build();
      DEBUG_CUDNN_MSG(log_buf, engine.describe());
      auto& knobs = engine.getSupportedKnobs();
      for (auto it = std::begin(knobs); it != std::end(knobs); ++it) {
        DEBUG_CUDNN_MSG(log_buf, it->describe());
      }
      if (knobs.begin() != knobs.end()) {
        DEBUG_CUDNN_MSG(log_buf, "Updated knob choice");
        knobs.begin()->setChoice(knobs.begin()->getMinValue() + 1);
        DEBUG_CUDNN_MSG(log_buf, knobs.begin()->describe());
      }

      // Createmplacee the requisite engine config
      auto engine_config = cudnn_frontend::EngineConfigBuilder().setEngine(engine).build();
      DEBUG_CUDNN_MSG(log_buf, engine_config.describe());
      plan_cache.emplace(
          cache_string,
          std::move(cudnn_frontend::ExecutionPlanBuilder().setHandle(handle_).setEngineConfig(engine_config).build()));
    }

    return plan_cache.find(cache_string)->second;
  }
}

void run_conv_bias(int64_t* x_dim, int64_t* w_dim, int64_t* y_dim, int64_t* conv_pad, int64_t* convstride,
                   int64_t* dilation, cudnnDataType_t dataType, at::Half* devPtrX, at::Half* devPtrW, at::Half* devPtrB,
                   at::Half* devPtrY) {
  cudnnHandle_t handle_ = torch::native::getCudnnHandle();
  std::stringstream log_buf;

  try {
    int convDim = 2;
    float alpha = 1.0f;
    float beta = 0.0f;
    int64_t b_dim[] = {1, y_dim[1], 1, 1};

    // Creates the necessary tensor descriptors
    int64_t stride[4];
    generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto xTensor = cudnn_frontend::TensorBuilder()
                       .setDim(4, x_dim)
                       .setStrides(4, stride)
                       .setId('x')
                       .setAlignment(16)
                       .setDataType(dataType)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, xTensor.describe());

    generateStrides(w_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto wTensor = cudnn_frontend::TensorBuilder()
                       .setDim(4, w_dim)
                       .setStrides(4, stride)
                       .setId('w')
                       .setAlignment(16)
                       .setDataType(dataType)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, wTensor.describe());

    generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto afterConvTensor = cudnn_frontend::TensorBuilder()
                               .setDim(4, y_dim)
                               .setStrides(4, stride)
                               .setId('c')
                               .setAlignment(16)
                               .setDataType(CUDNN_DATA_FLOAT)
                               .setVirtual()
                               .build();
    DEBUG_CUDNN_MSG(log_buf, afterConvTensor.describe());

    generateStrides(b_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto bTensor = cudnn_frontend::TensorBuilder()
                       .setDim(4, b_dim)
                       .setStrides(4, stride)
                       .setId('b')
                       .setAlignment(16)
                       .setDataType(dataType)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, bTensor.describe());

    generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto afterBiasTensor = cudnn_frontend::TensorBuilder()
                               .setDim(4, y_dim)
                               .setStrides(4, stride)
                               .setId('y')
                               .setAlignment(16)
                               .setDataType(dataType)
                               .build();
    DEBUG_CUDNN_MSG(log_buf, afterBiasTensor.describe());

    // Define the bias operation
    auto biasDesc =
        cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_ADD).setMathPrecision(CUDNN_DATA_FLOAT).build();
    DEBUG_CUDNN_MSG(log_buf, biasDesc.describe());

    // Define the convolution problem
    auto convDesc = cudnn_frontend::ConvDescBuilder()
                        .setDataType(CUDNN_DATA_FLOAT)
                        .setMathMode(CUDNN_CROSS_CORRELATION)
                        .setNDims(convDim)
                        .setStrides(convDim, convstride)
                        .setPrePadding(convDim, conv_pad)
                        .setPostPadding(convDim, conv_pad)
                        .setDilation(convDim, dilation)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, convDesc.describe());

    // Create a convolution Node
    auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
                       .setxDesc(xTensor)
                       .setwDesc(wTensor)
                       .setyDesc(afterConvTensor)
                       .setcDesc(convDesc)
                       .setAlpha(alpha)
                       .setBeta(beta)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, conv_op.describe());

    // Create a Bias Node.
    auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                       .setxDesc(conv_op.getOutputTensor())
                       .setbDesc(bTensor)
                       .setyDesc(afterBiasTensor)
                       .setpwDesc(biasDesc)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, bias_op.describe());

    // Create an Operation Graph. In this case it is convolution bias activation
    std::array<cudnn_frontend::Operation const*, 2> ops = {&conv_op, &bias_op};

    auto opGraph = cudnn_frontend::OperationGraphBuilder().setHandle(handle_).setOperationGraph(2, ops.data()).build();

    // Create string encoding for plan caching
    auto cache_string = getConvFusionString(x_dim, conv_pad, convstride, dilation, w_dim, dataType, opGraph.getTag());
    DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);

    auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
    DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());

    auto workspace_size = plan.getWorkspaceSize();
    DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);

    void* workspace_ptr = nullptr;
    auto workspace_tensor = at::empty({(workspace_size + 3) / 4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
    if (workspace_size > 0) {
      workspace_ptr = workspace_tensor.data_ptr<float>();
    }
    void* data_ptrs[] = {devPtrX, devPtrW, devPtrB, devPtrY};
    int64_t uids[] = {'x', 'w', 'b', 'y'};
    auto variantPack = cudnn_frontend::VariantPackBuilder()
                           .setWorkspacePointer(workspace_ptr)
                           .setDataPointers(4, data_ptrs)
                           .setUids(4, uids)
                           .build();
    DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
    cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
    checkCudnnErr(status);
    cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
  } catch (cudnn_frontend::cudnnException e) {
    std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
  }
}

void run_conv_bias_mask_relu(int64_t* x_dim, int64_t* w_dim, int64_t* y_dim, int64_t* conv_pad, int64_t* conv_stride,
                             int64_t* conv_dilation, cudnnDataType_t dataType, at::Half* devPtrX, at::Half* devPtrW,
                             at::Half* devPtrB, int8_t* devPtrM, at::Half* devPtrY) {
  cudnnHandle_t handle_ = torch::native::getCudnnHandle();
  std::stringstream log_buf;

  try {
    int conv_dim = 2;
    float alpha = 1.0f;
    float beta = 0.0f;
    int64_t b_dim[] = {1, y_dim[1], 1, 1};

    // Creates the necessary tensor descriptors
    int64_t stride[4];
    generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto xTensor = cudnn_frontend::TensorBuilder()
                       .setDim(4, x_dim)
                       .setStrides(4, stride)
                       .setId('x')
                       .setAlignment(16)
                       .setDataType(dataType)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, xTensor.describe());

    generateStrides(w_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto wTensor = cudnn_frontend::TensorBuilder()
                       .setDim(4, w_dim)
                       .setStrides(4, stride)
                       .setId('w')
                       .setAlignment(16)
                       .setDataType(dataType)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, wTensor.describe());

    generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto mTensor = cudnn_frontend::TensorBuilder()
                       .setDim(4, y_dim)
                       .setStrides(4, stride)
                       .setId('m')
                       .setAlignment(16)
                       .setDataType(CUDNN_DATA_INT8)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, wTensor.describe());

    generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto afterConvTensor = cudnn_frontend::TensorBuilder()
                               .setDim(4, y_dim)
                               .setStrides(4, stride)
                               .setId('c')
                               .setAlignment(16)
                               .setDataType(CUDNN_DATA_FLOAT)
                               .setVirtual()
                               .build();
    DEBUG_CUDNN_MSG(log_buf, afterConvTensor.describe());

    generateStrides(b_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto bTensor = cudnn_frontend::TensorBuilder()
                       .setDim(4, b_dim)
                       .setStrides(4, stride)
                       .setId('b')
                       .setAlignment(16)
                       .setDataType(dataType)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, bTensor.describe());

    generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto afterBiasTensor = cudnn_frontend::TensorBuilder()
                               .setDim(4, y_dim)
                               .setStrides(4, stride)
                               .setId('B')
                               .setAlignment(16)
                               .setDataType(CUDNN_DATA_FLOAT)
                               .setVirtual()
                               .build();
    DEBUG_CUDNN_MSG(log_buf, afterBiasTensor.describe());

    generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto afterMaskTensor = cudnn_frontend::TensorBuilder()
                               .setDim(4, y_dim)
                               .setStrides(4, stride)
                               .setId('M')
                               .setAlignment(16)
                               .setDataType(CUDNN_DATA_FLOAT)
                               .setVirtual()
                               .build();
    DEBUG_CUDNN_MSG(log_buf, afterBiasTensor.describe());

    generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto afterReLUTensor = cudnn_frontend::TensorBuilder()
                               .setDim(4, y_dim)
                               .setStrides(4, stride)
                               .setId('y')
                               .setAlignment(16)
                               .setDataType(dataType)
                               .build();
    DEBUG_CUDNN_MSG(log_buf, afterReLUTensor.describe());

    // Define the convolution problem
    auto convDesc = cudnn_frontend::ConvDescBuilder()
                        .setDataType(CUDNN_DATA_FLOAT)
                        .setMathMode(CUDNN_CROSS_CORRELATION)
                        .setNDims(conv_dim)
                        .setStrides(conv_dim, conv_stride)
                        .setPrePadding(conv_dim, conv_pad)
                        .setPostPadding(conv_dim, conv_pad)
                        .setDilation(conv_dim, conv_dilation)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, convDesc.describe());

    // Define the bias operation
    auto biasDesc =
        cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_ADD).setMathPrecision(CUDNN_DATA_FLOAT).build();
    DEBUG_CUDNN_MSG(log_buf, biasDesc.describe());

    // Define the mask operation
    auto maskDesc =
        cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_MUL).setMathPrecision(CUDNN_DATA_FLOAT).build();

    // Define the activation operation
    auto actDesc = cudnn_frontend::PointWiseDescBuilder()
                       .setMode(CUDNN_POINTWISE_RELU_FWD)
                       .setMathPrecision(CUDNN_DATA_FLOAT)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, actDesc.describe());

    // Create a convolution Node
    auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
                       .setxDesc(xTensor)
                       .setwDesc(wTensor)
                       .setyDesc(afterConvTensor)
                       .setcDesc(convDesc)
                       .setAlpha(alpha)
                       .setBeta(beta)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, conv_op.describe());

    // Create a Bias Node
    auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                       .setxDesc(conv_op.getOutputTensor())
                       .setbDesc(bTensor)
                       .setyDesc(afterBiasTensor)
                       .setpwDesc(biasDesc)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, bias_op.describe());

    // create a Mask Node
    auto mask_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                       .setxDesc(bias_op.getOutputTensor())
                       .setbDesc(mTensor)
                       .setyDesc(afterMaskTensor)
                       .setpwDesc(maskDesc)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, mask_op.describe());

    // Create an Activation Node
    auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                      .setxDesc(mask_op.getOutputTensor())
                      .setyDesc(afterReLUTensor)
                      .setpwDesc(actDesc)
                      .build();
    DEBUG_CUDNN_MSG(log_buf, act_op.describe());

    // Create an Operation Graph. In this case it is convolution bias activation
    std::array<cudnn_frontend::Operation const*, 4> ops = {&conv_op, &bias_op, &mask_op, &act_op};

    auto opGraph = cudnn_frontend::OperationGraphBuilder().setHandle(handle_).setOperationGraph(4, ops.data()).build();

    // Create string encoding for plan caching
    auto cache_string =
        getConvFusionString(x_dim, conv_pad, conv_stride, conv_dilation, w_dim, dataType, opGraph.getTag());
    DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);

    auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
    DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());

    auto workspace_size = plan.getWorkspaceSize();
    DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);

    void* workspace_ptr = nullptr;
    auto workspace_tensor = at::empty({(workspace_size + 3) / 4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
    if (workspace_size > 0) {
      workspace_ptr = workspace_tensor.data_ptr<float>();
    }
    void* data_ptrs[] = {devPtrX, devPtrW, devPtrB, devPtrM, devPtrY};
    int64_t uids[] = {'x', 'w', 'b', 'm', 'y'};
    auto variantPack = cudnn_frontend::VariantPackBuilder()
                           .setWorkspacePointer(workspace_ptr)
                           .setDataPointers(5, data_ptrs)
                           .setUids(5, uids)
                           .build();
    DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
    cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
    checkCudnnErr(status);
    cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
  } catch (cudnn_frontend::cudnnException e) {
    std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
  }
}

void run_conv_cscale_cbias_relu(int64_t* x_dim, int64_t* w_dim, int64_t* y_dim, int64_t* conv_pad, int64_t* conv_stride,
                                int64_t* conv_dilation, cudnnDataType_t dataType, at::Half* devPtrX, at::Half* devPtrW,
                                at::Half* devPtrS, at::Half* devPtrB, at::Half* devPtrY) {
  cudnnHandle_t handle_ = torch::native::getCudnnHandle();
  std::stringstream log_buf;

  try {
    int conv_dim = 2;
    float alpha = 1.0f;
    float beta = 0.0f;
    int64_t s_dim[] = {1, y_dim[1], 1, 1};
    int64_t b_dim[] = {1, y_dim[1], 1, 1};

    // Creates the necessary tensor descriptors
    int64_t stride[4];
    generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto xTensor = cudnn_frontend::TensorBuilder()
                       .setDim(4, x_dim)
                       .setStrides(4, stride)
                       .setId('x')
                       .setAlignment(16)
                       .setDataType(dataType)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, xTensor.describe());

    generateStrides(w_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto wTensor = cudnn_frontend::TensorBuilder()
                       .setDim(4, w_dim)
                       .setStrides(4, stride)
                       .setId('w')
                       .setAlignment(16)
                       .setDataType(dataType)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, wTensor.describe());

    generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto afterConvTensor = cudnn_frontend::TensorBuilder()
                               .setDim(4, y_dim)
                               .setStrides(4, stride)
                               .setId('c')
                               .setAlignment(16)
                               .setDataType(CUDNN_DATA_FLOAT)
                               .setVirtual()
                               .build();
    DEBUG_CUDNN_MSG(log_buf, afterConvTensor.describe());

    generateStrides(s_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto sTensor = cudnn_frontend::TensorBuilder()
                       .setDim(4, s_dim)
                       .setStrides(4, stride)
                       .setId('s')
                       .setAlignment(16)
                       .setDataType(dataType)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, sTensor.describe());

    generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto afterScaleTensor = cudnn_frontend::TensorBuilder()
                                .setDim(4, y_dim)
                                .setStrides(4, stride)
                                .setId('S')
                                .setAlignment(16)
                                .setDataType(CUDNN_DATA_FLOAT)
                                .setVirtual()
                                .build();
    DEBUG_CUDNN_MSG(log_buf, afterScaleTensor.describe());

    generateStrides(b_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto bTensor = cudnn_frontend::TensorBuilder()
                       .setDim(4, b_dim)
                       .setStrides(4, stride)
                       .setId('b')
                       .setAlignment(16)
                       .setDataType(dataType)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, bTensor.describe());

    generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto afterBiasTensor = cudnn_frontend::TensorBuilder()
                               .setDim(4, y_dim)
                               .setStrides(4, stride)
                               .setId('B')
                               .setAlignment(16)
                               .setDataType(CUDNN_DATA_FLOAT)
                               .setVirtual()
                               .build();
    DEBUG_CUDNN_MSG(log_buf, afterBiasTensor.describe());

    generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto afterReLUTensor = cudnn_frontend::TensorBuilder()
                               .setDim(4, y_dim)
                               .setStrides(4, stride)
                               .setId('y')
                               .setAlignment(16)
                               .setDataType(dataType)
                               .build();
    DEBUG_CUDNN_MSG(log_buf, afterReLUTensor.describe());

    // Define the convolution problem
    auto convDesc = cudnn_frontend::ConvDescBuilder()
                        .setDataType(CUDNN_DATA_FLOAT)
                        .setMathMode(CUDNN_CROSS_CORRELATION)
                        .setNDims(conv_dim)
                        .setStrides(conv_dim, conv_stride)
                        .setPrePadding(conv_dim, conv_pad)
                        .setPostPadding(conv_dim, conv_pad)
                        .setDilation(conv_dim, conv_dilation)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, convDesc.describe());

    // Define the scale operation
    auto scaleDesc =
        cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_MUL).setMathPrecision(CUDNN_DATA_FLOAT).build();
    DEBUG_CUDNN_MSG(log_buf, scaleDesc.describe());

    // Define the bias operation
    auto biasDesc =
        cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_ADD).setMathPrecision(CUDNN_DATA_FLOAT).build();
    DEBUG_CUDNN_MSG(log_buf, biasDesc.describe());

    // Define the activation operation
    auto actDesc = cudnn_frontend::PointWiseDescBuilder()
                       .setMode(CUDNN_POINTWISE_RELU_FWD)
                       .setMathPrecision(CUDNN_DATA_FLOAT)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, actDesc.describe());

    // Create a convolution Node
    auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
                       .setxDesc(xTensor)
                       .setwDesc(wTensor)
                       .setyDesc(afterConvTensor)
                       .setcDesc(convDesc)
                       .setAlpha(alpha)
                       .setBeta(beta)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, conv_op.describe());

    // Create a scale Node.
    auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                        .setxDesc(conv_op.getOutputTensor())
                        .setbDesc(sTensor)
                        .setyDesc(afterScaleTensor)
                        .setpwDesc(scaleDesc)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, scale_op.describe());

    // Create a Bias Node.
    auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                       .setxDesc(scale_op.getOutputTensor())
                       .setbDesc(bTensor)
                       .setyDesc(afterBiasTensor)
                       .setpwDesc(biasDesc)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, bias_op.describe());

    // Create an Activation Node.
    auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                      .setxDesc(bias_op.getOutputTensor())
                      .setyDesc(afterReLUTensor)
                      .setpwDesc(actDesc)
                      .build();
    DEBUG_CUDNN_MSG(log_buf, act_op.describe());

    // Create an Operation Graph. In this case it is convolution bias activation
    std::array<cudnn_frontend::Operation const*, 4> ops = {&conv_op, &scale_op, &bias_op, &act_op};

    auto opGraph =
        cudnn_frontend::OperationGraphBuilder().setHandle(handle_).setOperationGraph(ops.size(), ops.data()).build();

    // Create string encoding for plan caching
    auto cache_string =
        getConvFusionString(x_dim, conv_pad, conv_stride, conv_dilation, w_dim, dataType, opGraph.getTag());
    DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);

    auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
    DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());

    auto workspace_size = plan.getWorkspaceSize();
    DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);

    void* workspace_ptr = nullptr;
    auto workspace_tensor = at::empty({(workspace_size + 3) / 4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
    if (workspace_size > 0) {
      workspace_ptr = workspace_tensor.data_ptr<float>();
    }
    void* data_ptrs[] = {devPtrX, devPtrW, devPtrS, devPtrB, devPtrY};
    int64_t uids[] = {'x', 'w', 's', 'b', 'y'};
    auto variantPack = cudnn_frontend::VariantPackBuilder()
                           .setWorkspacePointer(workspace_ptr)
                           .setDataPointers(5, data_ptrs)
                           .setUids(5, uids)
                           .build();
    DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
    cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
    checkCudnnErr(status);
    cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
  } catch (cudnn_frontend::cudnnException e) {
    std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
  }
}

void run_conv_bias_relu(int64_t* x_dim, int64_t* w_dim, int64_t* y_dim, int64_t* conv_pad, int64_t* conv_stride,
                        int64_t* conv_dilation, cudnnDataType_t dataType, at::Half* devPtrX, at::Half* devPtrW,
                        at::Half* devPtrB, at::Half* devPtrY) {
  cudnnHandle_t handle_ = torch::native::getCudnnHandle();
  std::stringstream log_buf;

  try {
    int conv_dim = 2;
    float alpha = 1.0f;
    float beta = 0.0f;
    int64_t b_dim[] = {1, y_dim[1], 1, 1};

    // Creates the necessary tensor descriptors
    int64_t stride[4];
    generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto xTensor = cudnn_frontend::TensorBuilder()
                       .setDim(4, x_dim)
                       .setStrides(4, stride)
                       .setId('x')
                       .setAlignment(16)
                       .setDataType(dataType)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, xTensor.describe());

    generateStrides(w_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto wTensor = cudnn_frontend::TensorBuilder()
                       .setDim(4, w_dim)
                       .setStrides(4, stride)
                       .setId('w')
                       .setAlignment(16)
                       .setDataType(dataType)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, wTensor.describe());

    generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto afterConvTensor = cudnn_frontend::TensorBuilder()
                               .setDim(4, y_dim)
                               .setStrides(4, stride)
                               .setId('c')
                               .setAlignment(16)
                               .setDataType(CUDNN_DATA_FLOAT)
                               .setVirtual()
                               .build();
    DEBUG_CUDNN_MSG(log_buf, afterConvTensor.describe());

    generateStrides(b_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto bTensor = cudnn_frontend::TensorBuilder()
                       .setDim(4, b_dim)
                       .setStrides(4, stride)
                       .setId('b')
                       .setAlignment(16)
                       .setDataType(dataType)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, bTensor.describe());

    generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto afterBiasTensor = cudnn_frontend::TensorBuilder()
                               .setDim(4, y_dim)
                               .setStrides(4, stride)
                               .setId('B')
                               .setAlignment(16)
                               .setDataType(CUDNN_DATA_FLOAT)
                               .setVirtual()
                               .build();
    DEBUG_CUDNN_MSG(log_buf, afterBiasTensor.describe());

    generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto afterReLUTensor = cudnn_frontend::TensorBuilder()
                               .setDim(4, y_dim)
                               .setStrides(4, stride)
                               .setId('y')
                               .setAlignment(16)
                               .setDataType(dataType)
                               .build();
    DEBUG_CUDNN_MSG(log_buf, afterReLUTensor.describe());

    // Define the convolution problem
    auto convDesc = cudnn_frontend::ConvDescBuilder()
                        .setDataType(CUDNN_DATA_FLOAT)
                        .setMathMode(CUDNN_CROSS_CORRELATION)
                        .setNDims(conv_dim)
                        .setStrides(conv_dim, conv_stride)
                        .setPrePadding(conv_dim, conv_pad)
                        .setPostPadding(conv_dim, conv_pad)
                        .setDilation(conv_dim, conv_dilation)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, convDesc.describe());

    // Define the bias operation
    auto biasDesc =
        cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_ADD).setMathPrecision(CUDNN_DATA_FLOAT).build();
    DEBUG_CUDNN_MSG(log_buf, biasDesc.describe());

    // Define the activation operation
    auto actDesc = cudnn_frontend::PointWiseDescBuilder()
                       .setMode(CUDNN_POINTWISE_RELU_FWD)
                       .setMathPrecision(CUDNN_DATA_FLOAT)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, actDesc.describe());

    // Create a convolution Node
    auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
                       .setxDesc(xTensor)
                       .setwDesc(wTensor)
                       .setyDesc(afterConvTensor)
                       .setcDesc(convDesc)
                       .setAlpha(alpha)
                       .setBeta(beta)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, conv_op.describe());

    // Create a Bias Node.
    auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                       .setxDesc(conv_op.getOutputTensor())
                       .setbDesc(bTensor)
                       .setyDesc(afterBiasTensor)
                       .setpwDesc(biasDesc)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, bias_op.describe());

    // Create an Activation Node.
    auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                      .setxDesc(bias_op.getOutputTensor())
                      .setyDesc(afterReLUTensor)
                      .setpwDesc(actDesc)
                      .build();
    DEBUG_CUDNN_MSG(log_buf, act_op.describe());

    // Create an Operation Graph. In this case it is convolution bias activation
    std::array<cudnn_frontend::Operation const*, 3> ops = {&conv_op, &bias_op, &act_op};

    auto opGraph = cudnn_frontend::OperationGraphBuilder().setHandle(handle_).setOperationGraph(3, ops.data()).build();

    // Create string encoding for plan caching
    auto cache_string =
        getConvFusionString(x_dim, conv_pad, conv_stride, conv_dilation, w_dim, dataType, opGraph.getTag());
    DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);

    auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
    DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());

    auto workspace_size = plan.getWorkspaceSize();
    DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);

    void* workspace_ptr = nullptr;
    auto workspace_tensor = at::empty({(workspace_size + 3) / 4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
    if (workspace_size > 0) {
      workspace_ptr = workspace_tensor.data_ptr<float>();
    }
    void* data_ptrs[] = {devPtrX, devPtrW, devPtrB, devPtrY};
    int64_t uids[] = {'x', 'w', 'b', 'y'};
    auto variantPack = cudnn_frontend::VariantPackBuilder()
                           .setWorkspacePointer(workspace_ptr)
                           .setDataPointers(4, data_ptrs)
                           .setUids(4, uids)
                           .build();
    DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
    cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
    checkCudnnErr(status);
    cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
  } catch (cudnn_frontend::cudnnException e) {
    std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
  }
}

void run_drelu_dscale(int64_t* dy_dim, cudnnDataType_t dataType, at::Half* devPtrDY, at::Half* devPtrR,
                      at::Half* devPtrS, at::Half* devPtrDX) {
  cudnnHandle_t handle_ = torch::native::getCudnnHandle();
  std::stringstream log_buf;

  try {
    int convDim = 2;
    float alpha = 1.0f;
    float beta = 0.0f;
    int64_t s_dim[] = {1, dy_dim[1], 1, 1};

    // Creates the necessary tensor descriptors
    int64_t stride[4];
    generateStrides(dy_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto dyTensor = cudnn_frontend::TensorBuilder()
                        .setDim(4, dy_dim)
                        .setStrides(4, stride)
                        .setId('y')
                        .setAlignment(16)
                        .setDataType(dataType)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, dyTensor.describe());

    generateStrides(dy_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto rTensor = cudnn_frontend::TensorBuilder()
                       .setDim(4, dy_dim)
                       .setStrides(4, stride)
                       .setId('r')
                       .setAlignment(16)
                       .setDataType(dataType)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, rTensor.describe());

    generateStrides(dy_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto inActGradTensor = cudnn_frontend::TensorBuilder()
                               .setDim(4, dy_dim)
                               .setStrides(4, stride)
                               .setId('R')
                               .setAlignment(16)
                               .setDataType(CUDNN_DATA_FLOAT)
                               .setVirtual()
                               .build();
    DEBUG_CUDNN_MSG(log_buf, inActGradTensor.describe());

    generateStrides(s_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto scaleTensor = cudnn_frontend::TensorBuilder()
                           .setDim(4, s_dim)
                           .setStrides(4, stride)
                           .setId('s')
                           .setAlignment(16)
                           .setDataType(dataType)
                           .build();
    DEBUG_CUDNN_MSG(log_buf, scaleTensor.describe());

    generateStrides(dy_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto dxTensor = cudnn_frontend::TensorBuilder()
                        .setDim(4, dy_dim)
                        .setStrides(4, stride)
                        .setId('x')
                        .setAlignment(16)
                        .setDataType(dataType)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, dxTensor.describe());

    // Define the activation backward operation
    auto actDesc = cudnn_frontend::PointWiseDescBuilder()
                       .setMode(CUDNN_POINTWISE_RELU_BWD)
                       .setMathPrecision(CUDNN_DATA_FLOAT)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, actDesc.describe());

    // Define the bias backward operation
    auto scaleDesc =
        cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_MUL).setMathPrecision(CUDNN_DATA_FLOAT).build();
    DEBUG_CUDNN_MSG(log_buf, scaleDesc.describe());

    // Create an relu backward Node
    auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                      .setdyDesc(dyTensor)
                      .setxDesc(rTensor)
                      .setdxDesc(inActGradTensor)
                      .setpwDesc(actDesc)
                      .build();
    DEBUG_CUDNN_MSG(log_buf, act_op.describe());

    // Create bias node
    auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                        .setxDesc(inActGradTensor)
                        .setbDesc(scaleTensor)
                        .setyDesc(dxTensor)
                        .setpwDesc(scaleDesc)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, scale_op.describe());

    // Create an Operation Graph. In this case it is bias only
    std::array<cudnn_frontend::Operation const*, 2> ops = {&act_op, &scale_op};

    auto opGraph =
        cudnn_frontend::OperationGraphBuilder().setHandle(handle_).setOperationGraph(ops.size(), ops.data()).build();

    // Create string encoding for plan caching
    // creating unique dummy values
    int64_t pad_dummy[] = {40, 40};
    int64_t stride_dummy[] = {40, 40};
    int64_t dilation_dummy[] = {40, 40};
    auto cache_string =
        getConvFusionString(dy_dim, pad_dummy, stride_dummy, dilation_dummy, s_dim, dataType, opGraph.getTag());
    DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);

    auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
    DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());

    auto workspace_size = plan.getWorkspaceSize();
    DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);

    void* workspace_ptr = nullptr;
    auto workspace_tensor = at::empty({(workspace_size + 3) / 4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
    if (workspace_size > 0) {
      workspace_ptr = workspace_tensor.data_ptr<float>();
    }
    void* data_ptrs[] = {devPtrDY, devPtrR, devPtrS, devPtrDX};
    int64_t uids[] = {'y', 'r', 's', 'x'};
    auto variantPack = cudnn_frontend::VariantPackBuilder()
                           .setWorkspacePointer(workspace_ptr)
                           .setDataPointers(4, data_ptrs)
                           .setUids(4, uids)
                           .build();
    DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
    cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
    checkCudnnErr(status);
    cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
  } catch (cudnn_frontend::cudnnException e) {
    std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
  }
}

void run_drelu_dbias(int64_t* dy_dim, cudnnDataType_t dataType, at::Half* devPtrDY, at::Half* devPtrR,
                     at::Half* devPtrDR, float* devPtrDB) {
  cudnnHandle_t handle_ = torch::native::getCudnnHandle();
  std::stringstream log_buf;

  try {
    int convDim = 2;
    float alpha = 1.0f;
    float beta = 0.0f;
    int64_t b_dim[] = {1, dy_dim[1], 1, 1};

    // Creates the necessary tensor descriptors
    int64_t stride[4];
    generateStrides(dy_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto dyTensor = cudnn_frontend::TensorBuilder()
                        .setDim(4, dy_dim)
                        .setStrides(4, stride)
                        .setId('x')
                        .setAlignment(16)
                        .setDataType(dataType)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, dyTensor.describe());

    generateStrides(dy_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto rTensor = cudnn_frontend::TensorBuilder()
                       .setDim(4, dy_dim)
                       .setStrides(4, stride)
                       .setId('r')
                       .setAlignment(16)
                       .setDataType(dataType)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, rTensor.describe());

    generateStrides(dy_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto inActGradTensor = cudnn_frontend::TensorBuilder()
                               .setDim(4, dy_dim)
                               .setStrides(4, stride)
                               .setId('R')
                               .setAlignment(16)
                               .setDataType(dataType)
                               .build();
    DEBUG_CUDNN_MSG(log_buf, inActGradTensor.describe());

    generateStrides(b_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto biasGradTensor = cudnn_frontend::TensorBuilder()
                              .setDim(4, b_dim)
                              .setStrides(4, stride)
                              .setId('y')
                              .setAlignment(16)
                              .setDataType(CUDNN_DATA_FLOAT)
                              .build();
    DEBUG_CUDNN_MSG(log_buf, biasGradTensor.describe());

    // Define the activation backward operation
    auto actDesc = cudnn_frontend::PointWiseDescBuilder()
                       .setMode(CUDNN_POINTWISE_RELU_BWD)
                       .setMathPrecision(CUDNN_DATA_FLOAT)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, actDesc.describe());

    // Define the bias backward operation
    auto biasDesc = cudnn_frontend::ReductionDescBuilder()
                        .setMathPrecision(CUDNN_DATA_FLOAT)
                        .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, biasDesc.describe());

    // Create an relu backward Node
    auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                      .setdyDesc(dyTensor)
                      .setxDesc(rTensor)
                      .setdxDesc(inActGradTensor)
                      .setpwDesc(actDesc)
                      .build();
    DEBUG_CUDNN_MSG(log_buf, act_op.describe());

    // Create bias node
    auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
                       .setxDesc(inActGradTensor)
                       .setyDesc(biasGradTensor)
                       .setreductionDesc(biasDesc)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, bias_op.describe());

    // Create an Operation Graph. In this case it is bias only
    std::array<cudnn_frontend::Operation const*, 2> ops = {&act_op, &bias_op};

    auto opGraph =
        cudnn_frontend::OperationGraphBuilder().setHandle(handle_).setOperationGraph(ops.size(), ops.data()).build();

    // Create string encoding for plan caching
    // creating unique dummy values
    int64_t pad_dummy[] = {20, 20};
    int64_t stride_dummy[] = {20, 20};
    int64_t dilation_dummy[] = {20, 20};
    auto cache_string =
        getConvFusionString(dy_dim, pad_dummy, stride_dummy, dilation_dummy, b_dim, dataType, opGraph.getTag());
    DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);

    auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
    DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());

    auto workspace_size = plan.getWorkspaceSize();
    DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);

    void* workspace_ptr = nullptr;
    auto workspace_tensor = at::empty({(workspace_size + 3) / 4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
    if (workspace_size > 0) {
      workspace_ptr = workspace_tensor.data_ptr<float>();
    }
    void* data_ptrs[] = {devPtrDY, devPtrR, devPtrDR, devPtrDB};
    int64_t uids[] = {'x', 'r', 'R', 'y'};
    auto variantPack = cudnn_frontend::VariantPackBuilder()
                           .setWorkspacePointer(workspace_ptr)
                           .setDataPointers(4, data_ptrs)
                           .setUids(4, uids)
                           .build();
    DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
    cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
    checkCudnnErr(status);
    cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
  } catch (cudnn_frontend::cudnnException e) {
    std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
  }
}

void run_dconv_drelu_dbias(int64_t* x_dim, int64_t* w_dim, int64_t* y_dim, int64_t* pad, int64_t* convstride,
                           int64_t* dilation, cudnnDataType_t dataType, at::Half* devPtrX, at::Half* devPtrW,
                           at::Half* devPtrR, at::Half* devPtrRg, float* devPtrY) {
  cudnnHandle_t handle_ = torch::native::getCudnnHandle();
  std::stringstream log_buf;
  try {
    int convDim = 2;
    float alpha = 1.0f;
    float beta = 0.0f;
    int64_t b_dim[] = {1, x_dim[1], 1, 1};

    int64_t stride[4];
    generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto outConvGradTensor = cudnn_frontend::TensorBuilder()
                                 .setDim(4, y_dim)
                                 .setStrides(4, stride)
                                 .setId('x')
                                 .setAlignment(16)
                                 .setDataType(dataType)
                                 .build();
    DEBUG_CUDNN_MSG(log_buf, outConvGradTensor.describe());

    generateStrides(w_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto wTensor = cudnn_frontend::TensorBuilder()
                       .setDim(4, w_dim)
                       .setStrides(4, stride)
                       .setId('w')
                       .setAlignment(16)
                       .setDataType(dataType)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, wTensor.describe());

    generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto inConvGradTensor = cudnn_frontend::TensorBuilder()
                                .setDim(4, x_dim)
                                .setStrides(4, stride)
                                .setId('A')
                                .setAlignment(16)
                                .setDataType(CUDNN_DATA_FLOAT)
                                .setVirtual()
                                .build();
    DEBUG_CUDNN_MSG(log_buf, inConvGradTensor.describe());

    generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto rTensor = cudnn_frontend::TensorBuilder()
                       .setDim(4, x_dim)
                       .setStrides(4, stride)
                       .setId('r')
                       .setAlignment(16)
                       .setDataType(dataType)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, rTensor.describe());

    generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto inReLUGradTensor = cudnn_frontend::TensorBuilder()
                                .setDim(4, x_dim)
                                .setStrides(4, stride)
                                .setId('R')
                                .setAlignment(16)
                                .setDataType(dataType)
                                .build();
    DEBUG_CUDNN_MSG(log_buf, inReLUGradTensor.describe());

    generateStrides(b_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto inBiasGradTensor = cudnn_frontend::TensorBuilder()
                                .setDim(4, b_dim)
                                .setStrides(4, stride)
                                .setId('y')
                                .setAlignment(16)
                                .setDataType(CUDNN_DATA_FLOAT)
                                .build();
    DEBUG_CUDNN_MSG(log_buf, inBiasGradTensor.describe());

    // Define the convolution problem
    auto convDesc = cudnn_frontend::ConvDescBuilder()
                        .setDataType(CUDNN_DATA_FLOAT)
                        .setMathMode(CUDNN_CROSS_CORRELATION)
                        .setNDims(convDim)
                        .setStrides(convDim, convstride)
                        .setPrePadding(convDim, pad)
                        .setPostPadding(convDim, pad)
                        .setDilation(convDim, dilation)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, convDesc.describe());

    // Define the activation backward operation
    auto actDesc = cudnn_frontend::PointWiseDescBuilder()
                       .setMode(CUDNN_POINTWISE_RELU_BWD)
                       .setMathPrecision(CUDNN_DATA_FLOAT)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, actDesc.describe());

    // Define the bias backward operation
    auto biasDesc = cudnn_frontend::ReductionDescBuilder()
                        .setMathPrecision(CUDNN_DATA_FLOAT)
                        .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, biasDesc.describe());

    // Create a convolution Node
    auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR)
                       .setdyDesc(outConvGradTensor)
                       .setwDesc(wTensor)
                       .setdxDesc(inConvGradTensor)
                       .setcDesc(convDesc)
                       .setAlpha(alpha)
                       .setBeta(beta)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, conv_op.describe());

    // Create an relu backward Node
    auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                      .setdyDesc(inConvGradTensor)
                      .setxDesc(rTensor)
                      .setdxDesc(inReLUGradTensor)
                      .setpwDesc(actDesc)
                      .build();
    DEBUG_CUDNN_MSG(log_buf, act_op.describe());

    // Create bias node
    auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
                       .setxDesc(inReLUGradTensor)
                       .setyDesc(inBiasGradTensor)
                       .setreductionDesc(biasDesc)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, bias_op.describe());

    // Create an Operation Graph. In this case it is bias only
    std::array<cudnn_frontend::Operation const*, 3> ops = {&conv_op, &act_op, &bias_op};

    auto opGraph =
        cudnn_frontend::OperationGraphBuilder().setHandle(handle_).setOperationGraph(ops.size(), ops.data()).build();

    // Create string encoding for plan caching
    auto cache_string = getConvFusionString(x_dim, pad, convstride, dilation, w_dim, dataType, opGraph.getTag());
    DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);

    auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
    DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());

    auto workspace_size = plan.getWorkspaceSize();
    DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);

    void* workspace_ptr = nullptr;
    auto workspace_tensor = at::empty({(workspace_size + 3) / 4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
    if (workspace_size > 0) {
      workspace_ptr = workspace_tensor.data_ptr<float>();
    }
    void* data_ptrs[] = {devPtrX, devPtrW, devPtrR, devPtrRg, devPtrY};
    int64_t uids[] = {'x', 'w', 'r', 'R', 'y'};
    auto variantPack = cudnn_frontend::VariantPackBuilder()
                           .setWorkspacePointer(workspace_ptr)
                           .setDataPointers(5, data_ptrs)
                           .setUids(5, uids)
                           .build();
    DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
    cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
    checkCudnnErr(status);
    cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
  } catch (cudnn_frontend::cudnnException e) {
    std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
  }
}

void run_dconv(int64_t* x_dim, int64_t* w_dim, int64_t* y_dim, int64_t* conv_pad, int64_t* conv_stride,
               int64_t* conv_dilation, cudnnDataType_t dataType, at::Half* devPtrX, at::Half* devPtrW,
               at::Half* devPtrY, cudnnBackendDescriptorType_t mode) {
  cudnnHandle_t handle_ = torch::native::getCudnnHandle();
  std::stringstream log_buf;

  try {
    int conv_dim = 2;
    float alpha = 1.0f;
    float beta = 0.0f;

    // Define the convolution problem
    int64_t stride[4];
    generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto xTensor = cudnn_frontend::TensorBuilder()
                       .setDim(4, x_dim)
                       .setStrides(4, stride)
                       .setId('x')
                       .setAlignment(16)
                       .setDataType(dataType)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, xTensor.describe());

    generateStrides(w_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto wTensor = cudnn_frontend::TensorBuilder()
                       .setDim(4, w_dim)
                       .setStrides(4, stride)
                       .setId('w')
                       .setAlignment(16)
                       .setDataType(dataType)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, wTensor.describe());

    generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto yTensor = cudnn_frontend::TensorBuilder()
                       .setDim(4, y_dim)
                       .setStrides(4, stride)
                       .setId('y')
                       .setAlignment(16)
                       .setDataType(dataType)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, yTensor.describe());

    // Define the convolution problem
    auto convDesc = cudnn_frontend::ConvDescBuilder()
                        .setDataType(CUDNN_DATA_FLOAT)
                        .setMathMode(CUDNN_CROSS_CORRELATION)
                        .setNDims(conv_dim)
                        .setStrides(conv_dim, conv_stride)
                        .setPrePadding(conv_dim, conv_pad)
                        .setPostPadding(conv_dim, conv_pad)
                        .setDilation(conv_dim, conv_dilation)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, convDesc.describe());

    // Create a convolution node
    // mode should be one of following
    // CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
    // CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
    auto conv_op_builder = cudnn_frontend::OperationBuilder(mode);
    if (mode == CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR) {
      conv_op_builder.setdxDesc(xTensor).setwDesc(wTensor).setdyDesc(yTensor).setcDesc(convDesc);
    } else {
      conv_op_builder.setxDesc(xTensor).setdwDesc(wTensor).setdyDesc(yTensor).setcDesc(convDesc);
    }
    auto conv_op = conv_op_builder.setAlpha(alpha).setBeta(beta).build();
    DEBUG_CUDNN_MSG(log_buf, conv_op.describe());

    // Create an Operation Graph. In this case it is convolution add bias activation
    std::array<cudnn_frontend::Operation const*, 1> ops = {&conv_op};

    auto opGraph =
        cudnn_frontend::OperationGraphBuilder().setHandle(handle_).setOperationGraph(ops.size(), ops.data()).build();

    // Create string encoding for plan caching
    auto cache_string =
        getConvFusionString(x_dim, conv_pad, conv_stride, conv_dilation, w_dim, dataType, opGraph.getTag());
    DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);

    auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
    DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());

    auto workspace_size = plan.getWorkspaceSize();
    DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);

    void* workspace_ptr = nullptr;
    auto workspace_tensor = at::empty({(workspace_size + 3) / 4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
    if (workspace_size > 0) {
      workspace_ptr = workspace_tensor.data_ptr<float>();
    }
    void* data_ptrs[] = {devPtrX, devPtrW, devPtrY};
    int64_t uids[] = {'x', 'w', 'y'};
    auto variantPack = cudnn_frontend::VariantPackBuilder()
                           .setWorkspacePointer(workspace_ptr)
                           .setDataPointers(3, data_ptrs)
                           .setUids(3, uids)
                           .build();
    DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
    cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
    checkCudnnErr(status);
    cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
  } catch (cudnn_frontend::cudnnException e) {
    std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
  }
}

void run_dbias(int64_t* x_dim, cudnnDataType_t dataType, at::Half* devPtrX, float* devPtrY) {
  cudnnHandle_t handle_ = torch::native::getCudnnHandle();
  std::stringstream log_buf;
  try {
    int convDim = 2;
    int64_t b_dim[] = {1, x_dim[1], 1, 1};

    int64_t stride[4];
    generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto xTensor = cudnn_frontend::TensorBuilder()
                       .setDim(4, x_dim)
                       .setStrides(4, stride)
                       .setId('x')
                       .setAlignment(16)
                       .setDataType(dataType)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, xTensor.describe());

    generateStrides(b_dim, stride, 4, CUDNN_TENSOR_NHWC);
    auto yTensor = cudnn_frontend::TensorBuilder()
                       .setDim(4, b_dim)
                       .setStrides(4, stride)
                       .setId('y')
                       .setAlignment(16)
                       .setDataType(CUDNN_DATA_FLOAT)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, yTensor.describe());

    // Define the bias backward operation
    auto biasDesc = cudnn_frontend::ReductionDescBuilder()
                        .setMathPrecision(CUDNN_DATA_FLOAT)
                        .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
                        .build();
    DEBUG_CUDNN_MSG(log_buf, biasDesc.describe());

    // Create bias node
    auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
                       .setxDesc(xTensor)
                       .setyDesc(yTensor)
                       .setreductionDesc(biasDesc)
                       .build();
    DEBUG_CUDNN_MSG(log_buf, bias_op.describe());

    // Create an Operation Graph. In this case it is bias only
    std::array<cudnn_frontend::Operation const*, 1> ops = {&bias_op};

    auto opGraph =
        cudnn_frontend::OperationGraphBuilder().setHandle(handle_).setOperationGraph(ops.size(), ops.data()).build();

    // Create string encoding for plan caching
    int64_t pad_dummy[] = {10, 10};
    int64_t stride_dummy[] = {10, 10};
    int64_t dilation_dummy[] = {10, 10};
    auto cache_string =
        getConvFusionString(x_dim, pad_dummy, stride_dummy, dilation_dummy, b_dim, dataType, opGraph.getTag());
    DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);

    auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
    DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());

    auto workspace_size = plan.getWorkspaceSize();
    DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);

    void* workspace_ptr = nullptr;
    auto workspace_tensor = at::empty({(workspace_size + 3) / 4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
    if (workspace_size > 0) {
      workspace_ptr = workspace_tensor.data_ptr<float>();
    }
    void* data_ptrs[] = {devPtrX, devPtrY};
    int64_t uids[] = {'x', 'y'};
    auto variantPack = cudnn_frontend::VariantPackBuilder()
                           .setWorkspacePointer(workspace_ptr)
                           .setDataPointers(2, data_ptrs)
                           .setUids(2, uids)
                           .build();
    DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
    cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
    checkCudnnErr(status);
    cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
  } catch (cudnn_frontend::cudnnException e) {
    std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
  }
}

std::vector<at::Tensor> conv_bias_mask_relu_forward(std::vector<at::Tensor> inputs, int64_t padding, int64_t stride) {
  std::cout << std::fixed;

  // create output vector
  std::vector<at::Tensor> outputs;
  auto output_format = at::MemoryFormat::ChannelsLast;

  // setup dimensions
  int64_t x_dim[] = {0, 0, 0, 0};
  int64_t w_dim[] = {0, 0, 0, 0};

  // All dim calculation after this order of n,c,h,w
  int axis[] = {0, 1, 2, 3};
  for (int dim = 0; dim < 4; dim++) {
    x_dim[dim] = inputs[0].size(axis[dim]);
    w_dim[dim] = inputs[1].size(axis[dim]);
  }

  // output dim in n,c,h,w used by backend
  int64_t y_dim[] = {0, 0, 0, 0};

  // use these fixed values
  int64_t conv_pad[] = {padding, padding};
  int64_t conv_stride[] = {stride, stride};
  int64_t conv_dilation[] = {1, 1};

  // compute output from pad/stride/dilation
  y_dim[0] = x_dim[0];
  y_dim[1] = w_dim[0];
  for (int dim = 0; dim < 2; dim++) {
    y_dim[dim + 2] =
        getFwdConvOutputDim(x_dim[dim + 2], conv_pad[dim], w_dim[dim + 2], conv_stride[dim], conv_dilation[dim]);
  }

  // run
  at::Half* x = inputs[0].data_ptr<at::Half>();
  at::Half* w = inputs[1].data_ptr<at::Half>();
  at::Half* b = inputs[2].data_ptr<at::Half>();
  int8_t* m = inputs[3].data_ptr<int8_t>();
  auto out = at::empty(y_dim, inputs[0].type(), output_format);
  at::Half* y = out.data_ptr<at::Half>();

  run_conv_bias_mask_relu(x_dim, w_dim, y_dim, conv_pad, conv_stride, conv_dilation, CUDNN_DATA_HALF, x, w, b, m, y);

  DEBUG_MSG("[DEBUG] conv-bias-mask-relu : " << y.to(at::kFloat).sum().item<float>());

  outputs.push_back(out);

  return outputs;
}

at::Tensor conv_cscale_cbias_relu_forward(std::vector<at::Tensor> inputs, int64_t padding, int64_t stride) {
  std::cout << std::fixed;

  // setup dimensions
  int64_t x_dim[] = {0, 0, 0, 0};
  int64_t w_dim[] = {0, 0, 0, 0};

  // All dim calculation after this order of n,c,h,w
  int axis[] = {0, 1, 2, 3};
  for (int dim = 0; dim < 4; dim++) {
    x_dim[dim] = inputs[0].size(axis[dim]);
    w_dim[dim] = inputs[1].size(axis[dim]);
  }

  // output dim in n,c,h,w used by backend
  int64_t y_dim[] = {0, 0, 0, 0};

  // use these fixed values
  int64_t conv_pad[] = {padding, padding};
  int64_t conv_stride[] = {stride, stride};
  int64_t conv_dilation[] = {1, 1};

  // compute output from pad/stride/dilation
  y_dim[0] = x_dim[0];
  y_dim[1] = w_dim[0];
  for (int dim = 0; dim < 2; dim++) {
    y_dim[dim + 2] =
        getFwdConvOutputDim(x_dim[dim + 2], conv_pad[dim], w_dim[dim + 2], conv_stride[dim], conv_dilation[dim]);
  }

  // run
  at::Half* x = inputs[0].data_ptr<at::Half>();
  at::Half* w = inputs[1].data_ptr<at::Half>();
  at::Half* s = inputs[2].data_ptr<at::Half>();
  at::Half* b = inputs[3].data_ptr<at::Half>();
  auto out = at::empty(y_dim, inputs[0].type(), at::MemoryFormat::ChannelsLast);
  at::Half* y = out.data_ptr<at::Half>();

  run_conv_cscale_cbias_relu(x_dim, w_dim, y_dim, conv_pad, conv_stride, conv_dilation, CUDNN_DATA_HALF, x, w, s, b, y);

  DEBUG_MSG("[DEBUG] conv-cscale-cbias-relu : " << y.to(at::kFloat).sum().item<float>());

  return out;
}

std::vector<at::Tensor> conv_cscale_cbias_relu_backward(std::vector<at::Tensor> inputs, int64_t padding,
                                                        int64_t stride) {
  bool requires_grad = inputs[0].requires_grad();

  for (int i = 0; i <= 4; i++) {
    CHECK_INPUT(inputs[i]);
  }

  std::cout << std::fixed;

  // create output vector
  std::vector<at::Tensor> outputs;
  auto output_format = at::MemoryFormat::ChannelsLast;

  // setup dimensions
  int64_t x_dim[] = {0, 0, 0, 0};
  int64_t w_dim[] = {0, 0, 0, 0};
  int64_t y_dim[] = {0, 0, 0, 0};

  // All dim calculation after this order of n,c,h,w
  int axis[] = {0, 1, 2, 3};
  for (int dim = 0; dim < 4; dim++) {
    x_dim[dim] = inputs[0].size(axis[dim]);
    w_dim[dim] = inputs[1].size(axis[dim]);
    y_dim[dim] = inputs[3].size(axis[dim]);
  }

  int64_t b_dim[] = {1, y_dim[1], 1, 1};

  int64_t conv_pad[] = {padding, padding};
  int64_t conv_stride[] = {stride, stride};
  int64_t conv_dilation[] = {1, 1};

  // run
  // drelu-dbias
  at::Half* dy = inputs[4].data_ptr<at::Half>();
  at::Half* r = inputs[3].data_ptr<at::Half>();
  auto s = inputs[2].data_ptr<at::Half>();
  auto dscale = at::empty_like(inputs[4]);
  at::Half* ds = dscale.data_ptr<at::Half>();

  auto options =
      at::TensorOptions().dtype(at::kFloat).layout(inputs[0].layout()).device(inputs[0].device()).requires_grad(false);
  run_drelu_dscale(y_dim, CUDNN_DATA_HALF, dy, r, s, ds);

  // conv wgrad
  at::Half* x = inputs[0].data_ptr<at::Half>();
  auto wgrad = at::empty_like(inputs[1]);
  at::Half* dw = wgrad.data_ptr<at::Half>();
  run_dconv(x_dim, w_dim, y_dim, conv_pad, conv_stride, conv_dilation, CUDNN_DATA_HALF, x, dw, ds,
            CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);

  // conv dgrad
  at::Half* w = inputs[1].data_ptr<at::Half>();
  auto dgrad = at::empty_like(inputs[0]);
  at::Half* dx = dgrad.data_ptr<at::Half>();
  run_dconv(x_dim, w_dim, y_dim, conv_pad, conv_stride, conv_dilation, CUDNN_DATA_HALF, dx, w, ds,
            CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);

  outputs.push_back(dgrad);
  outputs.push_back(wgrad);

  return outputs;
}

std::vector<at::Tensor> conv_bias_relu_forward(std::vector<at::Tensor> inputs, int64_t padding, int64_t stride) {
  std::cout << std::fixed;

  // create output vector
  std::vector<at::Tensor> outputs;
  auto output_format = at::MemoryFormat::ChannelsLast;

  // setup dimensions
  int64_t x_dim[] = {0, 0, 0, 0};
  int64_t w_dim[] = {0, 0, 0, 0};

  // All dim calculation after this order of n,c,h,w
  int axis[] = {0, 1, 2, 3};
  for (int dim = 0; dim < 4; dim++) {
    x_dim[dim] = inputs[0].size(axis[dim]);
    w_dim[dim] = inputs[1].size(axis[dim]);
  }

  // output dim in n,c,h,w used by backend
  int64_t y_dim[] = {0, 0, 0, 0};

  // use these fixed values
  int64_t conv_pad[] = {padding, padding};
  int64_t conv_stride[] = {stride, stride};
  int64_t conv_dilation[] = {1, 1};

  // compute output from pad/stride/dilation
  y_dim[0] = x_dim[0];
  y_dim[1] = w_dim[0];
  for (int dim = 0; dim < 2; dim++) {
    y_dim[dim + 2] =
        getFwdConvOutputDim(x_dim[dim + 2], conv_pad[dim], w_dim[dim + 2], conv_stride[dim], conv_dilation[dim]);
  }

  // run
  at::Half* x = inputs[0].data_ptr<at::Half>();
  at::Half* w = inputs[1].data_ptr<at::Half>();
  at::Half* b = inputs[2].data_ptr<at::Half>();
  auto out = at::empty(y_dim, inputs[0].type(), output_format);
  at::Half* y = out.data_ptr<at::Half>();

  run_conv_bias_relu(x_dim, w_dim, y_dim, conv_pad, conv_stride, conv_dilation, CUDNN_DATA_HALF, x, w, b, y);

  DEBUG_MSG("[DEBUG] conv-bias-relu : " << y.to(at::kFloat).sum().item<float>());

  outputs.push_back(out);

  return outputs;
}

std::vector<at::Tensor> conv_bias_relu_backward(std::vector<at::Tensor> inputs, int64_t padding, int64_t stride) {
  bool requires_grad = inputs[0].requires_grad();

  for (int i = 0; i <= 3; i++) {
    CHECK_INPUT(inputs[i]);
  }

  std::cout << std::fixed;

  // create output vector
  std::vector<at::Tensor> outputs;
  auto output_format = at::MemoryFormat::ChannelsLast;

  // setup dimensions
  int64_t x_dim[] = {0, 0, 0, 0};
  int64_t w_dim[] = {0, 0, 0, 0};
  int64_t y_dim[] = {0, 0, 0, 0};

  // All dim calculation after this order of n,c,h,w
  int axis[] = {0, 1, 2, 3};
  for (int dim = 0; dim < 4; dim++) {
    x_dim[dim] = inputs[0].size(axis[dim]);
    w_dim[dim] = inputs[1].size(axis[dim]);
    y_dim[dim] = inputs[3].size(axis[dim]);
  }

  int64_t b_dim[] = {1, y_dim[1], 1, 1};

  int64_t conv_pad[] = {padding, padding};
  int64_t conv_stride[] = {stride, stride};
  int64_t conv_dilation[] = {1, 1};

  // run
  // drelu-dbias
  at::Half* dy = inputs[3].data_ptr<at::Half>();
  at::Half* r = inputs[2].data_ptr<at::Half>();
  auto drelu = at::empty_like(inputs[2]);
  at::Half* dr = drelu.data_ptr<at::Half>();
  auto options =
      at::TensorOptions().dtype(at::kFloat).layout(inputs[0].layout()).device(inputs[0].device()).requires_grad(false);
  auto bgrad = at::empty(b_dim, options, output_format);
  float* db = bgrad.data_ptr<float>();
  run_drelu_dbias(y_dim, CUDNN_DATA_HALF, dy, r, dr, db);

  // conv wgrad
  at::Half* x = inputs[0].data_ptr<at::Half>();
  auto wgrad = at::empty_like(inputs[1]);
  at::Half* dw = wgrad.data_ptr<at::Half>();
  run_dconv(x_dim, w_dim, y_dim, conv_pad, conv_stride, conv_dilation, CUDNN_DATA_HALF, x, dw, dr,
            CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);

  // conv dgrad
  at::Half* w = inputs[1].data_ptr<at::Half>();
  auto dgrad = at::empty_like(inputs[0]);
  at::Half* dx = dgrad.data_ptr<at::Half>();
  run_dconv(x_dim, w_dim, y_dim, conv_pad, conv_stride, conv_dilation, CUDNN_DATA_HALF, dx, w, dr,
            CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);

  outputs.push_back(dgrad);
  outputs.push_back(wgrad);
  outputs.push_back(bgrad);

  return outputs;
}

std::vector<at::Tensor> conv_bias_forward(std::vector<at::Tensor> inputs, int64_t padding, int64_t stride) {
  std::cout << std::fixed;

  // create output vector
  std::vector<at::Tensor> outputs;
  auto output_format = at::MemoryFormat::ChannelsLast;

  // setup dimensions
  int64_t x_dim[] = {0, 0, 0, 0};
  int64_t w_dim[] = {0, 0, 0, 0};

  // All dim calculation after this order of n,c,h,w
  int axis[] = {0, 1, 2, 3};
  for (int dim = 0; dim < 4; dim++) {
    x_dim[dim] = inputs[0].size(axis[dim]);
    w_dim[dim] = inputs[1].size(axis[dim]);
  }

  // output dim in n,c,h,w used by backend
  int64_t y_dim[] = {0, 0, 0, 0};

  // use these fixed values
  int64_t conv_pad[] = {padding, padding};
  int64_t conv_stride[] = {stride, stride};
  int64_t conv_dilation[] = {1, 1};

  // compute output from pad/stride/dilation
  y_dim[0] = x_dim[0];
  y_dim[1] = w_dim[0];
  for (int dim = 0; dim < 2; dim++) {
    y_dim[dim + 2] =
        getFwdConvOutputDim(x_dim[dim + 2], conv_pad[dim], w_dim[dim + 2], conv_stride[dim], conv_dilation[dim]);
  }

  // run
  at::Half* x = inputs[0].data_ptr<at::Half>();
  at::Half* w = inputs[1].data_ptr<at::Half>();
  at::Half* b = inputs[2].data_ptr<at::Half>();
  auto out = at::empty(y_dim, inputs[0].type(), output_format);
  at::Half* y = out.data_ptr<at::Half>();

  run_conv_bias(x_dim, w_dim, y_dim, conv_pad, conv_stride, conv_dilation, CUDNN_DATA_HALF, x, w, b, y);

  DEBUG_MSG("[DEBUG] conv-bias : " << y.to(at::kFloat).sum().item<float>());

  outputs.push_back(out);

  return outputs;
}

std::vector<at::Tensor> conv_bias_backward(std::vector<at::Tensor> inputs, int64_t padding, int64_t stride) {
  bool requires_grad = inputs[0].requires_grad();

  for (int i = 0; i <= 2; i++) {
    CHECK_INPUT(inputs[i]);
  }

  std::cout << std::fixed;

  // create output vector
  std::vector<at::Tensor> outputs;
  auto output_format = at::MemoryFormat::ChannelsLast;

  // setup dimensions
  int64_t x_dim[] = {0, 0, 0, 0};
  int64_t w_dim[] = {0, 0, 0, 0};
  int64_t y_dim[] = {0, 0, 0, 0};

  // All dim calculation after this order of n,c,h,w
  int axis[] = {0, 1, 2, 3};
  for (int dim = 0; dim < 4; dim++) {
    x_dim[dim] = inputs[0].size(axis[dim]);
    w_dim[dim] = inputs[1].size(axis[dim]);
    y_dim[dim] = inputs[2].size(axis[dim]);
  }

  int64_t b_dim[] = {1, y_dim[1], 1, 1};

  int64_t conv_pad[] = {padding, padding};
  int64_t conv_stride[] = {stride, stride};
  int64_t conv_dilation[] = {1, 1};

  // run
  // dbias
  at::Half* dy = inputs[2].data_ptr<at::Half>();
  auto options =
      at::TensorOptions().dtype(at::kFloat).layout(inputs[0].layout()).device(inputs[0].device()).requires_grad(false);
  auto bgrad = at::empty(b_dim, options, output_format);
  float* db = bgrad.data_ptr<float>();
  run_dbias(y_dim, CUDNN_DATA_HALF, dy, db);

  // conv wgrad
  at::Half* x = inputs[0].data_ptr<at::Half>();
  auto wgrad = at::empty_like(inputs[1]);
  at::Half* dw = wgrad.data_ptr<at::Half>();
  run_dconv(x_dim, w_dim, y_dim, conv_pad, conv_stride, conv_dilation, CUDNN_DATA_HALF, x, dw, dy,
            CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);

  // conv dgrad
  at::Half* w = inputs[1].data_ptr<at::Half>();
  auto dgrad = at::empty_like(inputs[0]);
  at::Half* dx = dgrad.data_ptr<at::Half>();
  run_dconv(x_dim, w_dim, y_dim, conv_pad, conv_stride, conv_dilation, CUDNN_DATA_HALF, dx, w, dy,
            CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);

  outputs.push_back(dgrad);
  outputs.push_back(wgrad);
  outputs.push_back(bgrad);

  return outputs;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &conv_bias_relu_forward, "Fused Conv-Bias-ReLU forward", py::call_guard<py::gil_scoped_release>());
  m.def("backward", &conv_bias_relu_backward, "Fused Conv-Bias-ReLU backward",
        py::call_guard<py::gil_scoped_release>());
  m.def("forward_no_relu", &conv_bias_forward, "Fused Conv-Bias forward", py::call_guard<py::gil_scoped_release>());
  m.def("backward_no_relu", &conv_bias_backward, "Fused Conv-Bias backward", py::call_guard<py::gil_scoped_release>());
  m.def("forward_mask", &conv_bias_mask_relu_forward, "Fused Conv-Bias-Mask-ReLU forward",
        py::call_guard<py::gil_scoped_release>());
  m.def("forward_cscale_cbias_relu", &conv_cscale_cbias_relu_forward, "Fused Conv-(const)Scale-(const)Bias-ReLU",
        py::call_guard<py::gil_scoped_release>());
  m.def("backward_cscale_cbias_relu", &conv_cscale_cbias_relu_backward,
        "Fused Conv-(const)Scale-(const)Bias-ReLU backward", py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: apex/contrib/csrc/cudnn_gbn/cudnn_gbn.cpp
================================================
#include <ATen/ATen.h>
#include <torch/extension.h>
#include <torch/torch.h>

#include <iostream>
#include <vector>

#include "norm_sample.h"

// define this enum:
enum bn_type { BN_FWD, BN_BWD };

// this is a global variable
static std::map<std::vector<int64_t>, cudnn_frontend::ExecutionPlan> gbn_plan_cache;

at::Tensor gbn_forward(const at::Tensor& x, const at::Tensor& scale, const at::Tensor& bias,
                       const at::Tensor& running_mean, const at::Tensor& running_var, const at::Tensor& minibatch_mean,
                       const at::Tensor& minibatch_inv_var, const float momentum, const float epsilon,
                       const int64_t bn_group, const int rank_id, const std::vector<int64_t>& peer_buffers) {
  int64_t N = x.size(0);
  int64_t C = x.size(1);
  int64_t H = x.size(2);
  int64_t W = x.size(3);

  int64_t tensorDims[] = {N, C, H, W};
  int64_t peerDims[] = {bn_group, 4 * C, 1, 1};
  int64_t perChannelDims[] = {1, C, 1, 1};
  int64_t epsilonDims[] = {1, 1, 1, 1};

  // Allocate output tensor
  at::Tensor y = at::empty_like(x);

  std::vector<void*> void_peer_buffers;
  for (int64_t addr : peer_buffers) {
    void_peer_buffers.push_back((void*)addr);
  }

  // we need the peer size for the buffer reset
  size_t peer_size = 1;
  for (size_t i = 0; i < 4; ++i) {
    peer_size *= peerDims[i];
  }

  // sanity check
  assert(bn_group == void_peer_buffers.size());

  // check if plan already exists
  std::vector<int64_t> fv = {(int64_t)BN_FWD, N, C, H, W, bn_group, (int64_t)CUDNN_DATA_HALF};
  if (gbn_plan_cache.find(fv) == gbn_plan_cache.end()) {
    auto plan = run_batch_norm_forward(tensorDims, perChannelDims, epsilonDims, peerDims, CUDNN_DATA_HALF);
    gbn_plan_cache.emplace(fv, std::move(plan));
  }

  // get plan and handle
  auto plan = gbn_plan_cache.find(fv)->second;

  // execute
  execute_batch_norm_forward(plan, x.data_ptr(), y.data_ptr(), scale.data_ptr(), bias.data_ptr(),
                             running_mean.data_ptr(), running_var.data_ptr(), running_mean.data_ptr(),
                             running_var.data_ptr(), minibatch_mean.data_ptr(), minibatch_inv_var.data_ptr(),
                             void_peer_buffers, static_cast<double>(epsilon), static_cast<double>(momentum), peer_size,
                             rank_id);

  return y;
}

std::vector<at::Tensor> gbn_backward(const at::Tensor& x, const at::Tensor& dy, const at::Tensor& scale,
                                     const at::Tensor& minibatch_mean, const at::Tensor& minibatch_inv_var,
                                     const float epsilon, const int64_t bn_group, const int rank_id,
                                     const std::vector<int64_t>& peer_buffers) {
  int64_t N = x.size(0);
  int64_t C = x.size(1);
  int64_t H = x.size(2);
  int64_t W = x.size(3);

  int64_t tensorDims[] = {N, C, H, W};
  int64_t peerDims[] = {bn_group, 4 * C, 1, 1};
  int64_t perChannelDims[] = {1, C, 1, 1};
  int64_t epsilonDims[] = {1, 1, 1, 1};

  // Allocate output tensor
  // outputs
  at::Tensor x_grad, scale_grad, bias_grad;

  // Allocate outputs
  x_grad = at::empty_like(x);
  scale_grad = at::empty_like(scale);
  bias_grad = at::empty_like(scale);

  std::vector<void*> void_peer_buffers;
  for (int64_t addr : peer_buffers) {
    void_peer_buffers.push_back((void*)addr);
  }

  // we need the peer size for the buffer reset
  size_t peer_size = 1;
  for (size_t i = 0; i < 4; ++i) {
    peer_size *= peerDims[i];
  }

  assert(bn_group == void_peer_buffers.size());

  std::vector<int64_t> fv = {(int64_t)BN_BWD, N, C, H, W, bn_group, (int64_t)CUDNN_DATA_HALF};
  if (gbn_plan_cache.find(fv) == gbn_plan_cache.end()) {
    auto plan = run_batch_norm_backward(tensorDims, perChannelDims, epsilonDims, peerDims, CUDNN_DATA_HALF);
    gbn_plan_cache.emplace(fv, std::move(plan));
  }

  // get plan and handle
  auto plan = gbn_plan_cache.find(fv)->second;

  // execute
  execute_batch_norm_backward(plan, x.data_ptr(), dy.data_ptr(), scale.data_ptr(), minibatch_mean.data_ptr(),
                              minibatch_inv_var.data_ptr(), void_peer_buffers, x_grad.data_ptr(), scale_grad.data_ptr(),
                              bias_grad.data_ptr(), static_cast<double>(epsilon), peer_size, rank_id);

  return std::vector<at::Tensor>{x_grad, scale_grad, bias_grad};
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &gbn_forward, "Group batch norm forward", py::call_guard<py::gil_scoped_release>());
  m.def("backward", &gbn_backward, "Group batch backward", py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: apex/contrib/csrc/cudnn_gbn/norm_sample.cpp
================================================
/*
 * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

#include "norm_sample.h"

#include <ATen/cudnn/Handle.h>  // for getcudnnhandle
#include <cudnn_frontend.h>
#include <torch/extension.h>
#include <torch/torch.h>

#include "cudnn_backend.h"

// some helpers
int64_t checkCudaError(cudaError_t code, const char* expr, const char* file, int line) {
  if (code) {
    printf("CUDA error at %s:%d, code=%d (%s) in '%s'", file, line, (int)code, cudaGetErrorString(code), expr);
    return 1;
  }
  return 0;
}

int64_t checkCudnnError(cudnnStatus_t code, const char* expr, const char* file, int line) {
  if (code) {
    printf("CUDNN error at %s:%d, code=%d (%s) in '%s'\n", file, line, (int)code, cudnnGetErrorString(code), expr);
    return 1;
  }
  return 0;
}

bool AllowAll(cudnnBackendDescriptor_t engine_config) {
  (void)engine_config;
  return false;
}

void generateStrides(const int64_t* dimA, int64_t* strideA, int64_t nbDims, cudnnTensorFormat_t filterFormat) {
  // For INT8x4 and INT8x32 we still compute standard strides here to input
  // into the cuDNN functions. We will manually scale by resizeFactor in the cpu ref.
  if (filterFormat == CUDNN_TENSOR_NCHW) {
    strideA[nbDims - 1] = 1;
    for (int64_t d = nbDims - 2; d >= 0; d--) {
      strideA[d] = strideA[d + 1] * dimA[d + 1];
    }
  } else {
    // Here we assume that the format is CUDNN_TENSOR_NHWC
    strideA[1] = 1;
    strideA[nbDims - 1] = strideA[1] * dimA[1];
    for (int64_t d = nbDims - 2; d >= 2; d--) {
      strideA[d] = strideA[d + 1] * dimA[d + 1];
    }
    strideA[0] = strideA[2] * dimA[2];
  }
}

// runtime
cudnn_frontend::ExecutionPlan run_batch_norm_forward(int64_t* tensorDims, int64_t* perChannelSum, int64_t* epsilon,
                                                     int64_t* peerDims, cudnnDataType_t data_type) {
  // get the cudnn handle
  cudnnHandle_t handle = torch::native::getCudnnHandle();

  // Creates the necessary tensor descriptors
  int64_t tensor_stride[4];
  int64_t stride[4];
  int64_t peer_stride[4];

  // NHWC format. GenerateStrides() takes care of this. Howeever, tensor dims should still be NCHW
  generateStrides(tensorDims, tensor_stride, (int64_t)4, CUDNN_TENSOR_NHWC);
  generateStrides(peerDims, peer_stride, (int64_t)4, CUDNN_TENSOR_NHWC);

  auto tensor_create = [&tensor_stride, &tensorDims](cudnnDataType_t type, int64_t id) {
    return cudnn_frontend::TensorBuilder()
        .setDim(4, tensorDims)
        .setStrides(4, tensor_stride)
        .setId(id)
        .setAlignment(16)
        .setDataType(type)
        .build();
  };

  auto peer_tensor_create = [&peer_stride, &tensorDims](cudnnDataType_t type, int64_t id) {
    return cudnn_frontend::TensorBuilder()
        .setDim(4, tensorDims)
        .setStrides(4, peer_stride)
        .setId(id)
        .setAlignment(16)
        .setDataType(type)
        .build();
  };

  generateStrides(perChannelSum, stride, (int64_t)4, CUDNN_TENSOR_NHWC);

  auto per_channel_tensor_create = [&stride, &perChannelSum](cudnnDataType_t type, int64_t id) {
    return cudnn_frontend::TensorBuilder()
        .setDim(4, perChannelSum)
        .setStrides(4, stride)
        .setId(id)
        .setAlignment(16)
        .setDataType(type)
        .build();
  };

  auto xTensor = tensor_create(data_type, 100);
  auto yTensor = tensor_create(data_type, 101);
  auto scaleTensor = per_channel_tensor_create(CUDNN_DATA_FLOAT, 102);
  auto biasTensor = per_channel_tensor_create(CUDNN_DATA_FLOAT, 103);
  auto inMeanTensor = per_channel_tensor_create(CUDNN_DATA_FLOAT, 104);
  auto inVarTensor = per_channel_tensor_create(CUDNN_DATA_FLOAT, 105);
  auto outMeanTensor = per_channel_tensor_create(CUDNN_DATA_FLOAT, 106);
  auto outVarTensor = per_channel_tensor_create(CUDNN_DATA_FLOAT, 107);
  auto savedMeanTensor = per_channel_tensor_create(CUDNN_DATA_FLOAT, 108);
  auto savedInvVarTensor = per_channel_tensor_create(CUDNN_DATA_FLOAT, 109);

  int64_t epsilon_stride[4];
  generateStrides(epsilon, epsilon_stride, (int64_t)4, CUDNN_TENSOR_NHWC);
  auto scalar_tensor_create = [&epsilon_stride, &epsilon](cudnnDataType_t type, int64_t id) {
    return cudnn_frontend::TensorBuilder()
        .setDim(4, epsilon)
        .setStrides(4, epsilon_stride)
        .setId(id)
        .setAlignment(16)
        .setDataType(type)
        .setByValue(true)
        .build();
  };

  auto epsilonTensor = scalar_tensor_create(CUDNN_DATA_DOUBLE, 110);
  auto expDecayTensor = scalar_tensor_create(CUDNN_DATA_DOUBLE, 111);

  // Create the two peer stat tensors. Jump IDs in case we need to add more tensors with UIDs
  std::vector<cudnn_frontend::Tensor_v8> peerStatTensors;
  for (size_t i = 112; i < 112 + peerDims[0]; ++i) {
    peerStatTensors.push_back(peer_tensor_create(CUDNN_DATA_FLOAT, i));
  }

#if (CUDNN_VERSION >= 8500)
  // Batch normalization
  cudnnBackendNormMode_t normalizationMode = CUDNN_BATCH_NORM;

  // Forward training
  cudnnBackendNormFwdPhase_t phase = CUDNN_NORM_FWD_TRAINING;

  // Create a Finalize node
  auto batch_norm_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR)
                           .setNormalizationMode(normalizationMode)
                           .setNormFwdPhase(phase)
                           .setxDesc(xTensor)
                           .setScaleAndBias(scaleTensor, biasTensor)
                           .setPrevRunningMeanAndVar(inMeanTensor, inVarTensor)
                           .setNextRunningMeanAndVar(outMeanTensor, outVarTensor)
                           .setSavedMeanAndInvVar(savedMeanTensor, savedInvVarTensor)
                           .setEpsilonTensor(epsilonTensor)
                           .setExpDecayFactorTensor(expDecayTensor)
                           .setPeerStatTensor(peerStatTensors)
                           .setyDesc(yTensor)
                           .build();

  std::array<cudnn_frontend::Operation const*, 1> ops = {&batch_norm_op};
#else
  std::array<cudnn_frontend::Operation const*, 0> ops = {};
#endif
  auto opGraph =
      cudnn_frontend::OperationGraphBuilder().setHandle(handle).setOperationGraph(ops.size(), ops.data()).build();
  // std::cout << opGraph.describe() << std::endl;

  cudnn_frontend::EngineConfigList filtered_configs;
  auto statuses = cudnn_frontend::get_heuristics_list<2>({"heuristics_instant", "heuristics_fallback"}, opGraph,
                                                         ::AllowAll, filtered_configs, true);

  // std::cout << "get_heuristics_list Statuses: ";
  // for (auto i = 0u ; i < statuses.size(); i++) {
  //   std::cout << cudnn_frontend::to_string(statuses[i]) << " ";
  // }
  // std::cout << std::endl;
  // std::cout << "Filter config list has " << filtered_configs.size() << " configurations " << std::endl;

  // some verbose printing:
  // std::cout << "Tensor shape: (" << tensorDims[0] << ", " << tensorDims[1] << ", " << tensorDims[2] << ", " <<
  // tensorDims[3] << ")" << std::endl;

  auto plan_builder = [&filtered_configs, &opGraph, &handle]() {
    for (auto i = 0u; i < filtered_configs.size(); i++) {
      try {
        auto plan = cudnn_frontend::ExecutionPlanBuilder()
                        .setHandle(handle)
                        .setEngineConfig(filtered_configs[i], opGraph.getTag())
                        .build();
        return plan;
      } catch (cudnn_frontend::cudnnException& e) {
        continue;
      }
    }
    return cudnn_frontend::ExecutionPlanBuilder()
        .setHandle(handle)
        .setEngineConfig(filtered_configs[0], opGraph.getTag())
        .build();
  };

  assert(filtered_configs.size() > 0);
  auto plan = plan_builder();

  return plan;
}

void execute_batch_norm_forward(cudnn_frontend::ExecutionPlan plan, void* xDevPtr, void* yDevPtr, void* scaledevPtr,
                                void* biasdevPtr, void* in_meandevPtr, void* in_vardevPtr, void* out_meandevPtr,
                                void* out_vardevPtr, void* saved_meandevPtr, void* saved_inv_vardevPtr,
                                const std::vector<void*>& peer_devPtrs, double epsilon_val,
                                double exponential_decay_factor, size_t peer_size, int rank_id) {
  // get handle
  cudnnHandle_t handle_ = torch::native::getCudnnHandle();

  // get stream
  cudaStream_t stream;
  cudnnGetStream(handle_, &stream);

  try {
    // allocate workspace
    auto workspace_size = plan.getWorkspaceSize();
    auto workspace_tensor = at::empty({(workspace_size + 3) / 4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
    void* workPtr = nullptr;
    if (workspace_size > 0) {
      workPtr = workspace_tensor.data_ptr<float>();
    }

    // first the data pointers
    std::vector<void*> data_ptrs{
        xDevPtr,        yDevPtr,       scaledevPtr,      biasdevPtr,          in_meandevPtr, in_vardevPtr,
        out_meandevPtr, out_vardevPtr, saved_meandevPtr, saved_inv_vardevPtr, &epsilon_val,  &exponential_decay_factor};
    data_ptrs.insert(data_ptrs.end(), peer_devPtrs.begin(), peer_devPtrs.end());
    // then the uids
    std::vector<int64_t> uids;
    for (size_t i = 100; i < 100 + data_ptrs.size(); ++i) {
      uids.push_back(i);
    }
    auto variantPack = cudnn_frontend::VariantPackBuilder()
                           .setWorkspacePointer(workPtr)
                           .setDataPointers(data_ptrs.size(), data_ptrs.data())
                           .setUids(uids.size(), uids.data())
                           .build();
    // std::cout << "variantPack " << variantPack.describe() << std::endl;
    cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
    cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);

    // Reset local communication buffer
    cudaMemsetAsync(peer_devPtrs[rank_id], 0, peer_size * 4, stream);

  } catch (cudnn_frontend::cudnnException& e) {
    struct cudaDeviceProp prop;
    checkCudaErr(cudaGetDeviceProperties(&prop, 0));
    if (prop.major == 8) {
      std::cout << "[ERROR] Exception " << e.what() << std::endl;
      assert(false);
    }
  }
}

cudnn_frontend::ExecutionPlan run_batch_norm_backward(int64_t* tensorDims, int64_t* perChannelSum, int64_t* epsilon,
                                                      int64_t* peerDims, cudnnDataType_t data_type) {
  // get cudnn handle
  cudnnHandle_t handle = torch::native::getCudnnHandle();

  // Creates the necessary tensor descriptors
  int64_t tensor_stride[4];
  int64_t stride[4];
  int64_t peer_stride[4];

  // NHWC format. GenerateStrides() takes care of this. Howeever, tensor dims should still be NCHW
  generateStrides(tensorDims, tensor_stride, (int64_t)4, CUDNN_TENSOR_NHWC);
  generateStrides(peerDims, peer_stride, (int64_t)4, CUDNN_TENSOR_NHWC);

  auto tensor_create = [&tensor_stride, &tensorDims](cudnnDataType_t type, int64_t id) {
    return cudnn_frontend::TensorBuilder()
        .setDim(4, tensorDims)
        .setStrides(4, tensor_stride)
        .setId(id)
        .setAlignment(16)
        .setDataType(type)
        .build();
  };

  auto peer_tensor_create = [&peer_stride, &peerDims](cudnnDataType_t type, int64_t id) {
    return cudnn_frontend::TensorBuilder()
        .setDim(4, peerDims)
        .setStrides(4, peer_stride)
        .setId(id)
        .setAlignment(16)
        .setDataType(type)
        .build();
  };

  generateStrides(perChannelSum, stride, (int64_t)4, CUDNN_TENSOR_NHWC);

  auto per_channel_tensor_create = [&stride, &perChannelSum](cudnnDataType_t type, int64_t id) {
    return cudnn_frontend::TensorBuilder()
        .setDim(4, perChannelSum)
        .setStrides(4, stride)
        .setId(id)
        .setAlignment(16)
        .setDataType(type)
        .build();
  };

  auto xTensor = tensor_create(data_type, 100);
  auto dyTensor = tensor_create(data_type, 101);
  auto scaleTensor = per_channel_tensor_create(CUDNN_DATA_FLOAT, 102);
  auto savedMeanTensor = per_channel_tensor_create(CUDNN_DATA_FLOAT, 103);
  auto savedInvVarTensor = per_channel_tensor_create(CUDNN_DATA_FLOAT, 104);
  auto dxTensor = tensor_create(data_type, 105);
  auto dScaleTensor = per_channel_tensor_create(CUDNN_DATA_FLOAT, 106);
  auto dBiasTensor = per_channel_tensor_create(CUDNN_DATA_FLOAT, 107);

  int64_t epsilon_stride[4];
  generateStrides(epsilon, epsilon_stride, (int64_t)4, CUDNN_TENSOR_NHWC);
  auto scalar_tensor_create = [&epsilon_stride, &epsilon](cudnnDataType_t type, int64_t id) {
    return cudnn_frontend::TensorBuilder()
        .setDim(4, epsilon)
        .setStrides(4, epsilon_stride)
        .setId(id)
        .setAlignment(16)
        .setDataType(type)
        .setByValue(true)
        .build();
  };

  auto epsilonTensor = scalar_tensor_create(CUDNN_DATA_DOUBLE, 108);

  std::vector<cudnn_frontend::Tensor_v8> peerStatTensors;
  for (size_t i = 109; i < 109 + peerDims[0]; ++i) {
    peerStatTensors.push_back(peer_tensor_create(CUDNN_DATA_FLOAT, i));
  }

#if (CUDNN_VERSION >= 8500)
  // Batch normalization
  cudnnBackendNormMode_t normalizationMode = CUDNN_BATCH_NORM;

  // Create a Finalize node
  auto batch_norm_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR)
                           .setNormalizationMode(normalizationMode)
                           .setxDesc(xTensor)
                           .setSavedMeanAndInvVar(savedMeanTensor, savedInvVarTensor)
                           .setdyDesc(dyTensor)
                           .setScale(scaleTensor)
                           .setEpsilonTensor(epsilonTensor)
                           .setDScaleAndDBias(dScaleTensor, dBiasTensor)
                           .setdxDesc(dxTensor)
                           .setPeerStatTensor(peerStatTensors)
                           .build();

  std::array<cudnn_frontend::Operation const*, 1> ops = {&batch_norm_op};
#else
  std::array<cudnn_frontend::Operation const*, 0> ops = {};
#endif

  auto opGraph =
      cudnn_frontend::OperationGraphBuilder().setHandle(handle).setOperationGraph(ops.size(), ops.data()).build();
  // std::cout << opGraph.describe() << std::endl;

  cudnn_frontend::EngineConfigList filtered_configs;
  auto statuses = cudnn_frontend::get_heuristics_list<2>({"heuristics_instant", "heuristics_fallback"}, opGraph,
                                                         ::AllowAll, filtered_configs, true);

  auto plan_builder = [&filtered_configs, &opGraph, &handle]() {
    for (auto i = 0u; i < filtered_configs.size(); i++) {
      try {
        auto plan = cudnn_frontend::ExecutionPlanBuilder()
                        .setHandle(handle)
                        .setEngineConfig(filtered_configs[i], opGraph.getTag())
                        .build();
        return plan;
      } catch (cudnn_frontend::cudnnException& e) {
        continue;
      }
    }
    return cudnn_frontend::ExecutionPlanBuilder()
        .setHandle(handle)
        .setEngineConfig(filtered_configs[0], opGraph.getTag())
        .build();
  };

  assert(filtered_configs.size() > 0);
  auto plan = plan_builder();

  return plan;
}

void execute_batch_norm_backward(cudnn_frontend::ExecutionPlan plan, void* xDevPtr, void* dyDevPtr, void* scaledevPtr,
                                 void* saved_meandevPtr, void* saved_inv_vardevPtr,
                                 const std::vector<void*>& peer_devPtrs, void* dxDevPtr, void* dscaledevPtr,
                                 void* dbiasdevPtr, double epsilon_val, size_t peer_size, int rank_id) {
  // get handle
  cudnnHandle_t handle_ = torch::native::getCudnnHandle();

  // get stream
  cudaStream_t stream;
  cudnnGetStream(handle_, &stream);

  try {
    // allocate workspace
    auto workspace_size = plan.getWorkspaceSize();
    auto workspace_tensor = at::empty({(workspace_size + 3) / 4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
    void* workPtr = nullptr;
    if (workspace_size > 0) {
      workPtr = workspace_tensor.data_ptr<float>();
    }

    // create helper arrays
    std::vector<void*> data_ptrs{xDevPtr,  dyDevPtr,     scaledevPtr, saved_meandevPtr, saved_inv_vardevPtr,
                                 dxDevPtr, dscaledevPtr, dbiasdevPtr, &epsilon_val};
    data_ptrs.insert(data_ptrs.end(), peer_devPtrs.begin(), peer_devPtrs.end());
    std::vector<int64_t> uids;
    for (size_t i = 100; i < 100 + data_ptrs.size(); ++i) {
      uids.push_back(i);
    }

    auto variantPack = cudnn_frontend::VariantPackBuilder()
                           .setWorkspacePointer(workPtr)
                           .setDataPointers(data_ptrs.size(), data_ptrs.data())
                           .setUids(uids.size(), uids.data())
                           .build();
    cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());

    cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);

    // Reset local communication buffer
    cudaMemsetAsync(peer_devPtrs[rank_id], 0, peer_size * 4, stream);

  } catch (cudnn_frontend::cudnnException& e) {
    struct cudaDeviceProp prop;
    checkCudaErr(cudaGetDeviceProperties(&prop, 0));
    if (prop.major == 8) {
      std::cout << "[ERROR] Exception " << e.what() << std::endl;
      assert(false);
    }
  }
}


================================================
FILE: apex/contrib/csrc/cudnn_gbn/norm_sample.h
================================================
#pragma once

/*
 * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

#pragma once

#include <assert.h>
#include <ctype.h>
#include <cudnn.h>
#include <cudnn_frontend.h>
#include <inttypes.h>
#include <stdlib.h>
#include <string.h>

#include <functional>
#include <iostream>
#include <tuple>

/* some helpers
 */
void generateStrides(const int64_t* dimA, int64_t* strideA, int64_t nbDims, cudnnTensorFormat_t filterFormat);

int64_t checkCudaError(cudaError_t code, const char* expr, const char* file, int line);
int64_t checkCudnnError(cudnnStatus_t code, const char* expr, const char* file, int line);

#define checkCudaErr(...)                                                        \
  do {                                                                           \
    int64_t err = checkCudaError(__VA_ARGS__, #__VA_ARGS__, __FILE__, __LINE__); \
    assert(err == 0);                                                            \
  } while (0)

#define checkCudnnErr(...)                                                        \
  do {                                                                            \
    int64_t err = checkCudnnError(__VA_ARGS__, #__VA_ARGS__, __FILE__, __LINE__); \
    assert(err == 0);                                                             \
  } while (0)

/**
 * @brief Run a Group BN forward sample with 2 peer stat tensors.
 *
 * @param tensorDims an array with shape (N, C, H, W) for input tensor dims. Stride in NHWC or NCHW will take care of
 memory format
 * @param perChannelSum an array with shape (1, C, 1, 1) to denote the sum values for each channel in the input tensor
 * @param epsilon a scalar array with shape (1, 1, 1, 1) to represent the epsilon value for the BN
 * @param peerDims an array with shape (num GPUs, 2 * C, 1, 1) to denote the tensor dimensions for peer stat tensor in
 GBN

 *
 */
cudnn_frontend::ExecutionPlan run_batch_norm_forward(int64_t* tensorDims, int64_t* perChannelSum, int64_t* epsilon,
                                                     int64_t* peerDims, cudnnDataType_t in_out_data_type);
/**
 * @param xDevPtr input tensor device pointer
 * @param yDevPtr output tensor device pointer
 * @param scaledevPtr input scale device pointer for BN scaling
 * @param biasdevPtr input scale device pointer for BN bias
 * @param in_meandevPtr Input mean device pointer
 * @param in_vardevPtr Input variance device pointer
 * @param out_meandevPtr output mean device pointer
 * @param out_vardevPtr output variance device pointer
 * @param saved_meandevPtr saved mean device pointer for BN backward
 * @param saved_inv_vardevPtr saved inverse variance device pointer for BN backward
 * @param peer_devPtr1 peer stat tensor 1 device pointer
 * @param peer_devPtr2 peer stat tensor 2 device pointer
 * @param epsilon_val episilon value as a double
 * @param exponential_decay_factor exponential_decay_factor as a value
 *
 **/
void execute_batch_norm_forward(cudnn_frontend::ExecutionPlan plan, void* xDevPtr, void* yDevPtr, void* scaledevPtr,
                                void* biasdevPtr, void* in_meandevPtr, void* in_vardevPtr, void* out_meandevPtr,
                                void* out_vardevPtr, void* saved_meandevPtr, void* saved_inv_vardevPtr,
                                const std::vector<void*>& peer_devPtrs, double epsilon_val,
                                double exponential_decay_factor, size_t peer_size, int rank_id);

/**
 * @brief Run a Group BN backward sample with 2 peer stat tensors.
 *
 * @param tensorDims an array with shape (N, C, H, W) for input tensor dims. Stride in NHWC or NCHW will take care of
 * memory format
 * @param perChannelSum an array with shape (1, C, 1, 1) to denote the sum values for each channel in the input tensor
 * @param epsilon a scalar array with shape (1, 1, 1, 1) to represent the epsilon value for the BN
 * @param peerDims an array with shape (num GPUs, 2 * C, 1, 1) to denote the tensor dimensions for peer stat tensor in
 * GBN
 *
 */
cudnn_frontend::ExecutionPlan run_batch_norm_backward(int64_t* tensorDims, int64_t* perChannelSum, int64_t* epsilon,
                                                      int64_t* peerDims, cudnnDataType_t data_type);

/**
 * @brief Run a Group BN backward sample with 2 peer stat tensors.
 *
 * @param xDevPtr input tensor device pointer
 * @param yDevPtr output tensor device pointer
 * @param scaledevPtr input scale device pointer for BN scaling
 * @param biasdevPtr input scale device pointer for BN bias
 * @param in_meandevPtr Input mean device pointer
 * @param in_vardevPtr Input variance device pointer
 * @param out_meandevPtr output mean device pointer
 * @param out_vardevPtr output variance device pointer
 * @param saved_meandevPtr saved mean device pointer for BN backward
 * @param saved_inv_vardevPtr saved inverse variance device pointer for BN backward
 * @param peer_devPtr1 peer stat tensor 1 device pointer
 * @param peer_devPtr2 peer stat tensor 2 device pointer
 * @param epsilon_val episilon value as a double
 *
 */
void execute_batch_norm_backward(cudnn_frontend::ExecutionPlan plan, void* xDevPtr, void* dyDevPtr, void* scaledevPtr,
                                 void* saved_meandevPtr, void* saved_inv_vardevPtr,
                                 const std::vector<void*>& peer_devPtrs, void* dxDevPtr, void* dscaledevPtr,
                                 void* dbiasdevPtr, double epsilon_val, size_t peer_size, int rank_id);


================================================
FILE: apex/contrib/csrc/fmha/fmha_api.cpp
================================================
/******************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#include <ATen/cuda/CUDAContext.h>
#include <torch/extension.h>

#include "fmha.h"

extern at::Tensor& mha_fill(at::Tensor& self, const at::Tensor& start_index);
void set_params(Fused_multihead_attention_fprop_params& params,
                // sizes
                const size_t b, const size_t s, const size_t h, const size_t d,
                // device pointers
                void* qkv_packed_d, void* cu_seqlens_d, void* o_packed_d, void* s_d, float p_dropout) {
  Data_type acc_type = DATA_TYPE_FP32;
  Data_type data_type = DATA_TYPE_FP16;

  // Reset the parameters
  memset(&params, 0, sizeof(params));

  // Set the pointers and strides.
  params.qkv_ptr = qkv_packed_d;
  params.qkv_stride_in_bytes = get_size_in_bytes(h * 3 * d, data_type);
  params.o_ptr = o_packed_d;
  params.o_stride_in_bytes = get_size_in_bytes(h * d, data_type);

  params.cu_seqlens = static_cast<int*>(cu_seqlens_d);

  // S = softmax(P)
  params.s_ptr = s_d;
  params.s_stride_in_bytes = get_size_in_bytes(b * h * s, data_type);

  // Set the dimensions.
  params.b = b;
  params.h = h;
  params.s = s;
  params.d = d;

  // Set the different scale values.
  const float scale_bmm1 = 1.f / sqrtf(d);
  constexpr float scale_softmax = 1.f;
  constexpr float scale_bmm2 = 1.f;

  set_alpha(params.scale_bmm1, scale_bmm1, data_type);
  set_alpha(params.scale_softmax, scale_softmax, acc_type);
  set_alpha(params.scale_bmm2, scale_bmm2, data_type);

  // Set this to probability of keeping an element to simplify things.
  params.p_dropout = 1.f - p_dropout;
  params.rp_dropout = 1.f / params.p_dropout;
  TORCH_CHECK(p_dropout < 1.f);
  set_alpha(params.scale_dropout, params.rp_dropout, data_type);
}

std::vector<at::Tensor> mha_fwd(
    const at::Tensor& qkv,         // total x num_heads x 3 x head_size, total := \sum_{i=0}^{b} s_i
    const at::Tensor& cu_seqlens,  // b+1
    const float p_dropout, const int max_seq_len, const bool is_training, const bool is_nl, const bool zero_tensors,
    c10::optional<at::Generator> gen_) {
  using namespace torch::indexing;
  auto dprops = at::cuda::getCurrentDeviceProperties();
  TORCH_CHECK((dprops->major == 8 && dprops->minor == 0) || (dprops->major == 9 && dprops->minor == 0) ||
              (dprops->major == 10 && dprops->minor == 0) || (dprops->major == 12 && dprops->minor == 0));
  auto stream = at::cuda::getCurrentCUDAStream().stream();
  Launch_params<Fused_multihead_attention_fprop_params> launch_params(dprops, stream, is_training, is_nl);

  int seq_len = 512;
  auto launch = &run_fmha_fp16_512_64_sm80;
  if (max_seq_len <= 128) {
    seq_len = 128;
    launch = &run_fmha_fp16_128_64_sm80;
  } else if (max_seq_len <= 256) {
    seq_len = 256;
    launch = &run_fmha_fp16_256_64_sm80;
  } else if (max_seq_len <= 384) {
    seq_len = 384;
    launch = &run_fmha_fp16_384_64_sm80;
  } else if (max_seq_len <= 512) {
    seq_len = 512;
    launch = &run_fmha_fp16_512_64_sm80;
  } else {
    TORCH_CHECK(false);
  }

  TORCH_CHECK(qkv.is_cuda())
  TORCH_CHECK(cu_seqlens.is_cuda())

  TORCH_CHECK(qkv.is_contiguous())
  TORCH_CHECK(cu_seqlens.is_contiguous())

  TORCH_CHECK(cu_seqlens.dim() == 1);
  TORCH_CHECK(qkv.dim() == 4);

  const auto sizes = qkv.sizes();

  TORCH_CHECK(sizes[THREE_DIM] == 3);

  const int batch_size = cu_seqlens.numel() - 1;
  const int total = sizes[TOTAL_DIM];
  const int num_heads = sizes[H_DIM];
  const int head_size = sizes[D_DIM];
  TORCH_CHECK(batch_size > 0);
  TORCH_CHECK(head_size == 64);
  auto opts = qkv.options();

  auto ctx = torch::empty({total, num_heads, head_size}, opts);

  auto s = torch::empty({batch_size, num_heads, seq_len, seq_len}, opts);

  if (zero_tensors) {
    mha_fill(ctx, cu_seqlens.index({Slice(-1, None)}));
  }

  auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(gen_, at::cuda::detail::getDefaultCUDAGenerator());

  set_params(launch_params.params, batch_size, seq_len, num_heads, head_size, qkv.data_ptr(), cu_seqlens.data_ptr(),
             ctx.data_ptr(), s.data_ptr(), p_dropout);

  launch(launch_params, /*configure=*/true);
  // number of times random will be generated per thread, to offset philox counter in thc random
  // state
  int64_t counter_offset = launch_params.elts_per_thread;
  at::PhiloxCudaState rng_engine_inputs;

  if (is_training) {
    // See Note [Acquire lock when using random generators]
    std::lock_guard<std::mutex> lock(gen->mutex_);
    launch_params.params.philox_args = gen->philox_cuda_state(counter_offset);
  }

  launch(launch_params, /*configure=*/false);

  return {ctx, s};
}

std::vector<at::Tensor> mha_bwd(
    const at::Tensor& dout,        // total x num_heads, x head_size
    const at::Tensor& qkv,         // total x num_heads x 3 x head_size, total := \sum_{i=0}^{b} s_i
    at::Tensor& softmax,           // b x h x s x s softmax and dmask - will be overwritten with dP
    const at::Tensor& cu_seqlens,  // b+1
    const float p_dropout,         // probability to drop
    const int max_seq_len,         // max sequence length to choose the kernel
    const bool zero_tensors) {
  using namespace torch::indexing;
  auto dprops = at::cuda::getCurrentDeviceProperties();
  TORCH_CHECK((dprops->major == 8 && dprops->minor == 0) || (dprops->major == 9 && dprops->minor == 0) ||
              (dprops->major == 10 && dprops->minor == 0) || (dprops->major == 12 && dprops->minor == 0));
  int seq_len = 512;
  auto launch = &run_fmha_dgrad_fp16_512_64_sm80;
  if (max_seq_len <= 128) {
    seq_len = 128;
    launch = &run_fmha_dgrad_fp16_128_64_sm80;
  } else if (max_seq_len <= 256) {
    seq_len = 256;
    launch = &run_fmha_dgrad_fp16_256_64_sm80;
  } else if (max_seq_len <= 384) {
    seq_len = 384;
    launch = &run_fmha_dgrad_fp16_384_64_sm80;
  } else if (max_seq_len <= 512) {
    seq_len = 512;
    launch = &run_fmha_dgrad_fp16_512_64_sm80;
  } else {
    TORCH_CHECK(false);
  }

  auto stream = at::cuda::getCurrentCUDAStream().stream();

  TORCH_CHECK(qkv.dtype() == torch::kFloat16);
  TORCH_CHECK(dout.dtype() == torch::kFloat16);
  TORCH_CHECK(softmax.dtype() == torch::kFloat16);
  TORCH_CHECK(cu_seqlens.dtype() == torch::kInt32);

  TORCH_CHECK(qkv.is_cuda());
  TORCH_CHECK(cu_seqlens.is_cuda());

  TORCH_CHECK(qkv.is_contiguous());
  TORCH_CHECK(cu_seqlens.is_contiguous());

  TORCH_CHECK(cu_seqlens.dim() == 1);
  TORCH_CHECK(qkv.dim() == 4);

  const auto sizes = qkv.sizes();

  TORCH_CHECK(sizes[THREE_DIM] == 3);

  const int batch_size = cu_seqlens.numel() - 1;
  const int num_heads = sizes[H_DIM];
  const int head_size = sizes[D_DIM];
  TORCH_CHECK(batch_size > 0);
  TORCH_CHECK(head_size == 64);

  auto dqkv = torch::empty_like(qkv);

  if (zero_tensors) {
    mha_fill(dqkv, cu_seqlens.index({Slice(-1, None)}));
  }

  Fused_multihead_attention_fprop_params params;

  set_params(params, batch_size, seq_len, num_heads, head_size, qkv.data_ptr(), cu_seqlens.data_ptr(),
             dout.data_ptr(),     // we set o_ptr to dout
             softmax.data_ptr(),  // softmax gets overwritten by dP!
             p_dropout);

  // we're re-using these scales
  Data_type acc_type = DATA_TYPE_FP32;
  set_alpha(params.scale_bmm1, 1.f, acc_type);
  set_alpha(params.scale_softmax, 1.f / sqrtf(head_size), acc_type);
  set_alpha(params.scale_bmm2, 1.f, DATA_TYPE_FP16);
  params.dqkv_ptr = dqkv.data_ptr();

  launch(params, stream);
  return {dqkv, softmax};
}

std::vector<at::Tensor> mha_bwd_nl(
    const at::Tensor& dout,        // total x num_heads, x head_size
    const at::Tensor& qkv,         // total x num_heads x 3 x head_size, total := \sum_{i=0}^{b} s_i
    at::Tensor& softmax,           // b x h x s x s softmax and dmask - will be overwritten with dP
    const at::Tensor& cu_seqlens,  // b+1
    const float p_dropout,         // probability to drop
    const int max_seq_len,         // max sequence length to choose the kernel
    const bool zero_tensors) {
  auto stream = at::cuda::getCurrentCUDAStream().stream();

  TORCH_CHECK(qkv.is_cuda())
  TORCH_CHECK(cu_seqlens.is_cuda())

  TORCH_CHECK(qkv.is_contiguous())
  TORCH_CHECK(cu_seqlens.is_contiguous())

  TORCH_CHECK(cu_seqlens.dim() == 1);

  TORCH_CHECK(qkv.dim() == 4);

  const auto sizes = qkv.sizes();

  TORCH_CHECK(sizes[THREE_DIM] == 3);

  const int batch_size = cu_seqlens.numel() - 1;

  const int total = sizes[TOTAL_DIM];
  const int num_heads = sizes[H_DIM];
  const int head_size = sizes[D_DIM];
  TORCH_CHECK(batch_size > 0);
  TORCH_CHECK(head_size == 64);

  int seq_len = 512;
  auto launch = &run_fmha_dgrad_fp16_512_64_sm80_nl;

  auto opts = qkv.options();

  auto dqkv = torch::empty_like(qkv);

  if (zero_tensors) {
    dqkv.zero_();
  }

  int num_chunks = 2;
  if (batch_size == 1) {
    num_chunks = 4;
  } else if (batch_size == 2) {
    num_chunks = 3;
  }
  auto dkv = torch::empty({total, num_chunks, 2, num_heads, head_size}, opts);

  Fused_multihead_attention_fprop_params params;

  set_params(params, batch_size, seq_len, num_heads, head_size, qkv.data_ptr(), cu_seqlens.data_ptr(),
             dout.data_ptr(),     // o_ptr = dout
             softmax.data_ptr(),  // softmax gets overwritten by dP!
             p_dropout);

  params.dkv_ptr = dkv.data_ptr();

  Data_type acc_type = DATA_TYPE_FP32;
  set_alpha(params.scale_bmm1, 1.f, acc_type);
  set_alpha(params.scale_softmax, 1.f / sqrtf(head_size), acc_type);
  set_alpha(params.scale_bmm2, 1.f, DATA_TYPE_FP16);
  params.dqkv_ptr = dqkv.data_ptr();

  launch(params, num_chunks, stream);

  // SPLIT-K reduction of num_chunks dK, dV parts

  // The equivalent of the following Pytorch code:
  // using namespace torch::indexing;
  // at::Tensor view_out = dqkv.index({Slice(), Slice(1, None, None)});
  // torch::sum_out(view_out, dkv, 1);

  const int hidden_size = num_heads * head_size;
  fmha_run_noloop_reduce(dqkv.data_ptr(), dkv.data_ptr(), cu_seqlens.data_ptr<int>(), hidden_size, batch_size, total,
                         num_chunks, stream);

  return {dqkv, softmax, dkv};
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.doc() = "Fused Multi-head Self-attention for BERT";
  m.def("fwd", &mha_fwd, "Forward pass", py::call_guard<py::gil_scoped_release>());
  m.def("bwd", &mha_bwd, "Backward pass", py::call_guard<py::gil_scoped_release>());
  m.def("bwd_nl", &mha_bwd_nl, "Backward pass (small-batch)", py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: apex/contrib/csrc/fmha/src/fmha/gemm.h
================================================
/******************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#pragma once

#include <fmha/utils.h>

#define FMHA_DIV_UP(m, n) (((m) + (n) - 1) / (n))

namespace fmha {

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Data_type_, int NUM_ELTS_, int BITS_PER_ELT_, int ALIGNMENT_>
struct Fragment_base_ {
  // The data type.
  using Data_type = Data_type_;
  // default input type
  using Input_type_ = Data_type_;
  // Does it store the array of elements.
  enum { HAS_ELTS = BITS_PER_ELT_ >= 8 };
  // The number of elements.
  enum { NUM_ELTS = NUM_ELTS_ };
  // The size of element in bits.
  enum { BITS_PER_ELT = BITS_PER_ELT_ };
  // The size of byte of a single register.
  enum { BYTES_PER_REG = 4 };
  // The size in bits.
  enum { BITS_PER_REG = BYTES_PER_REG * 8 };
  // The number of registers needed to store the fragment.
  enum { NUM_REGS = Div_up<NUM_ELTS * BITS_PER_ELT, BITS_PER_REG>::VALUE };
  // The size in bytes (as returned by sizeof(Fragment_base<>).
  enum { SIZE_IN_BYTES = NUM_REGS * BYTES_PER_REG };
  // The alignment.
  enum { ALIGNMENT = ALIGNMENT_ > 0 ? ALIGNMENT_ : Min<NUM_REGS * BYTES_PER_REG, 16>::VALUE };
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <
    // The type of the elements.
    typename Data_type_,
    // The number of elements.
    int NUM_ELTS_,
    // The alignment if you want to force a value -- use 0 otherwise.
    int ALIGNMENT_ = 0,
    // The base class.
    typename Base_ = Fragment_base_<Data_type_, NUM_ELTS_, 8 * sizeof(Data_type_), ALIGNMENT_> >
struct alignas(static_cast<int>(Base_::ALIGNMENT)) Fragment : public Base_ {
  // The size of a load/store.
  enum { BYTES_PER_LOAD_STORE = Base_::NUM_REGS * sizeof(uint32_t) };

  // Clear the fragment. Using PTX in that code seems to produce better SASS...
  inline __device__ void clear() {
#pragma unroll
    for (int ii = 0; ii < Base_::NUM_REGS; ++ii) {
      asm volatile("mov.u32 %0, 0; \n" : "=r"(this->reg(ii)) :);
    }
  }

  // Immutable access to a register.
  inline __device__ const uint32_t& reg(int ii) const { return this->regs_[ii]; }

  // Mutable access to a register.
  inline __device__ uint32_t& reg(int ii) { return this->regs_[ii]; }

  uint32_t regs_[Base_::NUM_REGS];

  // Immutable access to the elements.
  inline __device__ const Data_type_& elt(int ii) const {
    return reinterpret_cast<const Data_type_*>(&this->regs_[0])[ii];
  }

  // Mutable access to the elements.
  inline __device__ Data_type_& elt(int ii) { return reinterpret_cast<Data_type_*>(&this->regs_[0])[ii]; }

  // Immutable access to the elements with a cast.
  template <typename Cast_type>
  inline __device__ const Cast_type& elt_as(int ii) const {
    return reinterpret_cast<const Cast_type*>(&this->regs_[0])[ii];
  }

  // Mutable access to the elements.
  template <typename Cast_type>
  inline __device__ Cast_type& elt_as(int ii) {
    return reinterpret_cast<Cast_type*>(&this->regs_[0])[ii];
  }

  // Add another fragment.
  inline __device__ void add(const Fragment& other) {
#pragma unroll
    for (int ii = 0; ii < NUM_ELTS_; ++ii) {
      this->elt(ii) += other.elt(ii);
    }
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Layout>
struct Fragment_a : public Fragment<uint16_t, 8> {};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Layout>
struct Fragment_b : public Fragment<uint16_t, 8> {};

////////////////////////////////////////////////////////////////////////////////////////////////////

struct Fragment_accumulator : public Fragment<float, 8> {
  // The base class.
  using Base = Fragment<float, 8>;

  // Add two fragments.
  template <typename Other_fragment_>
  inline __device__ void add(const Other_fragment_& other) {
    for (int ii = 0; ii < Base::NUM_ELTS; ++ii) {
      this->elt(ii) = this->elt(ii) + other.elt(ii);
    }
  }

  // Do the HMMA.
  template <typename Layout_a, typename Layout_b>
  inline __device__ void mma(const Fragment_a<Layout_a>& a, const Fragment_b<Layout_b>& b) {
    asm volatile(
        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 \n"
        "    {%0, %1, %2, %3}, \n"
        "    {%4, %5, %6, %7}, \n"
        "    {%8, %9}, \n"
        "    {%0, %1, %2, %3}; \n"
        : "+f"(elt(0)), "+f"(elt(1)), "+f"(elt(2)), "+f"(elt(3))
        : "r"(a.reg(0)), "r"(a.reg(1)), "r"(a.reg(2)), "r"(a.reg(3)), "r"(b.reg(0)), "r"(b.reg(1)));
    asm volatile(
        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 \n"
        "    {%0, %1, %2, %3}, \n"
        "    {%4, %5, %6, %7}, \n"
        "    {%8, %9}, \n"
        "    {%0, %1, %2, %3}; \n"
        : "+f"(elt(4)), "+f"(elt(5)), "+f"(elt(6)), "+f"(elt(7))
        : "r"(a.reg(0)), "r"(a.reg(1)), "r"(a.reg(2)), "r"(a.reg(3)), "r"(b.reg(2)), "r"(b.reg(3)));
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Fragment, int M, int N>
inline __device__ void clear(Fragment (&frag)[M][N]) {
#pragma unroll
  for (int mi = 0; mi < M; ++mi) {
#pragma unroll
    for (int ni = 0; ni < N; ++ni) {
      frag[mi][ni].clear();
    }
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Accumulator_type, int WARPS_K>
struct Clear_accumulator {};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int WARPS_K>
struct Clear_accumulator<float, WARPS_K> {
  template <typename Acc, int M, int N>
  static inline __device__ void apply(Acc (&acc)[M][N], bool = false) {
    fmha::clear(acc);
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Acc, typename A, typename B, int M, int N>
inline __device__ void gemm(Acc (&acc)[M][N], const A (&a)[M], const B (&b)[N]) {
#pragma unroll
  for (int mi = 0; mi < M; ++mi) {
#pragma unroll
    for (int ni = 0; ni < N; ++ni) {
      acc[mi][ni].mma(a[mi], b[ni]);
    }
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <
    // The number of rows in the CTA tile.
    int M_,
    // The number of cols in the CTA tile.
    int N_,
    // The number of elements in the the K dimension of the GEMM loop.
    int K_,
    // The number of rows of warps.
    int WARPS_M_,
    // The number of cols of warps.
    int WARPS_N_,
    // The number of warps in the K dimension of the GEMM loop.
    int WARPS_K_>
struct Cta_tile_ {
  enum { M = M_, N = N_, K = K_ };
  // The number of warps.
  enum { WARPS_M = WARPS_M_, WARPS_N = WARPS_N_, WARPS_K = WARPS_K_ };
  // The number of warps per CTA.
  enum { WARPS_PER_CTA = WARPS_M * WARPS_N * WARPS_K };
  // The number of threads per warp.
  enum { THREADS_PER_WARP = 32 };
  // The number of threads per CTA.
  enum { THREADS_PER_CTA = WARPS_PER_CTA * THREADS_PER_WARP };
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Cta_tile>
struct Hmma_tile {
  // The number of elements computed with a single warp-MMA.
  enum { M_PER_MMA = 16, N_PER_MMA = 16, K_PER_MMA = 16 };

  // The number of elements computed with a single CTA-MMA.
  enum {
    M_PER_MMA_PER_CTA = M_PER_MMA * Cta_tile::WARPS_M,
    N_PER_MMA_PER_CTA = N_PER_MMA * Cta_tile::WARPS_N,
    K_PER_MMA_PER_CTA = K_PER_MMA * Cta_tile::WARPS_K
  };

  // The number of MMAs needed to compute the GEMM.
  enum {
    MMAS_M = Div_up<Cta_tile::M, M_PER_MMA_PER_CTA>::VALUE,
    MMAS_N = Div_up<Cta_tile::N, N_PER_MMA_PER_CTA>::VALUE,
    MMAS_K = Div_up<Cta_tile::K, K_PER_MMA_PER_CTA>::VALUE,
  };

  // The number of elements computed per warp.
  enum {
    M_PER_WARP = MMAS_M * M_PER_MMA,
    N_PER_WARP = MMAS_N * N_PER_MMA,
    K_PER_WARP = MMAS_K * K_PER_MMA,
  };
};

////////////////////////////////////////////////////////////////////////////////////////////////////

using A_type = uint16_t;
using B_type = uint16_t;
using C_type = uint16_t;
using Accumulator_type = float;
using Epilogue_type = float;

constexpr int BITS_PER_ELEMENT_A = sizeof(A_type) * 8;
constexpr int BITS_PER_ELEMENT_B = sizeof(B_type) * 8;
constexpr int BITS_PER_ELEMENT_C = sizeof(C_type) * 8;

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int M, int N, int K, int WARPS_M, int WARPS_N, int WARPS_K>
using Cta_tile_extd = Cta_tile_<M, N, K, WARPS_M, WARPS_N, WARPS_K>;

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Cta_tile_>
using Cta_tile_with_k_with_padding = Cta_tile_extd<Cta_tile_::M, Cta_tile_::N, Next_power_of_two<Cta_tile_::K>::VALUE,
                                                   Cta_tile_::WARPS_M, Cta_tile_::WARPS_N, Cta_tile_::WARPS_K>;

////////////////////////////////////////////////////////////////////////////////////////////////////

}  // namespace fmha


================================================
FILE: apex/contrib/csrc/fmha/src/fmha/gmem_tile.h
================================================
/******************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#pragma once

namespace fmha {

////////////////////////////////////////////////////////////////////////////////////////////////////

template <
    // The dimensions of the tile computed by the CTA.
    typename Cta_tile,
    // The number of bits per element.
    int BITS_PER_ELEMENT,
    // The number of rows of Q, K or V loaded by this tile.
    int ROWS,
    // The number of columns.
    int COLS,
    // The number of matrics.
    int NUM_MATS = 3>
struct Gmem_tile_qkv {
  // The size of each LDG.
  enum { BYTES_PER_LDG = 16 };
  // The size of a row in bytes.
  enum { BYTES_PER_ROW = COLS * BITS_PER_ELEMENT / 8 };

  // The number of threads to load a "row" of the matrix.
  enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_LDG };

  // The number of "rows" loaded per LDG.
  enum { ROWS_PER_LDG = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
  // The number of LDGs needed to load a chunk of the Q matrix.
  enum { LDGS = fmha::Div_up<ROWS, ROWS_PER_LDG>::VALUE };

  // Ctor.
  template <typename Params, typename BInfo>
  inline __device__ Gmem_tile_qkv(const Params& params, const int qkv_offset, const BInfo& binfo, const int tidx)
      : params_qkv_stride_in_bytes_(params.qkv_stride_in_bytes),
        actual_seqlen(binfo.actual_seqlen),
        qkv_ptr_(reinterpret_cast<char*>(params.qkv_ptr)) {
    // Compute the position in the sequence (within the CTA for the moment).
    int row = tidx / THREADS_PER_ROW;
    // Compute the position of the thread in the row.
    int col = tidx % THREADS_PER_ROW;

    // Store the row as we need it to disable the loads.
    row_ = row;

    // The row offset in the batched GEMM. For each seq element, we store QKV in that order.
    int64_t row_offset = (int64_t)row * params.qkv_stride_in_bytes;
    // Add the block index.
    row_offset += (int64_t)((binfo.sum_s * NUM_MATS + qkv_offset) * binfo.h + binfo.bidh) * BYTES_PER_ROW;

    // Assemble the final pointer.
    qkv_ptr_ += row_offset + col * BYTES_PER_LDG;
  }

  // Store data to shared memory.
  template <typename Smem_tile>
  inline __device__ void commit(Smem_tile& smem_tile) {
    smem_tile.store(fetch_);
  }

  // Load data from memory.
  template <typename Smem_tile>
  inline __device__ void load(Smem_tile& smem_tile) {
    const void* ptrs[LDGS];
    uint32_t preds[LDGS];
#pragma unroll
    for (int ii = 0; ii < LDGS; ++ii) {
      ptrs[ii] = qkv_ptr_ + (int64_t)ii * ROWS_PER_LDG * params_qkv_stride_in_bytes_;
      preds[ii] = ((row_ + ii * ROWS_PER_LDG) < min(ROWS, actual_seqlen));
      fetch_[ii] = make_uint4(0, 0, 0, 0);
    }

    // not packing predicates removes restrictions (e.g. FP16 384, 4 warps)
    Ldg_functor<uint4, LDGS> fct(fetch_, ptrs);
#pragma unroll
    for (int ii = 0; ii < LDGS; ++ii) {
      fct.load(ii, preds[ii]);
    }
  }

  // Store data to memory.
  inline __device__ void store(const uint4 (&data)[LDGS]) {
#pragma unroll
    for (int ii = 0; ii < LDGS; ++ii) {
      char* ptr = qkv_ptr_ + (int64_t)ii * ROWS_PER_LDG * params_qkv_stride_in_bytes_;
      if ((row_ + ii * ROWS_PER_LDG) < min(ROWS, actual_seqlen)) {
        fmha::stg(ptr, data[ii]);
      }
    }
  }

  // Move the pointer to the next location.
  inline __device__ void move() {
    qkv_ptr_ += (int64_t)ROWS * params_qkv_stride_in_bytes_;
    actual_seqlen -= ROWS;
  }

  inline __device__ void move(int steps) {
    qkv_ptr_ += (int64_t)ROWS * params_qkv_stride_in_bytes_ * steps;
    actual_seqlen -= ROWS * steps;
  }

  // The stride between rows for the QKV matrice.
  int64_t params_qkv_stride_in_bytes_;
  // The pointer.
  char* qkv_ptr_;
  // The fetch registers.
  uint4 fetch_[LDGS];
  // Keep track of the row the thread is processing as we move the tile.
  int row_;
  // The length of the sequence loaded by that memory tile.
  int actual_seqlen;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Cta_tile>
struct Gmem_tile_o {
  // The mma tile.
  using Mma_tile = fmha::Hmma_tile<Cta_tile>;

  // The size of each element.
  enum { BYTES_PER_ELEMENT = 2 };
  // The size of a row in bytes.
  enum { BYTES_PER_ROW = Cta_tile::N * BYTES_PER_ELEMENT };

  // The number of threads to store a "row" of the matrix.
  enum { THREADS_PER_ROW = 16 };
  // The size of each STG.
  enum { BYTES_PER_STG = BYTES_PER_ROW / THREADS_PER_ROW };

  // The number of "rows" stored per iteration of the loop. The output of 1 MMA.
  enum { ROWS = Cta_tile::M };
  // The number of "rows" stored per iteration of the loop. The output of 1 MMA.
  enum { ROWS_PER_LOOP = ROWS <= 64 ? ROWS : (int)Mma_tile::M_PER_MMA_PER_CTA };
  // The number of outter loop for the stores.
  enum { LOOPS = ROWS / ROWS_PER_LOOP };

  // The number of "rows" stored per STG.
  enum { ROWS_PER_STG = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
  // Do we have to guard against partial writes/reads.
  enum { HAS_INCOMPLETE_STG = Cta_tile::M % ROWS_PER_STG != 0 };
  // The number of STGs needed to store a chunk of the Q matrix.
  enum { STGS_PER_LOOP = fmha::Div_up<ROWS_PER_LOOP, ROWS_PER_STG>::VALUE };
  // The number of STGs needed to store a chunk of the Q matrix in total.
  enum { STGS = STGS_PER_LOOP * LOOPS };

  // Ctor.
  template <typename Params, typename BInfo>
  inline __device__ Gmem_tile_o(const Params& params, const BInfo& binfo, int tidx)
      : params_o_stride_in_bytes_(params.o_stride_in_bytes),
        actual_seqlen_(binfo.actual_seqlen),
        o_ptr_(reinterpret_cast<char*>(params.o_ptr)) {
    // Compute the position in the sequence (within the CTA for the moment).
    int row = tidx / THREADS_PER_ROW;
    // Compute the position of the thread in the row.
    int col = tidx % THREADS_PER_ROW;

    // Store the row as we need it to disable loads.
    row_ = row;

    // The row offset in the batched GEMM.
    int64_t row_offset = (int64_t)row * params.o_stride_in_bytes + binfo.bidx * BYTES_PER_ROW;
    // Assemble the final pointer.
    o_ptr_ += row_offset + col * BYTES_PER_STG;

    // Is that thread active on the last STG?
    if (HAS_INCOMPLETE_STG) {
      is_active_for_last_stg_ = row + (STGS - 1) * ROWS_PER_STG < Cta_tile::M;
    }
  }

  // Store data to global memory.
  inline __device__ void store(const uint4 (&src)[STGS_PER_LOOP], int mi) {
#pragma unroll
    for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
      int jj = mi * STGS_PER_LOOP + ii;
      if (this->row_ + jj * ROWS_PER_STG >= this->actual_seqlen_) {
        break;
      }

      float x = reinterpret_cast<const float&>(src[ii].x);
      float y = reinterpret_cast<const float&>(src[ii].y);
      float z = reinterpret_cast<const float&>(src[ii].z);
      float w = reinterpret_cast<const float&>(src[ii].w);
      uint2 out = float4_to_half4(x, y, z, w);
      if (!HAS_INCOMPLETE_STG || (jj < STGS - 1 || this->is_active_for_last_stg_)) {
        fmha::stg(this->o_ptr_ + jj * ROWS_PER_STG * this->params_o_stride_in_bytes_, out);
      }
    }
  }

  // Move the pointer to the next location.
  inline __device__ void move() {
    row_ += ROWS;
    o_ptr_ += (int64_t)ROWS * params_o_stride_in_bytes_;
  }

  inline __device__ void move(const int steps) {
    row_ += ROWS * steps;
    o_ptr_ += (int64_t)ROWS * params_o_stride_in_bytes_ * steps;
  }

  // The stride between rows for the QKV matrice.
  int64_t params_o_stride_in_bytes_;
  // The pointer.
  char* o_ptr_;
  // Is the thread active for the last STG?
  int is_active_for_last_stg_;
  // Keep track of the row to disable loads.
  int row_;
  // The length of the sequence loaded by that memory tile.
  int actual_seqlen_;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Cta_tile, int BYTES_PER_ELEMENT>
struct Gmem_tile_mma_sd {
  // The mma tile.
  using Mma_tile = fmha::Hmma_tile<Cta_tile>;

  // Each STG stores 8 elements.
  enum { BYTES_PER_STG = BYTES_PER_ELEMENT * 8 };
  // The number of MMAs in the M dimension.
  enum { MMAS_M = Mma_tile::MMAS_M };
  // The number of MMAs in the N dimension.
  enum { MMAS_N = Mma_tile::MMAS_N };
  // The number of rows computed per MMA per thread block.
  enum { M_PER_MMA_PER_CTA = Mma_tile::M_PER_MMA_PER_CTA };
  // The number of cols computed per MMA per thread block.
  enum { N_PER_MMA_PER_CTA = Mma_tile::N_PER_MMA_PER_CTA };
  // The number of threads per block.
  enum { THREADS_PER_CTA = Cta_tile::THREADS_PER_CTA };
  // The size of each row in bytes. I.e. how many bytes are stored per STG.
  enum { BYTES_PER_ROW = THREADS_PER_CTA * BYTES_PER_STG };
  // The fixed sequence length.
  enum { SEQLEN = Cta_tile::N };
  // The distance between two blocks (in bytes).
  enum { BLOCK_STRIDE_BYTES = SEQLEN * SEQLEN * BYTES_PER_ELEMENT };
  // The distance between elements stored per loop (in bytes).
  enum { LOOP_STRIDE_BYTES = MMAS_M * MMAS_N * BYTES_PER_ROW };

  // The type of elements stored per STG.
  using Type = typename fmha::Uint_from_size_in_bytes<BYTES_PER_STG>::Type;

  // Ctor.
  template <typename Params>
  inline __device__ Gmem_tile_mma_sd(void* ptr, const Params& params, const int bidb, const int bidh, const int tidx)
      : ptr_(static_cast<char*>(ptr)) {
    // The block index.
    size_t bidx = bidb * params.h + bidh;

    // Set store location for each thread at the beginning of the loop
    ptr_ += bidx * BLOCK_STRIDE_BYTES + tidx * BYTES_PER_STG;
  }

  // Store to global memory.
  inline __device__ void store(const Type& data, const int mi, const int ni) {
    size_t offset = (mi * MMAS_N + ni) * BYTES_PER_ROW;
    fmha::stg(ptr_ + offset, data);
  }

  // Load from global memory.
  inline __device__ void load(Type& data, const int mi, const int ni) {
    size_t offset = (mi * MMAS_N + ni) * BYTES_PER_ROW;
    fmha::ldg(data, ptr_ + offset);
  }

  // Move to the next tile.
  inline __device__ void move() { ptr_ += LOOP_STRIDE_BYTES; }
  inline __device__ void move(const int steps) { ptr_ += LOOP_STRIDE_BYTES * steps; }

  // The pointer in global memory.
  char* ptr_;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Cta_tile, typename Base = Gmem_tile_mma_sd<Cta_tile, sizeof(uint16_t)> >
struct Gmem_tile_mma_s : public Base {
  // The number of mmas in the vertical dimension.
  enum { M = Base::MMAS_M };
  // The number of mmas in the horizontal dimension.
  enum { N = Base::MMAS_N };
  // The type of the vectors stored by each STG.
  using Type = typename Base::Type;

  // Ctor.
  template <typename Params, typename Block_info>
  inline __device__ Gmem_tile_mma_s(const Params& params, const Block_info& binfo, const int tidx)
      : Base(params.s_ptr, params, binfo.bidb, binfo.bidh, tidx) {}

  // Store to global memory.
  template <typename Mask>
  inline __device__ void store(const float (&softmax)[2 * M][4 * N], const Mask& mask) {
#pragma unroll
    for (int mi = 0; mi < M; mi++) {
#pragma unroll
      for (int ni = 0; ni < N; ni++) {
        float tmp00 = softmax[2 * mi + 0][4 * ni + 0];
        float tmp01 = softmax[2 * mi + 0][4 * ni + 1];
        float tmp02 = softmax[2 * mi + 0][4 * ni + 2];
        float tmp03 = softmax[2 * mi + 0][4 * ni + 3];

        float tmp10 = softmax[2 * mi + 1][4 * ni + 0];
        float tmp11 = softmax[2 * mi + 1][4 * ni + 1];
        float tmp12 = softmax[2 * mi + 1][4 * ni + 2];
        float tmp13 = softmax[2 * mi + 1][4 * ni + 3];

        uint4 dst;
        dst.x = fmha::float2_to_half2(tmp00, tmp01);
        dst.y = fmha::float2_to_half2(tmp02, tmp03);
        dst.z = fmha::float2_to_half2(tmp10, tmp11);
        dst.w = fmha::float2_to_half2(tmp12, tmp13);
        if (mask.is_valid(mi, ni, 0, 0)) {
          Base::store(dst, mi, ni);
        }
      }
    }
  }

  // Store to global memory.
  template <typename Mask, typename Fragment>
  inline __device__ void store(const Fragment (&frag)[N][M], const Mask& mask) {
#pragma unroll
    for (int mi = 0; mi < M; mi++) {
#pragma unroll
      for (int ni = 0; ni < N; ni++) {
        uint4 dst;
        dst.x = frag[ni][mi].reg(0);
        dst.y = frag[ni][mi].reg(2);
        dst.z = frag[ni][mi].reg(1);
        dst.w = frag[ni][mi].reg(3);
        if (mask.any_valid(mi, ni)) {
          Base::store(dst, mi, ni);
        }
      }
    }
  }

  // Load from global memory.
  template <typename Mask>
  inline __device__ void load(uint4 (&regs)[M][N], const Mask& mask) {
#pragma unroll
    for (int mi = 0; mi < M; mi++) {
#pragma unroll
      for (int ni = 0; ni < N; ni++) {
        regs[mi][ni] = make_uint4(0, 0, 0, 0);
        if (mask.any_valid(mi, ni)) {
          Base::load(regs[mi][ni], mi, ni);
        }
      }
    }
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <
    // The dimensions of the tile computed by the CTA.
    typename Cta_tile,
    // The base class.
    typename Base = fmha::Gmem_tile_qkv<Cta_tile, fmha::BITS_PER_ELEMENT_A, Cta_tile::M, Cta_tile::K> >
struct Gmem_tile_dout : public Base {
  // Ctor.
  template <typename Params, typename BInfo>
  inline __device__ Gmem_tile_dout(const Params& params, const BInfo& binfo, int tidx) : Base(params, 0, binfo, tidx) {
    this->qkv_ptr_ = reinterpret_cast<char*>(params.o_ptr);
    this->params_qkv_stride_in_bytes_ = params.o_stride_in_bytes;  // needed for move

    // Compute the position of the thread in the row.
    int col = tidx % Base::THREADS_PER_ROW;

    // The row offset in the batched GEMM. For each seq element, we store O in that order.
    int64_t row_offset = (int64_t)this->row_ * params.o_stride_in_bytes + binfo.bidx * Base::BYTES_PER_ROW;

    // Assemble the final pointer.
    this->qkv_ptr_ += row_offset + col * Base::BYTES_PER_LDG;
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Cta_tile, typename Base = fmha::Gmem_tile_o<Cta_tile> >
struct Gmem_tile_dq : public Base {
  // Ctor.
  template <typename Params, typename BInfo>
  inline __device__ Gmem_tile_dq(const Params& params, const BInfo& binfo, int tidx) : Base(params, binfo, tidx) {
    this->o_ptr_ = reinterpret_cast<char*>(params.dqkv_ptr);
    this->params_o_stride_in_bytes_ = params.qkv_stride_in_bytes;  // needed for move

    // Compute the position of the thread in the row.
    int col = tidx % Base::THREADS_PER_ROW;

    // The row offset in the batched GEMM. For each seq element, we store O in that order.
    int64_t row_offset = (int64_t)this->row_ * params.qkv_stride_in_bytes +
                         (binfo.sum_s * 3 * binfo.h + binfo.bidh) * Base::BYTES_PER_ROW;

    // Assemble the final pointer.
    this->o_ptr_ += row_offset + col * Base::BYTES_PER_STG;
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

}  // namespace fmha


================================================
FILE: apex/contrib/csrc/fmha/src/fmha/kernel_traits.h
================================================
/******************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#pragma once

#include "gmem_tile.h"
#include "smem_tile.h"

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int S, int D, int STEP, int WARPS_M, int WARPS_N, uint32_t FLAGS = 0x08u>
struct FMHA_kernel_traits {
  // The CTA description for the 1st GEMM.
  using Cta_tile_p = fmha::Cta_tile_extd<STEP, S, D, WARPS_M, WARPS_N, 1>;
  // The CTA description for the 2nd GEMM.
  using Cta_tile_o = fmha::Cta_tile_extd<STEP, D, S, WARPS_M, 1, WARPS_N>;

  // Do we use one buffer for K and V.
  enum { SHARE_SMEM_FOR_K_AND_V = (FLAGS & 0x08u) != 0u };
  // Do we keep K in registers.
  enum { K_IN_REGS = (FLAGS & 0x10u) == 0u };

  // The global memory tile to load Q.
  using Gmem_tile_q = fmha::Gmem_tile_qkv<Cta_tile_p, fmha::BITS_PER_ELEMENT_A, STEP, D>;

  // The shared memory tile to swizzle Q.
  using Smem_tile_q = fmha::Smem_tile_a<Cta_tile_p, fmha::Row, Gmem_tile_q::BYTES_PER_LDG, 1>;

  // The global memory tile to load K.
  using Gmem_tile_k = fmha::Gmem_tile_qkv<Cta_tile_p, fmha::BITS_PER_ELEMENT_B, S, D>;
  // The shared memory tile to swizzle K.
  using Smem_tile_k = fmha::Smem_tile_b<Cta_tile_p, fmha::Col>;

  // The global memory tile to load V.
  using Gmem_tile_v = fmha::Gmem_tile_qkv<Cta_tile_o, fmha::BITS_PER_ELEMENT_B, S, D>;
  // The shared memory tile to swizzle V.
  using Smem_tile_v = fmha::Smem_tile_v<Cta_tile_o>;

  // The global memory tile to store O.
  using Gmem_tile_o = fmha::Gmem_tile_o<Cta_tile_o>;
  // The shared memory tile for O.
  using Smem_tile_o = fmha::Smem_tile_o<Cta_tile_o>;

  // The global memory tile to load/store S.
  using Gmem_tile_s = fmha::Gmem_tile_mma_s<Cta_tile_p>;

  // The shared memory tile to transpose S.
  using Smem_tile_st = fmha::Smem_tile_mma_transposed<Cta_tile_p>;

  using Gmem_tile_do = fmha::Gmem_tile_dout<Cta_tile_p>;

  // Make sure the number of threads match.
  static_assert((int)Gmem_tile_o::THREADS_PER_ROW == (int)Smem_tile_o::THREADS_PER_ROW, "");

  // The number of threads.
  enum { THREADS = Cta_tile_p::THREADS_PER_CTA };
  // Make sure the number of threads matches both CTAs.
  static_assert((int)THREADS == (int)Cta_tile_o::THREADS_PER_CTA, "");

  // The amount of shared memory needed to load Q and K.
  enum { BYTES_PER_SMEM_QK = Smem_tile_q::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE };
  // The extra amount of shared memory needed to load V.
  enum { BYTES_PER_SMEM_V = SHARE_SMEM_FOR_K_AND_V ? 0u : Smem_tile_v::BYTES_PER_TILE };
  // The amount of shared memory needed for Q, K and V..
  enum { BYTES_PER_SMEM_QKV = BYTES_PER_SMEM_QK + BYTES_PER_SMEM_V };
  // The amount of shared memory needed to load Q and store O.
  enum { BYTES_PER_SMEM_QO = Smem_tile_q::BYTES_PER_TILE + Smem_tile_o::BYTES_PER_TILE };

  // The amount of shared memory needed for Q, K, V and O.
  enum { BYTES_PER_SMEM = fmha::Max<BYTES_PER_SMEM_QKV, BYTES_PER_SMEM_QO>::VALUE };
  // Make sure we have enough shared memory.
  static_assert(Smem_tile_q::BYTES_PER_TILE + Smem_tile_o::BYTES_PER_TILE <= BYTES_PER_SMEM, "");
};

////////////////////////////////////////////////////////////////////////////////////////////////////


================================================
FILE: apex/contrib/csrc/fmha/src/fmha/mask.h
================================================
/******************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#pragma once

namespace fmha {

template <typename Cta_tile>
struct Mask {
  using Mma_tile = fmha::Hmma_tile<Cta_tile>;

  template <typename Params, typename BInfo>
  __device__ Mask(const Params& params, const BInfo& blockInfo, int tidx) {
    actual_seqlen = blockInfo.actual_seqlen;

    const int warp = tidx / Cta_tile::THREADS_PER_WARP;
    const int lane = tidx % Cta_tile::THREADS_PER_WARP;

    static_assert(Cta_tile::WARPS_K == 1, "");

    // find the warp in the Cta tile
    const int warp_n = (warp / Cta_tile::WARPS_M);
    const int warp_m = (warp % Cta_tile::WARPS_M);
    // decompose warp into 8x4 tile
    const int quad = lane / 4;
    const int tid = (lane % 4) * 2;
    row = warp_m * 16 + quad;
    col = warp_n * 16 + tid;
  }

  inline __device__ bool is_valid(const int mi, const int ni, const int ii, const int jj) const {
    // ii and jj iterate over the 2x4 fragment
    const bool col_valid = (ni * Mma_tile::N_PER_MMA_PER_CTA + col + (jj & 2) * 4 + (jj & 1)) < actual_seqlen;
    //&& (row + mi * Mma_tile::M_PER_MMA_PER_CTA + ii * 8) < actual_seqlen;
    return col_valid;
    // return row_valid && col_valid;
  }

  // BERT Mask: if upper left is invalid, none are valid
  inline __device__ bool any_valid(int mi, int ni) const { return is_valid(mi, ni, 0, 0); }

  inline __device__ void load(int it) { row_offset = it * Cta_tile::M + row; }
  int row_offset;

  int row;
  int col;
  int actual_seqlen;
};

}  // namespace fmha


================================================
FILE: apex/contrib/csrc/fmha/src/fmha/smem_tile.h
================================================
/******************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#pragma once

#include <fmha/gemm.h>
#include <fmha/utils.h>

namespace fmha {

////////////////////////////////////////////////////////////////////////////////////////////////////

template <
    // The description of the tile computed by this CTA.
    typename Cta_tile,
    // The number of rows in the 2D shared memory buffer.
    int M_,
    // The number of cols.
    int N_,
    // The size in bits of each element.
    int BITS_PER_ELEMENT_,
    // The number of bytes per STS.
    int BYTES_PER_STS_ = 16,
    // The number of buffers. (Used in multistage and double buffer cases.)
    int BUFFERS_PER_TILE_ = 1,
    // Do we enable the fast path for LDS.128 and friends.
    int ENABLE_LDS_FAST_PATH_ = 0,
    // The number of rows that are used for the XOR swizzling to allow fast STS/LDS.
    int ROWS_PER_XOR_PATTERN_ = 8,
    // The number of cols that are used for the XOR swizzling to allow fast STS/LDS.
    int COLS_PER_XOR_PATTERN_ = 1,
    // Use or not predicates
    bool USE_PREDICATES_ = true>
struct Smem_tile_without_skews {
  // The size in bits of each element.
  enum { BITS_PER_ELEMENT = BITS_PER_ELEMENT_ };
  // The size in bytes of a single STS.
  enum { BYTES_PER_STS = BYTES_PER_STS_ };
  // The number of elements per STS.
  enum { ELEMENTS_PER_STS = BYTES_PER_STS * 8 / BITS_PER_ELEMENT };
  // To support arbitrary N, we pad some values to a power-of-2.
  enum { N_WITH_PADDING = Next_power_of_two<N_>::VALUE };
  // The number of bytes per row without packing of rows.
  enum { BYTES_PER_ROW_BEFORE_PACKING = N_WITH_PADDING * BITS_PER_ELEMENT / 8 };
  // The number of bytes per row -- we want at least 128B per row.
  enum { BYTES_PER_ROW = Max<BYTES_PER_ROW_BEFORE_PACKING, 128>::VALUE };
  // The number of rows in shared memory (two rows may be packed into a single one).
  enum { ROWS = M_ * BYTES_PER_ROW_BEFORE_PACKING / BYTES_PER_ROW };

  // The number of threads per row.
  enum { THREADS_PER_ROW_UNBOUNDED = BYTES_PER_ROW / BYTES_PER_STS };
  // The number of threads per row.
  enum { THREADS_PER_ROW = Min<Cta_tile::THREADS_PER_CTA, THREADS_PER_ROW_UNBOUNDED>::VALUE };

  // The number of STS per row.
  enum { STS_PER_ROW = BYTES_PER_ROW / THREADS_PER_ROW / BYTES_PER_STS };
  // It must be at least one.
  static_assert(STS_PER_ROW >= 1, "");
  // The number of rows written with a single STS.
  enum { ROWS_PER_STS = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
  // Make sure we write to at least one row per STS. Thanks Dr. Obvious ;)
  static_assert(ROWS_PER_STS >= 1, "");
  // The number of STS needed to store all rows.
  enum { STS_PER_COL = Div_up<ROWS, ROWS_PER_STS>::VALUE };
  // The number of STS in total.
  enum { STS = STS_PER_COL * STS_PER_ROW };

  // The size of one buffer in bytes in shared memory.
  enum { BYTES_PER_BUFFER = STS * BYTES_PER_STS * Cta_tile::THREADS_PER_CTA };
  // The number of buffers.
  enum { BUFFERS_PER_TILE = BUFFERS_PER_TILE_ };
  // The size in bytes of total buffers.
  enum { BYTES_PER_TILE = BYTES_PER_BUFFER * BUFFERS_PER_TILE };
  // The boundary for smem_read_offset and smem_write_offset increment.
  enum { BYTES_PER_TILE_INC_BOUNDARY = BYTES_PER_TILE - BYTES_PER_BUFFER };

  // Do we enable the LDS.128 fast path?
  enum { ENABLE_LDS_FAST_PATH = ENABLE_LDS_FAST_PATH_ };
  static_assert(ENABLE_LDS_FAST_PATH == 0);
  // The number of rows that are used for the XOR swizzling to allow fast STS/LDS.
  enum { ROWS_PER_XOR_PATTERN = ROWS_PER_XOR_PATTERN_ };
  // The number of cols that are used for the XOR swizzling to allow fast STS/LDS.
  enum { COLS_PER_XOR_PATTERN = COLS_PER_XOR_PATTERN_ * 16 / BYTES_PER_STS };
  // Use or not predicates
  enum { USE_PREDICATES = USE_PREDICATES_ };

  // The type of elements that are stored in shared memory by each thread.
  using Store_type = typename Uint_from_size_in_bytes<BYTES_PER_STS>::Type;

  // Ctor.
  inline __device__ Smem_tile_without_skews(void* smem, int tidx) : smem_(__nvvm_get_smem_pointer(smem)) {
    // The row written by a thread. See doc/mma_smem_layout.xlsx.
    int smem_write_row = tidx / THREADS_PER_ROW;

    // The XOR pattern.
    int smem_write_xor = smem_write_row % ROWS_PER_XOR_PATTERN * COLS_PER_XOR_PATTERN;
    // Compute the column and apply the XOR pattern.
    int smem_write_col = (tidx % THREADS_PER_ROW) ^ smem_write_xor;

    // The offset.
    this->smem_write_offset_ = smem_write_row * BYTES_PER_ROW + smem_write_col * BYTES_PER_STS;

    // TODO: Why not merge it with the read offset?
    this->smem_read_buffer_ = __shfl_sync(0xffffffff, 0, 0);
    this->smem_write_buffer_ = __shfl_sync(0xffffffff, 0, 0);
  }

  // Compute the store pointers.
  template <int N>
  inline __device__ void compute_store_pointers(uint32_t (&ptrs)[N]) {
#pragma unroll
    for (int ii = 0; ii < N; ++ii) {
      // Decompose the STS into row/col.
      int row = ii / STS_PER_ROW;
      int col = ii % STS_PER_ROW;

      // Assemble the offset.
      int offset = smem_write_offset_ + row * ROWS_PER_STS * BYTES_PER_ROW;

      // Take the column into account.
      if (STS_PER_ROW > 1) {
        offset += col * THREADS_PER_ROW * BYTES_PER_STS;
      }

      // Apply the XOR pattern if needed.
      if (ROWS_PER_STS < ROWS_PER_XOR_PATTERN) {
        const int m = row * ROWS_PER_STS % ROWS_PER_XOR_PATTERN;
        offset ^= m * COLS_PER_XOR_PATTERN * BYTES_PER_STS;
      }

      // Assemble the final pointer :)
      ptrs[ii] = smem_ + offset + smem_write_buffer_;
    }
  }

  inline __device__ void debug_reset() {
    for (int buffer = 0; buffer < BYTES_PER_TILE; buffer += BYTES_PER_BUFFER) {
      for (int row = 0; row < ROWS; ++row) {
        for (int col = 0; col < BYTES_PER_ROW; col += 4) {
          if (threadIdx.x == 0) {
            uint32_t val = 0x0;
            sts(val, smem_ + row * BYTES_PER_ROW + col + buffer);
          }
        }
      }
    }
  }

  // Print the content of the tile (only for debug ;)).
  inline __device__ void debug_print() const {
    for (int buffer = 0; buffer < BYTES_PER_TILE; buffer += BYTES_PER_BUFFER) {
      for (int row = 0; row < ROWS; ++row) {
        for (int col = 0; col < BYTES_PER_ROW; col += 4) {
          if (threadIdx.x == 0) {
            uint32_t val;
            lds(val, smem_ + row * BYTES_PER_ROW + col + buffer);
            printf("block=(x=%2d, y=%2d, z=%2d) (smem_=%2d, buffer=%2d, row=%2d, byte=%4d)=0x%08x\n", blockIdx.x,
                   blockIdx.y, blockIdx.z, smem_, buffer, row, col, val);
          }
        }
      }
    }
  }

  // Move the read offset to next buffer.
  inline __device__ void move_to_next_read_buffer() {
    if (BUFFERS_PER_TILE > 1 && smem_read_buffer_ >= BYTES_PER_TILE_INC_BOUNDARY) {
      this->smem_read_buffer_ -= BYTES_PER_TILE_INC_BOUNDARY;
    } else if (BUFFERS_PER_TILE > 1) {
      this->smem_read_buffer_ += BYTES_PER_BUFFER;
    }
  }

  // Move the read offset to next buffer. TODO: Remove this member function!!!
  inline __device__ void move_next_read_buffer() { this->move_to_next_read_buffer(); }

  // Move the read offset to next N buffer (circular-buffer).
  inline __device__ void move_to_next_read_buffer(int N) {
    if (BUFFERS_PER_TILE > 1) {
      this->smem_read_buffer_ += N * BYTES_PER_BUFFER;
      this->smem_read_buffer_ -= smem_read_buffer_ >= BYTES_PER_TILE ? BYTES_PER_TILE : 0;
    }
  }

  // Move the read offset to next N buffer (circular-buffer). TODO: Remove this member function!!!
  inline __device__ void move_next_read_buffer(int N) { this->move_to_next_read_buffer(N); }

  // Move the write offset to next buffer.
  inline __device__ void move_to_next_write_buffer() {
    if (BUFFERS_PER_TILE > 1 && smem_write_buffer_ >= BYTES_PER_TILE_INC_BOUNDARY) {
      this->smem_write_buffer_ -= BYTES_PER_TILE_INC_BOUNDARY;
    } else if (BUFFERS_PER_TILE > 1) {
      this->smem_write_buffer_ += BYTES_PER_BUFFER;
    }
  }

  // Move the write offset to next buffer. TODO: Remove that member function!
  inline __device__ void move_next_write_buffer() { this->move_to_next_write_buffer(); }

  // Move the read offset.
  inline __device__ void move_read_offset(int delta) { this->smem_read_offset_ += delta; }

  // Move the write offset.
  inline __device__ void move_write_offset(int delta) { this->smem_write_offset_ += delta; }

  // Store to the tile in shared memory.
  template <int N>
  inline __device__ void store(const Store_type (&data)[N], uint64_t = 0) {
    uint32_t smem_ptrs[N];
    this->compute_store_pointers(smem_ptrs);
    sts(smem_ptrs, data);
  }

  // Store to the tile in shared memory.
  template <int N, int M>
  inline __device__ void store(const Store_type (&data)[N], uint32_t (&preds)[M], uint64_t = 0) {
    uint32_t smem_ptrs[N];
    this->compute_store_pointers(smem_ptrs);
    sts(smem_ptrs, data, preds);
  }

  // Store to the tile in shared memory.
  template <int N>
  inline __device__ void store(const Store_type (&data)[N], uint32_t preds, uint64_t = 0) {
    this->store(data, preds);
  }

  // Store to the tile in shared memory.
  template <int N>
  inline __device__ void store(const void* (&gmem_ptrs)[N], uint32_t preds, uint64_t = 0) {
    uint32_t tmp[1] = {preds};
    this->store(gmem_ptrs, tmp);
  }

  // The shared memory pointer.
  uint32_t smem_;
  // The read offset. Reserve 4 offsets if needed.
  int smem_read_offset_;
  // The write offset.
  int smem_write_offset_;
  // The buffer base offset for read.
  int smem_read_buffer_;
  // The buffer base offset for write.
  int smem_write_buffer_;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <
    // The dimensions of the tile computed by the CTA.
    typename Cta_tile,
    // The layout of the tile.
    typename Layout,
    // The size of the STS.
    int BYTES_PER_STS = 16,
    // The number of buffers per tile.
    int BUFFERS_PER_TILE = 1,
    // Use or not predicates
    bool USE_PREDICATES = true>
struct Smem_tile_a {};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int MMAS_K, int MMAS_K_WITH_PADDING>
struct Compute_reset_mask {
  // The potential mask.
  enum { HALF = MMAS_K_WITH_PADDING / 2 };
  // The remainder.
  enum { MOD = MMAS_K % HALF };
  // The final value.
  enum { VALUE = (MMAS_K == MOD ? 0 : HALF) | Compute_reset_mask<MOD, HALF>::VALUE };
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int MMAS_K_WITH_PADDING>
struct Compute_reset_mask<0, MMAS_K_WITH_PADDING> {
  enum { VALUE = 0 };
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int MMAS_K>
struct Compute_reset_mask<MMAS_K, MMAS_K> {
  enum { VALUE = MMAS_K - 1 };
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
struct Rows_per_xor_pattern_a {
  // The size in bits.
  enum { N_IN_BITS = N * fmha::BITS_PER_ELEMENT_A };
  // The number of rows.
  enum { VALUE = N_IN_BITS <= 256 ? 2 : (N_IN_BITS <= 512 ? 4 : 8) };
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
struct Rows_per_xor_pattern_row_a : public Rows_per_xor_pattern_a<N> {};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <
    // The dimensions of the tile computed by the CTA.
    typename Cta_tile,
    // The size of the STS.
    int BYTES_PER_STS,
    // The number of buffers per tile.
    int BUFFERS_PER_TILE,
    // How many rows to use for the XOR pattern to avoid bank conflicts?
    int ROWS_PER_XOR_PATTERN_ = Rows_per_xor_pattern_row_a<Cta_tile::K>::VALUE>
struct Smem_tile_row_a : public Smem_tile_without_skews<Cta_tile, Cta_tile::M, Cta_tile::K, fmha::BITS_PER_ELEMENT_A,
                                                        BYTES_PER_STS, BUFFERS_PER_TILE, 0, ROWS_PER_XOR_PATTERN_, 1> {
  // The MMA tile.
  using Mma_tile = fmha::Hmma_tile<Cta_tile>;
  // The base class.
  using Base = Smem_tile_without_skews<Cta_tile, Cta_tile::M, Cta_tile::K, fmha::BITS_PER_ELEMENT_A, BYTES_PER_STS,
                                       BUFFERS_PER_TILE, 0, ROWS_PER_XOR_PATTERN_, 1>;
  // The fragment.
  using Fragment = Fragment_a<Row>;

  // When we use padding to reach a power of two, special care has to be taken.
  using Cta_tile_with_padding = Cta_tile_with_k_with_padding<Cta_tile>;
  // The number of MMAs.
  using Mma_tile_with_padding = fmha::Hmma_tile<Cta_tile_with_padding>;

  // The size of a single LDS in bytes.
  enum { BYTES_PER_LDS = 16 };

  // Ctor.
  inline __device__ Smem_tile_row_a(void* smem, int tidx) : Base(smem, tidx) {
    // For documentation on the layout, see doc/mma_smem_layout.xlsx.

    // The number of warps.
    const int WARPS_M = Cta_tile::WARPS_M;
    const int WARPS_N = Cta_tile::WARPS_N;
    const int WARPS_K = Cta_tile::WARPS_K;

    static_assert(WARPS_M == 1);
    static_assert(WARPS_N == 4 || WARPS_N == 8);
    static_assert(WARPS_K == 1);
    static_assert(Base::ROWS_PER_XOR_PATTERN == 8);

    // The row and column read by the thread.
    int smem_read_row = (tidx & 0x0f);
    int smem_read_col = (tidx & 0x07);
    smem_read_col ^= (tidx & 0x10) / 16;

    // The shared memory offset.
    this->smem_read_offset_ = smem_read_row * Base::BYTES_PER_ROW + smem_read_col * BYTES_PER_LDS;
  }

  // Rewind smem_read_offset for last LDS phase in main loop.
  inline __device__ void reverse_smem_read_offset(int ki = 0) {
    // Undo the pointer increment for the next ni.
    // Should match the load function below for ki = 0.
    if (Mma_tile_with_padding::MMAS_K >= 2) {
      this->smem_read_offset_ ^= BYTES_PER_LDS * 2;
    }
  }

  // Load from shared memory.
  inline __device__ void load(Fragment (&a)[Mma_tile::MMAS_M], int ki) {
#pragma unroll
    for (int mi = 0; mi < Mma_tile::MMAS_M; ++mi) {
      // Jump by as many matrix rows as needed (a row in smem may pack multiple matrix rows).
      int offset = mi * Mma_tile::M_PER_MMA_PER_CTA * Base::BYTES_PER_ROW_BEFORE_PACKING;

      // Load using LDSM.M88.4.
      uint4 tmp;
      ldsm(tmp, this->smem_ + this->smem_read_offset_ + this->smem_read_buffer_ + offset);

      // Store the value into the fragment.
      a[mi].reg(0) = tmp.x;
      a[mi].reg(1) = tmp.y;
      a[mi].reg(2) = tmp.z;
      a[mi].reg(3) = tmp.w;
    }

    // Move the offset to the next possition. See doc/mma_smem_layout.xlsx.
    static_assert(Mma_tile_with_padding::MMAS_K < 64, "Not implemented");
    if (Mma_tile_with_padding::MMAS_K >= 32 && ki % 16 == 15) {
      this->smem_read_offset_ ^= 31 * BYTES_PER_LDS * 2;
    } else if (Mma_tile_with_padding::MMAS_K >= 16 && ki % 8 == 7) {
      this->smem_read_offset_ ^= 15 * BYTES_PER_LDS * 2;
    } else if (Mma_tile_with_padding::MMAS_K >= 8 && ki % 4 == 3) {
      this->smem_read_offset_ ^= 7 * BYTES_PER_LDS * 2;
    } else if (Mma_tile_with_padding::MMAS_K >= 4 && ki % 2 == 1) {
      this->smem_read_offset_ ^= 3 * BYTES_PER_LDS * 2;
    } else if (Mma_tile_with_padding::MMAS_K >= 2) {
      this->smem_read_offset_ ^= 1 * BYTES_PER_LDS * 2;
    }
  }

  // Reset the read offset.
  inline __device__ void reset_read_offset() {
    // The number of MMAs in the K dimension.
    enum { MMAS_K = Mma_tile::MMAS_K };
    // The number of MMAs in the K dimension when we include padding.
    enum { MMAS_K_WITH_PADDING = Mma_tile_with_padding::MMAS_K };
    // Assemble the mask.
    enum { MASK = Compute_reset_mask<MMAS_K, MMAS_K_WITH_PADDING>::VALUE };

    // Reset the read offset.
    this->smem_read_offset_ ^= MASK * BYTES_PER_LDS * 2;
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <
    // The dimensions of the tile computed by the CTA.
    typename Cta_tile,
    // The size of the STS.
    int BYTES_PER_STS,
    // The number of buffers per tile.
    int BUFFERS_PER_TILE>
struct Smem_tile_a<Cta_tile, Row, BYTES_PER_STS, BUFFERS_PER_TILE>
    : public Smem_tile_row_a<Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE> {
  // The base class.
  using Base = Smem_tile_row_a<Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;

  // Ctor.
  inline __device__ Smem_tile_a(void* smem, int tidx) : Base(smem, tidx) {}
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <
    // The dimensions of the tile computed by the CTA.
    typename Cta_tile,
    // The layout of the tile.
    typename Layout,
    // The size of the STS.
    int BYTES_PER_STS = 16,
    // The number of buffers per tile.
    int BUFFERS_PER_TILE = 1,
    // Use or not predicates
    bool USE_PREDICATES = true>
struct Smem_tile_b {};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
struct Rows_per_xor_pattern_b {
  // The size in bits.
  enum { N_IN_BITS = N * fmha::BITS_PER_ELEMENT_B };
  // The number of rows.
  enum { VALUE = N_IN_BITS <= 256 ? 2 : (N_IN_BITS <= 512 ? 4 : 8) };
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
struct Rows_per_xor_pattern_col_b : public Rows_per_xor_pattern_b<N> {};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <
    // The dimensions of the tile computed by the CTA.
    typename Cta_tile,
    // The size of the STS.
    int BYTES_PER_STS,
    // The number of buffers per tile.
    int BUFFERS_PER_TILE,
    // How many rows to use for the XOR pattern to avoid bank conflicts?
    int ROWS_PER_XOR_PATTERN_ = Rows_per_xor_pattern_col_b<Cta_tile::K>::VALUE>
struct Smem_tile_col_b : public Smem_tile_without_skews<Cta_tile, Cta_tile::N, Cta_tile::K, fmha::BITS_PER_ELEMENT_B,
                                                        BYTES_PER_STS, BUFFERS_PER_TILE, 0, ROWS_PER_XOR_PATTERN_, 1> {
  // The MMA tile.
  using Mma_tile = fmha::Hmma_tile<Cta_tile>;
  // The base class.
  using Base = Smem_tile_without_skews<Cta_tile, Cta_tile::N, Cta_tile::K, fmha::BITS_PER_ELEMENT_B, BYTES_PER_STS,
                                       BUFFERS_PER_TILE, 0, ROWS_PER_XOR_PATTERN_, 1>;
  // The fragment.
  using Fragment = Fragment_b<Col>;

  // When we use padding to reach a power of two, special care has to be taken.
  using Cta_tile_with_padding = Cta_tile_with_k_with_padding<Cta_tile>;
  // The number of MMAs.
  using Mma_tile_with_padding = fmha::Hmma_tile<Cta_tile_with_padding>;

  // The size of a single LDS in bytes.
  enum { BYTES_PER_LDS = 16 };

  // The number of STS per thread
  enum { STS_PER_THREAD_ = Base::ROWS * Base::THREADS_PER_ROW / Cta_tile::THREADS_PER_CTA };
  // The number of STS per thread must be at least 1.
  enum { STS_PER_THREAD = Max<1, STS_PER_THREAD_>::VALUE };

  // Ctor.
  inline __device__ Smem_tile_col_b(void* smem, int tidx) : Base(smem, tidx) {
    // For documentation on the layout, see doc/mma_smem_layout.xlsx.

    // The number of warps.
    const int WARPS_M = Cta_tile::WARPS_M;
    const int WARPS_N = Cta_tile::WARPS_N;
    const int WARPS_K = Cta_tile::WARPS_K;
    static_assert(Base::ROWS_PER_XOR_PATTERN == 8);
    static_assert(WARPS_M == 1);
    static_assert(WARPS_N == 4 || WARPS_N == 8);
    static_assert(WARPS_K == 1);

    // The masks to select the warps.
    const int WARP_MASK_N = Warp_masks<WARPS_M, WARPS_N, WARPS_K>::N;

    // The divisor for the warps.
    const int WARP_DIV_N = WARPS_M * 1 * Cta_tile::THREADS_PER_WARP;

    // The row and column read by the thread.
    int smem_read_row = (tidx & WARP_MASK_N) / WARP_DIV_N * Mma_tile::N_PER_MMA + (tidx & 0x07) + (tidx & 0x10) / 2;
    int smem_read_col = (tidx & 0x07);
    smem_read_col ^= (tidx & 0x08) / 8;
    // The shared memory offset.
    this->smem_read_offset_ = smem_read_row * Base::BYTES_PER_ROW + smem_read_col * BYTES_PER_LDS;
  }

  // Rewind smem_read_offset for last LDS phase in main loop.
  inline __device__ void reverse_smem_read_offset(int ki = 0) {
    // Undo the pointer increment for the next ni.
    // Should match the load function below for ki = 0.
    if (Mma_tile_with_padding::MMAS_K >= 2) {
      this->smem_read_offset_ ^= BYTES_PER_LDS * 2;
    }
  }

  // Load from shared memory.
  inline __device__ void load(Fragment (&b)[Mma_tile::MMAS_N], int ki) {
#pragma unroll
    for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
      // Jump by as many matrix rows as needed (a row in smem may pack multiple matrix rows).
      int offset = ni * Mma_tile::N_PER_MMA_PER_CTA * Base::BYTES_PER_ROW_BEFORE_PACKING;

      // Load using LDSM.M88.4.
      uint4 tmp;
      ldsm(tmp, this->smem_ + this->smem_read_offset_ + this->smem_read_buffer_ + offset);

      // Store the value into the fragment.
      b[ni].reg(0) = tmp.x;
      b[ni].reg(1) = tmp.y;
      b[ni].reg(2) = tmp.z;
      b[ni].reg(3) = tmp.w;
    }

    // Move the offset to the next possition. See doc/mma_smem_layout.xlsx.
    static_assert(Mma_tile_with_padding::MMAS_K < 64, "Not implemented");
    if (Mma_tile_with_padding::MMAS_K >= 32 && ki % 16 == 15) {
      this->smem_read_offset_ ^= 31 * BYTES_PER_LDS * 2;
    } else if (Mma_tile_with_padding::MMAS_K >= 16 && ki % 8 == 7) {
      this->smem_read_offset_ ^= 15 * BYTES_PER_LDS * 2;
    } else if (Mma_tile_with_padding::MMAS_K >= 8 && ki % 4 == 3) {
      this->smem_read_offset_ ^= 7 * BYTES_PER_LDS * 2;
    } else if (Mma_tile_with_padding::MMAS_K >= 4 && ki % 2 == 1) {
      this->smem_read_offset_ ^= 3 * BYTES_PER_LDS * 2;
    } else if (Mma_tile_with_padding::MMAS_K >= 2) {
      this->smem_read_offset_ ^= 1 * BYTES_PER_LDS * 2;
    }
  }

  // Reset the read offset.
  inline __device__ void reset_read_offset() {
    // The number of MMAs in the K dimension.
    enum { MMAS_K = Mma_tile::MMAS_K };
    // The number of MMAs in the K dimension when we include padding.
    enum { MMAS_K_WITH_PADDING = Mma_tile_with_padding::MMAS_K };
    // Assemble the mask.
    enum { MASK = Compute_reset_mask<MMAS_K, MMAS_K_WITH_PADDING>::VALUE };

    // Reset the read offset.
    this->smem_read_offset_ ^= MASK * BYTES_PER_LDS * 2;
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <
    // The dimensions of the tile computed by the CTA.
    typename Cta_tile,
    // The size of the STS.
    int BYTES_PER_STS,
    // The number of buffers per tile.
    int BUFFERS_PER_TILE>
struct Smem_tile_b<Cta_tile, Col, BYTES_PER_STS, BUFFERS_PER_TILE>
    : public Smem_tile_col_b<Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE> {
  // The base class.
  using Base = Smem_tile_col_b<Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;

  // Ctor.
  inline __device__ Smem_tile_b(void* smem, int tidx) : Base(smem, tidx) {}
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
struct Rows_per_xor_pattern_row_b : public Rows_per_xor_pattern_b<N> {};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <
    // The dimensions of the tile computed by the CTA.
    typename Cta_tile,
    // The size of the STS.
    int BYTES_PER_STS,
    // The number of buffers per tile.
    int BUFFERS_PER_TILE,
    // How many rows to use for the XOR pattern to avoid bank conflicts?
    int ROWS_PER_XOR_PATTERN_ = Rows_per_xor_pattern_row_b<Cta_tile::N>::VALUE,
    // How many cols to use for the XOR pattern to avoid bank conflicts?
    int COLS_PER_XOR_PATTERN_ = 1>
struct Smem_tile_row_b
    : public Smem_tile_without_skews<Cta_tile, Cta_tile::K, Cta_tile::N, fmha::BITS_PER_ELEMENT_B, BYTES_PER_STS,
                                     BUFFERS_PER_TILE, 0, ROWS_PER_XOR_PATTERN_, COLS_PER_XOR_PATTERN_> {
  // The MMA tile.
  using Mma_tile = fmha::Hmma_tile<Cta_tile>;
  // The base class.
  using Base = Smem_tile_without_skews<Cta_tile, Cta_tile::K, Cta_tile::N, fmha::BITS_PER_ELEMENT_B, BYTES_PER_STS,
                                       BUFFERS_PER_TILE, 0, ROWS_PER_XOR_PATTERN_, COLS_PER_XOR_PATTERN_>;
  // The fragment.
  using Fragment = Fragment_b<Row>;

  // Can we use LDSM? No if the data type is 32-bit large.
  enum { USE_LDSMT = fmha::BITS_PER_ELEMENT_B == 16 };
  // The size of a single LDS in bytes.
  enum { BYTES_PER_LDS = USE_LDSMT ? 16 : 4 };
  // The number of elements per LDS.
  enum { ELEMENTS_PER_LDS = BYTES_PER_LDS * 8 / fmha::BITS_PER_ELEMENT_B };

  // The number of STS per thread
  enum { STS_PER_THREAD_ = Base::ROWS * Base::THREADS_PER_ROW / Cta_tile::THREADS_PER_CTA };
  // The number of STS per thread must be at least 1.
  enum { STS_PER_THREAD = Max<1, STS_PER_THREAD_>::VALUE };

  // Ctor.
  inline __device__ Smem_tile_row_b(void* smem, int tidx) : Base(smem, tidx) {
    // The number of warps.
    const int WARPS_M = Cta_tile::WARPS_M;
    const int WARPS_N = Cta_tile::WARPS_N;
    const int WARPS_K = Cta_tile::WARPS_K;
    static_assert(WARPS_K == 1);
    static_assert(WARPS_M == 4 || WARPS_M == 8);
    static_assert(WARPS_N == 1);

    // The masks to select the warps.
    const int WARP_MASK_N = Warp_masks<WARPS_M, WARPS_N, WARPS_K>::N;
    const int WARP_MASK_K = Warp_masks<WARPS_M, WARPS_N, WARPS_K>::K;

    // The divisor for the warps.
    const int WARP_DIV_N = WARPS_M * 1 * Cta_tile::THREADS_PER_WARP;
    const int WARP_DIV_K = WARPS_M * WARPS_N * Cta_tile::THREADS_PER_WARP;

    // The row/col read by the thread.
    int smem_read_row, smem_read_col;

    static_assert(USE_LDSMT);
    static_assert(Base::ROWS_PER_XOR_PATTERN == 8);

    smem_read_row = (tidx & WARP_MASK_K) / WARP_DIV_K * Mma_tile::MMAS_K * 16 + (tidx & 0x07) + (tidx & 0x08);
    smem_read_col = (tidx & 0x07);
    smem_read_col ^= (tidx & WARP_MASK_N) / WARP_DIV_N * 2 + (tidx & 0x10) / 16;

    // The shared memory offset.
    this->smem_read_offset_ = smem_read_row * Base::BYTES_PER_ROW + smem_read_col * BYTES_PER_LDS;

    // Fill zeroes for group conv
  }

  // Rewind smem_read_offset for last LDS phase in main loop.
  inline __device__ void reverse_smem_read_offset(int ki = 0) {
    // The size of each element in bits.
    const int BITS_PER_ELT = fmha::BITS_PER_ELEMENT_B;
    // The size in bytes of the data needed to compute an MMA per CTA.
    const int BYTES_PER_MMA_PER_CTA = Mma_tile::N_PER_MMA_PER_CTA * BITS_PER_ELT / 8;

#pragma unroll
    for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
      // Undo the pointer increment for the next ni.
      // Should match the load function below for ki = 0.
      if (BYTES_PER_MMA_PER_CTA >= 128) {
        // Nothing to do!
      } else if (BYTES_PER_MMA_PER_CTA == 64 && Mma_tile::MMAS_N > 1) {
        this->smem_read_offset_ ^= BYTES_PER_MMA_PER_CTA;
      } else if (BYTES_PER_MMA_PER_CTA == 64) {
        // Nothing to do!
      } else if (BYTES_PER_MMA_PER_CTA == 32 && Mma_tile::MMAS_N == 4) {
        this->smem_read_offset_ ^= BYTES_PER_LDS * (ni % 2 == 0 ? 2 : 6);
      } else if (BYTES_PER_MMA_PER_CTA == 32 && Mma_tile::MMAS_N == 2) {
        this->smem_read_offset_ ^= BYTES_PER_LDS * 2;
      }
    }

    // Reset smem_read_offset for odd MMAS_N > 1 (npo2 kernels)
    if (BYTES_PER_MMA_PER_CTA == 64 && Mma_tile::MMAS_N > 1 && Mma_tile::MMAS_N % 2 == 1) {
      this->smem_read_offset_ ^= BYTES_PER_MMA_PER_CTA;
    }
  }

  // Load from shared memory.
  inline __device__ void load(Fragment (&b)[Mma_tile::MMAS_N], int ki) {
    // The size of each element in bits.
    const int BITS_PER_ELT = fmha::BITS_PER_ELEMENT_B;
    // The size in bytes of the data needed to compute an MMA per CTA.
    const int BYTES_PER_MMA_PER_CTA = Mma_tile::N_PER_MMA_PER_CTA * BITS_PER_ELT / 8;

#pragma unroll
    for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
      // Prepare the offset.
      int offset = ki * Base::ROWS_PER_XOR_PATTERN * 2 * Base::BYTES_PER_ROW;
      if (BYTES_PER_MMA_PER_CTA == 32) {
        offset += this->smem_read_offset_;
      } else if (BYTES_PER_MMA_PER_CTA == 64) {
        offset += this->smem_read_offset_ + (ni / 2) * BYTES_PER_MMA_PER_CTA * 2;
      } else {
        offset += this->smem_read_offset_ + (ni)*BYTES_PER_MMA_PER_CTA;
      }

      // Load the data using LDSM.MT88.2.
      uint32_t ptr = this->smem_ + this->smem_read_buffer_ + offset;
      uint4 tmp;
      if (USE_LDSMT) {
        ldsmt(tmp, ptr);
      } else {
        lds(tmp.x, (ptr) + 0 * Base::BYTES_PER_ROW);
        lds(tmp.y, (ptr) + 4 * Base::BYTES_PER_ROW);
        lds(tmp.z, (ptr ^ 32) + 0 * Base::BYTES_PER_ROW);
        lds(tmp.w, (ptr ^ 32) + 4 * Base::BYTES_PER_ROW);
      }

      // Store those values in the fragment.
      b[ni].reg(0) = tmp.x;
      b[ni].reg(1) = tmp.y;
      b[ni].reg(2) = tmp.z;
      b[ni].reg(3) = tmp.w;

      // Move the pointer for the next ni. I expect the compiler to not recompute those.
      if (BYTES_PER_MMA_PER_CTA >= 128) {
        // Nothing to do!
      } else if (BYTES_PER_MMA_PER_CTA == 64 && Mma_tile::MMAS_N > 1) {
        this->smem_read_offset_ ^= BYTES_PER_MMA_PER_CTA;
      } else if (BYTES_PER_MMA_PER_CTA == 64) {
        // Nothing to do!
      } else if (BYTES_PER_MMA_PER_CTA == 32 && Mma_tile::MMAS_N == 4) {
        this->smem_read_offset_ ^= BYTES_PER_LDS * (ni % 2 == 0 ? 2 : 6);
      } else if (BYTES_PER_MMA_PER_CTA == 32 && Mma_tile::MMAS_N == 2) {
        this->smem_read_offset_ ^= BYTES_PER_LDS * 2;
      }
    }

    // Reset smem_read_offset for odd MMAS_N > 1 (npo2 kernels)
    if (BYTES_PER_MMA_PER_CTA == 64 && Mma_tile::MMAS_N > 1 && Mma_tile::MMAS_N % 2 == 1) {
      this->smem_read_offset_ ^= BYTES_PER_MMA_PER_CTA;
    }
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <
    // The dimensions of the tile computed by the CTA.
    typename Cta_tile,
    // The size of the STS.
    int BYTES_PER_STS,
    // The number of buffers per tile.
    int BUFFERS_PER_TILE>
struct Smem_tile_b<Cta_tile, Row, BYTES_PER_STS, BUFFERS_PER_TILE>
    : public Smem_tile_row_b<Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE> {
  // The base class.
  using Base = Smem_tile_row_b<Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;

  // Ctor.
  inline __device__ Smem_tile_b(void* smem, int tidx) : Base(smem, tidx) {}
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Cta_tile>
struct Smem_tile_v : public fmha::Smem_tile_without_skews<Cta_tile, Cta_tile::K, Cta_tile::N, 16, 16, 1, 0, 8, 1> {
  // The base class.
  using Base = Smem_tile_without_skews<Cta_tile, Cta_tile::K, Cta_tile::N, 16, 16, 1, 0, 8, 1>;
  // The MMA tile.
  using Mma_tile = fmha::Hmma_tile<Cta_tile>;
  // The fragment.
  using Fragment = Fragment_b<fmha::Col>;

  // The size of a single LDS in bytes.
  enum { BYTES_PER_LDS = 16 };

  // Ctor.
  inline __device__ Smem_tile_v(void* smem, int tidx) : Base(smem, tidx) {
    // The row/col read by the thread.
    int read_row, read_col;

    static_assert(Cta_tile::WARPS_M == 1 && Cta_tile::WARPS_N == 1 &&
                  (Cta_tile::WARPS_K == 4 || Cta_tile::WARPS_K == 8));

    read_row = (tidx & 0xe0) / 2 + (tidx & 0x0f);
    read_col = (tidx & 0x07);
    read_col ^= (tidx & 0x10) / 16;

    // The shared memory offset.
    this->smem_read_offset_ = read_row * Base::BYTES_PER_ROW + read_col * BYTES_PER_LDS;
  }

  // Load from shared memory.
  inline __device__ void load(Fragment (&b)[Mma_tile::MMAS_N], int ki) {
#pragma unroll
    for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
      // Jump by 16 * #warps row.
      int row = ki * 16 * Cta_tile::WARPS_K;

      // Load the data using LDSM.MT88.2.
      uint4 tmp;
      fmha::ldsmt(tmp, this->smem_ + this->smem_read_offset_ + row * Base::BYTES_PER_ROW);
      b[ni].reg(0) = tmp.x;
      b[ni].reg(1) = tmp.y;
      b[ni].reg(2) = tmp.z;
      b[ni].reg(3) = tmp.w;

      // Move the pointer for the next ni. I expect the compiler to not recompute those.
      if (Mma_tile::MMAS_N == 4) {
        this->smem_read_offset_ ^= BYTES_PER_LDS * (ni % 2 == 0 ? 2 : 6);
      } else {
        assert(false);  // Not implemented!
      }
    }
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Cta_tile>
struct Smem_tile_o {
  // The MMA tile.
  using Mma_tile = fmha::Hmma_tile<Cta_tile>;
  // The accumulators.
  using Accumulator = fmha::Fragment_accumulator;
  // The accumulators.
  using Data_type = typename Accumulator::Data_type;

  // The size of each element.
  enum { BYTES_PER_ELEMENT = sizeof(Data_type) };
  // The size of each STS.
  enum { BYTES_PER_STS = 8 };
  // The size of each row in shared memory.
  enum { BYTES_PER_ROW = Cta_tile::N * Cta_tile::WARPS_K * BYTES_PER_ELEMENT };

  // The size of each LDS.
  enum { BYTES_PER_LDS = 16 };
  enum { THREADS_PER_ROW = 16 };

  // The number of rows.
  enum { ROWS = Cta_tile::M };
  // The number of "rows" to process per loop iteration (in the "epilogue").
  enum { ROWS_PER_LOOP = ROWS <= 64 ? ROWS : (int)Mma_tile::M_PER_MMA_PER_CTA };
  // The number of outer loops.
  enum { LOOPS = ROWS / ROWS_PER_LOOP };
  // Make sure it matches our expectations.
  static_assert(LOOPS == 1 || LOOPS == (int)Mma_tile::MMAS_M, "");

  // The number of rows loaded per LDS.
  enum { ROWS_PER_LDS = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
  // Do we have to guard against partial writes/reads.
  enum { HAS_INCOMPLETE_LDS = ROWS_PER_LOOP % ROWS_PER_LDS != 0 };
  // The total number of LDS per loop.
  enum { LDS_PER_LOOP = fmha::Div_up<ROWS_PER_LOOP, ROWS_PER_LDS>::VALUE };

  // The amount of shared memory.
  enum { BYTES_PER_TILE = ROWS_PER_LOOP * BYTES_PER_ROW };

  // The write pointer.
  uint32_t smem_write_, smem_read_;
  // Is the thread active for the last LDS of the series?
  int is_active_for_last_lds_;

  static_assert(BYTES_PER_ROW == 64 * 4 * Cta_tile::WARPS_K);
  static_assert(LOOPS == 1 || LOOPS == (int)Mma_tile::MMAS_M, "");

  // Ctor.
  inline __device__ Smem_tile_o(void* smem, int tidx) {
    // Get a 32-bit value for the shared memory address.
    uint32_t smem_ = __nvvm_get_smem_pointer(smem);

    static_assert(Cta_tile::WARPS_M == 1 && Cta_tile::WARPS_N == 1 &&
                  (Cta_tile::WARPS_K == 4 || Cta_tile::WARPS_K == 8));

    int write_row = (tidx & 0x1c) / 4;
    int write_col = (tidx);

    // Assemble the write pointer.
    smem_write_ = smem_ + write_row * BYTES_PER_ROW + write_col * BYTES_PER_STS;

    // The element read by each thread.
    int read_row = tidx / THREADS_PER_ROW;
    int read_col = tidx % THREADS_PER_ROW;

    // Take the XOR pattern into account for the column.
    read_col ^= 2 * (read_row & 0x7);

    // Assemble the read pointer.
    this->smem_read_ = smem_ + read_row * BYTES_PER_ROW + read_col * BYTES_PER_LDS;

    // Is that thread active on the last LDS?
    if (HAS_INCOMPLETE_LDS) {
      this->is_active_for_last_lds_ = read_row + (LDS_PER_LOOP - 1) * ROWS_PER_LDS < Cta_tile::M;
    }
  }

  // Load the output fragments.
  inline __device__ void load(uint4 (&out)[LDS_PER_LOOP]) const {
#pragma unroll
    for (int ii = 0; ii < LDS_PER_LOOP; ++ii) {
      // Load the elements before the reduction (split-K).
      uint4 tmp[Cta_tile::WARPS_K];
#pragma unroll
      for (int jj = 0; jj < Cta_tile::WARPS_K; ++jj) {
        int imm = ii * ROWS_PER_LDS * BYTES_PER_ROW + jj * Cta_tile::N * BYTES_PER_ELEMENT;
        if (!HAS_INCOMPLETE_LDS || (ii < LDS_PER_LOOP - 1 || this->is_active_for_last_lds_)) {
          fmha::lds(tmp[jj], this->smem_read_ + imm);
        }
      }

      // Perform the reduction.
      out[ii] = tmp[0];
#pragma unroll
      for (int jj = 1; jj < Cta_tile::WARPS_K; ++jj) {
        out[ii] = fmha::fadd4(out[ii], tmp[jj]);
      }
    }
  }
  // Store the accumulators.
  template <int M, int N>
  inline __device__ void store(const Accumulator (&acc)[M][N], int mi) {
    enum { M_PER_MMA = Mma_tile::M_PER_MMA_PER_CTA };
#pragma unroll
    for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
      // The number of MMAs that are stored per loop iteration.
      enum { MMAS_M_PER_LOOP = Mma_tile::MMAS_M / LOOPS };

// Store 1st column of the different MMAs.
#pragma unroll
      for (int mj = 0; mj < MMAS_M_PER_LOOP; ++mj) {
        // Precompute the immediates to jump between rows.
        int row_0 = (mj * M_PER_MMA + 0) * BYTES_PER_ROW;
        int row_1 = (mj * M_PER_MMA + 8) * BYTES_PER_ROW;
        uint2 tmp0, tmp1;
        tmp0.x = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(0);
        tmp0.y = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(1);

        tmp1.x = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(2);
        tmp1.y = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(3);

        // Store.
        fmha::sts(this->smem_write_ + row_0, tmp0);
        fmha::sts(this->smem_write_ + row_1, tmp1);
      }

      // Swizzle the write pointer using a XOR of 16B.
      this->smem_write_ ^= 32;

// Store 2nd column of the different MMAs.
#pragma unroll
      for (int mj = 0; mj < MMAS_M_PER_LOOP; ++mj) {
        // Precompute the immediates to jump between rows.
        int row_0 = (mj * M_PER_MMA + 0) * BYTES_PER_ROW;
        int row_1 = (mj * M_PER_MMA + 8) * BYTES_PER_ROW;

        uint2 tmp0, tmp1;
        tmp0.x = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(4);
        tmp0.y = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(5);

        tmp1.x = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(6);
        tmp1.y = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(7);
        // Store.
        fmha::sts(this->smem_write_ + row_0, tmp0);
        fmha::sts(this->smem_write_ + row_1, tmp1);
      }

      // Cancel the previous XOR of 1 + swizzle the write pointer using a XOR of 32B or 64B.
      this->smem_write_ ^= (ni & 1) ? 7 * 32 : 3 * 32;
    }
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Cta_tile>
struct Smem_tile_mma {
  using Mma_tile = fmha::Hmma_tile<Cta_tile>;
  using Fragment = fmha::Fragment_a<fmha::Col>;

  enum { COLS = Cta_tile::N };
  enum { BYTES_PER_ELT = 2 };
  enum { BYTES_PER_STS = 4 };
  enum { BYTES_PER_ROW = COLS * BYTES_PER_ELT };  // TODO
  enum { BYTES_PER_TILE = Cta_tile::M * BYTES_PER_ROW };

  enum { WARPS_M = Cta_tile::WARPS_M };
  enum { WARPS_N = Cta_tile::WARPS_N };
  enum { WARPS_K = Cta_tile::WARPS_K };

  static_assert(WARPS_K == 1);
  inline __device__ Smem_tile_mma(char* smem, int tidx) {
    smem_ = __nvvm_get_smem_pointer(smem);

    int write_col, write_row;
    static_assert(WARPS_M == 1 && (WARPS_N == 4 || WARPS_N == 8) || (WARPS_M == 4 || WARPS_N == 8) || WARPS_N == 1);
    if (WARPS_M == 1 && (WARPS_N == 4 || WARPS_N == 8)) {
      write_row = (tidx & 0x1c) / 4;
      write_col = (tidx & 0xe0) / 4 + (tidx & 0x03);
    } else {
      write_row = (tidx & 0xe0) / 2 + (tidx & 0x1c) / 4;
      write_col = (tidx & 0x03);
    }
    write_col ^= (write_row & 0x07) * 4;

    write_offset_ = write_row * BYTES_PER_ROW + write_col * BYTES_PER_STS;
  }

  template <int M, int N>
  inline __device__ void store(const uint4 (&regs)[M][N]) {
    static_assert(COLS == Cta_tile::N);
    for (int mi = 0; mi < M; mi++) {
      for (int ni = 0; ni < N; ni++) {
        size_t offset = write_offset_ + mi * WARPS_M * 16 * BYTES_PER_ROW + ni * WARPS_N * 16 * BYTES_PER_ELT;
        fmha::sts(smem_ + offset + 0 * BYTES_PER_ROW, regs[mi][ni].x);
        fmha::sts(smem_ + offset + 8 * BYTES_PER_ROW, regs[mi][ni].z);
        offset ^= 4 * BYTES_PER_STS;
        fmha::sts(smem_ + offset + 0 * BYTES_PER_ROW, regs[mi][ni].y);
        fmha::sts(smem_ + offset + 8 * BYTES_PER_ROW, regs[mi][ni].w);
      }
    }
  }

  uint32_t smem_;
  uint32_t write_offset_;
  uint32_t warp_m;
  uint32_t warp_n;
  uint32_t lane;
};

template <typename Cta_tile, typename Base = Smem_tile_mma<Cta_tile>>
struct Smem_tile_mma_transposed : public Base {
  enum { BYTES_PER_LDS = 16 };
  enum { BYTES_PER_ROW = Base::BYTES_PER_ROW };
  enum { BYTES_PER_ELT = Base::BYTES_PER_ELT };
  enum { WARPS_M = Base::WARPS_M };
  enum { WARPS_N = Base::WARPS_N };
  static_assert(WARPS_M == 1 && (WARPS_N == 4 || WARPS_N == 8));
  using Fragment = typename Base::Fragment;
  inline __device__ Smem_tile_mma_transposed(char* smem, int tidx) : Base(smem, tidx) {
    static_assert(WARPS_M == 1 && (WARPS_N == 4 || WARPS_N == 8));
    int read_row, read_col;
    read_row = (tidx & 0x0f);
    read_col = (tidx & 0xe0) / 16 + (tidx & 0x1c) / 16;

    read_col ^= (read_row & 0x07);
    read_offset_ = read_row * BYTES_PER_ROW + read_col * BYTES_PER_LDS;
  }

  template <int M, int N>
  inline __device__ void load(Fragment (&frag)[M][N]) {
    static_assert(Base::COLS == Cta_tile::N);
    for (int mi = 0; mi < M; mi++) {
      for (int ni = 0; ni < N; ni++) {
        size_t offset = read_offset_ + mi * WARPS_M * 16 * BYTES_PER_ROW + ni * WARPS_N * 16 * BYTES_PER_ELT;
        uint4 dst;
        fmha::ldsmt(dst, this->smem_ + offset);
        frag[mi][ni].reg(0) = dst.x;
        frag[mi][ni].reg(1) = dst.z;  // Fragment A regs col major!
        frag[mi][ni].reg(2) = dst.y;
        frag[mi][ni].reg(3) = dst.w;
      }
    }
  }

  uint32_t read_offset_;
};

template <typename Cta_tile, typename Base = Smem_tile_mma<Cta_tile>>
struct Smem_tile_mma_epilogue : public Base {
  enum { BYTES_PER_LDS = 16 };
  enum { BYTES_PER_ROW = Base::BYTES_PER_ROW };
  enum { BYTES_PER_ELT = Base::BYTES_PER_ELT };
  enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_LDS };
  static_assert(THREADS_PER_ROW * BYTES_PER_LDS == BYTES_PER_ROW);
  enum { ROWS_PER_LDS = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
  enum { NUM_LDS = Cta_tile::M / ROWS_PER_LDS };
  static_assert(NUM_LDS * ROWS_PER_LDS == Cta_tile::M);
  enum { WARPS_M = Base::WARPS_M };
  enum { WARPS_N = Base::WARPS_N };
  static_assert((WARPS_M == 4 || WARPS_N == 8) || WARPS_N == 1);

  using Acc = fmha::Fragment_accumulator;

  inline __device__ Smem_tile_mma_epilogue(char* smem, int tidx) : Base(smem, tidx) {
    const int read_row = tidx / THREADS_PER_ROW;
    int read_col = tidx % THREADS_PER_ROW;
    read_col ^= (read_row & 0x07);
    read_offset_ = read_row * BYTES_PER_ROW + read_col * BYTES_PER_LDS;
  }

  inline __device__ void load(uint4 (&data)[NUM_LDS]) {
    for (int ii = 0; ii < NUM_LDS; ii++) {
      size_t offset = read_offset_ + ii * ROWS_PER_LDS * BYTES_PER_ROW;
      fmha::lds(data[ii], this->smem_ + offset);
    }
  }

  template <int M, int N>
  inline __device__ void store(const Acc (&acc)[M][N]) {
#pragma unroll
    for (int mi = 0; mi < M; mi++) {
#pragma unroll
      for (int ni = 0; ni < N; ni++) {
        // 1st row - 4 elements per row.
        float tmp00 = acc[mi][ni].elt(0);
        float tmp01 = acc[mi][ni].elt(1);
        float tmp02 = acc[mi][ni].elt(4);
        float tmp03 = acc[mi][ni].elt(5);
        // 2nd row - 4 elements per row.
        float tmp10 = acc[mi][ni].elt(2);
        float tmp11 = acc[mi][ni].elt(3);
        float tmp12 = acc[mi][ni].elt(6);
        float tmp13 = acc[mi][ni].elt(7);

        uint32_t x = fmha::float2_to_half2(tmp00, tmp01);
        uint32_t y = fmha::float2_to_half2(tmp02, tmp03);
        uint32_t z = fmha::float2_to_half2(tmp10, tmp11);
        uint32_t w = fmha::float2_to_half2(tmp12, tmp13);

        size_t offset = (this->write_offset_ ^ (ni * 32)) + mi * WARPS_M * 16 * BYTES_PER_ROW;
        fmha::sts(this->smem_ + offset + 0 * BYTES_PER_ROW, x);
        fmha::sts(this->smem_ + offset + 8 * BYTES_PER_ROW, z);
        offset ^= 4 * Base::BYTES_PER_STS;
        fmha::sts(this->smem_ + offset + 0 * BYTES_PER_ROW, y);
        fmha::sts(this->smem_ + offset + 8 * BYTES_PER_ROW, w);
      }
    }
  }

  template <int M, int N>
  inline __device__ void store(const uint4 (&regs)[M][N]) {
    for (int mi = 0; mi < M; mi++) {
      for (int ni = 0; ni < N; ni++) {
        size_t offset = (this->write_offset_ ^ (ni * 32)) + mi * WARPS_M * 16 * BYTES_PER_ROW;
        fmha::sts(this->smem_ + offset + 0 * BYTES_PER_ROW, regs[mi][ni].x);
        fmha::sts(this->smem_ + offset + 8 * BYTES_PER_ROW, regs[mi][ni].z);
        offset ^= 4 * Base::BYTES_PER_STS;
        fmha::sts(this->smem_ + offset + 0 * BYTES_PER_ROW, regs[mi][ni].y);
        fmha::sts(this->smem_ + offset + 8 * BYTES_PER_ROW, regs[mi][ni].w);
      }
    }
  }

  uint32_t read_offset_;
};

}  // namespace fmha


================================================
FILE: apex/contrib/csrc/fmha/src/fmha/softmax.h
================================================
/******************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#pragma once

namespace fmha {

////////////////////////////////////////////////////////////////////////////////////////////////////

struct Sum_ {
  enum { IS_SUM = 1 };
  static inline __device__ float apply(float x, float y) { return x + y; }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

struct Max_ {
  enum { IS_SUM = 0 };
  static inline __device__ float apply(float x, float y) { return x > y ? x : y; }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ float apply_exp_(float x, float max) { return __expf(x - max); }

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int COLS>
struct ReadType {};
template <>
struct ReadType<4> {
  using T = float;
};
template <>
struct ReadType<8> {
  using T = float2;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Cta_tile, typename Kernel_traits>
struct Smem_tile_reduce {
  // Helper class to distribute MMA tiles reduced over rows per warp over quads.

  // The Mma tile.
  using Mma_tile = fmha::Hmma_tile<Cta_tile>;

  // The number of MMAs in M/N dimensions.
  enum { MMAS_M = Mma_tile::MMAS_M };
  enum { MMAS_N = Mma_tile::MMAS_N };

  enum { WARPS_M = Cta_tile::WARPS_M };
  enum { WARPS_N = Cta_tile::WARPS_N };

  static constexpr int ROWS = WARPS_M * MMAS_M * 16;
  static constexpr int COLS = WARPS_N;
  static_assert(COLS == 4 || COLS == 8);
  static constexpr int ROWS_PER_XOR_PATTERN = (COLS == 8) ? 4 : 8;
  static constexpr int BYTES_PER_TILE = ROWS * COLS * sizeof(float);
  static constexpr int ELTS_PER_TILE = ROWS * COLS;

  static constexpr int THREADS_PER_GROUP = Kernel_traits::Gmem_tile_o::THREADS_PER_ROW;
  static_assert(THREADS_PER_GROUP == 16);  // DEBUG
  static constexpr int ROWS_PER_WARP = 32 / THREADS_PER_GROUP;
  static constexpr int LOOPS = Kernel_traits::Gmem_tile_o::LOOPS;
  static_assert(LOOPS == 1);

  using read_t = typename ReadType<COLS>::T;

  __device__ inline Smem_tile_reduce(float* smem_, const int tidx) {
    int lane = tidx % 32;
    int warp = tidx / 32;

    int warp_m = warp % WARPS_M;
    int warp_n = warp / WARPS_M;

    qid_ = lane % 4;
    int qp = lane / 4;

    // Swizzle the column to avoid 2-fold bank conflicts when we have 8 warps.
    // This won't affect reading as we assume commutative reduction ops.
    const int col = warp_n ^ (qp / ROWS_PER_XOR_PATTERN);
    smem_write_ = &smem_[warp_m * 16 * MMAS_M * WARPS_N + qp * WARPS_N + col];
    smem_read_ = &reinterpret_cast<read_t*>(smem_)[warp_m * 16 * MMAS_M * 4 + qp * 4 + qid_];
  }

  __device__ inline void store(float (&frag)[2 * MMAS_M]) {
    if (qid_ == 0) {
#pragma unroll
      for (int mi = 0; mi < MMAS_M; mi++) {
        int offset = mi * 16 * WARPS_N;
        smem_write_[offset + 0 * 8 * WARPS_N] = frag[mi * 2 + 0];
        smem_write_[offset + 1 * 8 * WARPS_N] = frag[mi * 2 + 1];
      }
    }
  }

  __device__ inline void load(read_t (&frag)[2 * MMAS_M]) {
#pragma unroll
    for (int mi = 0; mi < MMAS_M; mi++) {
      int offset = mi * 16 * 4;
      frag[mi * 2 + 0] = smem_read_[offset + 0 * 8 * 4];
      frag[mi * 2 + 1] = smem_read_[offset + 1 * 8 * 4];
    }
  }

  int qid_;
  float* smem_write_;
  read_t* smem_read_;
};

template <typename Cta_tile, typename Kernel_traits>
struct Softmax_base {
  // The Mma tile.
  using Mma_tile = fmha::Hmma_tile<Cta_tile>;

  // The number of MMAs in M/N dimensions.
  enum { MMAS_M = Mma_tile::MMAS_M };
  enum { MMAS_N = Mma_tile::MMAS_N };

  // The number of groups of warp such that we have at most 4 warps writing consecutive elements.
  enum { GROUPS = fmha::Div_up<Cta_tile::WARPS_N, 4>::VALUE };
  // The number of elements that we are going to store per row.
  enum { ELEMENTS_PER_ROW = Cta_tile::WARPS_N / GROUPS };
  // The number of rows.
  enum { ROWS = Cta_tile::M * GROUPS };
  // The total number of elements.
  enum { ELEMENTS = ROWS * ELEMENTS_PER_ROW };

  // Ctor.
  template <typename Params>
  inline __device__ Softmax_base(const Params& params, void* smem, int bidb, int tidx)
      :  // packed_mask_ptr_(reinterpret_cast<const char*>(params.packed_mask_ptr)),
        smem_(reinterpret_cast<float*>(smem)),
        tidx_(tidx) {
    // Move to the 1st mask loaded by the thread+ tidx;
    // packed_mask_ptr_ += bidb * params.packed_mask_stride_in_bytes + tidx * sizeof(uint32_t);

    // Extract the position in the warp.
    int warp = tidx / Cta_tile::THREADS_PER_WARP;
    int lane = tidx % Cta_tile::THREADS_PER_WARP;

    // Decompose the warp index into M and N.
    int warp_m = warp % Cta_tile::WARPS_M;
    int warp_n = warp / Cta_tile::WARPS_M;

    // Decompose the warp-n index into group/position-inside-the-group.
    int warp_g = warp_n / ELEMENTS_PER_ROW;
    int warp_i = warp_n % ELEMENTS_PER_ROW;

    // The location written by the threads.
    int write_row = warp_g * (ROWS / GROUPS) + warp_m * Mma_tile::M_PER_MMA + lane / 4;
    int write_col = warp_i;

    // Assemble the write pointer.
    smem_write_ = &smem_[write_row * ELEMENTS_PER_ROW + write_col];

    // Assemble the read pointer.
    smem_read_ = &smem_[warp_m * Mma_tile::M_PER_MMA + lane / 4];
  }

  template <typename Mask>
  inline __device__ void apply_mask(const Mask& mask) {
#pragma unroll
    for (int mi = 0; mi < MMAS_M; ++mi) {
#pragma unroll
      for (int ii = 0; ii < 2; ++ii) {
#pragma unroll
        for (int ni = 0; ni < MMAS_N; ++ni) {
#pragma unroll
          for (int jj = 0; jj < 4; ++jj) {
            if (!mask.is_valid(mi, ni, ii, jj)) {
              elt_[2 * mi + ii][4 * ni + jj] = -INFINITY;
            }
          }
        }
      }
    }
  }

  // Apply the exp to all the elements.
  inline __device__ void apply_exp(const float (&max)[MMAS_M * 2]) {
#pragma unroll
    for (int mi = 0; mi < MMAS_M * 2; ++mi) {
#pragma unroll
      for (int ni = 0; ni < MMAS_N * 4; ++ni) {
        elt_[mi][ni] = apply_exp_(elt_[mi][ni], max[mi]);
      }
    }
  }

  // Scale all the elements.
  inline __device__ void scale(const float (&sum)[MMAS_M * 2]) {
    // Precompute the inverse sum to normalize. Without -use_fast_math, it makes a huge deal.
    float inv_sum[MMAS_M * 2];
#pragma unroll
    for (int mi = 0; mi < MMAS_M * 2; ++mi) {
      inv_sum[mi] = (sum[mi] == 0.f || sum[mi] != sum[mi]) ? 1.f : 1.f / sum[mi];
    }

// Update the values.
#pragma unroll
    for (int mi = 0; mi < MMAS_M * 2; ++mi) {
#pragma unroll
      for (int ni = 0; ni < MMAS_N * 4; ++ni) {
        elt_[mi][ni] *= inv_sum[mi];
      }
    }
  }

  // The pointer to the mask.
  const char* packed_mask_ptr_;
  // Shared memory for the CTA-wide reduction.
  float *smem_, *smem_write_, *smem_read_;
  // The current thread index.
  int tidx_;
  // The elements.
  float elt_[MMAS_M * 2][MMAS_N * 4];
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Cta_tile, typename Kernel_traits>
struct Softmax : public Softmax_base<Cta_tile, Kernel_traits> {
  // The base class.
  using Base = Softmax_base<Cta_tile, Kernel_traits>;
  // The fragment.
  using Fragment_a = fmha::Fragment_a<fmha::Row>;

  static_assert(Fragment_a::NUM_REGS == 4);

  enum { WARPS_M = Cta_tile::WARPS_M };
  enum { WARPS_N = Cta_tile::WARPS_N };
  // The MMAs.
  enum { MMAS_M = Base::MMAS_M };
  enum { MMAS_N = Base::MMAS_N };

  // The accumulators.
  using Accumulator = fmha::Fragment_accumulator;
  using Accumulator_out = Fragment<uint16_t, 8>;
  static_assert(Accumulator_out::NUM_REGS == 4);

  static_assert(std::is_same<Accumulator::Data_type, float>::value);

  using Smem_tile_red = Smem_tile_reduce<Cta_tile, Kernel_traits>;
  static_assert(Smem_tile_red::ELTS_PER_TILE == Cta_tile::M * WARPS_N);
  // Ctor.
  template <typename Params>
  inline __device__ Softmax(const Params& params, void* smem, int bidb, int tidx)
      : Base(params, smem, bidb, tidx),
        params_scale_bmm1_(params.scale_bmm1),
        smem_sum_(static_cast<float*>(smem), tidx),
        smem_max_(static_cast<float*>(smem) + Smem_tile_red::ELTS_PER_TILE, tidx) {}

  // Pack the data to a fragment for the next GEMM.
  template <int K, int M>
  inline __device__ void pack(Fragment_a (&dst)[K][M]) const {
#pragma unroll
    for (int mi = 0; mi < M; ++mi) {
#pragma unroll
      for (int ki = 0; ki < K; ++ki) {
        // 1st row - 4 elements per row.
        float tmp_00 = this->elt_[2 * mi + 0][4 * ki + 0];
        float tmp_01 = this->elt_[2 * mi + 0][4 * ki + 1];
        float tmp_02 = this->elt_[2 * mi + 0][4 * ki + 2];
        float tmp_03 = this->elt_[2 * mi + 0][4 * ki + 3];

        // 2nd row - 4 elements per row.
        float tmp_10 = this->elt_[2 * mi + 1][4 * ki + 0];
        float tmp_11 = this->elt_[2 * mi + 1][4 * ki + 1];
        float tmp_12 = this->elt_[2 * mi + 1][4 * ki + 2];
        float tmp_13 = this->elt_[2 * mi + 1][4 * ki + 3];

        // Pack to 4 registers.
        dst[ki][mi].reg(0) = fmha::float2_to_half2(tmp_00, tmp_01);
        dst[ki][mi].reg(1) = fmha::float2_to_half2(tmp_10, tmp_11);
        dst[ki][mi].reg(2) = fmha::float2_to_half2(tmp_02, tmp_03);
        dst[ki][mi].reg(3) = fmha::float2_to_half2(tmp_12, tmp_13);
      }
    }
  }

  // Scale FP32 fragments
  inline __device__ void unpack(const Accumulator (&acc)[MMAS_M][MMAS_N]) {
    const float scalef = reinterpret_cast<const float&>(this->params_scale_bmm1_);

#pragma unroll
    for (int mi = 0; mi < MMAS_M; ++mi) {
#pragma unroll
      for (int ni = 0; ni < MMAS_N; ++ni) {
        // 1st row - 4 elements per row.
        this->elt_[2 * mi + 0][4 * ni + 0] = acc[mi][ni].elt(0) * scalef;
        this->elt_[2 * mi + 0][4 * ni + 1] = acc[mi][ni].elt(1) * scalef;
        this->elt_[2 * mi + 0][4 * ni + 2] = acc[mi][ni].elt(4) * scalef;
        this->elt_[2 * mi + 0][4 * ni + 3] = acc[mi][ni].elt(5) * scalef;
        // 2nd row - 4 elements per row.
        this->elt_[2 * mi + 1][4 * ni + 0] = acc[mi][ni].elt(2) * scalef;
        this->elt_[2 * mi + 1][4 * ni + 1] = acc[mi][ni].elt(3) * scalef;
        this->elt_[2 * mi + 1][4 * ni + 2] = acc[mi][ni].elt(6) * scalef;
        this->elt_[2 * mi + 1][4 * ni + 3] = acc[mi][ni].elt(7) * scalef;
      }
    }
  }
  // Scale FP32 fragments
  inline __device__ void unpack_noscale(const Accumulator (&acc)[MMAS_M][MMAS_N]) {
#pragma unroll
    for (int mi = 0; mi < MMAS_M; ++mi) {
#pragma unroll
      for (int ni = 0; ni < MMAS_N; ++ni) {
        // 1st row - 4 elements per row.
        this->elt_[2 * mi + 0][4 * ni + 0] = acc[mi][ni].elt(0);
        this->elt_[2 * mi + 0][4 * ni + 1] = acc[mi][ni].elt(1);
        this->elt_[2 * mi + 0][4 * ni + 2] = acc[mi][ni].elt(4);
        this->elt_[2 * mi + 0][4 * ni + 3] = acc[mi][ni].elt(5);
        // 2nd row - 4 elements per row.
        this->elt_[2 * mi + 1][4 * ni + 0] = acc[mi][ni].elt(2);
        this->elt_[2 * mi + 1][4 * ni + 1] = acc[mi][ni].elt(3);
        this->elt_[2 * mi + 1][4 * ni + 2] = acc[mi][ni].elt(6);
        this->elt_[2 * mi + 1][4 * ni + 3] = acc[mi][ni].elt(7);
      }
    }
  }

  template <typename Operator>
  __device__ inline void reduce_(float (&frag)[2 * MMAS_M], Operator& op, Smem_tile_red& smem_red) {
    for (int mi = 0; mi < 2 * MMAS_M; mi++) {
      frag[mi] = this->elt_[mi][0];
      for (int ni = 1; ni < 4 * MMAS_N; ni++) {
        frag[mi] = op(frag[mi], this->elt_[mi][ni]);
      }
    }
    quad_reduce(frag, frag, op);

    smem_red.store(frag);
    __syncthreads();
    typename Smem_tile_red::read_t tmp[2 * MMAS_M];
    smem_red.load(tmp);

    quad_allreduce(frag, tmp, op);
  }

  __device__ inline void reduce_max(float (&frag)[2 * MMAS_M]) {
    MaxOp<float> max;
    reduce_(frag, max, smem_max_);
  }

  __device__ inline void reduce_sum(float (&frag)[2 * MMAS_M]) {
    SumOp<float> sum;
    reduce_(frag, sum, smem_sum_);
  }

  const uint32_t params_scale_bmm1_;
  Smem_tile_red smem_max_;
  Smem_tile_red smem_sum_;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

}  // namespace fmha


================================================
FILE: apex/contrib/csrc/fmha/src/fmha/utils.h
================================================
/******************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#pragma once

#include <assert.h>
#include <stdint.h>
#include <stdlib.h>

extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void* ptr);

////////////////////////////////////////////////////////////////////////////////////////////////////

namespace fmha {

////////////////////////////////////////////////////////////////////////////////////////////////////

struct Row {};
struct Col {};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int M, bool = (M & (M - 1)) == 0>
struct Next_power_of_two {};

template <int M>
struct Next_power_of_two<M, true> {
  enum { VALUE = M };
};
template <>
struct Next_power_of_two<3, false> {
  enum { VALUE = 4 };
};
template <>
struct Next_power_of_two<5, false> {
  enum { VALUE = 8 };
};
template <>
struct Next_power_of_two<6, false> {
  enum { VALUE = 8 };
};
template <>
struct Next_power_of_two<7, false> {
  enum { VALUE = 8 };
};
template <>
struct Next_power_of_two<9, false> {
  enum { VALUE = 16 };
};
template <>
struct Next_power_of_two<10, false> {
  enum { VALUE = 16 };
};
template <>
struct Next_power_of_two<11, false> {
  enum { VALUE = 16 };
};
template <>
struct Next_power_of_two<12, false> {
  enum { VALUE = 16 };
};
template <>
struct Next_power_of_two<13, false> {
  enum { VALUE = 16 };
};
template <>
struct Next_power_of_two<14, false> {
  enum { VALUE = 16 };
};
template <>
struct Next_power_of_two<15, false> {
  enum { VALUE = 16 };
};
template <>
struct Next_power_of_two<24, false> {
  enum { VALUE = 32 };
};
template <>
struct Next_power_of_two<48, false> {
  enum { VALUE = 64 };
};
template <>
struct Next_power_of_two<80, false> {
  enum { VALUE = 128 };
};
template <>
struct Next_power_of_two<96, false> {
  enum { VALUE = 128 };
};
template <>
struct Next_power_of_two<112, false> {
  enum { VALUE = 128 };
};
template <>
struct Next_power_of_two<144, false> {
  enum { VALUE = 256 };
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N, bool = (N & (N - 1)) == 0>
struct Prev_power_of_two {};

template <int N>
struct Prev_power_of_two<N, true> {
  enum { VALUE = N };
};
template <>
struct Prev_power_of_two<3, false> {
  enum { VALUE = 2 };
};
template <>
struct Prev_power_of_two<5, false> {
  enum { VALUE = 4 };
};
template <>
struct Prev_power_of_two<6, false> {
  enum { VALUE = 4 };
};
template <>
struct Prev_power_of_two<7, false> {
  enum { VALUE = 4 };
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int M, int N>
struct Div_up {
  enum { VALUE = (M + N - 1) / N };
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int A, int B>
struct Max {
  enum { VALUE = A >= B ? A : B };
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int A, int B, int C>
struct Max_3 {
  enum { VALUE = Max<Max<A, B>::VALUE, C>::VALUE };
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int A, int B>
struct Min {
  enum { VALUE = A <= B ? A : B };
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int SIZE_IN_BYTES>
struct Uint_from_size_in_bytes {};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <>
struct Uint_from_size_in_bytes<1> {
  using Type = uint8_t;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <>
struct Uint_from_size_in_bytes<2> {
  using Type = uint16_t;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <>
struct Uint_from_size_in_bytes<4> {
  using Type = uint32_t;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <>
struct Uint_from_size_in_bytes<8> {
  using Type = uint2;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <>
struct Uint_from_size_in_bytes<16> {
  using Type = uint4;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int WARPS_M, int WARPS_N, int WARPS_K>
struct Warp_masks {};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <>
struct Warp_masks<8, 1, 1> {
  enum { M = 0xe0, N = 0x00, K = 0x00 };
};
template <>
struct Warp_masks<4, 2, 1> {
  enum { M = 0x60, N = 0x80, K = 0x00 };
};
template <>
struct Warp_masks<4, 1, 2> {
  enum { M = 0x60, N = 0x00, K = 0x80 };
};
template <>
struct Warp_masks<4, 1, 1> {
  enum { M = 0x60, N = 0x00, K = 0x00 };
};
template <>
struct Warp_masks<2, 4, 1> {
  enum { M = 0x20, N = 0xc0, K = 0x00 };
};
template <>
struct Warp_masks<2, 2, 2> {
  enum { M = 0x20, N = 0x40, K = 0x80 };
};
template <>
struct Warp_masks<2, 2, 1> {
  enum { M = 0x20, N = 0x40, K = 0x00 };
};
template <>
struct Warp_masks<2, 1, 2> {
  enum { M = 0x20, N = 0x00, K = 0x40 };
};
template <>
struct Warp_masks<2, 1, 1> {
  enum { M = 0x20, N = 0x00, K = 0x00 };
};
template <>
struct Warp_masks<1, 8, 1> {
  enum { M = 0x00, N = 0xe0, K = 0x00 };
};
template <>
struct Warp_masks<1, 4, 2> {
  enum { M = 0x00, N = 0x60, K = 0x80 };
};
template <>
struct Warp_masks<1, 4, 1> {
  enum { M = 0x00, N = 0x60, K = 0x00 };
};
template <>
struct Warp_masks<1, 2, 2> {
  enum { M = 0x00, N = 0x20, K = 0x40 };
};
template <>
struct Warp_masks<1, 2, 1> {
  enum { M = 0x00, N = 0x20, K = 0x00 };
};
template <>
struct Warp_masks<1, 1, 4> {
  enum { M = 0x00, N = 0x00, K = 0x60 };
};
template <>
struct Warp_masks<1, 1, 2> {
  enum { M = 0x00, N = 0x00, K = 0x20 };
};
template <>
struct Warp_masks<1, 1, 1> {
  enum { M = 0x00, N = 0x00, K = 0x00 };
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename T>
inline __device__ __host__ T div_up(T m, T n) {
  return (m + n - 1) / n;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

inline int clz(int x) {
  for (int i = 31; i >= 0; --i) {
    if ((1 << i) & x) {
      return 31 - i;
    }
  }
  return 32;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

inline int find_log_2(int x, bool round_up = false) {
  int a = 31 - clz(x);
  if (round_up) {
    a += (x & (x - 1)) ? 1 : 0;
  }
  return a;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint32_t hadd2(uint32_t a, uint32_t b) {
  uint32_t c;
  asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
  return c;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint32_t hmin2(uint32_t a, uint32_t b) {
  uint32_t c;
  asm volatile("min.f16x2 %0, %1, %2;" : "=r"(c) : "r"(a), "r"(b));
  return c;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint32_t hmul2(uint32_t a, uint32_t b) {
  uint32_t c;
  asm volatile("mul.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
  return c;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint2 hmul4(uint2 a, uint2 b) {
  uint2 c;
  c.x = hmul2(a.x, b.x);
  c.y = hmul2(a.y, b.y);
  return c;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint4 hmul8(uint4 a, uint4 b) {
  uint4 c;
  c.x = hmul2(a.x, b.x);
  c.y = hmul2(a.y, b.y);
  c.z = hmul2(a.z, b.z);
  c.w = hmul2(a.w, b.w);
  return c;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint4 hmul8(uint32_t a, uint4 b) {
  uint4 c;
  c.x = hmul2(a, b.x);
  c.y = hmul2(a, b.y);
  c.z = hmul2(a, b.z);
  c.w = hmul2(a, b.w);
  return c;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint32_t hrelu2(uint32_t x, uint32_t lb = 0) {
  uint32_t res;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
  asm volatile("max.f16x2 %0, %1, %2;\n" : "=r"(res) : "r"(x), "r"(lb));
#else
  const uint32_t zero = 0u;
  asm volatile(
      "{\n"
      "\t .reg .f16x2 sela;\n"
      "\t set.gtu.u32.f16x2 sela, %1, %2;\n"
      "\t and.b32 %0, sela, %1;\n"
      "}\n"
      : "=r"(res)
      : "r"(x), "r"(zero));
#endif
  return res;
}
static inline __device__ uint32_t habs2(uint32_t x) {
  uint32_t res;
  asm volatile("abs.f16x2 %0, %1;\n" : "=r"(res) : "r"(x));
  return res;
}

////////////////////////////////////////////////////////////////////////////////////////////////////
//
template <typename T>
static inline __device__ T clamp(T x, T lb, T ub) {
  return x < lb ? lb : (x > ub ? ub : x);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint16_t clamp_to_zero(uint16_t x) {
  uint16_t mask;
  asm volatile("set.gtu %0, %1, 0;" : "=h"(mask) : "h"(x));
  return mask & x;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint16_t float_to_half(float f) {
  uint16_t h;
  asm volatile("cvt.rn.f16.f32 %0, %1;" : "=h"(h) : "f"(f));
  return h;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint32_t float2_to_half2(float a, float b) {
  uint32_t c;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
  asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(c) : "f"(b), "f"(a));
#else
  uint16_t lo = float_to_half(a);
  uint16_t hi = float_to_half(b);
  asm volatile("mov.b32 %0, {%1, %2};\n" : "=r"(c) : "h"(lo), "h"(hi));
#endif
  return c;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint32_t float_to_half2(float a) { return float2_to_half2(a, a); }

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint32_t float2_to_half2(const float2& f) { return float2_to_half2(f.x, f.y); }

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint2 float4_to_half4(float x, float y, float z, float w) {
  uint2 d;
  d.x = float2_to_half2(x, y);
  d.y = float2_to_half2(z, w);
  return d;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint32_t hfma2(uint32_t a, uint32_t b, uint32_t c) {
  uint32_t d;
  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "r"(b), "r"(c));
  return d;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint32_t hfma2_relu(uint32_t a, uint32_t b, uint32_t c) {
  uint32_t d;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
  asm volatile("fma.rn.f16x2.relu %0, %1, %2, %3;" : "=r"(d) : "r"(a), "r"(b), "r"(c));
#else
  d = hrelu2(hfma2(a, b, c));
#endif
  return d;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint32_t h0_h0(uint32_t x) {
  uint32_t y;
  asm volatile("{.reg .f16 lo, hi; mov.b32 {lo, hi}, %1; mov.b32 %0, {lo, lo};}\n" : "=r"(y) : "r"(x));
  return y;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ float h0_to_float(uint32_t h2) {
  float f;
  asm volatile(
      "{\n"
      ".reg .f16 lo, hi;\n"
      "mov.b32 {lo, hi}, %1;\n"
      "cvt.f32.f16 %0, lo;\n"
      "}\n"
      : "=f"(f)
      : "r"(h2));
  return f;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint32_t h1_h1(uint32_t x) {
  uint32_t y;
  asm volatile("{.reg .f16 lo, hi; mov.b32 {lo, hi}, %1; mov.b32 %0, {hi, hi};}\n" : "=r"(y) : "r"(x));
  return y;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint16_t hadd(uint16_t a, uint16_t b) {
  uint16_t d;
  asm volatile("add.f16 %0, %1, %2;" : "=h"(d) : "h"(a), "h"(b));
  return d;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint32_t hadd(uint32_t a, uint32_t b) { return hadd2(a, b); }

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint2 hadd4(uint2 a, uint2 b) {
  uint2 c;
  c.x = hadd2(a.x, b.x);
  c.y = hadd2(a.y, b.y);
  return c;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint2 hadd(uint2 a, uint2 b) { return hadd4(a, b); }

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint4 hadd8(uint4 a, uint4 b) {
  uint4 c;
  c.x = hadd2(a.x, b.x);
  c.y = hadd2(a.y, b.y);
  c.z = hadd2(a.z, b.z);
  c.w = hadd2(a.w, b.w);
  return c;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint4 fadd4(uint4 a, uint4 b) {
  float4 c;
  c.x = reinterpret_cast<const float&>(a.x) + reinterpret_cast<const float&>(b.x);
  c.y = reinterpret_cast<const float&>(a.y) + reinterpret_cast<const float&>(b.y);
  c.z = reinterpret_cast<const float&>(a.z) + reinterpret_cast<const float&>(b.z);
  c.w = reinterpret_cast<const float&>(a.w) + reinterpret_cast<const float&>(b.w);
  return reinterpret_cast<const uint4&>(c);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint4 hadd(uint4 a, uint4 b) { return hadd8(a, b); }

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ float half_to_float(uint16_t h) {
  float f;
  asm volatile("cvt.f32.f16 %0, %1;\n" : "=f"(f) : "h"(h));
  return f;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ float2 half2_to_float2(uint32_t x) {
  uint16_t lo, hi;
  asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(x));
  return make_float2(half_to_float(lo), half_to_float(hi));
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ void half2_to_float2(float& x, float& y, uint32_t h) {
  float2 tmp = half2_to_float2(h);
  x = tmp.x;
  y = tmp.y;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint16_t hfma(uint16_t a, uint16_t b, uint16_t c) {
  uint16_t d;
  asm volatile("fma.rn.f16 %0, %1, %2, %3;" : "=h"(d) : "h"(a), "h"(b), "h"(c));
  return d;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ uint16_t hmul(uint16_t a, uint16_t b) {
  uint16_t d;
  asm volatile("mul.f16 %0, %1, %2;" : "=h"(d) : "h"(a), "h"(b));
  return d;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ float sigmoid(float x) { return 1.f / (1.f + expf(-x)); }

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void clear(uint16_t& dst) { dst = uint16_t(0); }

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void clear(uint32_t& dst) { dst = 0u; }

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void clear(uint2& dst) { dst = make_uint2(0u, 0u); }

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void clear(uint4& dst) { dst = make_uint4(0u, 0u, 0u, 0u); }

////////////////////////////////////////////////////////////////////////////////////////////////////
//
// P R E D I C A T E   P A C K I N G
//
////////////////////////////////////////////////////////////////////////////////////////////////////
enum { BYTES_PER_REG = 4, PREDS_PER_BYTE = 4, PREDS_PER_REG = BYTES_PER_REG * PREDS_PER_BYTE };

////////////////////////////////////////////////////////////////////////////////////////////////////
//
// G E N E R I C   P R E D I C A T E D   L D G S T S
//
////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N, int M, typename Functor>
inline __device__ void load_(Functor& fct, const uint32_t (&preds)[M]) {
  // The number of complete bytes (where we use all the predicates in a byte).
  enum { COMPLETE = N / PREDS_PER_BYTE };
  // Make sure we did allocate enough predicates.
  static_assert(Div_up<COMPLETE, BYTES_PER_REG>::VALUE <= M, "");
  // The remainder.
  enum { REMAINDER = N - COMPLETE * PREDS_PER_BYTE };
  // Make sure we got the math right and the remainder is between 0 and 3.
  static_assert(REMAINDER >= 0 && REMAINDER <= 3, "");
  // The mask to extract the predicates.
  enum { COMPLETE_MASK = (1 << PREDS_PER_BYTE) - 1 };

// Clear the fetch registers.
#pragma unroll
  for (int ii = 0; ii < N; ++ii) {
    fct.clear(ii);
  }

  // Run complete steps.
  bool p[PREDS_PER_BYTE];
#pragma unroll
  for (int ii = 0; ii < COMPLETE; ++ii) {
    // The predicate.
    uint32_t reg = preds[ii / BYTES_PER_REG];

// Extract the predicates.
#pragma unroll
    for (int jj = 0; jj < PREDS_PER_BYTE; ++jj) {
      uint32_t mask = 1u << (ii % BYTES_PER_REG * 8 + jj);
      p[jj] = (reg & mask) != 0u;
    }

// Issue the loads.
#pragma unroll
    for (int jj = 0; jj < PREDS_PER_BYTE; ++jj) {
      fct.load(ii * PREDS_PER_BYTE + jj, p[jj]);
    }
  }

  // Skip the rest of the code if we do not have a remainder.
  if (REMAINDER > 0) {
    // The mask to extract the predicates.
    enum { REMAINDER_MASK = (1 << REMAINDER) - 1 };

    // The predicate register.
    uint32_t reg = preds[COMPLETE / BYTES_PER_REG];

// Extract the predicates.
#pragma unroll
    for (int jj = 0; jj < PREDS_PER_BYTE; ++jj) {
      uint32_t mask = 1u << (COMPLETE % BYTES_PER_REG * 8 + jj);
      p[jj] = (reg & mask) != 0u;
    }

// Issue the loads.
#pragma unroll
    for (int ii = 0; ii < REMAINDER; ++ii) {
      fct.load(COMPLETE * PREDS_PER_BYTE + ii, p[ii]);
    }
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int M, typename Functor>
inline __device__ void load_(Functor& fct, uint32_t preds) {
  uint32_t tmp[1] = {preds};
  load_<M>(fct, tmp);
}

////////////////////////////////////////////////////////////////////////////////////////////////////
//
// L D G
//
////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void ldg(uint8_t& dst, const void* ptr) { dst = *reinterpret_cast<const uint8_t*>(ptr); }

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void ldg(uint16_t& dst, const void* ptr) { dst = *reinterpret_cast<const uint16_t*>(ptr); }

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void ldg(uint32_t& dst, const void* ptr) { dst = *reinterpret_cast<const uint32_t*>(ptr); }

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void ldg(uint2& dst, const void* ptr) { dst = *reinterpret_cast<const uint2*>(ptr); }

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void ldg(uint4& dst, const void* ptr) { dst = *reinterpret_cast<const uint4*>(ptr); }

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Data_type, int N>
struct Ldg_functor {
  // Ctor.
  inline __device__ Ldg_functor(Data_type (&fetch)[N], const void* (&ptrs)[N]) : fetch_(fetch), ptrs_(ptrs) {}

  // Clear the element.
  inline __device__ void clear(int ii) { fmha::clear(fetch_[ii]); }

  // Trigger the loads.
  inline __device__ void load(int ii, bool p) {
    if (p) {
      ldg(fetch_[ii], ptrs_[ii]);
    }
  }

  // The fetch registers.
  Data_type (&fetch_)[N];
  // The pointers.
  const void* (&ptrs_)[N];
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Data_type, int N, int M>
inline __device__ void ldg_(Data_type (&fetch)[N], const void* (&ptrs)[N], uint32_t (&preds)[M]) {
  Ldg_functor<Data_type, N> fct(fetch, ptrs);
  load_<N>(fct, preds);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N, int M>
inline __device__ void ldg(uint8_t (&fetch)[N], const void* (&ptrs)[N], uint32_t (&preds)[M]) {
  ldg_<uint8_t, N>(fetch, ptrs, preds);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N, int M>
inline __device__ void ldg(uint16_t (&fetch)[N], const void* (&ptrs)[N], uint32_t (&preds)[M]) {
  ldg_<uint16_t, N>(fetch, ptrs, preds);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N, int M>
inline __device__ void ldg(uint32_t (&fetch)[N], const void* (&ptrs)[N], uint32_t (&preds)[M]) {
  ldg_<uint32_t, N>(fetch, ptrs, preds);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N, int M>
inline __device__ void ldg(uint2 (&fetch)[N], const void* (&ptrs)[N], uint32_t (&preds)[M]) {
  ldg_<uint2, N>(fetch, ptrs, preds);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N, int M>
inline __device__ void ldg(uint4 (&fetch)[N], const void* (&ptrs)[N], uint32_t (&preds)[M]) {
  ldg_<uint4, N>(fetch, ptrs, preds);
}

////////////////////////////////////////////////////////////////////////////////////////////////////
//
// L D S
//
////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void lds(uint16_t& dst, uint32_t ptr) {
  asm volatile("ld.shared.b16 %0, [%1];\n" : "=h"(dst) : "r"(ptr));
}

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void lds(uint32_t& dst, uint32_t ptr) {
  asm volatile("ld.shared.b32 %0, [%1];\n" : "=r"(dst) : "r"(ptr));
}

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void lds(uint2& dst, uint32_t ptr) {
  asm volatile("ld.shared.v2.b32 {%0, %1}, [%2];\n" : "=r"(dst.x), "=r"(dst.y) : "r"(ptr));
}

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void lds(uint4& dst, uint32_t ptr) {
  asm volatile("ld.shared.v4.b32 {%0, %1, %2, %3}, [%4];\n"
               : "=r"(dst.x), "=r"(dst.y), "=r"(dst.z), "=r"(dst.w)
               : "r"(ptr));
}

////////////////////////////////////////////////////////////////////////////////////////////////////
//
// L D S M
//
////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void ldsm(uint32_t& dst, uint32_t ptr) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 730
  asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n" : "=r"(dst) : "r"(ptr));
#endif
}

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void ldsmt(uint32_t& dst, uint32_t ptr) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 730
  asm volatile("ldmatrix.sync.aligned.m8n8.x1.trans.shared.b16 {%0}, [%1];\n" : "=r"(dst) : "r"(ptr));
#endif
}

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void ldsm(uint2& dst, uint32_t ptr) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 730
  asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0, %1}, [%2];\n" : "=r"(dst.x), "=r"(dst.y) : "r"(ptr));
#endif
}

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void ldsmt(uint2& dst, uint32_t ptr) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 730
  asm volatile("ldmatrix.sync.aligned.m8n8.x2.trans.shared.b16 {%0, %1}, [%2];\n"
               : "=r"(dst.x), "=r"(dst.y)
               : "r"(ptr));
#endif
}

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void ldsm(uint4& dst, uint32_t ptr) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 730
  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0, %1, %2, %3}, [%4];\n"
               : "=r"(dst.x), "=r"(dst.y), "=r"(dst.z), "=r"(dst.w)
               : "r"(ptr));
#endif
}

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void ldsmt(uint4& dst, uint32_t ptr) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 730
  asm volatile("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%0, %1, %2, %3}, [%4];\n"
               : "=r"(dst.x), "=r"(dst.y), "=r"(dst.z), "=r"(dst.w)
               : "r"(ptr));
#endif
}

////////////////////////////////////////////////////////////////////////////////////////////////////
//
// S T G
//
////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void stg(void* ptr, uint8_t val) { *reinterpret_cast<uint8_t*>(ptr) = val; }

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void stg(void* ptr, uint16_t val) { *reinterpret_cast<uint16_t*>(ptr) = val; }

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void stg(void* ptr, uint32_t val) { *reinterpret_cast<uint32_t*>(ptr) = val; }

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void stg(void* ptr, uint2 val) { *reinterpret_cast<uint2*>(ptr) = val; }

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void stg(void* ptr, uint4 val) { *reinterpret_cast<uint4*>(ptr) = val; }

////////////////////////////////////////////////////////////////////////////////////////////////////
//
// S T S
//
////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void sts(uint32_t ptr, uint16_t val) {
  asm volatile("st.shared.b16 [%0], %1;\n" : : "r"(ptr), "h"(val));
}

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void sts(uint32_t ptr, uint32_t val) {
  asm volatile("st.shared.b32 [%0], %1;\n" : : "r"(ptr), "r"(val));
}

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void sts(uint32_t ptr, uint2 val) {
  asm volatile("st.shared.v2.b32 [%0], {%1, %2};\n" : : "r"(ptr), "r"(val.x), "r"(val.y));
}

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void sts(uint32_t ptr, uint4 val) {
  asm volatile("st.shared.v4.b32 [%0], {%1, %2, %3, %4};\n"
               :
               : "r"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w));
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Data_type, int N>
inline __device__ void sts_(uint32_t (&ptrs)[N], const Data_type (&data)[N]) {
#pragma unroll
  for (int ii = 0; ii < N; ++ii) {
    sts(ptrs[ii], data[ii]);
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
inline __device__ void sts(uint32_t (&ptrs)[N], const uint16_t (&data)[N]) {
  sts_<uint16_t, N>(ptrs, data);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
inline __device__ void sts(uint32_t (&ptrs)[N], const uint32_t (&data)[N]) {
  sts_<uint32_t, N>(ptrs, data);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
inline __device__ void sts(uint32_t (&ptrs)[N], const uint2 (&data)[N]) {
  sts_<uint2, N>(ptrs, data);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
inline __device__ void sts(uint32_t (&ptrs)[N], const uint4 (&data)[N]) {
  sts_<uint4, N>(ptrs, data);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename T>
struct MaxOp {
  __device__ inline T operator()(T const& x, T const& y) { return x > y ? x : y; }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename T>
struct SumOp {
  __device__ inline T operator()(T const& x, T const& y) { return x + y; }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int THREADS>
struct Allreduce {
  static_assert(THREADS == 32 || THREADS == 16 || THREADS == 8 || THREADS == 4);
  template <typename T, typename Operator>
  static __device__ inline T run(T x, Operator& op) {
    constexpr int OFFSET = THREADS / 2;
    x = op(x, __shfl_xor_sync(uint32_t(-1), x, OFFSET));
    return Allreduce<OFFSET>::run(x, op);
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <>
struct Allreduce<2> {
  template <typename T, typename Operator>
  static __device__ inline T run(T x, Operator& op) {
    x = op(x, __shfl_xor_sync(uint32_t(-1), x, 1));
    return x;
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Operator, int M>
__device__ inline void quad_reduce(float (&dst)[M], float (&src)[M], Operator& op) {
#pragma unroll
  for (int mi = 0; mi < M; mi++) {
    dst[mi] = src[mi];
    dst[mi] = op(dst[mi], __shfl_down_sync(uint32_t(-1), dst[mi], 2));
    dst[mi] = op(dst[mi], __shfl_down_sync(uint32_t(-1), dst[mi], 1));
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Operator, int M>
__device__ inline void quad_reduce(float (&dst)[M], float2 (&src)[M], Operator& op) {
  float tmp[M];
#pragma unroll
  for (int mi = 0; mi < M; mi++) {
    tmp[mi] = op(src[mi].x, src[mi].y);
  }
  quad_reduce(dst, tmp, op);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Operator, int M>
__device__ inline void quad_allreduce(float (&dst)[M], float (&src)[M], Operator& op) {
#pragma unroll
  for (int mi = 0; mi < M; mi++) {
    dst[mi] = src[mi];
    dst[mi] = Allreduce<4>::run(dst[mi], op);
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Operator, int M>
__device__ inline void quad_allreduce(float (&dst)[M], float2 (&src)[M], Operator& op) {
  float tmp[M];
#pragma unroll
  for (int mi = 0; mi < M; mi++) {
    tmp[mi] = op(src[mi].x, src[mi].y);
  }
  quad_allreduce(dst, tmp, op);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

}  // namespace fmha


================================================
FILE: apex/contrib/csrc/fmha/src/fmha.h
================================================
/******************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#pragma once

#include <cuda.h>

#include <vector>

#ifdef OLD_GENERATOR_PATH
#include <ATen/CUDAGeneratorImpl.h>
#else
#include <ATen/cuda/CUDAGeneratorImpl.h>
#endif

#include <fmha_utils.h>

#include <ATen/cuda/CUDAGraphsUtils.cuh>

constexpr int TOTAL_DIM = 0;
constexpr int THREE_DIM = 1;
constexpr int H_DIM = 2;
constexpr int D_DIM = 3;

////////////////////////////////////////////////////////////////////////////////////////////////////

struct Qkv_params {
  // The QKV matrices.
  void* __restrict__ qkv_ptr;

  // The stride between rows of the Q, K and V matrices.
  size_t qkv_stride_in_bytes;

  // The number of heads.
  int h;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

struct Fused_multihead_attention_fprop_params : public Qkv_params {
  // The dQKV matrices.
  void* __restrict__ dqkv_ptr;

  // Temporary for dKV.
  void* __restrict__ dkv_ptr;

  // The O matrix (output).
  void* __restrict__ o_ptr;

  // The stride between rows of O.
  int64_t o_stride_in_bytes;

  // The pointer to the S matrix, overwritten by the dP matrix (bwd).
  void* __restrict__ s_ptr;
  // The stride between rows of the S matrix.
  int64_t s_stride_in_bytes;

  // The dimensions.
  int b, s, d;

  // The scaling factors for the kernel.
  uint32_t scale_bmm1, scale_softmax, scale_bmm2;

  // array of length b+1 holding starting offset of each sequence.
  int* __restrict__ cu_seqlens;

  // The dropout probability (probability of keeping an activation).
  float p_dropout;

  // Scale factor of 1 / (1 - p_dropout).
  float rp_dropout;

  // Scale factor of 1 / (1 - p_dropout), in half2.
  uint32_t scale_dropout;

  // Random state.
  at::PhiloxCudaState philox_args;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Kernel_params>
struct Launch_params {
  Launch_params(cudaDeviceProp* props_, cudaStream_t stream_, bool is_training_, bool is_nl_)
      : elts_per_thread(0), props(props_), stream(stream_), is_training(is_training_), is_nl(is_nl_) {}

  size_t elts_per_thread;

  cudaDeviceProp* props;

  cudaStream_t stream;

  bool is_training;

  Kernel_params params;
  int num_full_heads;
  int num_main_groups;
  int heads_last_wave;
  int main_steps;
  int rest_steps;
  bool is_nl;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

void run_fmha_fp16_128_64_sm80(Launch_params<Fused_multihead_attention_fprop_params>& launch_params,
                               const bool configure);
void run_fmha_fp16_256_64_sm80(Launch_params<Fused_multihead_attention_fprop_params>& launch_params,
                               const bool configure);
void run_fmha_fp16_384_64_sm80(Launch_params<Fused_multihead_attention_fprop_params>& launch_params,
                               const bool configure);
void run_fmha_fp16_512_64_sm80(Launch_params<Fused_multihead_attention_fprop_params>& launch_params,
                               const bool configure);

void run_fmha_dgrad_fp16_128_64_sm80(const Fused_multihead_attention_fprop_params& params, cudaStream_t stream);
void run_fmha_dgrad_fp16_256_64_sm80(const Fused_multihead_attention_fprop_params& params, cudaStream_t stream);
void run_fmha_dgrad_fp16_384_64_sm80(const Fused_multihead_attention_fprop_params& params, cudaStream_t stream);
void run_fmha_dgrad_fp16_512_64_sm80(const Fused_multihead_attention_fprop_params& params, cudaStream_t stream);

void run_fmha_fp16_512_64_sm80_nl(const Fused_multihead_attention_fprop_params& params, const bool is_training,
                                  const int num_chunks, cudaStream_t stream);

void run_fmha_dgrad_fp16_512_64_sm80_nl(const Fused_multihead_attention_fprop_params& params, const int num_chunks,
                                        cudaStream_t stream);

void fmha_run_noloop_reduce(void* out, const void* in, const int* cu_seqlens, const int hidden_size,
                            const int batch_size, const int total, const int num_chunks, cudaStream_t stream);


================================================
FILE: apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_128_64_kernel.sm80.cu
================================================
/******************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#include "fmha.h"
#include "fmha_dgrad_kernel_1xN_reload.h"

using Kernel_traits = FMHA_kernel_traits<128, 64, 16, 1, 4, 0x08u>;

extern "C" __global__ void fmha_dgrad_fp16_128_64_sm80_kernel(Fused_multihead_attention_fprop_params params) {
  fmha::compute_dv_1xN<Kernel_traits>(params);
  fmha::compute_dq_dk_1xN<Kernel_traits>(params);
}

void run_fmha_dgrad_fp16_128_64_sm80(const Fused_multihead_attention_fprop_params& params, cudaStream_t stream) {
  constexpr int smem_size_softmax = Kernel_traits::Cta_tile_p::M * Kernel_traits::Cta_tile_p::WARPS_N * sizeof(float);
  constexpr int smem_size_q = Kernel_traits::Smem_tile_q::BYTES_PER_TILE;
  constexpr int smem_size_v = Kernel_traits::Smem_tile_v::BYTES_PER_TILE;
  constexpr int smem_size_o = Kernel_traits::Smem_tile_o::BYTES_PER_TILE;

  using Smem_tile_s = fmha::Smem_tile_mma_transposed<Kernel_traits::Cta_tile_p>;
  constexpr int smem_size_s = Smem_tile_s::BYTES_PER_TILE;
  static_assert(smem_size_s == 16 * 128 * 2);
  static_assert(smem_size_o == 16 * 64 * 4 * Kernel_traits::Cta_tile_p::WARPS_N);

  constexpr int smem_size_dv = smem_size_s + 2 * smem_size_q + smem_size_v + smem_size_softmax;
  constexpr int smem_size_dq_dk = smem_size_s + smem_size_o + smem_size_q + smem_size_v;
  constexpr int smem_size = std::max(smem_size_dv, smem_size_dq_dk);

  if (smem_size >= 48 * 1024) {
    FMHA_CHECK_CUDA(cudaFuncSetAttribute(fmha_dgrad_fp16_128_64_sm80_kernel,
                                         cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
  }
  dim3 grid(params.h, params.b);
  fmha_dgrad_fp16_128_64_sm80_kernel<<<grid, Kernel_traits::THREADS, smem_size, stream>>>(params);
}


================================================
FILE: apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_256_64_kernel.sm80.cu
================================================
/******************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#include "fmha.h"
#include "fmha_dgrad_kernel_1xN_reload.h"

using Kernel_traits = FMHA_kernel_traits<256, 64, 16, 1, 4, 0x08u>;

extern "C" __global__ void fmha_dgrad_fp16_256_64_sm80_kernel(Fused_multihead_attention_fprop_params params) {
  fmha::compute_dv_1xN<Kernel_traits>(params);
  fmha::compute_dq_dk_1xN<Kernel_traits>(params);
}

void run_fmha_dgrad_fp16_256_64_sm80(const Fused_multihead_attention_fprop_params& params, cudaStream_t stream) {
  constexpr int smem_size_softmax = Kernel_traits::Cta_tile_p::M * Kernel_traits::Cta_tile_p::WARPS_N * sizeof(float);
  constexpr int smem_size_q = Kernel_traits::Smem_tile_q::BYTES_PER_TILE;
  constexpr int smem_size_v = Kernel_traits::Smem_tile_v::BYTES_PER_TILE;
  constexpr int smem_size_o = Kernel_traits::Smem_tile_o::BYTES_PER_TILE;

  using Smem_tile_s = fmha::Smem_tile_mma_transposed<Kernel_traits::Cta_tile_p>;
  constexpr int smem_size_s = Smem_tile_s::BYTES_PER_TILE;
  static_assert(smem_size_s == 16 * 256 * 2);
  static_assert(smem_size_o == 16 * 64 * 4 * Kernel_traits::Cta_tile_p::WARPS_N);

  constexpr int smem_size_dv = smem_size_s + 2 * smem_size_q + smem_size_v + smem_size_softmax;
  constexpr int smem_size_dq_dk = smem_size_s + smem_size_o + smem_size_q + smem_size_v;
  constexpr int smem_size = std::max(smem_size_dv, smem_size_dq_dk);

  if (smem_size >= 48 * 1024) {
    FMHA_CHECK_CUDA(cudaFuncSetAttribute(fmha_dgrad_fp16_256_64_sm80_kernel,
                                         cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
  }
  dim3 grid(params.h, params.b);
  fmha_dgrad_fp16_256_64_sm80_kernel<<<grid, Kernel_traits::THREADS, smem_size, stream>>>(params);
}


================================================
FILE: apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_384_64_kernel.sm80.cu
================================================
/******************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#include "fmha.h"
#include "fmha_dgrad_kernel_1xN_reload.h"

using Kernel_traits = FMHA_kernel_traits<384, 64, 16, 1, 8, 0x08u>;

extern "C" __global__ void fmha_dgrad_fp16_384_64_sm80_kernel(Fused_multihead_attention_fprop_params params) {
  fmha::compute_dv_1xN<Kernel_traits>(params);
  fmha::compute_dq_dk_1xN<Kernel_traits>(params);
}

void run_fmha_dgrad_fp16_384_64_sm80(const Fused_multihead_attention_fprop_params& params, cudaStream_t stream) {
  constexpr int smem_size_softmax = Kernel_traits::Cta_tile_p::M * Kernel_traits::Cta_tile_p::WARPS_N * sizeof(float);
  constexpr int smem_size_q = Kernel_traits::Smem_tile_q::BYTES_PER_TILE;
  constexpr int smem_size_v = Kernel_traits::Smem_tile_v::BYTES_PER_TILE;
  constexpr int smem_size_o = Kernel_traits::Smem_tile_o::BYTES_PER_TILE;

  using Smem_tile_s = fmha::Smem_tile_mma_transposed<Kernel_traits::Cta_tile_p>;
  constexpr int smem_size_s = Smem_tile_s::BYTES_PER_TILE;
  static_assert(smem_size_s == 16 * 384 * 2);
  static_assert(smem_size_o == 16 * 64 * 4 * Kernel_traits::Cta_tile_p::WARPS_N);

  constexpr int smem_size_dv = smem_size_s + 2 * smem_size_q + smem_size_v + smem_size_softmax;
  constexpr int smem_size_dq_dk = smem_size_s + smem_size_o + smem_size_q + smem_size_v;
  constexpr int smem_size = std::max(smem_size_dv, smem_size_dq_dk);

  if (smem_size >= 48 * 1024) {
    FMHA_CHECK_CUDA(cudaFuncSetAttribute(fmha_dgrad_fp16_384_64_sm80_kernel,
                                         cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
  }
  dim3 grid(params.h, params.b);
  fmha_dgrad_fp16_384_64_sm80_kernel<<<grid, Kernel_traits::THREADS, smem_size, stream>>>(params);
}


================================================
FILE: apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_512_64_kernel.sm80.cu
================================================
/******************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#include "fmha.h"
#include "fmha_dgrad_kernel_1xN_reload.h"
#include "fmha_dgrad_kernel_1xN_reload_nl.h"

using Kernel_traits = FMHA_kernel_traits<512, 64, 16, 1, 8, 0x08u>;

extern "C" __global__ void fmha_dgrad_fp16_512_64_sm80_kernel(Fused_multihead_attention_fprop_params params) {
  fmha::compute_dv_1xN<Kernel_traits>(params);
  fmha::compute_dq_dk_1xN<Kernel_traits>(params);
}

template <int CHUNKS>
__global__ void fmha_dgrad_fp16_512_64_sm80_nl_kernel(Fused_multihead_attention_fprop_params params) {
  fmha::compute_dv_1xN_nl<CHUNKS, Kernel_traits>(params);
  fmha::compute_dq_dk_1xN_nl<CHUNKS, Kernel_traits>(params);
}

void run_fmha_dgrad_fp16_512_64_sm80(const Fused_multihead_attention_fprop_params& params, cudaStream_t stream) {
  constexpr int smem_size_softmax = Kernel_traits::Cta_tile_p::M * Kernel_traits::Cta_tile_p::WARPS_N * sizeof(float);
  constexpr int smem_size_q = Kernel_traits::Smem_tile_q::BYTES_PER_TILE;
  constexpr int smem_size_v = Kernel_traits::Smem_tile_v::BYTES_PER_TILE;
  constexpr int smem_size_o = Kernel_traits::Smem_tile_o::BYTES_PER_TILE;

  using Smem_tile_s = fmha::Smem_tile_mma_transposed<Kernel_traits::Cta_tile_p>;
  constexpr int smem_size_s = Smem_tile_s::BYTES_PER_TILE;
  static_assert(smem_size_s == 16 * 512 * 2);
  static_assert(smem_size_o == 16 * 64 * 4 * Kernel_traits::Cta_tile_p::WARPS_N);

  constexpr int smem_size_dv = smem_size_s + 2 * smem_size_q + smem_size_v + smem_size_softmax;
  constexpr int smem_size_dq_dk = smem_size_s + smem_size_o + smem_size_q + smem_size_v;
  constexpr int smem_size = std::max(smem_size_dv, smem_size_dq_dk);

  if (smem_size >= 48 * 1024) {
    FMHA_CHECK_CUDA(cudaFuncSetAttribute(fmha_dgrad_fp16_512_64_sm80_kernel,
                                         cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
  }
  dim3 grid(params.h, params.b);
  fmha_dgrad_fp16_512_64_sm80_kernel<<<grid, Kernel_traits::THREADS, smem_size, stream>>>(params);
}

void run_fmha_dgrad_fp16_512_64_sm80_nl(const Fused_multihead_attention_fprop_params& params, const int num_chunks,
                                        cudaStream_t stream) {
  constexpr int smem_size_softmax = Kernel_traits::Cta_tile_p::M * Kernel_traits::Cta_tile_p::WARPS_N * sizeof(float);
  constexpr int smem_size_q = Kernel_traits::Smem_tile_q::BYTES_PER_TILE;
  constexpr int smem_size_v = Kernel_traits::Smem_tile_v::BYTES_PER_TILE;
  constexpr int smem_size_o = Kernel_traits::Smem_tile_o::BYTES_PER_TILE;

  using Smem_tile_s = fmha::Smem_tile_mma_transposed<Kernel_traits::Cta_tile_p>;
  constexpr int smem_size_s = Smem_tile_s::BYTES_PER_TILE;
  static_assert(smem_size_s == 16 * 512 * 2);
  static_assert(smem_size_o == 16 * 64 * 4 * Kernel_traits::Cta_tile_p::WARPS_N);

  constexpr int smem_size_dv = smem_size_s + 2 * smem_size_q + smem_size_v + smem_size_softmax;
  constexpr int smem_size_dq_dk = smem_size_s + smem_size_o + smem_size_q + smem_size_v;
  constexpr int smem_size = std::max(smem_size_dv, smem_size_dq_dk);

  auto kernel = fmha_dgrad_fp16_512_64_sm80_nl_kernel<2>;

  if (num_chunks == 2) {
    kernel = fmha_dgrad_fp16_512_64_sm80_nl_kernel<2>;
  } else if (num_chunks == 3) {
    kernel = fmha_dgrad_fp16_512_64_sm80_nl_kernel<3>;
  } else {
    assert(false && "Unsupperted number of chunks");
  }

  if (smem_size >= 48 * 1024) {
    FMHA_CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
  }

  dim3 grid(params.h, params.b, num_chunks);

  kernel<<<grid, Kernel_traits::THREADS, smem_size, stream>>>(params);

  FMHA_CHECK_CUDA(cudaPeekAtLastError());
}


================================================
FILE: apex/contrib/csrc/fmha/src/fmha_dgrad_kernel_1xN_reload.h
================================================
/******************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#pragma once

#include <fmha/gemm.h>
#include <fmha/kernel_traits.h>

#include "fmha_kernel.h"

namespace fmha {

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Kernel_traits, typename Params>
inline __device__ void compute_dv_1xN(const Params& params) {
  // The description of the CTA tile for the 1st batched GEMM.
  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
  // The description of the CTA tile for the 2nd batched GEMM.
  using Cta_tile_dv =
      fmha::Cta_tile_extd<Cta_tile_p::N, Cta_tile_p::K, Cta_tile_p::M, Cta_tile_p::WARPS_N, 1, Cta_tile_p::WARPS_M>;

  static_assert(Cta_tile_dv::M == 512 || Cta_tile_dv::M == 384 || Cta_tile_dv::M == 256 || Cta_tile_dv::M == 128);
  static_assert(Cta_tile_dv::N == 64);
  static_assert(Cta_tile_dv::K == 16);

  // The MMA tile for the 1st GEMM.
  using Mma_tile_p = fmha::Hmma_tile<Cta_tile_p>;
  // The MMA tile for the 2nd GEMM.
  using Mma_tile_dv = fmha::Hmma_tile<Cta_tile_dv>;

  // The global memory tile to load Q.
  using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
  // The shared memory tile to swizzle Q.
  // using Smem_tile_q = typename Kernel_traits::Smem_tile_q;
  using Smem_tile_q = fmha::Smem_tile_a<Cta_tile_p, fmha::Row, Gmem_tile_q::BYTES_PER_LDG, 2>;
  // The shared memory tile to reload Q as fragment b.
  using Smem_tile_qt = fmha::Smem_tile_b<Cta_tile_dv, fmha::Row, Gmem_tile_q::BYTES_PER_LDG, 2>;

  // The global memory tile to load K.
  using Gmem_tile_k = typename Kernel_traits::Gmem_tile_k;
  // The shared memory tile to swizzle K.
  using Smem_tile_k = typename Kernel_traits::Smem_tile_k;

  // The global memory tile to load V.
  using Gmem_tile_v = typename Kernel_traits::Gmem_tile_v;
  // The shared memory tile to swizzle V.
  using Smem_tile_v = typename Kernel_traits::Smem_tile_v;

  // The global memory tile to store O.
  using Gmem_tile_o = typename Kernel_traits::Gmem_tile_o;
  // The shared memory tile to swizzle O.
  using Smem_tile_o = typename Kernel_traits::Smem_tile_o;

  // The global memory tile to store dV.
  using Gmem_tile_dv = typename Kernel_traits::Gmem_tile_v;
  // The shared memory tile to swizzle dV.
  using Smem_tile_dv = fmha::Smem_tile_mma_epilogue<Cta_tile_dv>;
  static_assert(Smem_tile_dv::NUM_LDS == Gmem_tile_dv::LDGS);
  static_assert(Smem_tile_dv::THREADS_PER_ROW == Gmem_tile_dv::THREADS_PER_ROW);

  using Gmem_tile_s = typename Kernel_traits::Gmem_tile_s;
  using Smem_tile_st = typename Kernel_traits::Smem_tile_st;
  using Gmem_tile_do = typename Kernel_traits::Gmem_tile_do;

  // Shared memory.
  extern __shared__ char smem_[];

  // The block index for the batch.
  const int bidb = blockIdx.y;
  // The block index for the head.
  const int bidh = blockIdx.x;
  // The thread index.
  const int tidx = threadIdx.x;

  const BlockInfoPadded<Kernel_traits::THREADS> binfo(params, bidb, bidh, tidx);
  if (binfo.stop_early()) return;
  Mask<Cta_tile_p> mask(params, binfo, tidx);

  // Allocate the global memory tile loader for Q.
  Gmem_tile_do gmem_q(params, binfo, tidx);  // treating dout as Q
  // Allocate the shared memory tile loader for Q.
  Smem_tile_q smem_q(&smem_[0], tidx);
  Smem_tile_qt smem_qt(&smem_[0], tidx);
  Smem_tile_st smem_s(&smem_[Smem_tile_q::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE], tidx);

  // Allocate the global memory tile loader for K.
  Gmem_tile_k gmem_k(params, 2, binfo, tidx);  // treating V as K
  // Allocate the shared memory tile loader for K.
  Smem_tile_k smem_k(&smem_[Smem_tile_q::BYTES_PER_TILE], tidx);

  // Trigger the loads for Q.
  gmem_q.load(smem_q);
  // Trigger the loads for K.
  gmem_k.load(smem_k);

  // Commit the data for Q and K to shared memory.
  gmem_q.commit(smem_q);
  gmem_k.commit(smem_k);

  // Make sure the data is in shared memory.
  __syncthreads();

  // Load the fragments for Q.
  typename Smem_tile_q::Fragment frag_q[2][Mma_tile_p::MMAS_M];
  smem_q.load(frag_q[0], 0);

  typename Smem_tile_qt::Fragment frag_qt[2][Mma_tile_dv::MMAS_N];
  static_assert(Smem_tile_qt::Fragment::NUM_REGS == 4);
  static_assert(Mma_tile_dv::MMAS_K == 1);
  smem_qt.load(frag_qt[0], 0);

  // Load the fragments for K. We keep the data in registers during the entire kernel.
  typename Smem_tile_k::Fragment frag_k[2][Mma_tile_p::MMAS_N];
  smem_k.load(frag_k[0], 0);

  enum { BITS_PER_ELT_S = sizeof(fmha::A_type) * 8 };

  Gmem_tile_s gmem_s(params, binfo, tidx);

  // Create the object to do the softmax.
  using Softmax = fmha::Softmax<Cta_tile_p, Kernel_traits>;
  Softmax softmax(params,
                  &smem_[Smem_tile_q::BYTES_PER_TILE + Smem_tile_st::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE],
                  bidb, tidx);

  enum { THREADS_PER_ROW = 32 };
  enum { M = Mma_tile_p::MMAS_M };
  enum { N = Mma_tile_p::MMAS_N };

  // Declare the accumulators for the 2nd gemm.
  fmha::Fragment_accumulator acc_dv[Mma_tile_dv::MMAS_M][Mma_tile_dv::MMAS_N];
  fmha::Clear_accumulator<fmha::Accumulator_type, Cta_tile_dv::WARPS_K>::apply(acc_dv);

  enum { STEPS = Cta_tile_p::N / Cta_tile_p::M };
  // Load over the entire sequence length.
  for (int l = 0; l < STEPS; l++) {
    const int loop = l * Cta_tile_p::M;
    if (loop >= binfo.actual_seqlen) break;

    // Load S
    uint4 s_regs[M][N];
    gmem_s.load(s_regs, mask);
    fmha::Fragment_accumulator acc_p[Mma_tile_p::MMAS_M][Mma_tile_p::MMAS_N];
    fmha::Clear_accumulator<fmha::Accumulator_type, Cta_tile_p::WARPS_K>::apply(acc_p);
// Do this part of P^T = (Q * K^T)^T.
#pragma unroll
    for (int ki = 1; ki < Mma_tile_p::MMAS_K; ++ki) {
      // Trigger the load from shared memory for the next series of Q values.
      smem_q.load(frag_q[ki & 1], ki);
      smem_k.load(frag_k[ki & 1], ki);
      // Do the math for the values already in registers.
      fmha::gemm(acc_p, frag_q[(ki - 1) & 1], frag_k[(ki - 1) & 1]);
    }

    // Store s * dmask to smem for transpose
    smem_s.store(s_regs);

    // Declare the accumulators for the 1st gemm.
    // Do the final stage of math.
    {
      int ki = Mma_tile_p::MMAS_K;
      fmha::gemm(acc_p, frag_q[(ki - 1) & 1], frag_k[(ki - 1) & 1]);
    }
    // Trigger the load for the next Q values. We're using double buffering, so reading qt is safe
    if (l < STEPS - 1) {
      smem_q.move_to_next_write_buffer();
      gmem_q.move();
      gmem_q.load(smem_q);
    }

    // Convert from the accumulator type to FP32 for Softmax.
    softmax.unpack(acc_p);

    float s_mat[2 * M][4 * N];

#pragma unroll
    for (int mi = 0; mi < M; mi++) {
#pragma unroll
      for (int ni = 0; ni < N; ni++) {
        uint4& dst = s_regs[mi][ni];
        fmha::half2_to_float2(s_mat[2 * mi + 0][4 * ni + 0], s_mat[2 * mi + 0][4 * ni + 1], dst.x);
        fmha::half2_to_float2(s_mat[2 * mi + 0][4 * ni + 2], s_mat[2 * mi + 0][4 * ni + 3], dst.y);
        fmha::half2_to_float2(s_mat[2 * mi + 1][4 * ni + 0], s_mat[2 * mi + 1][4 * ni + 1], dst.z);
        fmha::half2_to_float2(s_mat[2 * mi + 1][4 * ni + 2], s_mat[2 * mi + 1][4 * ni + 3], dst.w);
      }
    }

#pragma unroll
    for (int mi = 0; mi < M; mi++) {
#pragma unroll
      for (int ii = 0; ii < 2; ii++) {
#pragma unroll
        for (int ni = 0; ni < N; ni++) {
#pragma unroll
          for (int jj = 0; jj < 4; jj++) {
            float& s_dmask = s_mat[2 * mi + ii][4 * ni + jj];
            const bool drop = reinterpret_cast<const uint32_t&>(s_dmask) & 0x80000000;
            const float d_s = drop ? 0.f : softmax.elt_[2 * mi + ii][4 * ni + jj] * params.rp_dropout;
            s_dmask = fabsf(s_dmask);
            softmax.elt_[2 * mi + ii][4 * ni + jj] = d_s * fabsf(s_dmask);
          }
        }
      }
    }

    float p_sum[2 * M];
    softmax.reduce_sum(p_sum);

    const float scalef = reinterpret_cast<const float&>(params.scale_softmax);
#pragma unroll
    for (int mi = 0; mi < M; mi++) {
#pragma unroll
      for (int ii = 0; ii < 2; ii++) {
#pragma unroll
        for (int ni = 0; ni < N; ni++) {
#pragma unroll
          for (int jj = 0; jj < 4; jj++) {
            softmax.elt_[2 * mi + ii][4 * ni + jj] -= p_sum[2 * mi + ii] * (s_mat[2 * mi + ii][4 * ni + jj]);
            softmax.elt_[2 * mi + ii][4 * ni + jj] *= scalef;
          }
        }
      }
    }
    typename Smem_tile_st::Fragment frag_s[Mma_tile_dv::MMAS_K][Mma_tile_dv::MMAS_M];
    smem_s.load(frag_s);
    for (int ki = 0; ki < Mma_tile_dv::MMAS_K; ki++) {
      for (int mi = 0; mi < Mma_tile_dv::MMAS_M; mi++) {
        for (int ii = 0; ii < Smem_tile_st::Fragment::NUM_REGS; ii++) {
          frag_s[ki][mi].reg(ii) = fmha::hmul2(frag_s[ki][mi].reg(ii), params.scale_dropout);
          frag_s[ki][mi].reg(ii) = fmha::hrelu2(frag_s[ki][mi].reg(ii));
        }
      }
    }

    gmem_s.store(softmax.elt_, mask);
    gmem_s.move();

#pragma unroll
    for (int ki = 1; ki < Mma_tile_dv::MMAS_K; ++ki) {
      // Trigger the load from shared memory for the next series of Q values.
      smem_qt.load(frag_qt[ki & 1], ki);
      // Do the math for the values already in registers.
      fmha::gemm(acc_dv, frag_s[(ki - 1)], frag_qt[(ki - 1) & 1]);
    }

    // Do the final stage of math.
    {
      int ki = Mma_tile_dv::MMAS_K;
      fmha::gemm(acc_dv, frag_s[(ki - 1)], frag_qt[(ki - 1) & 1]);
    }
    // Commit the values for Q into shared memory.
    if (l < STEPS - 1) {
      gmem_q.commit(smem_q);
    }

    // Make sure we are reading from the correct buffer.
    smem_q.move_to_next_read_buffer();
    smem_qt.move_to_next_read_buffer();

    // Make sure the data is in shared memory.
    __syncthreads();

    // Trigger the loads for the values of Q for the next iteration.
    smem_q.load(frag_q[0], 0);
    smem_k.load(frag_k[0], 0);
    smem_qt.load(frag_qt[0], 0);

  }  // Outer loop over the sequence length.

  // Epilogue swizzle for dV
  Smem_tile_dv smem_dv(&smem_[Kernel_traits::Smem_tile_q::BYTES_PER_TILE], tidx);
  smem_dv.store(acc_dv);

  __syncthreads();
  uint4 dv_out[Smem_tile_dv::NUM_LDS];
  smem_dv.load(dv_out);
  Qkv_params dv_params;
  dv_params.qkv_ptr = params.dqkv_ptr;
  dv_params.qkv_stride_in_bytes = params.qkv_stride_in_bytes;
  dv_params.h = params.h;
  Gmem_tile_dv gmem_dv(dv_params, 2, binfo, tidx);
  gmem_dv.store(dv_out);
}

template <typename Kernel_traits, typename Params>
inline __device__ void compute_dq_dk_1xN(const Params& params) {
  // The description of the CTA tile for the 1st batched GEMM.
  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
  using Cta_tile_o = typename Kernel_traits::Cta_tile_o;
  // The description of the CTA tile for the 2nd batched GEMM.
  using Cta_tile_dk =
      fmha::Cta_tile_extd<Cta_tile_p::N, Cta_tile_p::K, Cta_tile_p::M, Cta_tile_p::WARPS_N, 1, Cta_tile_p::WARPS_M>;
  static_assert(Cta_tile_dk::M == 512 || Cta_tile_dk::M == 384 || Cta_tile_dk::M == 256 || Cta_tile_dk::M == 128);
  static_assert(Cta_tile_dk::N == 64);
  static_assert(Cta_tile_dk::K == 16);

  // The MMA tile for the 1st GEMM.
  using Mma_tile_p = fmha::Hmma_tile<Cta_tile_p>;
  using Mma_tile_o = fmha::Hmma_tile<Cta_tile_o>;
  // The MMA tile for the 2nd GEMM.
  using Mma_tile_dk = fmha::Hmma_tile<Cta_tile_dk>;

  // The global memory tile to load Q.
  using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
  // The shared memory tile to swizzle Q.
  using Smem_tile_q = typename Kernel_traits::Smem_tile_q;

  // The global memory tile to load K.
  using Gmem_tile_k = typename Kernel_traits::Gmem_tile_v;
  // The shared memory tile to swizzle K.
  using Smem_tile_k = typename Kernel_traits::Smem_tile_v;  // K is used like V in fprop

  // The global memory tile to load V.
  using Gmem_tile_v = typename Kernel_traits::Gmem_tile_v;
  // The shared memory tile to swizzle V.
  using Smem_tile_v = typename Kernel_traits::Smem_tile_v;

  // The global memory tile to store O.
  // using Gmem_tile_o = typename Kernel_traits::Gmem_tile_o;
  using Gmem_tile_o = fmha::Gmem_tile_dq<Cta_tile_o>;
  // The shared memory tile to swizzle O.
  using Smem_tile_o = typename Kernel_traits::Smem_tile_o;

  // The global memory tile to store dK.
  using Gmem_tile_dk = typename Kernel_traits::Gmem_tile_v;
  // The shared memory tile to swizzle dK.
  using Smem_tile_dk = fmha::Smem_tile_mma_epilogue<Cta_tile_dk>;
  static_assert(Smem_tile_dk::NUM_LDS == Gmem_tile_dk::LDGS);
  static_assert(Smem_tile_dk::THREADS_PER_ROW == Gmem_tile_dk::THREADS_PER_ROW);

  // The shared memory tile to reload Q transposed.
  using Smem_tile_qt = fmha::Smem_tile_b<Cta_tile_dk, fmha::Row, Gmem_tile_q::BYTES_PER_LDG, 1>;

  using Gmem_tile_s = typename Kernel_traits::Gmem_tile_s;

  using Smem_tile_st = typename Kernel_traits::Smem_tile_st;

  enum { M = Mma_tile_p::MMAS_M };
  enum { N = Mma_tile_p::MMAS_N };
  static_assert(M == Mma_tile_o::MMAS_M);
  static_assert(N == Mma_tile_o::MMAS_K);
  // Shared memory.
  extern __shared__ char smem_[];

  // The block index for the batch.
  const int bidb = blockIdx.y;
  // The block index for the head.
  const int bidh = blockIdx.x;
  // The thread index.
  const int tidx = threadIdx.x;

  const BlockInfoPadded<Kernel_traits::THREADS> binfo(params, bidb, bidh, tidx);
  if (binfo.stop_early()) return;

  Mask<Cta_tile_p> mask(params, binfo, tidx);

  // Allocate the global memory tile loader for Q.
  Gmem_tile_q gmem_q(params, 0, binfo, tidx);
  // Allocate the shared memory tile loader for Q.
  Smem_tile_q smem_q(&smem_[0], tidx);
  Smem_tile_qt smem_qt(&smem_[0], tidx);
  Smem_tile_st smem_s(&smem_[Smem_tile_q::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE + Smem_tile_o::BYTES_PER_TILE],
                      tidx);

  // Allocate the global memory tile loader for K.
  Gmem_tile_k gmem_k(params, 1, binfo, tidx);
  // Allocate the shared memory tile loader for K.
  Smem_tile_k smem_k(&smem_[Smem_tile_q::BYTES_PER_TILE], tidx);

  // Allocate the global memory tile loader for O.
  Gmem_tile_o gmem_o(params, binfo, tidx);
  // Allocate the shared memory tile loader for O. We use the same as K so be careful!!!
  Smem_tile_o smem_o(&smem_[Smem_tile_q::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE], tidx);

  // Trigger the loads for Q.
  gmem_q.load(smem_q);
  // Trigger the loads for K.
  gmem_k.load(smem_k);

  Gmem_tile_s gmem_s(params, binfo, tidx);
  // Load dP
  uint4 s_regs[M][N];
  gmem_s.load(s_regs, mask);
  gmem_s.move();

  // Commit the data for Q and K to shared memory.
  gmem_q.commit(smem_q);
  gmem_k.commit(smem_k);

  // Make sure the data is in shared memory.
  __syncthreads();

  typename Smem_tile_qt::Fragment frag_qt[2][Mma_tile_dk::MMAS_N];
  smem_qt.load(frag_qt[0], 0);
  typename Smem_tile_k::Fragment frag_k[2][Mma_tile_o::MMAS_N];
  smem_k.load(frag_k[0], 0);

  enum { BITS_PER_ELT_S = sizeof(fmha::A_type) * 8 };

  enum { THREADS_PER_ROW = 32 };
  enum { STEPS = Cta_tile_p::N / Cta_tile_p::M };

  // Declare the accumulators for the 2nd gemm.
  fmha::Fragment_accumulator acc_dk[Mma_tile_dk::MMAS_M][Mma_tile_dk::MMAS_N];
  fmha::Clear_accumulator<fmha::Accumulator_type, Cta_tile_dk::WARPS_K>::apply(acc_dk);

  // Load over the entire sequence length.
  for (int l = 0; l < STEPS; l++) {
    const int loop = l * Cta_tile_p::M;
    if (loop >= binfo.actual_seqlen) break;

    // Pack dP as Fragment_a
    fmha::Fragment_a<fmha::Row> frag_p[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_M];
#pragma unroll
    for (int mi = 0; mi < M; mi++) {
#pragma unroll
      for (int ni = 0; ni < N; ni++) {
        uint4& dst = s_regs[mi][ni];
        frag_p[ni][mi].reg(0) = dst.x;  // row 0, cols 0,1
        frag_p[ni][mi].reg(1) = dst.z;  // row 8, cols 0,1
        frag_p[ni][mi].reg(2) = dst.y;  // row 0, cols 8,9
        frag_p[ni][mi].reg(3) = dst.w;  // row 8, cols 8,9
      }
    }

    // Declare the accumulators for the 1st gemm.
    fmha::Fragment_accumulator acc_o[Mma_tile_o::MMAS_M][Mma_tile_o::MMAS_N];
    fmha::Clear_accumulator<fmha::Accumulator_type, Cta_tile_o::WARPS_K>::apply(acc_o);

// Do this part of O = P^T * V^T. dQ = dP x dK
#pragma unroll
    for (int ki = 1; ki < Mma_tile_o::MMAS_K; ++ki) {
      // Trigger the load from shared memory for the next series of Q values.
      smem_k.load(frag_k[ki & 1], ki);
      // Do the math for the values already in registers.
      fmha::gemm(acc_o, frag_p[ki - 1], frag_k[(ki - 1) & 1]);
    }

    // Do the final stage of math.
    {
      int ki = Mma_tile_o::MMAS_K;
      fmha::gemm(acc_o, frag_p[ki - 1], frag_k[(ki - 1) & 1]);
    }

    // Store dP to smem for transpose
    smem_s.store(s_regs);
    if (l < STEPS - 1) {
      // Load next part of S
      gmem_s.load(s_regs, mask);
      gmem_s.move();
      smem_q.move_to_next_write_buffer();
      gmem_q.move();
      gmem_q.load(smem_q);
    }
// Loop over MMAS_M.
#pragma unroll
    for (int ii = 0; ii < Gmem_tile_o::LOOPS; ++ii) {
      // Swizzle the elements and do the final reduction.
      smem_o.store(acc_o, ii);

      // Make sure the data is in shared memory.
      __syncthreads();

      // Load from shared memory.
      uint4 out[Gmem_tile_o::STGS_PER_LOOP];
      smem_o.load(out);

      // Make sure the data was read from shared memory.
      if (ii < Gmem_tile_o::LOOPS - 1) {
        __syncthreads();
      }

      // Output the values.
      gmem_o.store(out, ii);
    }

    // Move to the next part of the output.
    gmem_o.move();

    typename Smem_tile_st::Fragment frag_s[Mma_tile_dk::MMAS_K][Mma_tile_dk::MMAS_M];
    smem_s.load(frag_s);

#pragma unroll
    for (int ki = 1; ki < Mma_tile_dk::MMAS_K; ++ki) {
      // Trigger the load from shared memory for the next series of Q values.
      smem_qt.load(frag_qt[ki & 1], ki);
      // Do the math for the values already in registers.
      fmha::gemm(acc_dk, frag_s[(ki - 1)], frag_qt[(ki - 1) & 1]);
    }

    // Do the final stage of math.
    {
      int ki = Mma_tile_dk::MMAS_K;
      fmha::gemm(acc_dk, frag_s[(ki - 1)], frag_qt[(ki - 1) & 1]);
    }

    // Commit the values for Q into shared memory.
    if (l < STEPS - 1) {
      gmem_q.commit(smem_q);
    }

    // Make sure the data is in shared memory.
    __syncthreads();

    // Trigger the loads for the values of Q for the next iteration.
    smem_qt.load(frag_qt[0], 0);
    smem_k.load(frag_k[0], 0);

  }  // Outer loop over the sequence length.

  // Epilogue swizzle for dK
  Smem_tile_dk smem_dk(&smem_[0], tidx);
  smem_dk.store(acc_dk);
  __syncthreads();
  uint4 dk_out[Smem_tile_dk::NUM_LDS];
  smem_dk.load(dk_out);
  Qkv_params dk_params;
  dk_params.qkv_ptr = params.dqkv_ptr;
  dk_params.qkv_stride_in_bytes = params.qkv_stride_in_bytes;
  dk_params.h = params.h;
  Gmem_tile_dk gmem_dk(dk_params, 1, binfo, tidx);
  gmem_dk.store(dk_out);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

}  // namespace fmha


================================================
FILE: apex/contrib/csrc/fmha/src/fmha_dgrad_kernel_1xN_reload_nl.h
================================================
/******************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#pragma once

#include <fmha/gemm.h>
#include <fmha/kernel_traits.h>

#include "fmha_kernel.h"

namespace fmha {

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int CHUNKS, typename Kernel_traits, typename Params>
inline __device__ void compute_dv_1xN_nl(const Params& params) {
  // The description of the CTA tile for the 1st batched GEMM.
  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
  // The description of the CTA tile for the 2nd batched GEMM.
  using Cta_tile_dv =
      fmha::Cta_tile_extd<Cta_tile_p::N, Cta_tile_p::K, Cta_tile_p::M, Cta_tile_p::WARPS_N, 1, Cta_tile_p::WARPS_M>;

  static_assert(Cta_tile_dv::M == 512 || Cta_tile_dv::M == 384 || Cta_tile_dv::M == 256 || Cta_tile_dv::M == 128);
  static_assert(Cta_tile_dv::N == 64);
  static_assert(Cta_tile_dv::K == 16);

  // The MMA tile for the 1st GEMM.
  using Mma_tile_p = fmha::Hmma_tile<Cta_tile_p>;
  // The MMA tile for the 2nd GEMM.
  using Mma_tile_dv = fmha::Hmma_tile<Cta_tile_dv>;

  // The global memory tile to load Q.
  using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
  // The shared memory tile to swizzle Q.
  using Smem_tile_q = fmha::Smem_tile_a<Cta_tile_p, fmha::Row, Gmem_tile_q::BYTES_PER_LDG, 2>;
  // The shared memory tile to reload Q as fragment b.
  using Smem_tile_qt = fmha::Smem_tile_b<Cta_tile_dv, fmha::Row, Gmem_tile_q::BYTES_PER_LDG, 2>;

  // The global memory tile to load K.
  using Gmem_tile_k = typename Kernel_traits::Gmem_tile_k;
  // The shared memory tile to swizzle K.
  using Smem_tile_k = typename Kernel_traits::Smem_tile_k;

  // The global memory tile to load V.
  using Gmem_tile_v = typename Kernel_traits::Gmem_tile_v;
  // The shared memory tile to swizzle V.
  using Smem_tile_v = typename Kernel_traits::Smem_tile_v;

  // The global memory tile to store dV.
  using Gmem_tile_dv = fmha::Gmem_tile_qkv<typename Kernel_traits::Cta_tile_o, fmha::BITS_PER_ELEMENT_B,
                                           Cta_tile_p::N,  // S,
                                           Cta_tile_p::K,  // D,
                                           2 * CHUNKS>;

  // The shared memory tile to swizzle dV.
  using Smem_tile_dv = fmha::Smem_tile_mma_epilogue<Cta_tile_dv>;
  static_assert(Smem_tile_dv::NUM_LDS == Gmem_tile_dv::LDGS);
  static_assert(Smem_tile_dv::THREADS_PER_ROW == Gmem_tile_dv::THREADS_PER_ROW);

  using Gmem_tile_s = typename Kernel_traits::Gmem_tile_s;
  using Smem_tile_st = typename Kernel_traits::Smem_tile_st;
  using Gmem_tile_do = typename Kernel_traits::Gmem_tile_do;

  // Shared memory.
  extern __shared__ char smem_[];

  // The block index for the chunk.
  const int bidc = blockIdx.z;
  // The block index for the batch.
  const int bidb = blockIdx.y;
  // The block index for the head.
  const int bidh = blockIdx.x;
  // The thread index.
  const int tidx = threadIdx.x;

  const BlockInfoPadded<Kernel_traits::THREADS> binfo(params, bidb, bidh, tidx);
  if (binfo.stop_early()) return;
  fmha::Mask<Cta_tile_p> mask(params, binfo, tidx);

  // Allocate the global memory tile loader for Q.
  Gmem_tile_do gmem_q(params, binfo, tidx);  // treating dout as Q
  // Allocate the shared memory tile loader for Q.
  Smem_tile_q smem_q(&smem_[0], tidx);
  Smem_tile_qt smem_qt(&smem_[0], tidx);
  Smem_tile_st smem_s(&smem_[Smem_tile_q::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE], tidx);

  // Allocate the global memory tile loader for K.
  Gmem_tile_k gmem_k(params, 2, binfo, tidx);  // treating V as K
  // Allocate the shared memory tile loader for K.
  Smem_tile_k smem_k(&smem_[Smem_tile_q::BYTES_PER_TILE], tidx);

  Gmem_tile_s gmem_s(params, binfo, tidx);

  using Noloop = Noloop_traits<CHUNKS, Cta_tile_p>;

  Noloop nl_traits(bidc, binfo);
  nl_traits.move_all(gmem_q, gmem_s);

  // Trigger the loads for Q.
  gmem_q.load(smem_q);
  // Trigger the loads for K.
  gmem_k.load(smem_k);

  // Commit the data for Q and K to shared memory.
  gmem_q.commit(smem_q);
  gmem_k.commit(smem_k);

  // Make sure the data is in shared memory.
  __syncthreads();

  // Load the fragments for Q.
  typename Smem_tile_q::Fragment frag_q[2][Mma_tile_p::MMAS_M];
  smem_q.load(frag_q[0], 0);

  typename Smem_tile_qt::Fragment frag_qt[2][Mma_tile_dv::MMAS_N];
  static_assert(Smem_tile_qt::Fragment::NUM_REGS == 4);
  static_assert(Mma_tile_dv::MMAS_K == 1);
  smem_qt.load(frag_qt[0], 0);

  // Load the fragments for K. We keep the data in registers during the entire kernel.
  typename Smem_tile_k::Fragment frag_k[2][Mma_tile_p::MMAS_N];
  smem_k.load(frag_k[0], 0);

  enum { BITS_PER_ELT_S = sizeof(fmha::A_type) * 8 };

  // Create the object to do the softmax.
  using Softmax = fmha::Softmax<Cta_tile_p, Kernel_traits>;
  Softmax softmax(params,
                  &smem_[Smem_tile_q::BYTES_PER_TILE + Smem_tile_st::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE],
                  bidb, tidx);

  enum { THREADS_PER_ROW = 32 };
  enum { M = Mma_tile_p::MMAS_M };
  enum { N = Mma_tile_p::MMAS_N };

  // Declare the accumulators for the 2nd gemm.
  fmha::Fragment_accumulator acc_dv[Mma_tile_dv::MMAS_M][Mma_tile_dv::MMAS_N];
  fmha::Clear_accumulator<fmha::Accumulator_type, Cta_tile_dv::WARPS_K>::apply(acc_dv);

  // Load over the entire sequence length.
  for (int l = 0; l < nl_traits.num_steps_; l++) {
    uint4 s_regs[M][N];
    gmem_s.load(s_regs, mask);
    fmha::Fragment_accumulator acc_p[Mma_tile_p::MMAS_M][Mma_tile_p::MMAS_N];
    fmha::Clear_accumulator<fmha::Accumulator_type, Cta_tile_p::WARPS_K>::apply(acc_p);
// Do this part of P^T = (Q * K^T)^T.
#pragma unroll
    for (int ki = 1; ki < Mma_tile_p::MMAS_K; ++ki) {
      // Trigger the load from shared memory for the next series of Q values.
      smem_q.load(frag_q[ki & 1], ki);
      smem_k.load(frag_k[ki & 1], ki);
      // Do the math for the values already in registers.
      fmha::gemm(acc_p, frag_q[(ki - 1) & 1], frag_k[(ki - 1) & 1]);
    }

    smem_s.store(s_regs);

    // Declare the accumulators for the 1st gemm.
    // Do the final stage of math.
    {
      int ki = Mma_tile_p::MMAS_K;
      fmha::gemm(acc_p, frag_q[(ki - 1) & 1], frag_k[(ki - 1) & 1]);
    }
    // Trigger the load for the next Q values. We're using double buffering, so reading qt is safe
    if (l < nl_traits.num_steps_ - 1) {
      smem_q.move_to_next_write_buffer();
      gmem_q.move();
      gmem_q.load(smem_q);
    }
    // Convert from the accumulator type to FP32 for Softmax.
    softmax.unpack(acc_p);

    float s_mat[2 * M][4 * N];

#pragma unroll
    for (int mi = 0; mi < M; mi++) {
#pragma unroll
      for (int ni = 0; ni < N; ni++) {
        uint4& dst = s_regs[mi][ni];
        fmha::half2_to_float2(s_mat[2 * mi + 0][4 * ni + 0], s_mat[2 * mi + 0][4 * ni + 1], dst.x);
        fmha::half2_to_float2(s_mat[2 * mi + 0][4 * ni + 2], s_mat[2 * mi + 0][4 * ni + 3], dst.y);
        fmha::half2_to_float2(s_mat[2 * mi + 1][4 * ni + 0], s_mat[2 * mi + 1][4 * ni + 1], dst.z);
        fmha::half2_to_float2(s_mat[2 * mi + 1][4 * ni + 2], s_mat[2 * mi + 1][4 * ni + 3], dst.w);
      }
    }

#pragma unroll
    for (int mi = 0; mi < M; mi++) {
#pragma unroll
      for (int ii = 0; ii < 2; ii++) {
#pragma unroll
        for (int ni = 0; ni < N; ni++) {
#pragma unroll
          for (int jj = 0; jj < 4; jj++) {
            float& s_dmask = s_mat[2 * mi + ii][4 * ni + jj];
            const bool drop = reinterpret_cast<const uint32_t&>(s_dmask) & 0x80000000;
            const float d_s = drop ? 0.f : softmax.elt_[2 * mi + ii][4 * ni + jj] * params.rp_dropout;
            s_dmask = fabsf(s_dmask);
            softmax.elt_[2 * mi + ii][4 * ni + jj] = d_s * (s_dmask);
          }
        }
      }
    }

    float p_sum[2 * M];
    softmax.reduce_sum(p_sum);

    const float scalef = reinterpret_cast<const float&>(params.scale_softmax);
#pragma unroll
    for (int mi = 0; mi < M; mi++) {
#pragma unroll
      for (int ii = 0; ii < 2; ii++) {
#pragma unroll
        for (int ni = 0; ni < N; ni++) {
#pragma unroll
          for (int jj = 0; jj < 4; jj++) {
            softmax.elt_[2 * mi + ii][4 * ni + jj] -= p_sum[2 * mi + ii] * (s_mat[2 * mi + ii][4 * ni + jj]);
            softmax.elt_[2 * mi + ii][4 * ni + jj] *= scalef;
          }
        }
      }
    }

    typename Smem_tile_st::Fragment frag_s[Mma_tile_dv::MMAS_K][Mma_tile_dv::MMAS_M];
    smem_s.load(frag_s);
    for (int ki = 0; ki < Mma_tile_dv::MMAS_K; ki++) {
      for (int mi = 0; mi < Mma_tile_dv::MMAS_M; mi++) {
        for (int ii = 0; ii < Smem_tile_st::Fragment::NUM_REGS; ii++) {
          frag_s[ki][mi].reg(ii) = fmha::hmul2(frag_s[ki][mi].reg(ii), params.scale_dropout);
          frag_s[ki][mi].reg(ii) = fmha::hrelu2(frag_s[ki][mi].reg(ii));
        }
      }
    }

    gmem_s.store(softmax.elt_, mask);
    gmem_s.move();

    static_assert(Mma_tile_dv::MMAS_K == 1);  // DEBUG
#pragma unroll
    for (int ki = 1; ki < Mma_tile_dv::MMAS_K; ++ki) {
      // Trigger the load from shared memory for the next series of Q values.
      smem_qt.load(frag_qt[ki & 1], ki);
      // Do the math for the values already in registers.
      fmha::gemm(acc_dv, frag_s[(ki - 1)], frag_qt[(ki - 1) & 1]);
    }

    // Do the final stage of math.
    {
      int ki = Mma_tile_dv::MMAS_K;
      fmha::gemm(acc_dv, frag_s[(ki - 1)], frag_qt[(ki - 1) & 1]);
    }
    // Commit the values for Q into shared memory.
    if (l < nl_traits.num_steps_ - 1) {
      gmem_q.commit(smem_q);
    }

    // Make sure we are reading from the correct buffer.
    smem_q.move_to_next_read_buffer();
    smem_qt.move_to_next_read_buffer();

    // Make sure the data is in shared memory.
    __syncthreads();

    // Trigger the loads for the values of Q for the next iteration.
    smem_q.load(frag_q[0], 0);
    smem_k.load(frag_k[0], 0);
    smem_qt.load(frag_qt[0], 0);

  }  // Outer loop over the sequence length.

  // Epilogue for dV = (S * D)' * dout'. We're fully exposed to this!

  // Epilogue swizzle for dV
  Smem_tile_dv smem_dv(&smem_[Kernel_traits::Smem_tile_q::BYTES_PER_TILE], tidx);
  smem_dv.store(acc_dv);

  __syncthreads();

  uint4 dv_out[Smem_tile_dv::NUM_LDS];
  smem_dv.load(dv_out);
  Qkv_params dv_params;
  dv_params.qkv_ptr = params.dkv_ptr;
  dv_params.qkv_stride_in_bytes = params.h * 2 * CHUNKS * params.d * sizeof(half);
  dv_params.h = params.h;
  Gmem_tile_dv gmem_dv(dv_params, nl_traits.get_idx_dv(), binfo, tidx);
  gmem_dv.store(dv_out);
}

template <int CHUNKS, typename Kernel_traits, typename Params>
inline __device__ void compute_dq_dk_1xN_nl(const Params& params) {
  // The description of the CTA tile for the 1st batched GEMM.
  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
  using Cta_tile_o = typename Kernel_traits::Cta_tile_o;
  // The description of the CTA tile for the 2nd batched GEMM.
  using Cta_tile_dk =
      fmha::Cta_tile_extd<Cta_tile_p::N, Cta_tile_p::K, Cta_tile_p::M, Cta_tile_p::WARPS_N, 1, Cta_tile_p::WARPS_M>;

  static_assert(Cta_tile_dk::M == 512 || Cta_tile_dk::M == 384 || Cta_tile_dk::M == 256 || Cta_tile_dk::M == 128);
  static_assert(Cta_tile_dk::N == 64);
  static_assert(Cta_tile_dk::K == 16);

  // The MMA tile for the 1st GEMM.
  using Mma_tile_p = fmha::Hmma_tile<Cta_tile_p>;
  using Mma_tile_o = fmha::Hmma_tile<Cta_tile_o>;
  // The MMA tile for the 2nd GEMM.
  using Mma_tile_dk = fmha::Hmma_tile<Cta_tile_dk>;

  // The global memory tile to load Q.
  using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
  // The shared memory tile to swizzle Q.
  using Smem_tile_q = typename Kernel_traits::Smem_tile_q;

  // The global memory tile to load K.
  using Gmem_tile_k = typename Kernel_traits::Gmem_tile_v;
  // The shared memory tile to swizzle K.
  using Smem_tile_k = typename Kernel_traits::Smem_tile_v;  // K is used like V in fprop

  // The global memory tile to load V.
  using Gmem_tile_v = typename Kernel_traits::Gmem_tile_v;
  // The shared memory tile to swizzle V.
  using Smem_tile_v = typename Kernel_traits::Smem_tile_v;

  // The global memory tile to store O.
  using Gmem_tile_o = Gmem_tile_dq<Cta_tile_o>;
  // The shared memory tile to swizzle O.
  using Smem_tile_o = typename Kernel_traits::Smem_tile_o;

  // The global memory tile to store dK.
  using Gmem_tile_dk = fmha::Gmem_tile_qkv<typename Kernel_traits::Cta_tile_o, fmha::BITS_PER_ELEMENT_B,
                                           Cta_tile_p::N,  // S,
                                           Cta_tile_p::K,  // D,
                                           2 * CHUNKS>;

  // The shared memory tile to swizzle dK.
  using Smem_tile_dk = fmha::Smem_tile_mma_epilogue<Cta_tile_dk>;
  static_assert(Smem_tile_dk::NUM_LDS == Gmem_tile_dk::LDGS);
  static_assert(Smem_tile_dk::THREADS_PER_ROW == Gmem_tile_dk::THREADS_PER_ROW);

  // The shared memory tile to reload Q transposed.
  using Smem_tile_qt = fmha::Smem_tile_b<Cta_tile_dk, fmha::Row, Gmem_tile_q::BYTES_PER_LDG, 1>;

  // The global memory tile to load dP, stored in S
  using Gmem_tile_s = Gmem_tile_mma_s<Cta_tile_p>;
  // The shared memory tile to transpose dP.
  using Smem_tile_st = Smem_tile_mma_transposed<Cta_tile_p>;

  using Noloop = Noloop_traits<CHUNKS, Cta_tile_p>;

  enum { M = Mma_tile_p::MMAS_M };
  enum { N = Mma_tile_p::MMAS_N };
  static_assert(M == Mma_tile_o::MMAS_M);
  static_assert(N == Mma_tile_o::MMAS_K);
  // Shared memory.
  extern __shared__ char smem_[];

  const int bidc = blockIdx.z;
  // The block index for the batch.
  const int bidb = blockIdx.y;
  // The block index for the head.
  const int bidh = blockIdx.x;
  // The thread index.
  const int tidx = threadIdx.x;

  const BlockInfoPadded<Kernel_traits::THREADS> binfo(params, bidb, bidh, tidx);
  if (binfo.stop_early()) return;

  fmha::Mask<Cta_tile_p> mask(params, binfo, tidx);

  // Allocate the global memory tile loader for Q.
  Gmem_tile_q gmem_q(params, 0, binfo, tidx);
  // Allocate the shared memory tile loader for Q (as B).
  Smem_tile_qt smem_qt(&smem_[0], tidx);
  // Allocate the global memory tile loader for dP.
  Gmem_tile_s gmem_s(params, binfo, tidx);
  // Allocate the shared memory tile loader for dP.
  Smem_tile_st smem_s(&smem_[Smem_tile_q::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE + Smem_tile_o::BYTES_PER_TILE],
                      tidx);

  // Allocate the global memory tile loader for K.
  Gmem_tile_k gmem_k(params, 1, binfo, tidx);
  // Allocate the shared memory tile loader for K.
  Smem_tile_k smem_k(&smem_[Smem_tile_q::BYTES_PER_TILE], tidx);

  // Allocate the global memory tile loader for O.
  Gmem_tile_o gmem_o(params, binfo, tidx);
  // Allocate the shared memory tile loader for O. We use the same as K so be careful!!!
  Smem_tile_o smem_o(&smem_[Smem_tile_q::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE], tidx);

  Noloop nl_traits(bidc, binfo);

  nl_traits.move_all(gmem_q, gmem_o, gmem_s);

  // Trigger the loads for Q.
  gmem_q.load(smem_qt);
  // Trigger the loads for K.
  gmem_k.load(smem_k);

  uint4 s_regs[M][N];
  gmem_s.load(s_regs, mask);

  // Commit the data for Q and K to shared memory.
  gmem_q.commit(smem_qt);
  gmem_k.commit(smem_k);

  // Make sure the data is in shared memory.
  __syncthreads();

  typename Smem_tile_qt::Fragment frag_qt[2][Mma_tile_dk::MMAS_N];
  smem_qt.load(frag_qt[0], 0);
  typename Smem_tile_k::Fragment frag_k[2][Mma_tile_o::MMAS_N];
  smem_k.load(frag_k[0], 0);

  enum { BITS_PER_ELT_S = sizeof(fmha::A_type) * 8 };

  enum { THREADS_PER_ROW = 32 };

  // Declare the accumulators for the 2nd gemm.
  fmha::Fragment_accumulator acc_dk[Mma_tile_dk::MMAS_M][Mma_tile_dk::MMAS_N];
  fmha::Clear_accumulator<fmha::Accumulator_type, Cta_tile_dk::WARPS_K>::apply(acc_dk);

  // Load over the entire sequence length.
  for (int l = 0; l < nl_traits.num_steps_; l++) {
    // Pack dP as Fragment_a
    fmha::Fragment_a<fmha::Row> frag_p[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_M];
#pragma unroll
    for (int mi = 0; mi < M; mi++) {
#pragma unroll
      for (int ni = 0; ni < N; ni++) {
        uint4& dst = s_regs[mi][ni];
        frag_p[ni][mi].reg(0) = dst.x;
        frag_p[ni][mi].reg(1) = dst.z;
        frag_p[ni][mi].reg(2) = dst.y;
        frag_p[ni][mi].reg(3) = dst.w;
      }
    }
    smem_s.store(s_regs);
    if (l < nl_traits.num_steps_ - 1) {
      // Load next part of S
      gmem_s.move();
      gmem_s.load(s_regs, mask);
      // Trigger the load for the next Q values.
      smem_qt.move_to_next_write_buffer();
      gmem_q.move();
      gmem_q.load(smem_qt);
    }
    // Declare the accumulators for the 1st gemm.
    fmha::Fragment_accumulator acc_o[Mma_tile_o::MMAS_M][Mma_tile_o::MMAS_N];
    fmha::Clear_accumulator<fmha::Accumulator_type, Cta_tile_o::WARPS_K>::apply(acc_o);

// Do this part of O = P^T * V^T. dQ = dP x dK
#pragma unroll
    for (int ki = 1; ki < Mma_tile_o::MMAS_K; ++ki) {
      // Trigger the load from shared memory for the next series of Q values.
      smem_k.load(frag_k[ki & 1], ki);
      // Do the math for the values already in registers.
      fmha::gemm(acc_o, frag_p[ki - 1], frag_k[(ki - 1) & 1]);
    }

    // Do the final stage of math.
    {
      int ki = Mma_tile_o::MMAS_K;
      fmha::gemm(acc_o, frag_p[ki - 1], frag_k[(ki - 1) & 1]);
    }

    static_assert(Gmem_tile_o::LOOPS == 1);  // DEBUG
// Loop over MMAS_M.
#pragma unroll
    for (int ii = 0; ii < Gmem_tile_o::LOOPS; ++ii) {
      // Swizzle the elements and do the final reduction.
      smem_o.store(acc_o, ii);

      // Make sure the data is in shared memory.
      __syncthreads();

      // Load from shared memory.
      uint4 out[Gmem_tile_o::STGS_PER_LOOP];
      smem_o.load(out);

      // Make sure the data was read from shared memory.
      if (ii < Gmem_tile_o::LOOPS - 1) {
        __syncthreads();
      }

      // Output the values.
      gmem_o.store(out, ii);
    }

    // Move to the next part of the output.
    gmem_o.move();

    typename Smem_tile_st::Fragment frag_s[Mma_tile_dk::MMAS_K][Mma_tile_dk::MMAS_M];
    smem_s.load(frag_s);

    static_assert(Mma_tile_dk::MMAS_K == 1);  // DEBUG

#pragma unroll
    for (int ki = 1; ki < Mma_tile_dk::MMAS_K; ++ki) {
      // Trigger the load from shared memory for the next series of Q values.
      smem_qt.load(frag_qt[ki & 1], ki);
      // Do the math for the values already in registers.
      fmha::gemm(acc_dk, frag_s[(ki - 1)], frag_qt[(ki - 1) & 1]);
    }

    // Do the final stage of math.
    {
      int ki = Mma_tile_dk::MMAS_K;
      fmha::gemm(acc_dk, frag_s[(ki - 1)], frag_qt[(ki - 1) & 1]);
    }

    // Commit the values for Q into shared memory.
    if (l < nl_traits.num_steps_ - 1) {
      gmem_q.commit(smem_qt);
      __syncthreads();
      // Trigger the loads for the values of Q for the next iteration.
      smem_qt.load(frag_qt[0], 0);
      smem_k.load(frag_k[0], 0);
    }

  }  // Outer loop over the sequence length.

  // Epilogue for dK = dP' * dq. We're fully exposed to this!

  // Epilogue swizzle for dK
  Smem_tile_dk smem_dk(&smem_[0], tidx);
  smem_dk.store(acc_dk);

  __syncthreads();

  uint4 dk_out[Smem_tile_dk::NUM_LDS];
  smem_dk.load(dk_out);
  Qkv_params dk_params;
  dk_params.qkv_ptr = params.dkv_ptr;
  dk_params.qkv_stride_in_bytes = params.h * 2 * CHUNKS * params.d * sizeof(half);
  dk_params.h = params.h;
  Gmem_tile_dk gmem_dk(dk_params, nl_traits.get_idx_dk(), binfo, tidx);
  gmem_dk.store(dk_out);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

}  // namespace fmha


================================================
FILE: apex/contrib/csrc/fmha/src/fmha_fill.cu
================================================
/******************************************************************************
 * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#include <ATen/Dispatch.h>
#include <ATen/cuda/CUDAContext.h>
#include <torch/extension.h>

constexpr int block_size = 512;
constexpr int ctas_per_sm = 4;

template <typename scalar_t>
__global__ void __launch_bounds__(block_size)
    mha_fill_kernel(scalar_t* out_tensor, const int32_t* const start_row, const size_t num_rows) {
  size_t row_stride = gridDim.y * blockDim.x;
  size_t row_index = blockIdx.x + (size_t)start_row[0];
  size_t col_index = blockIdx.y * blockDim.x + threadIdx.x;
  while (row_index < num_rows) {
    out_tensor[row_index * row_stride + col_index] = 0;
    row_index += gridDim.x;
  }
}

at::Tensor& mha_fill(at::Tensor& self, const at::Tensor& start_index) {
  auto max_tokens = self.size(0);
  auto self_2d = self.view({max_tokens, -1});
  auto fcd_size = self_2d.size(1);
  TORCH_CHECK(self.is_contiguous(), "input not contiguous");
  TORCH_CHECK(fcd_size % block_size == 0, "input size not aligned to block size");
  const int num_mp = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
  uint64_t num_blk_y = (uint64_t)(fcd_size / block_size);
  uint64_t num_blk_x = (uint64_t)std::ceil(num_mp * ctas_per_sm / num_blk_y);
  dim3 dim_grid(num_blk_x, num_blk_y);
  dim3 dim_block(block_size);

  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
      at::ScalarType::Half, at::ScalarType::BFloat16, self_2d.scalar_type(), "mha_padding_fill_", [&]() {
        mha_fill_kernel<<<dim_grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(
            self_2d.data_ptr<scalar_t>(), start_index.data_ptr<int32_t>(), max_tokens);
        C10_CUDA_KERNEL_LAUNCH_CHECK();
      });
  return self;
}


================================================
FILE: apex/contrib/csrc/fmha/src/fmha_fprop_fp16_128_64_kernel.sm80.cu
================================================
/******************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#include "fmha.h"
#include "fmha_fprop_kernel_1xN.h"

using Kernel_traits = FMHA_kernel_traits<128, 64, 16, 1, 4, 0x08u>;

template <bool Is_training>
__global__ void fmha_fprop_fp16_128_64_sm80_kernel(Fused_multihead_attention_fprop_params params,
                                                   const int total_heads) {
  fmha::device_1xN<Kernel_traits, Is_training>(params, total_heads);
}

void run_fmha_fp16_128_64_sm80(Launch_params<Fused_multihead_attention_fprop_params>& launch_params,
                               const bool configure) {
  auto kernel = launch_params.is_training ? &fmha_fprop_fp16_128_64_sm80_kernel<true>
                                          : &fmha_fprop_fp16_128_64_sm80_kernel<false>;

  constexpr int smem_size = fmha::get_dynamic_smem_size<Kernel_traits>();

  if (smem_size >= 48 * 1024) {
    FMHA_CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
  }

  const int sm_count = launch_params.props->multiProcessorCount;
  int ctas_per_sm;
  FMHA_CHECK_CUDA(
      cudaOccupancyMaxActiveBlocksPerMultiprocessor(&ctas_per_sm, kernel, Kernel_traits::THREADS, smem_size));
  int total_ctas = sm_count * ctas_per_sm;

  const int heads_total = launch_params.params.b * launch_params.params.h;
  if (configure) {
    using Mma_tile_p = fmha::Hmma_tile<typename Kernel_traits::Cta_tile_p>;
    constexpr size_t STEPS = Kernel_traits::Cta_tile_p::N / Kernel_traits::Cta_tile_p::M;
    constexpr size_t MMAS_M = Mma_tile_p::MMAS_M;
    constexpr size_t MMAS_N = Mma_tile_p::MMAS_N;

    size_t heads_per_cta = ((heads_total + total_ctas - 1) / total_ctas);
    size_t elts_per_head = STEPS * MMAS_M * MMAS_N * 8;
    launch_params.elts_per_thread = heads_per_cta * elts_per_head;
    return;
  }

  dim3 grid(total_ctas);
  kernel<<<grid, Kernel_traits::THREADS, smem_size, launch_params.stream>>>(launch_params.params, heads_total);

  FMHA_CHECK_CUDA(cudaPeekAtLastError());
}


================================================
FILE: apex/contrib/csrc/fmha/src/fmha_fprop_fp16_256_64_kernel.sm80.cu
================================================
/******************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#include "fmha.h"
#include "fmha_fprop_kernel_1xN.h"

using Kernel_traits = FMHA_kernel_traits<256, 64, 16, 1, 4, 0x08u>;

template <bool Is_training>
__global__ void fmha_fprop_fp16_256_64_sm80_kernel(Fused_multihead_attention_fprop_params params,
                                                   const int total_heads) {
  fmha::device_1xN<Kernel_traits, Is_training>(params, total_heads);
}

void run_fmha_fp16_256_64_sm80(Launch_params<Fused_multihead_attention_fprop_params>& launch_params,
                               const bool configure) {
  auto kernel = launch_params.is_training ? &fmha_fprop_fp16_256_64_sm80_kernel<true>
                                          : &fmha_fprop_fp16_256_64_sm80_kernel<false>;

  constexpr int smem_size = fmha::get_dynamic_smem_size<Kernel_traits>();

  if (smem_size >= 48 * 1024) {
    FMHA_CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
  }

  const int sm_count = launch_params.props->multiProcessorCount;
  int ctas_per_sm;
  FMHA_CHECK_CUDA(
      cudaOccupancyMaxActiveBlocksPerMultiprocessor(&ctas_per_sm, kernel, Kernel_traits::THREADS, smem_size));
  int total_ctas = sm_count * ctas_per_sm;

  const int heads_total = launch_params.params.b * launch_params.params.h;
  if (configure) {
    using Mma_tile_p = fmha::Hmma_tile<typename Kernel_traits::Cta_tile_p>;
    constexpr size_t STEPS = Kernel_traits::Cta_tile_p::N / Kernel_traits::Cta_tile_p::M;
    constexpr size_t MMAS_M = Mma_tile_p::MMAS_M;
    constexpr size_t MMAS_N = Mma_tile_p::MMAS_N;

    size_t heads_per_cta = ((heads_total + total_ctas - 1) / total_ctas);
    size_t elts_per_head = STEPS * MMAS_M * MMAS_N * 8;
    launch_params.elts_per_thread = heads_per_cta * elts_per_head;
    return;
  }

  dim3 grid(total_ctas);
  kernel<<<grid, Kernel_traits::THREADS, smem_size, launch_params.stream>>>(launch_params.params, heads_total);

  FMHA_CHECK_CUDA(cudaPeekAtLastError());
}


================================================
FILE: apex/contrib/csrc/fmha/src/fmha_fprop_fp16_384_64_kernel.sm80.cu
================================================
/******************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#include "fmha.h"
#include "fmha_fprop_kernel_1xN.h"

using Kernel_traits = FMHA_kernel_traits<384, 64, 16, 1, 4, 0x18u>;

template <bool Is_training>
__global__ void fmha_fprop_fp16_384_64_sm80_kernel(Fused_multihead_attention_fprop_params params,
                                                   const int total_heads) {
  fmha::device_1xN<Kernel_traits, Is_training>(params, total_heads);
}

void run_fmha_fp16_384_64_sm80(Launch_params<Fused_multihead_attention_fprop_params>& launch_params,
                               const bool configure) {
  auto kernel = launch_params.is_training ? &fmha_fprop_fp16_384_64_sm80_kernel<true>
                                          : &fmha_fprop_fp16_384_64_sm80_kernel<false>;

  constexpr int smem_size = fmha::get_dynamic_smem_size<Kernel_traits>();

  if (smem_size >= 48 * 1024) {
    FMHA_CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
  }

  const int sm_count = launch_params.props->multiProcessorCount;
  int ctas_per_sm;
  FMHA_CHECK_CUDA(
      cudaOccupancyMaxActiveBlocksPerMultiprocessor(&ctas_per_sm, kernel, Kernel_traits::THREADS, smem_size));
  int total_ctas = sm_count * ctas_per_sm;

  const int heads_total = launch_params.params.b * launch_params.params.h;
  if (configure) {
    using Mma_tile_p = fmha::Hmma_tile<typename Kernel_traits::Cta_tile_p>;
    constexpr size_t STEPS = Kernel_traits::Cta_tile_p::N / Kernel_traits::Cta_tile_p::M;
    constexpr size_t MMAS_M = Mma_tile_p::MMAS_M;
    constexpr size_t MMAS_N = Mma_tile_p::MMAS_N;

    size_t heads_per_cta = ((heads_total + total_ctas - 1) / total_ctas);
    size_t elts_per_head = STEPS * MMAS_M * MMAS_N * 8;
    launch_params.elts_per_thread = heads_per_cta * elts_per_head;
    return;
  }

  dim3 grid(total_ctas);
  kernel<<<grid, Kernel_traits::THREADS, smem_size, launch_params.stream>>>(launch_params.params, heads_total);

  FMHA_CHECK_CUDA(cudaPeekAtLastError());
}


================================================
FILE: apex/contrib/csrc/fmha/src/fmha_fprop_fp16_512_64_kernel.sm80.cu
================================================
/******************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#include "fmha.h"
#include "fmha_fprop_kernel_1xN.h"

using Kernel_traits = FMHA_kernel_traits<512, 64, 16, 1, 8, 0x00u>;

template <bool Is_training>
__global__ void fmha_fprop_fp16_512_64_sm80_kernel(Fused_multihead_attention_fprop_params params,
                                                   const int total_heads) {
  fmha::device_1xN<Kernel_traits, Is_training>(params, total_heads);
}

template <bool Is_training>
__global__ void fmha_fprop_fp16_512_64_sm80_kernel_nl(Fused_multihead_attention_fprop_params params,
                                                      const int num_full_heads, const int num_main_groups,
                                                      const int main_group_size, const int main_steps,
                                                      const int rest_steps) {
  fmha::device_1xN<Kernel_traits, Is_training>(params, num_full_heads, num_main_groups, main_group_size, main_steps,
                                               rest_steps);
}

void run_fmha_fp16_512_64_sm80_(Launch_params<Fused_multihead_attention_fprop_params>& launch_params,
                                const bool configure) {
  auto kernel = launch_params.is_training ? &fmha_fprop_fp16_512_64_sm80_kernel<true>
                                          : &fmha_fprop_fp16_512_64_sm80_kernel<false>;

  constexpr int smem_size = fmha::get_dynamic_smem_size<Kernel_traits>();

  if (smem_size >= 48 * 1024) {
    FMHA_CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
  }

  const int sm_count = launch_params.props->multiProcessorCount;
  int ctas_per_sm;
  FMHA_CHECK_CUDA(
      cudaOccupancyMaxActiveBlocksPerMultiprocessor(&ctas_per_sm, kernel, Kernel_traits::THREADS, smem_size));
  int total_ctas = sm_count * ctas_per_sm;

  const int heads_total = launch_params.params.b * launch_params.params.h;
  if (configure) {
    using Mma_tile_p = fmha::Hmma_tile<typename Kernel_traits::Cta_tile_p>;
    constexpr size_t STEPS = Kernel_traits::Cta_tile_p::N / Kernel_traits::Cta_tile_p::M;
    constexpr size_t MMAS_M = Mma_tile_p::MMAS_M;
    constexpr size_t MMAS_N = Mma_tile_p::MMAS_N;

    size_t heads_per_cta = ((heads_total + total_ctas - 1) / total_ctas);
    size_t elts_per_head = STEPS * MMAS_M * MMAS_N * 8;
    launch_params.elts_per_thread = heads_per_cta * elts_per_head;
    return;
  }

  dim3 grid(total_ctas);
  kernel<<<grid, Kernel_traits::THREADS, smem_size, launch_params.stream>>>(launch_params.params, heads_total);

  FMHA_CHECK_CUDA(cudaPeekAtLastError());
}

void run_fmha_fp16_512_64_sm80_nl_(Launch_params<Fused_multihead_attention_fprop_params>& launch_params,
                                   const bool configure) {
  auto kernel = launch_params.is_training ? &fmha_fprop_fp16_512_64_sm80_kernel_nl<true>
                                          : &fmha_fprop_fp16_512_64_sm80_kernel_nl<false>;

  constexpr int smem_size = fmha::get_dynamic_smem_size<Kernel_traits>();

  if (smem_size >= 48 * 1024) {
    FMHA_CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
  }

  const int sm_count = launch_params.props->multiProcessorCount;
  int ctas_per_sm;
  FMHA_CHECK_CUDA(
      cudaOccupancyMaxActiveBlocksPerMultiprocessor(&ctas_per_sm, kernel, Kernel_traits::THREADS, smem_size));
  int total_ctas = sm_count * ctas_per_sm;

  if (configure) {
    const int heads_total = launch_params.params.b * launch_params.params.h;
    std::tie(launch_params.num_full_heads, launch_params.num_main_groups, launch_params.heads_last_wave,
             launch_params.main_steps, launch_params.rest_steps, launch_params.elts_per_thread) =
        fmha::work_dist<Kernel_traits>(total_ctas, heads_total);
    return;
  }

  dim3 grid(total_ctas);
  kernel<<<grid, Kernel_traits::THREADS, smem_size, launch_params.stream>>>(
      launch_params.params, launch_params.num_full_heads, launch_params.num_main_groups, launch_params.heads_last_wave,
      launch_params.main_steps, launch_params.rest_steps);

  FMHA_CHECK_CUDA(cudaPeekAtLastError());
}

void run_fmha_fp16_512_64_sm80(Launch_params<Fused_multihead_attention_fprop_params>& launch_params,
                               const bool configure) {
  if (launch_params.is_nl) {
    run_fmha_fp16_512_64_sm80_nl_(launch_params, configure);
  } else {
    run_fmha_fp16_512_64_sm80_(launch_params, configure);
  }
}


================================================
FILE: apex/contrib/csrc/fmha/src/fmha_fprop_kernel_1xN.h
================================================
/***************************************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#pragma once

#include <fmha/gemm.h>
#include <fmha/kernel_traits.h>

#include "fmha_kernel.h"

namespace fmha {

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Kernel_traits>
struct Gemm_Q_K_base {
  using Smem_tile_o = typename Kernel_traits::Smem_tile_o;
  using Smem_tile_q = typename Kernel_traits::Smem_tile_q;
  using Smem_tile_k = typename Kernel_traits::Smem_tile_k;
  using Fragment_q = typename Smem_tile_q::Fragment;
  using Fragment_k = typename Smem_tile_k::Fragment;

  // The description of the CTA tile for the 1st batched GEMM.
  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;

  // The MMA tile for the 1st GEMM.
  using Mma_tile_p = fmha::Hmma_tile<Cta_tile_p>;

  static constexpr int SMEM_BYTES_SOFTMAX = Cta_tile_p::M * Cta_tile_p::WARPS_N * sizeof(float) * 2;

  __device__ inline Gemm_Q_K_base(char* smem_ptr_q, char* smem_ptr_k, const int tidx)
      : smem_q(smem_ptr_q, tidx), smem_k(smem_ptr_k, tidx) {}

  __device__ inline void load_q() { smem_q.load(frag_q[0], 0); }

  __device__ inline void reload_q() { smem_q.load(frag_q[0], 0); }

  Fragment_q frag_q[2][Mma_tile_p::MMAS_M];
  Smem_tile_q smem_q;
  Smem_tile_k smem_k;
};

template <typename Kernel_traits, bool K_in_regs>
struct Gemm_Q_K : public Gemm_Q_K_base<Kernel_traits> {
  using Base = Gemm_Q_K_base<Kernel_traits>;
  using Smem_tile_o = typename Base::Smem_tile_o;
  using Smem_tile_q = typename Base::Smem_tile_q;
  using Smem_tile_k = typename Base::Smem_tile_k;
  using Fragment_k = typename Base::Fragment_k;
  using Mma_tile_p = typename Base::Mma_tile_p;

  enum { SHARE_SMEM_FOR_K_AND_V = Kernel_traits::SHARE_SMEM_FOR_K_AND_V };

  enum { SMEM_OFFSET_O = Smem_tile_q::BYTES_PER_TILE };
  enum { SMEM_OFFSET_V = Smem_tile_q::BYTES_PER_TILE + (SHARE_SMEM_FOR_K_AND_V ? 0 : Smem_tile_k::BYTES_PER_TILE) };

  // Q | K / V
  //   | O | SOFTMAX
  static constexpr int SMEM_BYTES =
      Smem_tile_q::BYTES_PER_TILE + std::max((SHARE_SMEM_FOR_K_AND_V ? 1 : 2) * Smem_tile_k::BYTES_PER_TILE,
                                             Smem_tile_o::BYTES_PER_TILE + Base::SMEM_BYTES_SOFTMAX);

  __device__ inline Gemm_Q_K(char* smem_, const int tidx) : Base(smem_, smem_ + Smem_tile_q::BYTES_PER_TILE, tidx) {}

  __device__ inline void load_k() {
#pragma unroll
    for (int ki = 0; ki < Mma_tile_p::MMAS_K; ++ki) {
      Base::smem_k.load(frag_k[ki], ki);
    }
  }

  template <typename Acc, int M, int N>
  __device__ inline void operator()(Acc (&acc_p)[M][N]) {
// Do this part of P^T = (Q * K^T)^T.
#pragma unroll
    for (int ki = 1; ki < Mma_tile_p::MMAS_K; ++ki) {
      // Trigger the load from shared memory for the next series of Q values.
      Base::smem_q.load(Base::frag_q[ki & 1], ki);
      // Do the math for the values already in registers.
      fmha::gemm(acc_p, Base::frag_q[(ki - 1) & 1], frag_k[(ki - 1)]);
    }
    // Do the final stage of math.
    {
      int ki = Mma_tile_p::MMAS_K;
      fmha::gemm(acc_p, Base::frag_q[(ki - 1) & 1], frag_k[(ki - 1)]);
    }
  }

  __device__ inline void reload_k() {
    // Noop.
  }

  Fragment_k frag_k[Mma_tile_p::MMAS_K][Mma_tile_p::MMAS_N];
};

template <typename Kernel_traits>
struct Gemm_Q_K<Kernel_traits, false> : public Gemm_Q_K_base<Kernel_traits> {
  using Base = Gemm_Q_K_base<Kernel_traits>;
  using Smem_tile_o = typename Base::Smem_tile_o;
  using Smem_tile_q = typename Base::Smem_tile_q;
  using Smem_tile_k = typename Base::Smem_tile_k;
  using Smem_tile_v = typename Kernel_traits::Smem_tile_v;
  using Fragment_k = typename Base::Fragment_k;
  using Mma_tile_p = typename Base::Mma_tile_p;
  Fragment_k frag_k[2][Mma_tile_p::MMAS_N];

  enum { SHARE_SMEM_FOR_K_AND_V = Kernel_traits::SHARE_SMEM_FOR_K_AND_V };

  enum { SMEM_OFFSET_V = Smem_tile_q::BYTES_PER_TILE + (SHARE_SMEM_FOR_K_AND_V ? 0 : Smem_tile_k::BYTES_PER_TILE) };
  static_assert(Smem_tile_v::BYTES_PER_TILE == (int)Smem_tile_k::BYTES_PER_TILE);
  enum { SMEM_OFFSET_O = SMEM_OFFSET_V + Smem_tile_v::BYTES_PER_TILE };

  // Q | K/V + O + SOFTMAX
  static constexpr int SMEM_BYTES = Smem_tile_q::BYTES_PER_TILE +
                                    (SHARE_SMEM_FOR_K_AND_V ? 1 : 2) * Smem_tile_k::BYTES_PER_TILE +
                                    Smem_tile_o::BYTES_PER_TILE + Base::SMEM_BYTES_SOFTMAX;

  __device__ inline Gemm_Q_K(char* smem_, const int tidx) : Base(smem_, smem_ + Smem_tile_q::BYTES_PER_TILE, tidx) {}

  __device__ inline void load_k() { Base::smem_k.load(frag_k[0], 0); }

  template <typename Acc, int M, int N>
  __device__ inline void operator()(Acc (&acc_p)[M][N]) {
// Do this part of P^T = (Q * K^T)^T.
#pragma unroll
    for (int ki = 1; ki < Mma_tile_p::MMAS_K; ++ki) {
      // Trigger the load from shared memory for the next series of Q values.
      Base::smem_q.load(Base::frag_q[ki & 1], ki);
      Base::smem_k.load(frag_k[ki & 1], ki);
      // Do the math for the values already in registers.
      fmha::gemm(acc_p, Base::frag_q[(ki - 1) & 1], frag_k[(ki - 1) & 1]);
    }
    // Do the final stage of math.
    {
      int ki = Mma_tile_p::MMAS_K;
      fmha::gemm(acc_p, Base::frag_q[(ki - 1) & 1], frag_k[(ki - 1) & 1]);
    }
  }

  __device__ inline void reload_k() { Base::smem_k.load(frag_k[0], 0); }
};

template <typename Kernel_traits>
constexpr size_t get_dynamic_smem_size() {
  return Gemm_Q_K<Kernel_traits, Kernel_traits::K_IN_REGS>::SMEM_BYTES;
}

template <typename Kernel_traits, bool Is_training, typename Params, typename Prng>
inline __device__ void device_1xN_(const Params& params, const int bidb, const int bidh, const int begin,
                                   const int steps, Prng& ph) {
  // The description of the CTA tile for the 1st batched GEMM.
  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
  // The description of the CTA tile for the 2nd batched GEMM.
  using Cta_tile_o = typename Kernel_traits::Cta_tile_o;

  // The MMA tile for the 1st GEMM.
  using Mma_tile_p = fmha::Hmma_tile<Cta_tile_p>;
  // The MMA tile for the 2nd GEMM.
  using Mma_tile_o = fmha::Hmma_tile<Cta_tile_o>;

  // The global memory tile to load Q.
  using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;

  // The global memory tile to load K.
  using Gmem_tile_k = typename Kernel_traits::Gmem_tile_k;

  // The global memory tile to load V.
  using Gmem_tile_v = typename Kernel_traits::Gmem_tile_v;
  // The shared memory tile to swizzle V.
  using Smem_tile_v = typename Kernel_traits::Smem_tile_v;

  // The global memory tile to store O.
  using Gmem_tile_o = typename Kernel_traits::Gmem_tile_o;
  // The shared memory tile to swizzle O.
  using Smem_tile_o = typename Kernel_traits::Smem_tile_o;

  using Gmem_tile_s = typename Kernel_traits::Gmem_tile_s;

  using Gemm1 = Gemm_Q_K<Kernel_traits, Kernel_traits::K_IN_REGS>;

  using Softmax = fmha::Softmax<Cta_tile_p, Kernel_traits>;

  // The number of threads per row.
  enum { THREADS_PER_ROW = 32 };

  enum { BITS_PER_ELT_S = sizeof(fmha::A_type) * 8 };

  // Shared memory.
  extern __shared__ char smem_[];

  // The thread index.
  const int tidx = threadIdx.x;

  const BlockInfoPadded<Kernel_traits::THREADS> binfo(params, bidb, bidh, tidx);
  if (binfo.stop_early()) return;

  Gemm1 gemm_q_k(smem_, tidx);
  // Allocate the global memory tile loader for Q.
  Gmem_tile_q gmem_q(params, 0, binfo, tidx);
  // Allocate the global memory tile loader for O.
  Gmem_tile_o gmem_o(params, binfo, tidx);
  // Allocate the global memory tile loader for S.
  Gmem_tile_s gmem_s(params, binfo, tidx);
  // Wind gmem tiles to the correct position.
  for (int it = 0; it < begin; it++) {
    gmem_q.move();
    gmem_s.move();
    gmem_o.move();
  }

  fmha::Mask<Cta_tile_p> mask(params, binfo, tidx);

  // Allocate the global memory tile loader for K.
  Gmem_tile_k gmem_k(params, 1, binfo, tidx);
  // Allocate the global memory tile loader for V.
  Gmem_tile_v gmem_v(params, 2, binfo, tidx);
  // The base pointer of smem_v;
  char* smem_v_ = &smem_[Gemm1::SMEM_OFFSET_V];

  // Allocate the shared memory tile loader for V. We use the same as K so be careful!!!
  Smem_tile_v smem_v(smem_v_, tidx);

  // Allocate the shared memory tile loader for O. We use the same as K so be careful!!!
  Smem_tile_o smem_o(&smem_[Gemm1::SMEM_OFFSET_O], tidx);

  // Trigger the loads for K.
  gmem_k.load(gemm_q_k.smem_k);
  // Trigger the loads for Q.
  gmem_q.load(gemm_q_k.smem_q);
  // Trigger the loads for V.
  gmem_v.load(smem_v);

  const uint32_t scale_bmm1 = reinterpret_cast<const uint32_t&>(params.scale_bmm1);
#pragma unroll
  for (int it = 0; it < Gmem_tile_k::LDGS; it++) {
    gmem_k.fetch_[it] = fmha::hmul8(scale_bmm1, gmem_k.fetch_[it]);
  }

  // Commit the data for Q and V to shared memory.
  gmem_q.commit(gemm_q_k.smem_q);
  gmem_v.commit(smem_v);

  // Commit the data for K to shared memory.
  if (!Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
    gmem_k.commit(gemm_q_k.smem_k);
  }

  __syncthreads();

  // Load the fragments for Q.
  gemm_q_k.load_q();

  // Load the fragments for V. We keep the data in registers during the entire kernel.
  typename Smem_tile_v::Fragment frag_v[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_N];
#pragma unroll
  for (int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki) {
    smem_v.load(frag_v[ki], ki);
  }

  // Commit the data for V to shared memory if it has not been done already.
  if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
    // Make sure we are done loading the fragments for K.
    __syncthreads();

    // Commit the data to shared memory for V.
    gmem_k.commit(gemm_q_k.smem_k);

    // Make sure the data is in shared memory.
    __syncthreads();
  }

  // Load the fragments for K.
  gemm_q_k.load_k();
  uint32_t p_scaled = (uint32_t)256.0 * params.p_dropout;

  // Create the object to do the softmax.
  Softmax softmax(params, &smem_[Gemm1::SMEM_OFFSET_O + Smem_tile_o::BYTES_PER_TILE], bidb, tidx);

  // Load over the entire sequence length.
  for (int l = 0; l < steps; l++) {
    if (begin + l * Cta_tile_p::M >= binfo.actual_seqlen) break;

    // Declare the accumulators for the 1st gemm.
    fmha::Fragment_accumulator acc_p[Mma_tile_p::MMAS_M][Mma_tile_p::MMAS_N];
    fmha::Clear_accumulator<typename fmha::Accumulator_type, Cta_tile_p::WARPS_K>::apply(acc_p);

    // Do this part of P^T = (Q * K^T)^T.
    gemm_q_k(acc_p);

    // Trigger the load for the next Q values.
    if (l < steps - 1) {
      gemm_q_k.smem_q.move_to_next_write_buffer();
      gmem_q.move();
      gmem_q.load(gemm_q_k.smem_q);
    }

    // Load the mask for that iteration.
    mask.load(begin + l);

    // Convert from the accumulator type to FP32 for Softmax.
    softmax.unpack_noscale(acc_p);

    // Apply the mask.
    softmax.apply_mask(mask);

    if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V && l == 0) {
      // if we share K and V, it could be that V was not fully read yet but we write into smem for reduction
      __syncthreads();
    }
    // Compute the max.
    float p_max[Mma_tile_p::MMAS_M * 2];
    // softmax.template reduce<fmha::Max_>(p_max);
    softmax.reduce_max(p_max);

    // Compute the exponential value.
    softmax.apply_exp(p_max);

    // Compute the sum.
    float p_sum[Mma_tile_p::MMAS_M * 2];
    softmax.reduce_sum(p_sum);

    // Finalize softmax on the accumulators of P^T.
    softmax.scale(p_sum);

    using Frag_p = fmha::Fragment_a<fmha::Row>;
    Frag_p frag_p[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_M];
    if (Is_training) {
      auto encode_dropout = [](bool keep, float val) { return keep ? val : -val; };
#pragma unroll
      for (int mi = 0; mi < Mma_tile_p::MMAS_M; mi++) {
#pragma unroll
        for (int ii = 0; ii < 2; ii++) {
#pragma unroll
          for (int ni = 0; ni < Mma_tile_p::MMAS_N / 4; ni++) {
            uint8_t* rand_arr = (uint8_t*)&ph();
            // We encode the dropout pattern in the sign bit of the non-negative softmax to distinguish from
            // pre-existing zeros
            for (int ind = 0; ind < 16; ind++) {
              softmax.elt_[2 * mi + ii][16 * ni + ind] =
                  encode_dropout(rand_arr[ind] <= p_scaled, softmax.elt_[2 * mi + ii][16 * ni + ind]);
            }
          }
        }
      }
      softmax.pack(frag_p);
      gmem_s.store(frag_p, mask);
      gmem_s.move();
    } else {
      softmax.pack(frag_p);
    }

    // Commit the values for Q into shared memory.
    if (l < steps - 1) {
      gmem_q.commit(gemm_q_k.smem_q);
    }

    if (Is_training) {
#pragma unroll
      for (int ki = 0; ki < Mma_tile_o::MMAS_K; ki++) {
#pragma unroll
        for (int mi = 0; mi < Mma_tile_o::MMAS_M; mi++) {
#pragma unroll
          for (int ii = 0; ii < Frag_p::NUM_REGS; ii++) {
            //"Apply" the dropout.
            frag_p[ki][mi].reg(ii) = fmha::hmul2(frag_p[ki][mi].reg(ii), params.scale_dropout);
            frag_p[ki][mi].reg(ii) = fmha::hrelu2(frag_p[ki][mi].reg(ii));
          }
        }
      }
    }

    // Declare the accumulators for the 1st gemm.
    fmha::Fragment_accumulator acc_o[Mma_tile_o::MMAS_M][Mma_tile_o::MMAS_N];
    fmha::Clear_accumulator<typename fmha::Accumulator_type, Cta_tile_o::WARPS_K>::apply(acc_o);

// Do this part of O = P^T * V^T.
#pragma unroll
    for (int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki) {
      fmha::gemm(acc_o, frag_p[ki], frag_v[ki]);
    }

// Loop over MMAS_M.
#pragma unroll
    for (int ii = 0; ii < Gmem_tile_o::LOOPS; ++ii) {
      // Swizzle the elements and do the final reduction.
      smem_o.store(acc_o, ii);

      // Make sure the data is in shared memory.
      __syncthreads();

      // Load from shared memory.
      uint4 out[Gmem_tile_o::STGS_PER_LOOP];
      smem_o.load(out);

      // Make sure the data was read from shared memory.
      if (ii < Gmem_tile_o::LOOPS - 1) {
        __syncthreads();
      }

      // Output the values.
      gmem_o.store(out, ii);
    }

    // Move to the next part of the output.
    gmem_o.move();
    gemm_q_k.reload_k();

    // Commit the values for Q into shared memory.
    if (l < steps - 1) {
      gemm_q_k.reload_q();
    }

  }  // Outer loop over the sequence length.
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Kernel_traits, bool Is_training, typename Params>
inline __device__ void device_1xN(const Params& params, const int num_full_heads, const int num_main_groups,
                                  const int main_group_size, const int main_steps, const int rest_steps) {
  constexpr int STEPS = Kernel_traits::Cta_tile_p::N / Kernel_traits::Cta_tile_p::M;
  const int tidx_global = blockIdx.x * gridDim.x + threadIdx.x;
  auto seeds = at::cuda::philox::unpack(params.philox_args);
  Philox ph(std::get<0>(seeds), tidx_global, std::get<1>(seeds));
  for (int it = 0; it < num_full_heads; it++) {
    const int bidx = it * gridDim.x + blockIdx.x;
    const int bidh = bidx % params.h;
    const int bidb = bidx / params.h;
    fmha::device_1xN_<Kernel_traits, Is_training>(params, bidb, bidh, 0, STEPS, ph);
    __syncthreads();
  }
  if (main_group_size == 0) return;
  const int head_offset = num_full_heads * gridDim.x;

  if (blockIdx.x < main_group_size * num_main_groups) {
    // process within heads
    const int group = blockIdx.x % num_main_groups;
    const int bidx = blockIdx.x / num_main_groups;
    const int bidh = (head_offset + bidx) % params.h;
    const int bidb = (head_offset + bidx) / params.h;
    const int offset = group * main_steps;
    fmha::device_1xN_<Kernel_traits, Is_training>(params, bidb, bidh, offset, main_steps, ph);
  } else {
    if (rest_steps == 0) return;
    // process across heads
    const int bidx = blockIdx.x - main_group_size * num_main_groups;
    const int offset = num_main_groups * main_steps;
    const int total_heads = params.b * params.h;
    const int rest_ctas = gridDim.x - main_group_size * num_main_groups;
    for (int it = head_offset + bidx; it < total_heads; it += rest_ctas) {
      const int bidh = it % params.h;
      const int bidb = it / params.h;
      fmha::device_1xN_<Kernel_traits, Is_training>(params, bidb, bidh, offset, rest_steps, ph);
      __syncthreads();
    }
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Kernel_traits, bool Is_training, typename Params>
inline __device__ void device_1xN(const Params& params, const int total_heads) {
  const int tidx_global = blockIdx.x * gridDim.x + threadIdx.x;
  auto seeds = at::cuda::philox::unpack(params.philox_args);
  Philox ph(std::get<0>(seeds), tidx_global, std::get<1>(seeds));
  constexpr int STEPS = Kernel_traits::Cta_tile_p::N / Kernel_traits::Cta_tile_p::M;

  for (int bidx = blockIdx.x; bidx < total_heads; bidx += gridDim.x) {
    const int bidh = bidx % params.h;
    const int bidb = bidx / params.h;
    fmha::device_1xN_<Kernel_traits, Is_training>(params, bidb, bidh, 0, STEPS, ph);
    __syncthreads();
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

}  // namespace fmha


================================================
FILE: apex/contrib/csrc/fmha/src/fmha_kernel.h
================================================
/******************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#pragma once

#include <fmha.h>
#include <fmha/gmem_tile.h>
#include <fmha/mask.h>
#include <fmha/smem_tile.h>
#include <fmha/softmax.h>
#include <fmha/utils.h>

#include <multihead_attn/philox.cuh>

namespace fmha {

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int THREADS_PER_CTA>
struct BlockInfoPadded {
  template <typename Params>
  __device__ BlockInfoPadded(const Params& params, const int bidb, const int bidh, const int tidx)
      : bidb(bidb), bidh(bidh), h(params.h) {
    // The block index.
    sum_s = params.cu_seqlens[bidb];
    actual_seqlen = params.cu_seqlens[bidb + 1] - sum_s;
    bidx = sum_s * params.h + bidh;

    tidx_global = (bidb * params.h + bidh) * THREADS_PER_CTA + tidx;
  }

  __device__ bool stop_early() const { return actual_seqlen == 0; }

  int actual_seqlen;
  int bidx;
  int sum_s;
  int bidh;
  int bidb;
  int tidx_global;
  int h;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int CHUNKS, typename Cta_tile>
struct Noloop_traits {
  // Interpretation of Cta_tile dims, i.e. Cta_tile_p:
  enum { STEP = Cta_tile::M };
  enum { SEQLEN = Cta_tile::N };

  template <typename Block_info>
  inline __device__ Noloop_traits(const int bidc, const Block_info& binfo) : bidc_(bidc) {
    const int seqlen = binfo.actual_seqlen;
    const int steps = (seqlen + STEP - 1) / STEP;
    const int steps_per_chunk = (steps + CHUNKS - 1) / CHUNKS;

    const int step_begin = bidc_ * steps_per_chunk;
    const int step_end = min(steps, (bidc_ + 1) * steps_per_chunk);
    const int actual_steps = max(0, step_end - step_begin);
    loop_offset_ = step_begin;
    num_steps_ = actual_steps;
  }

  template <typename... Tiles>
  inline __device__ void move_all(Tiles&... tiles) const {
    using expand_type = int[];
    for (int s = 0; s < loop_offset_; s++) {
      expand_type{(tiles.move(), 0)...};
    }
  }

  inline __device__ int get_idx_dk() const {
    // return bidc_;
    return bidc_ * 2 + 0;
  }

  inline __device__ int get_idx_dv() const {
    // return CHUNKS + bidc_;
    return bidc_ * 2 + 1;
  }

  inline __device__ int offset_loop_count(const int l) {
    // convert loop counter to position in the outer sequence
    return (loop_offset_ + l) * STEP;
  }

  const uint32_t bidc_;
  int loop_offset_;
  int num_steps_;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Kernel_traits>
std::tuple<int, int, int, int, int, int> work_dist(const int total_ctas, const int heads_total) {
  constexpr int STEPS_PER_HEAD = Kernel_traits::Cta_tile_p::N / Kernel_traits::Cta_tile_p::M;

  const int num_full_heads = heads_total / total_ctas;
  const int heads_last_wave = heads_total % total_ctas;

  int num_main_groups = 0;
  int main_steps = 0;
  int rest_steps = 0;
  if (heads_last_wave > 0) {
    // Number of CTA groups that process within heads.
    num_main_groups = total_ctas / heads_last_wave;
    // Remaining CTAs that process between heads.
    const int rest_ctas = total_ctas - (heads_last_wave * num_main_groups);
    if (rest_ctas == 0) {
      // We have exactly "num_main_groups" CTAs to process each of the remaining heads.
      main_steps = (STEPS_PER_HEAD + num_main_groups - 1) / num_main_groups;
      num_main_groups = STEPS_PER_HEAD / main_steps;  // Here: main_step > 0
      rest_steps = STEPS_PER_HEAD % main_steps;

    } else {
      // Ideal number of steps if we could load-balance as evenly as possible.
      const int steps_ideal = (heads_last_wave * STEPS_PER_HEAD + total_ctas - 1) / total_ctas;
      // Iterations that a "rest" CTA has to do at most.
      const int max_rest_iters = (heads_last_wave + rest_ctas - 1) / rest_ctas;
      // Find the first step distribution, s.t. the maximum work of the "rest" CTAs is less than the work of the main
      // CTAs.
      main_steps = steps_ideal;
      rest_steps = STEPS_PER_HEAD - main_steps * num_main_groups;
      for (; main_steps * num_main_groups < STEPS_PER_HEAD; main_steps++) {
        rest_steps = STEPS_PER_HEAD - main_steps * num_main_groups;
        const int max_rest_total_steps = rest_steps * max_rest_iters;
        if (max_rest_total_steps < main_steps) break;
      }
      rest_steps = STEPS_PER_HEAD - main_steps * num_main_groups;
    }
  }

  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
  using Mma_tile_p = fmha::Hmma_tile<Cta_tile_p>;

  const int max_steps = STEPS_PER_HEAD * num_full_heads + std::max(main_steps, rest_steps);
  const int elts_per_thread_per_step = Mma_tile_p::MMAS_M * Mma_tile_p::MMAS_N * 8;
  const int elts_per_thread = max_steps * elts_per_thread_per_step;

  return {num_full_heads, num_main_groups, heads_last_wave, main_steps, rest_steps, elts_per_thread};
}

////////////////////////////////////////////////////////////////////////////////////////////////////

}  // namespace fmha


================================================
FILE: apex/contrib/csrc/fmha/src/fmha_noloop_reduce.cu
================================================
/******************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#include "fmha.h"

inline __device__ float4 ldg128(const void* ptr) { return *static_cast<const float4*>(ptr); }

inline __device__ void stg128(void* ptr, const float4& data) { *static_cast<float4*>(ptr) = data; }

template <typename T, int THREADS, int HIDDEN_SIZE, int CHUNKS>
__global__ __launch_bounds__(THREADS) void fmha_noloop_reduce_kernel(void* __restrict__ out,
                                                                     const void* __restrict__ in,
                                                                     const int* __restrict__ cu_seqlens,
                                                                     const int batch_size) {
  enum { BYTES_PER_LDG = 16 };
  enum { NUM_ELTS = BYTES_PER_LDG / sizeof(T) };

  // One CTA hidden vector for K and V
  enum { BYTES_PER_ROW = HIDDEN_SIZE * sizeof(T) * 2 };
  // The stride in bytes in dQKV
  enum { OUT_STRIDE_BYTES = 3 * HIDDEN_SIZE * sizeof(T) };
  // The offset in bytes in dQKV to the dKV part for non-interleaved heads
  enum { OUT_OFFSET_KV_BYTES = HIDDEN_SIZE * sizeof(T) };

  static_assert(BYTES_PER_ROW == HIDDEN_SIZE * 2 * sizeof(T));

  // Size in bytes of the input tile
  enum { BYTES_PER_TILE = CHUNKS * BYTES_PER_ROW };

  enum { BYTES_PER_CTA = THREADS * BYTES_PER_LDG };

  enum { LDGS = BYTES_PER_ROW / BYTES_PER_CTA };
  static_assert(BYTES_PER_CTA * LDGS == BYTES_PER_ROW);

  union Vec_t {
    float4 raw;
    T elt[NUM_ELTS];
  };

  // ZERO-OUT invalid positions in dQKV
  const int total = cu_seqlens[batch_size];
  if (blockIdx.x >= total) {
    enum { BYTES_PER_QKV_ROW = 3 * HIDDEN_SIZE * sizeof(T) };
    enum { STGS = BYTES_PER_QKV_ROW / BYTES_PER_LDG };

    const float4 zeros = make_float4(0.f, 0.f, 0.f, 0.f);

    char* base_ptr = static_cast<char*>(out) + blockIdx.x * OUT_STRIDE_BYTES;

    for (int tidx = threadIdx.x; tidx < STGS; tidx += THREADS) {
      stg128(base_ptr + tidx * BYTES_PER_LDG, zeros);
    }

    return;
  }

  // SETUP
  const int offset_in = blockIdx.x * BYTES_PER_TILE + threadIdx.x * BYTES_PER_LDG;
  const char* ptr_in = static_cast<const char*>(in) + offset_in;

  const int offset_out = blockIdx.x * OUT_STRIDE_BYTES + threadIdx.x * BYTES_PER_LDG;
  char* ptr_out = static_cast<char*>(out) + OUT_OFFSET_KV_BYTES + offset_out;

  // LOAD

  Vec_t local_in[CHUNKS][LDGS];

#pragma unroll
  for (int c = 0; c < CHUNKS; c++) {
#pragma unroll
    for (int l = 0; l < LDGS; l++) {
      int offset = c * BYTES_PER_ROW + l * BYTES_PER_CTA;
      local_in[c][l].raw = ldg128(ptr_in + offset);
    }
  }

  // UNPACK
  float acc[LDGS][NUM_ELTS];

#pragma unroll
  for (int l = 0; l < LDGS; l++) {
#pragma unroll
    for (int e = 0; e < NUM_ELTS; e++) {
      acc[l][e] = float(local_in[0][l].elt[e]);
    }
  }

// COMPUTE
#pragma unroll
  for (int c = 1; c < CHUNKS; c++) {
#pragma unroll
    for (int l = 0; l < LDGS; l++) {
#pragma unroll
      for (int e = 0; e < NUM_ELTS; e++) {
        acc[l][e] += float(local_in[c][l].elt[e]);
      }
    }
  }

  // PACK
  Vec_t local_out[LDGS];

#pragma unroll
  for (int l = 0; l < LDGS; l++) {
#pragma unroll
    for (int e = 0; e < NUM_ELTS; e++) {
      local_out[l].elt[e] = T(acc[l][e]);
    }
  }

// STORE
#pragma unroll
  for (int l = 0; l < LDGS; l++) {
    const int offset = l * BYTES_PER_CTA;
    stg128(ptr_out + offset, local_out[l].raw);
  }
}

void fmha_run_noloop_reduce(void* out, const void* in, const int* cu_seqlens, const int hidden_size,
                            const int batch_size, const int total, const int num_chunks, cudaStream_t stream) {
  const int blocks = total;

  if (hidden_size == 1024) {
    constexpr int HIDDEN_SIZE = 1024;
    constexpr int THREADS = 256;

    if (num_chunks == 2) {
      fmha_noloop_reduce_kernel<half, THREADS, HIDDEN_SIZE, 2>
          <<<blocks, THREADS, 0, stream>>>(out, in, cu_seqlens, batch_size);
    } else if (num_chunks == 3) {
      fmha_noloop_reduce_kernel<half, THREADS, HIDDEN_SIZE, 3>
          <<<blocks, THREADS, 0, stream>>>(out, in, cu_seqlens, batch_size);
    } else {
      assert(false && "Unsupported num_chunks");
    }

  } else {
    assert(false && "Unsupported hidden_size");
  }

  FMHA_CHECK_CUDA(cudaPeekAtLastError());
}


================================================
FILE: apex/contrib/csrc/fmha/src/fmha_utils.h
================================================
/******************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#pragma once

#include <assert.h>
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <stdlib.h>

////////////////////////////////////////////////////////////////////////////////////////////////////

#define FMHA_CHECK_CUDA(call)                                                                       \
  do {                                                                                              \
    cudaError_t status_ = call;                                                                     \
    if (status_ != cudaSuccess) {                                                                   \
      fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, cudaGetErrorString(status_)); \
      exit(1);                                                                                      \
    }                                                                                               \
  } while (0)

////////////////////////////////////////////////////////////////////////////////////////////////////

enum Data_type { DATA_TYPE_FP16, DATA_TYPE_FP32, DATA_TYPE_INT32, DATA_TYPE_INT8 };

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline void set_alpha(uint32_t& alpha, float norm, Data_type dtype) {
  if (dtype == DATA_TYPE_FP16) {
    half x = __float2half_rn(norm);
    uint16_t h = reinterpret_cast<const uint16_t&>(x);
    ushort2 h2 = {h, h};
    alpha = reinterpret_cast<const uint32_t&>(h2);
  } else if (dtype == DATA_TYPE_FP32) {
    alpha = reinterpret_cast<const uint32_t&>(norm);
  } else if (dtype == DATA_TYPE_INT32) {
    int32_t inorm = static_cast<int32_t>(norm);
    alpha = reinterpret_cast<const uint32_t&>(inorm);
  } else {
    assert(false);
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline size_t get_size_in_bytes(size_t n, Data_type dtype) {
  switch (dtype) {
    case DATA_TYPE_FP32:
      return n * 4;
    case DATA_TYPE_FP16:
      return n * 2;
    case DATA_TYPE_INT32:
      return n * 4;
    case DATA_TYPE_INT8:
      return n;
    default:
      assert(false);
      return 0;
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////


================================================
FILE: apex/contrib/csrc/focal_loss/focal_loss_cuda.cpp
================================================
#include <torch/torch.h>

#include <cstdint>
#include <vector>

// CUDA forward declarations

std::vector<at::Tensor> focal_loss_forward_cuda(const at::Tensor& cls_output, const at::Tensor& cls_targets_at_level,
                                                const at::Tensor& num_positives_sum, const int64_t num_real_classes,
                                                const float alpha, const float gamma, const float smoothing_factor);

at::Tensor focal_loss_backward_cuda(const at::Tensor& grad_output, const at::Tensor& partial_grad,
                                    const at::Tensor& num_positives_sum);

// C++ interface

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
  CHECK_CUDA(x);       \
  CHECK_CONTIGUOUS(x)

std::vector<at::Tensor> focal_loss_forward(const at::Tensor& cls_output, const at::Tensor& cls_targets_at_level,
                                           const at::Tensor& num_positives_sum, const int64_t num_real_classes,
                                           const float alpha, const float gamma, const float smoothing_factor) {
  CHECK_INPUT(cls_output);
  CHECK_INPUT(cls_targets_at_level);
  CHECK_INPUT(num_positives_sum);

  return focal_loss_forward_cuda(cls_output, cls_targets_at_level, num_positives_sum, num_real_classes, alpha, gamma,
                                 smoothing_factor);
}

at::Tensor focal_loss_backward(const at::Tensor& grad_output, const at::Tensor& partial_grad,
                               const at::Tensor& num_positives_sum) {
  CHECK_INPUT(grad_output);
  CHECK_INPUT(partial_grad);

  return focal_loss_backward_cuda(grad_output, partial_grad, num_positives_sum);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &focal_loss_forward, "Focal loss calculation forward (CUDA)",
        py::call_guard<py::gil_scoped_release>());
  m.def("backward", &focal_loss_backward, "Focal loss calculation backward (CUDA)",
        py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: apex/contrib/csrc/focal_loss/focal_loss_cuda_kernel.cu
================================================
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>

// Use 128-bit vectorization
typedef uint4 vector_t;

#define ASSERT_ALIGNED(DTYPE, PTR) \
  TORCH_INTERNAL_ASSERT(is_aligned<DTYPE>(PTR), "Tensor " #PTR " is not " #DTYPE " aligned")

template <class T>
bool is_aligned(const void* ptr) noexcept {
  auto iptr = reinterpret_cast<std::uintptr_t>(ptr);
  return !(iptr % alignof(T));
}

template <bool SMOOTHING, int ILP, typename scalar_t, typename labelscalar_t, typename accscalar_t,
          typename outscalar_t>
__global__ void focal_loss_forward_cuda_kernel(outscalar_t* loss, scalar_t* partial_grad,
                                               const scalar_t* __restrict__ cls_output,
                                               const labelscalar_t* __restrict__ cls_targets_at_level,
                                               const float* __restrict__ num_positives_sum, const int64_t num_examples,
                                               const int64_t num_classes, const int64_t num_real_classes,
                                               const float alpha, const float gamma, const float smoothing_factor) {
  extern __shared__ unsigned char shm[];
  accscalar_t* loss_shm = reinterpret_cast<accscalar_t*>(shm);
  loss_shm[threadIdx.x] = 0;
  accscalar_t loss_acc = 0;

  accscalar_t one = accscalar_t(1.0);
  accscalar_t K = accscalar_t(2.0);
  accscalar_t normalizer = one / static_cast<accscalar_t>(num_positives_sum[0]);
  accscalar_t nn_norm, np_norm, pn_norm, pp_norm;

  // *_norm is used for label smoothing only
  if (SMOOTHING) {
    nn_norm = one - smoothing_factor / K;
    np_norm = smoothing_factor / K;
    pn_norm = smoothing_factor - smoothing_factor / K;
    pp_norm = one - smoothing_factor + smoothing_factor / K;
  }

  vector_t p_vec, grad_vec;

  // Accumulate loss on each thread
  for (int64_t i = (blockIdx.x * blockDim.x + threadIdx.x) * ILP; i < num_examples * num_classes;
       i += gridDim.x * blockDim.x * ILP) {
    int64_t idy = i / num_classes;
    labelscalar_t y = cls_targets_at_level[idy];
    int64_t base_yid = i % num_classes;

    int64_t pos_idx = idy * num_classes + y;
    p_vec = *(vector_t*)&cls_output[i];  // Vectorized load

    // Skip ignored matches
    if (y == -2) {
#pragma unroll
      for (int j = 0; j < ILP; j++) {
        *((scalar_t*)(&grad_vec) + j) = 0;
      }
      *(vector_t*)&partial_grad[i] = grad_vec;
      continue;
    }

#pragma unroll
    for (int j = 0; j < ILP; j++) {
      // Skip the pad classes
      if (base_yid + j >= num_real_classes) {
        *((scalar_t*)(&grad_vec) + j) = 0;
        continue;
      }

      accscalar_t p = static_cast<accscalar_t>(*((scalar_t*)(&p_vec) + j));
      accscalar_t exp_np = ::exp(-p);
      accscalar_t exp_pp = ::exp(p);
      accscalar_t sigma = one / (one + exp_np);
      accscalar_t logee = (p >= 0) ? exp_np : exp_pp;
      accscalar_t addee = (p >= 0) ? 0 : -p;
      accscalar_t off_a = addee + ::log(one + logee);

      // Negative matches
      accscalar_t base = SMOOTHING ? nn_norm * p : p;
      accscalar_t off_b = (SMOOTHING ? np_norm : 0) - sigma;
      accscalar_t coeff_f1 = one - alpha;
      accscalar_t coeff_f2 = sigma;
      accscalar_t coeff_b1 = gamma;
      accscalar_t coeff_b2 = one - sigma;

      // Positive matches
      if (y >= 0 && (i + j == pos_idx)) {
        base = SMOOTHING ? pn_norm * p : 0;
        off_b = (SMOOTHING ? pp_norm : one) - sigma;
        coeff_f1 = alpha;
        coeff_f2 = one - sigma;
        coeff_b1 = -gamma;
        coeff_b2 = sigma;
      }

      accscalar_t coeff_f = coeff_f1 * ::pow(coeff_f2, gamma);
      accscalar_t coeff_b = coeff_b1 * coeff_b2;

      accscalar_t loss_t = coeff_f * (base + off_a);
      accscalar_t grad = coeff_f * (coeff_b * (base + off_a) - off_b);

      // Delay the normalize of partial gradient by num_positives_sum to back
      // propagation because scalar_t reduces precision. Focal loss is very
      // sensitive to the small gradient. No worry on overflow here since
      // gradient has relative smaller range than input.
      loss_acc += loss_t;
      *((scalar_t*)(&grad_vec) + j) = static_cast<scalar_t>(grad);
    }

    // This may generate two vectorized stores instead of one
    *(vector_t*)&partial_grad[i] = grad_vec;
  }
  loss_shm[threadIdx.x] = loss_acc;

  // Intra-CTA reduction
  __syncthreads();
  for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
    if (threadIdx.x < s) {
      loss_shm[threadIdx.x] += loss_shm[threadIdx.x + s];
    }
    __syncthreads();
  }

  // Inter-CTA reduction
  if (threadIdx.x == 0) {
    loss_acc = loss_shm[0] * normalizer;
    atomicAdd(loss, loss_acc);
  }
}

template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t>
__global__ void focal_loss_backward_cuda_kernel(scalar_t* partial_grad, const outscalar_t* __restrict__ grad_output,
                                                const float* __restrict__ num_positives_sum, const uint64_t numel) {
  int64_t idx = (blockIdx.x * blockDim.x + threadIdx.x) * ILP;

  accscalar_t normalizer = static_cast<accscalar_t>(grad_output[0]) / static_cast<accscalar_t>(num_positives_sum[0]);

  // The input is enforced to pad to use vector load, thus there's no need to
  // check whether the last element of ILP can out of bound.
  if (idx >= numel) return;

  vector_t grad_vec;
  grad_vec = *(vector_t*)&partial_grad[idx];
#pragma unroll(ILP)
  for (int i = 0; i < ILP; i++) {
    auto grad = static_cast<accscalar_t>(*((scalar_t*)(&grad_vec) + i));
    grad *= normalizer;
    *((scalar_t*)(&grad_vec) + i) = static_cast<scalar_t>(grad);
  }
  *(vector_t*)&partial_grad[idx] = grad_vec;
}

std::vector<at::Tensor> focal_loss_forward_cuda(const at::Tensor& cls_output, const at::Tensor& cls_targets_at_level,
                                                const at::Tensor& num_positives_sum, const int64_t num_real_classes,
                                                const float alpha, const float gamma, const float smoothing_factor) {
  // Checks required for correctness
  TORCH_INTERNAL_ASSERT(cls_output.size(-1) >= num_real_classes, "Incorrect number of real classes.");
  TORCH_INTERNAL_ASSERT(cls_targets_at_level.scalar_type() == at::kLong, "Invalid label type.");
  TORCH_INTERNAL_ASSERT((num_positives_sum.numel() == 1) && (num_positives_sum.scalar_type() == at::kFloat),
                        "Expect num_positives_sum to be a float32 tensor with only one element.");
  TORCH_INTERNAL_ASSERT(cls_output.dim() == cls_targets_at_level.dim() + 1,
                        "Mis-matched dimensions between class output and label.");
  for (int64_t i = 0; i < cls_targets_at_level.dim(); i++)
    TORCH_INTERNAL_ASSERT(cls_output.size(i) == cls_targets_at_level.size(i),
                          "Mis-matched shape between class output and label.");

  // Checks required for better performance
  const int ILP = sizeof(vector_t) / cls_output.element_size();
  ASSERT_ALIGNED(vector_t, cls_output.data_ptr());
  TORCH_INTERNAL_ASSERT(cls_output.size(-1) % ILP == 0,
                        "Pad number of classes first to take advantage of vectorized load.");
  TORCH_INTERNAL_ASSERT(num_real_classes >= ILP, "Too few classes.");

  int64_t num_classes = cls_output.size(-1);
  int64_t num_examples = cls_output.numel() / num_classes;
  at::Tensor loss = at::zeros({}, cls_output.options().dtype(at::kFloat));

  // Compute the incompelete gradient during fprop since most of the heavy
  // functions of bprop are the same as fprop, thus trade memory for compute
  // helps with focal loss.
  at::Tensor partial_grad = at::empty_like(cls_output);

  // Set the number of CTAs per SM according to the compute capability.
  // Each CTA loops on input with stride till the last item.
  cudaDeviceProp props;
  cudaGetDeviceProperties(&props, at::cuda::current_device());
  int cta_per_sm = 2;
  if (props.major >= 10) {
    cta_per_sm = 8;
  }
  dim3 block(512);
  dim3 grid(cta_per_sm * props.multiProcessorCount);

  // Specialize on label smoothing or not to reduce redundant operations
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  if (smoothing_factor == 0.0f) {
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(cls_output.scalar_type(), "focal_loss_fprop", [&] {
      using accscalar_t = at::acc_type<scalar_t, true>;
      using labelscalar_t = int64_t;
      using outscalar_t = float;
      const int ILP = sizeof(vector_t) / sizeof(scalar_t);
      focal_loss_forward_cuda_kernel<false, ILP, scalar_t, labelscalar_t, accscalar_t, outscalar_t>
          <<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
              loss.data_ptr<outscalar_t>(), partial_grad.data_ptr<scalar_t>(), cls_output.data_ptr<scalar_t>(),
              cls_targets_at_level.data_ptr<labelscalar_t>(), num_positives_sum.data_ptr<float>(), num_examples,
              num_classes, num_real_classes, alpha, gamma, smoothing_factor);
    });
  } else {
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(cls_output.scalar_type(), "focal_loss_fprop", [&] {
      using accscalar_t = at::acc_type<scalar_t, true>;
      using labelscalar_t = int64_t;
      using outscalar_t = float;
      const int ILP = sizeof(vector_t) / sizeof(scalar_t);
      focal_loss_forward_cuda_kernel<true, ILP, scalar_t, labelscalar_t, accscalar_t, outscalar_t>
          <<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
              loss.data_ptr<outscalar_t>(), partial_grad.data_ptr<scalar_t>(), cls_output.data_ptr<scalar_t>(),
              cls_targets_at_level.data_ptr<labelscalar_t>(), num_positives_sum.data_ptr<float>(), num_examples,
              num_classes, num_real_classes, alpha, gamma, smoothing_factor);
    });
  }

  AT_CUDA_CHECK(cudaGetLastError());
  return {loss, partial_grad};
}

at::Tensor focal_loss_backward_cuda(const at::Tensor& grad_output, const at::Tensor& partial_grad,
                                    const at::Tensor& num_positives_sum) {
  // Each thread process ILP elements
  const int ILP = sizeof(vector_t) / partial_grad.element_size();
  dim3 block(512);
  dim3 grid((partial_grad.numel() + block.x * ILP - 1) / (block.x * ILP));

  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(partial_grad.scalar_type(), "focal_loss_bprop", [&] {
    using accscalar_t = at::acc_type<scalar_t, true>;
    using outscalar_t = float;
    const int ILP = sizeof(vector_t) / sizeof(scalar_t);
    focal_loss_backward_cuda_kernel<ILP, scalar_t, accscalar_t, outscalar_t>
        <<<grid, block, 0, stream>>>(partial_grad.data_ptr<scalar_t>(), grad_output.data_ptr<outscalar_t>(),
                                     num_positives_sum.data_ptr<float>(), partial_grad.numel());
  });

  AT_CUDA_CHECK(cudaGetLastError());
  return partial_grad;
}


================================================
FILE: apex/contrib/csrc/gpu_direct_storage/gds.cpp
================================================
// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.

#include <gds.h>

// torch
#include <c10/cuda/CUDAGuard.h>
#include <torch/torch.h>

// cuda
#include <cuda_runtime.h>
#include <cufile.h>

// file io
#include <fcntl.h>

namespace apex::contrib::gds {

// POSIX
template <class T, typename std::enable_if<std::is_integral<T>::value, std::nullptr_t>::type = nullptr>
std::string cuFileGetErrorString(T status) {
  status = std::abs(status);
  return IS_CUFILE_ERR(status) ? std::string(CUFILE_ERRSTR(status)) : std::string(std::strerror(errno));
}

// CUfileError_t
template <class T, typename std::enable_if<!std::is_integral<T>::value, std::nullptr_t>::type = nullptr>
std::string cuFileGetErrorString(T status) {
  std::string errStr = cuFileGetErrorString(static_cast<int>(status.err));
  if (IS_CUDA_ERR(status)) errStr.append(".").append(cudaGetErrorString(static_cast<cudaError_t>(status.cu_err)));
  return errStr;
}

File::File() : is_open(false) {};

File::File(const std::string& filename, const std::string& mode) : filename(filename), mode(mode), is_open(false) {
  open(filename, mode);
}

File::~File() {
  if (is_open) {
    close();
  }
}

void File::open(const std::string& other_filename, const std::string& other_mode) {
  TORCH_CHECK(is_open == false, "file", filename, "is already open");
  if (!filename.empty()) {
    TORCH_CHECK(other_filename == filename, "file", filename, "is already open with mode", mode);
  }
  if (!mode.empty()) {
    TORCH_CHECK(other_mode == mode, "file", filename, "is already open with mode", mode);
  }

  maybe_register = true;
  // Open the binary file
  if (mode == "r") {
    // for reading
    fd = ::open(filename.c_str(), O_RDONLY | O_DIRECT);
  } else if (mode == "w") {
    // for writing
    fd = ::open(filename.c_str(), O_CREAT | O_WRONLY | O_DIRECT, 0664);
  } else if (mode == "rn") {
    // for reading
    fd = ::open(filename.c_str(), O_RDONLY);
    maybe_register = false;
  } else if (mode == "wn") {
    // for writing
    fd = ::open(filename.c_str(), O_CREAT | O_WRONLY, 0664);
    maybe_register = false;
  } else {
    TORCH_CHECK(false, "only r and w modes are currently supported, but got:", mode);
  }
  TORCH_CHECK(fd >= 0, "fcntl cannot open file: ", filename);

  // Register cuFile handle
  if (maybe_register) {
    memset((void*)&cf_descr, 0, sizeof(CUfileDescr_t));
    cf_descr.handle.fd = fd;
    cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
    status = cuFileHandleRegister(&cf_handle, &cf_descr);
    if (status.err != CU_FILE_SUCCESS) {
      TORCH_CHECK(false, "cuFileHandleRegister failed: ", cuFileGetErrorString(status));
    }
  }
  is_open = true;
}

void File::close() {
  // Deregister cuFile handle and close the file
  if (is_open) {
    if (maybe_register) {
      cuFileHandleDeregister(cf_handle);
    }
    ::close(fd);
    fd = -1;
  }
  is_open = false;
}

void File::load_data(const torch::Tensor& tensor) {
  TORCH_CHECK(mode == "r", filename, " was opened for read only");
  c10::cuda::CUDAGuard gpuGuard(tensor.device());

  void* dataPtr = tensor.data_ptr();
  const size_t nbytes = tensor.nbytes();

  // Read the binary file
  ssize_t ret = cuFileRead(cf_handle, (void*)dataPtr, nbytes, 0, 0);
  TORCH_CHECK(ret >= 0, "cuFileWrite failed: ", cuFileGetErrorString(ret));
}

void File::save_data(const torch::Tensor& tensor) {
  TORCH_CHECK(mode == "w", filename, " was opened for write only");
  c10::cuda::CUDAGuard gpuGuard(tensor.device());

  void* dataPtr = tensor.data_ptr();
  const size_t nbytes = tensor.nbytes();

  // Register device memory
  status = cuFileBufRegister(dataPtr, nbytes, 0);
  TORCH_CHECK(status.err == CU_FILE_SUCCESS, "cuFileBufRegister failed: ", cuFileGetErrorString(status));

  // Write device memory contents to the file
  ssize_t ret = cuFileWrite(cf_handle, dataPtr, nbytes, 0, 0);
  status = cuFileBufDeregister(dataPtr);

  TORCH_CHECK(ret >= 0, "cuFileWrite failed: ", cuFileGetErrorString(ret));
  TORCH_CHECK(status.err == CU_FILE_SUCCESS, "cuFileBufDeregister failed:", cuFileGetErrorString(status));
}

// Just for benchmarking purposes

void File::load_data_no_gds(const torch::Tensor& tensor) {
  TORCH_CHECK(mode == "rn", filename, " was opened for read only");
  c10::cuda::CUDAGuard gpuGuard(tensor.device());

  void* dataPtrCPU = nullptr;
  void* dataPtr = tensor.data_ptr();
  const size_t nbytes = tensor.nbytes();
  dataPtrCPU = malloc(nbytes);
  TORCH_CHECK(dataPtrCPU != nullptr, "malloc failed");

  const ssize_t nbytes_read = pread(fd, dataPtrCPU, nbytes, 0);
  TORCH_CHECK(nbytes_read == nbytes || nbytes_read == 0, "fcntl pread failed");
  C10_CUDA_CHECK(cudaMemcpy(dataPtr, dataPtrCPU, nbytes, cudaMemcpyHostToDevice));
  free(dataPtrCPU);
}

void File::save_data_no_gds(const torch::Tensor& tensor) {
  TORCH_CHECK(mode == "wn", filename, " was opened for write only");
  c10::cuda::CUDAGuard gpuGuard(tensor.device());

  void* dataPtrCPU = nullptr;
  void* dataPtr = tensor.data_ptr();
  const size_t nbytes = tensor.nbytes();
  dataPtrCPU = malloc(nbytes);
  TORCH_CHECK(dataPtrCPU != nullptr, "malloc failed");
  C10_CUDA_CHECK(cudaMemcpy(dataPtrCPU, dataPtr, nbytes, cudaMemcpyDeviceToHost));

  const ssize_t nbytes_written = pwrite(fd, dataPtrCPU, nbytes, 0);
  TORCH_CHECK(nbytes_written == nbytes, "fcntl pwrite failed");
  free(dataPtrCPU);
}

}  // namespace apex::contrib::gds


================================================
FILE: apex/contrib/csrc/gpu_direct_storage/gds.h
================================================
// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.

#pragma once

#include <cufile.h>
#include <torch/torch.h>

#include <string>

namespace apex::contrib::gds {
class File {
 public:
  File();
  File(const std::string& filename, const std::string& mode);
  ~File();

  void open(const std::string& filename, const std::string& mode);
  void close();

  void load_data(const torch::Tensor& tensor);
  void save_data(const torch::Tensor& tensor);
  void load_data_no_gds(const torch::Tensor& tensor);
  void save_data_no_gds(const torch::Tensor& tensor);

 private:
  std::string filename;
  std::string mode;

  CUfileDescr_t cf_descr;
  CUfileHandle_t cf_handle;
  CUfileError_t status;

  int fd = -1;
  bool is_open = false;
  bool maybe_register = true;
};
}  // namespace apex::contrib::gds


================================================
FILE: apex/contrib/csrc/gpu_direct_storage/gds_pybind.cpp
================================================
// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.

#include <gds.h>
#include <torch/extension.h>
#include <torch/torch.h>

#include <string>

// python bindings
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  py::class_<apex::contrib::gds::File, std::shared_ptr<apex::contrib::gds::File>>(m, "_GDSFile")
      .def(py::init<>())
      .def(py::init<const std::string&, const std::string&>())
      .def("open", &apex::contrib::gds::File::open)
      .def("close", &apex::contrib::gds::File::close)
      .def("load_data", &apex::contrib::gds::File::load_data)
      .def("save_data", &apex::contrib::gds::File::save_data)
      .def("load_data_no_gds", &apex::contrib::gds::File::load_data_no_gds)
      .def("save_data_no_gds", &apex::contrib::gds::File::save_data_no_gds);
}


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc.cpp
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */
#include <assert.h>
#include <float.h>
#include <group_norm_nhwc.h>
#include <group_norm_nhwc_bwd_one_pass.h>
#include <group_norm_nhwc_fwd_one_pass.h>
#include <string.h>
#include <traits.h>

#include <type_traits>

template <typename T>
float inline unpack(const T& x) {
  return {};
}

template <>
float inline unpack(const __half& x) {
  return __half2float(x);
}

template <>
float inline unpack(const __nv_bfloat16& x) {
  return __bfloat162float(x);
}

template <>
float inline unpack(const float& x) {
  return x;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename T>
void check_results(const char* name, const T* out, const T* ref, size_t elts, float tol) {
  // The number of errors.
  int failed = 0;
  // The number of infinite value.
  int infs = 0;
  // The min/max values.
  float min_val = FLT_MAX, max_val = -FLT_MAX, max_err = 0.f;
  // The total sum of error.
  double sum_err = 0.0;

  // The case we are checking.
  printf("\e[1;34mchecking.....................: %s\e[0m\n", name);
  fflush(stdout);

  // Iterate over the different values.
  for (size_t ii = 0; ii < elts; ++ii) {
    float a = unpack(out[ii]);
    float b = unpack(ref[ii]);

    // Compute the absolute norms.
    float abs_a = fabsf(a);
    float abs_b = fabsf(b);

    // Compute the error.
    float den = abs_a + abs_b;
    // Is one of the quantities very small?
    bool is_small = abs_a <= tol || abs_b <= tol || den <= tol;
    // The error.
    float err = is_small ? fabsf(a - b) : fabsf(a - b) / den;
    // Is the result ok?
    bool ok = !isnan(a) && !isnan(b) && err <= tol;

    // Print the error.
    if (!ok && (failed < 10 || err > max_err)) {
      fprintf(stderr, ">> invalid result for ii=%lu:\n", ii);
      if (std::is_same<T, __half>::value || std::is_same<T, __nv_bfloat16>::value) {
        // The data.
        fprintf(stderr, ">>   found...: 0x%04x (%10.6f)\n", reinterpret_cast<const uint16_t&>(out[ii]), a);
        fprintf(stderr, ">>   expected: 0x%04x (%10.6f)\n", reinterpret_cast<const uint16_t&>(ref[ii]), b);
      } else if (std::is_same<T, float>::value) {
        fprintf(stderr, ">>   found...: 0x%08x (%10.6f)\n", reinterpret_cast<const uint32_t&>(a), a);
        fprintf(stderr, ">>   expected: 0x%08x (%10.6f)\n", reinterpret_cast<const uint32_t&>(b), b);
      } else {
        fprintf(stderr, "\e[1;34mUnknown type of check_results\e[0m\n");
        exit(1);
      }
      fprintf(stderr, ">>   error...: %.6f\n", err);
    }

    // Update the number of failures.
    failed += ok ? 0 : 1;

    // Measure min/max errors.
    min_val = fminf(min_val, a);
    max_val = fmaxf(max_val, a);
    max_err = fmaxf(max_err, err);

    // Accumulate the sum.
    sum_err = sum_err + (double)err;

    infs += !isfinite(a);
    infs += !isfinite(b);
  }

  if (!failed && infs < 10) {
    printf("\e[1;32mcheck........................: OK\e[0m\n");
  } else {
    printf("\e[1;31mcheck........................: FAILED\e[0m\n");
  }

  printf("tested.......................: %lu\n", elts);
  printf("failures.....................: %d\n", failed);
  printf("failure rate.................: %.2lf%%\n", (double)failed * 100.0 / (double)elts);
  printf("infs.........................: %d\n", infs);
  printf("tolerance....................: %.8f\n", tol);
  printf("\n");

  printf("min. value...................: %.6f\n", min_val);
  printf("max. value...................: %.6f\n", max_val);
  printf("max. error...................: %.6f\n", max_err);
  printf("sum. error...................: %.6lf\n", sum_err);
  printf("avg. error...................: %.6lf\n", sum_err / (double)elts);
  printf("\n");
}

template void check_results(const char* name, const __half* out, const __half* ref, size_t elts, float tol);

template void check_results(const char* name, const __nv_bfloat16* out, const __nv_bfloat16* ref, size_t elts,
                            float tol);

template void check_results(const char* name, const float* out, const float* ref, size_t elts, float tol);

////////////////////////////////////////////////////////////////////////////////////////////////////

static void group_norm_nhwc_bwd_(void* dx_h, float* dgamma_h, float* dbeta_h, const void* dy_h, const void* x_h,
                                 const float* gamma_h, const float* beta_h, const float2* sums_h, float epsilon, int n,
                                 int h, int w, int c, int groups, bool with_swish, bool use_fp32, bool use_bf16) {
  // The number of channels in each group.
  int channels_per_group = c / groups;
  // The normalization term to compute the means.
  float rcp_hwc_per_group = 1.f / (float)(h * w * channels_per_group);

  // The array to compute gamma.
  float* dgamma = (float*)malloc(c * sizeof(float));
  // The array to compute beta.
  float* dbeta = (float*)malloc(c * sizeof(float));

  // Set gamma/beta to 0.
  memset(dgamma, 0, c * sizeof(float));
  memset(dbeta, 0, c * sizeof(float));

  // Normalize the activations.
  for (int ni = 0; ni < n; ++ni) {
    for (int gi = 0; gi < groups; ++gi) {
      // The sums from the fwd pass.
      float2 sums = sums_h[ni * groups + gi];
      // The mean of X (computed during the fwd pass -- one value per batch*group).
      float x_mean = sums.x;
      // The mean of squares of X (computed during the fwd pass -- one value per batch*group).
      float x_sq_mean = sums.y;
      // The variance.
      float x_var = x_sq_mean - x_mean * x_mean;
      // The reciprocal of the standard deviation (i.e. 1.f / sqrt(var + epsilon)).
      float rcp_x_stddev = x_var <= 0.f ? 1.f : 1.f / sqrtf(x_var + epsilon);

      // TODO: We should store rcp_x_stddev instead of the sums of squares.

      // The following nested loops compute 2 means.
      float mean_1 = 0.f, mean_2 = 0.f;

      // Iterate over the activations in the group.
      for (int hi = 0; hi < h; ++hi) {
        for (int wi = 0; wi < w; ++wi) {
          for (int ii = 0; ii < channels_per_group; ++ii) {
            // The channel.
            int ci = gi * channels_per_group + ii;
            // Compute the src/dst offset.
            size_t offset = (size_t)ni * h * w * c + (size_t)hi * w * c + (size_t)wi * c + (size_t)ci;
            // Convert the element at that position to float.
            float x;
            if (use_fp32) {
              x = reinterpret_cast<const float*>(x_h)[offset];
            } else if (use_bf16) {
              x = __bfloat162float(reinterpret_cast<const __nv_bfloat16*>(x_h)[offset]);
            } else {
              x = __half2float(reinterpret_cast<const __half*>(x_h)[offset]);
            }
            // The output.
            float dy;
            if (use_fp32) {
              dy = reinterpret_cast<const float*>(dy_h)[offset];
            } else if (use_bf16) {
              dy = __bfloat162float(reinterpret_cast<const __nv_bfloat16*>(dy_h)[offset]);
            } else {
              dy = __half2float(reinterpret_cast<const __half*>(dy_h)[offset]);
            }

            // Gamma.
            float gamma = gamma_h[ci];

            // X - X_mean.
            float x_minus_x_mean = x - x_mean;
            // Normalize X.
            float x_norm = x_minus_x_mean * rcp_x_stddev;

            if (with_swish) {
              // Beta
              float beta = beta_h[ci];

              float x_gn = x_norm * gamma + beta;
              float s = sigmoid(x_gn);
              dy = dy * s * (1.f + x_gn * (1.f - s));
            }

            // Compute the gradient for beta.
            dbeta[ci] += dy;

            // Compute the gradient for gamma.
            dgamma[ci] += dy * x_norm;

            // The gradient that enters the x_norm node.
            float dx_norm = dy * gamma;

            // Accumulators over 2 means
            mean_1 += x_norm * dx_norm;
            mean_2 += dx_norm;

          }  // ii
        }  // wi
      }  // hi

      mean_1 *= rcp_hwc_per_group;
      mean_2 *= rcp_hwc_per_group;

      // Iterate over the activations in the group.
      for (int hi = 0; hi < h; ++hi) {
        for (int wi = 0; wi < w; ++wi) {
          for (int ii = 0; ii < channels_per_group; ++ii) {
            // The channel.
            int ci = gi * channels_per_group + ii;
            // Compute the src/dst offset.
            size_t offset = (size_t)ni * h * w * c + (size_t)hi * w * c + (size_t)wi * c + (size_t)ci;
            float x;
            if (use_fp32) {
              x = reinterpret_cast<const float*>(x_h)[offset];
            } else if (use_bf16) {
              x = __bfloat162float(reinterpret_cast<const __nv_bfloat16*>(x_h)[offset]);
            } else {
              x = __half2float(reinterpret_cast<const __half*>(x_h)[offset]);
            }
            // The output.
            float dy;
            if (use_fp32) {
              dy = reinterpret_cast<const float*>(dy_h)[offset];
            } else if (use_bf16) {
              dy = __bfloat162float(reinterpret_cast<const __nv_bfloat16*>(dy_h)[offset]);
            } else {
              dy = __half2float(reinterpret_cast<const __half*>(dy_h)[offset]);
            }

            // Gamma.
            float gamma = gamma_h[ci];

            // X - X_mean.
            float x_minus_x_mean = x - x_mean;
            // Normalize X.
            float x_norm = x_minus_x_mean * rcp_x_stddev;

            if (with_swish) {
              // Beta
              float beta = beta_h[ci];

              float x_gn = x_norm * gamma + beta;
              float s = sigmoid(x_gn);
              dy = dy * s * (1.f + x_gn * (1.f - s));
            }

            // The gradient that enters the x_norm node.
            float dx_norm = dy * gamma;

            // Input gradient
            float dx = (dx_norm - (x_norm * mean_1 + mean_2)) * rcp_x_stddev;

            // Set the output gradient.
            if (use_fp32) {
              reinterpret_cast<float*>(dx_h)[offset] = dx;
            } else if (use_bf16) {
              reinterpret_cast<__nv_bfloat16*>(dx_h)[offset] = __float2bfloat16_rn(dx);
            } else {
              reinterpret_cast<__half*>(dx_h)[offset] = __float2half_rn(dx);
            }

          }  // ii
        }  // wi
      }  // hi

    }  // gi
  }  // ni

  // Store gamma/beta.
  for (int ci = 0; ci < c; ++ci) {
    dgamma_h[ci] = dgamma[ci];
    dbeta_h[ci] = dbeta[ci];
  }

  // Release temporary memory.
  free(dgamma);
  free(dbeta);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

static void group_norm_nhwc_fwd_(void* y_h, const void* x_h, const float* gamma_h, const float* beta_h, float epsilon,
                                 int n, int h, int w, int c, int groups, bool with_swish, bool use_fp32,
                                 bool use_bf16) {
  // The number of channels in each group.
  int channels_per_group = c / groups;

  // The normalization term to compute the means.
  float inv_hwcg = 1.f / (float)(h * w * channels_per_group);

  // Normalize the activations.
  for (int ni = 0; ni < n; ++ni) {
    for (int gi = 0; gi < groups; ++gi) {
      // The sums to compute the mean/variance for that group.
      float sum = 0.f, sum_sq = 0.f;

      // Iterate over the activations in the group.
      for (int hi = 0; hi < h; ++hi) {
        for (int wi = 0; wi < w; ++wi) {
          for (int ii = 0; ii < channels_per_group; ++ii) {
            // The channel.
            int ci = gi * channels_per_group + ii;
            // Compute the src/dst offset.
            size_t offset = (size_t)ni * h * w * c + (size_t)hi * w * c + (size_t)wi * c + (size_t)ci;
            // Convert the element at that position to float.
            float x;
            if (use_fp32) {
              x = reinterpret_cast<const float*>(x_h)[offset];
            } else if (use_bf16) {
              x = __bfloat162float(reinterpret_cast<const __nv_bfloat16*>(x_h)[offset]);
            } else {
              x = __half2float(reinterpret_cast<const __half*>(x_h)[offset]);
            }

            // Update the sums.
            sum += x;
            sum_sq += x * x;

          }  // ii
        }  // wi
      }  // hi

      // Compute the mean.
      float mean = sum * inv_hwcg;
      // Compute the average value for the squares.
      float mean_sq = sum_sq * inv_hwcg;
      // Compute the variance.
      float var = mean_sq - (mean * mean);
      // Invert the variance.
      float inv_stddev = var <= 0.f ? 1.f : (1.f / sqrtf(var + epsilon));

      // Iterate over the data to normalize the output.
      for (int hi = 0; hi < h; ++hi) {
        for (int wi = 0; wi < w; ++wi) {
          for (int ii = 0; ii < channels_per_group; ++ii) {
            // The channel.
            int ci = gi * channels_per_group + ii;
            // Compute the src/dst offset.
            size_t offset = (size_t)ni * h * w * c + (size_t)hi * w * c + (size_t)wi * c + (size_t)ci;
            // Normalize.
            float x;
            if (use_fp32) {
              x = reinterpret_cast<const float*>(x_h)[offset];
            } else if (use_bf16) {
              x = __bfloat162float(reinterpret_cast<const __nv_bfloat16*>(x_h)[offset]);
            } else {
              x = __half2float(reinterpret_cast<const __half*>(x_h)[offset]);
            }
            float y = (x - mean) * inv_stddev;
            // Scale with gamma and add beta.
            y = y * gamma_h[ci] + beta_h[ci];
            // Apply swish (if needed).
            if (with_swish) {
              y = y * sigmoid(y);
            }
            // Store the result.
            if (use_fp32) {
              reinterpret_cast<float*>(y_h)[offset] = y;
            } else if (use_bf16) {
              reinterpret_cast<__nv_bfloat16*>(y_h)[offset] = __float2bfloat16_rn(y);
            } else {
              reinterpret_cast<__half*>(y_h)[offset] = __float2half_rn(y);
            }

          }  // ii
        }  // wi
      }  // hi
    }  // gi
  }  // ni
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename T>
void random_data(T* dst_h, size_t n, bool use_1s, int range = 3) {
  for (size_t ii = 0; ii < n; ++ii) {
    float x = 1.f;
    if (!use_1s) {
      x = (float)(rand() % range - (range / 2));
    }
    if (std::is_same<T, __half>::value) {
      dst_h[ii] = __float2half_rn(x);
    } else if (std::is_same<T, float>::value) {
      dst_h[ii] = x;
    } else if (std::is_same<T, __nv_bfloat16>::value) {
      dst_h[ii] = __float2bfloat16_rn(x);
    } else {
      fprintf(stderr, "\e[1;34mUnknown type of random_data\e[0m\n");
      exit(1);
    }
  }
}

template void random_data(float* dst_h, size_t n, bool use_1s, int range);

template void random_data(__half* dst_h, size_t n, bool use_1s, int range);

template void random_data(__nv_bfloat16* dst_h, size_t n, bool use_1s, int range);

////////////////////////////////////////////////////////////////////////////////////////////////////

enum class Mode { FWD_INFERENCE, FWD_TRAINING, BWD };

////////////////////////////////////////////////////////////////////////////////////////////////////

int main(int argc, char** argv) {
  // The tensor size.
  int n = 2, h = 64, w = 64, c = 320, groups = 32;
  // The default mode is inference.
  Mode mode = Mode::FWD_INFERENCE;
  // The constant epsilon for sqrt(var + epsilon).
  float epsilon = 1.e-5f;
  // Do we fuse with the Swish activation function?
  bool with_swish = false;
  // Do we use the one-pass kernel?
  bool use_one_pass = false;
  // The number of runs to time the code.
  int runs = 1;
  // Do we use 1s for the input data.
  bool use_1s = false;
  // The tolerance to check the results.
  float tol = 1.e-3f;
  // Do we skip the checks?
  bool skip_checks = false;
  // Do we output csv format only
  bool csv_output = false;
  // Use fp32 IO
  bool use_fp32 = false;
  // Use bf16 IO
  bool use_bf16 = false;

  // Parse the parameters.
  for (int ii = 1; ii < argc; ++ii) {
    if (!strcmp(argv[ii], "-1s")) {
      use_1s = true;
    } else if (!strcmp(argv[ii], "-bwd")) {
      mode = Mode::BWD;
    } else if (!strcmp(argv[ii], "-c") && ++ii < argc) {
      c = strtol(argv[ii], nullptr, 10);
    } else if (!strcmp(argv[ii], "-epsilon") && ++ii < argc) {
      epsilon = (float)strtod(argv[ii], nullptr);
    } else if (!strcmp(argv[ii], "-fwd")) {
      mode = Mode::FWD_INFERENCE;
    } else if (!strcmp(argv[ii], "-fwd-tr")) {
      mode = Mode::FWD_TRAINING;
    } else if (!strcmp(argv[ii], "-groups") && ++ii < argc) {
      groups = strtol(argv[ii], nullptr, 10);
    } else if (!strcmp(argv[ii], "-h") && ++ii < argc) {
      h = strtol(argv[ii], nullptr, 10);
    } else if (!strcmp(argv[ii], "-n") && ++ii < argc) {
      n = strtol(argv[ii], nullptr, 10);
    } else if (!strcmp(argv[ii], "-one-pass")) {
      use_one_pass = true;
    } else if (!strcmp(argv[ii], "-runs") && ++ii < argc) {
      runs = strtol(argv[ii], nullptr, 10);
    } else if (!strcmp(argv[ii], "-skip-checks")) {
      skip_checks = true;
    } else if (!strcmp(argv[ii], "-tol") && ++ii < argc) {
      tol = (float)strtod(argv[ii], nullptr);
    } else if (!strcmp(argv[ii], "-w") && ++ii < argc) {
      w = strtol(argv[ii], nullptr, 10);
    } else if (!strcmp(argv[ii], "-with-swish")) {
      with_swish = true;
    } else if (!strcmp(argv[ii], "-csv")) {
      csv_output = true;
    } else if (!strcmp(argv[ii], "-fp32")) {
      use_fp32 = true;
    } else if (!strcmp(argv[ii], "-bf16")) {
      use_bf16 = true;
    } else if (ii < argc) {
      fprintf(stderr, "Unknown argument: %s\n", argv[ii]);
      return 1;
    } else {
      fprintf(stderr, "Argument %s requires a value\n", argv[ii - 1]);
      return 1;
    }
  }

  if (use_bf16 && use_fp32) {
    fprintf(stderr, "Can't use fp32 and bf16 IO at the same time\n");
    return 1;
  }

  // Header.
  if (!csv_output) {
    printf("\n");
    printf("#######################################################################\n");
    printf("# Group Norm NHWC + Swish kernel\n");
    printf("# --\n");
    printf("# Compiled on %s\n", __DATE__);
    printf("#######################################################################\n");
    printf("\n");
  }

  // GPU info.
  cudaDeviceProp props;
  CHECK_CUDA(cudaGetDeviceProperties(&props, 0));
  if (!csv_output) {
    printf("device.......................: %s\n", props.name);
    printf("cc...........................: %d.%d\n", props.major, props.minor);
    printf("# of sms.....................: %d\n", props.multiProcessorCount);
  }

  // Dram peak bandwidth.
  float dram_clock = props.memoryClockRate / 1.e6f;
  float dram_peak = 2.f * dram_clock * props.memoryBusWidth / 8.f;
  if (!csv_output) {
    printf("dram clock...................: %.3f GHz\n", dram_clock);
    printf("dram peak....................: %.3f TB/s\n", dram_peak * 1.e-3f);
    printf("\n");
  }

  // Output the problem size.
  if (!csv_output) {
    printf("n............................: %d\n", n);
    printf("h............................: %d\n", h);
    printf("w............................: %d\n", w);
    printf("c............................: %d\n", c);
    printf("groups.......................: %d\n", groups);
    printf("epsilon......................: %f\n", epsilon);
    printf("with swish...................: %s\n", with_swish ? "true" : "false");
    printf("channels per group...........: %d\n", c / groups);
    if (mode == Mode::BWD) {
      printf("mode.........................: bwd\n");
    } else if (mode == Mode::FWD_INFERENCE) {
      printf("mode.........................: fwd inference\n");
    } else if (mode == Mode::FWD_TRAINING) {
      printf("mode.........................: fwd training\n");
    } else {
      assert(false);
    }
    printf("\n");
  }

  // Compute the SOL.
  double bytes = 0;
  int32_t io_bytes = use_fp32 ? sizeof(float) : sizeof(__half);
  if (mode != Mode::BWD) {
    bytes = (double)n * h * w * c * io_bytes +  // src
            (double)c * 4 +                     // gamma
            (double)c * 4 +                     // beta
            (double)n * h * w * c * io_bytes;   // out
  } else {
    bytes = (double)n * h * w * c * io_bytes * 2 +  // src, dsrc
            (double)c * 4 * 2 +                     // gamma, dgamma
            (double)c * 4 * 2 +                     // beta, dbeta
            (double)n * h * w * c * io_bytes * 1;   // dout
  }
  double gbytes = bytes * 1.e-9;
  double dram_sol = gbytes / dram_peak * 1.e3;
  if (!csv_output) {
    printf("bytes........................: %.3lfGB\n", gbytes);
    printf("dram sol.....................: %.6lfms\n", dram_sol);

    // The number of runs to measure performance.
    printf("runs.........................: %d\n", runs);
    printf("\n");
  }

  // The number of elements in the x tensor. The layout is N x H x W x C.
  size_t x_elts = (size_t)n * h * w * c;
  // The size of the src in bytes.
  size_t x_sz = x_elts * io_bytes;

  // Allocate the src/dst on the host.
  void* x_h = malloc(x_sz);
  void* y_h = malloc(x_sz);

  // Allocate src/dst on the device.
  void *x_d, *y_d;
  CHECK_CUDA(cudaMalloc((void**)&x_d, x_sz));
  CHECK_CUDA(cudaMalloc((void**)&y_d, x_sz));

  // The number of elements in the gamma/beta array.
  size_t gamma_elts = (size_t)c;
  // The size of the gamma/beta array in bytes.
  size_t gamma_sz = gamma_elts * sizeof(float);
  // Allocate gamma/beta on the host.
  float* gamma_h = (float*)malloc(gamma_sz);
  // Allocate gamma/beta on the device.
  float* gamma_d;
  CHECK_CUDA(cudaMalloc((void**)&gamma_d, gamma_sz));

  // Allocate gamma/beta on the host.
  float* beta_h = (float*)malloc(gamma_sz);
  // Allocate gamma/beta on the device.
  float* beta_d;
  CHECK_CUDA(cudaMalloc((void**)&beta_d, gamma_sz));

  // Allocate the reference on the host (to be computed on the host).
  void* y_ref_h = nullptr;
  if (!skip_checks) {
    y_ref_h = malloc(x_sz);
  }

  // Allocate the src/dst on the host for the gradients (bwd).
  void *dx_h = nullptr, *dy_h = nullptr;
  if (mode == Mode::BWD) {
    dx_h = malloc(x_sz);
    dy_h = malloc(x_sz);
  }

  // Allocate src/dst on the device.
  void *dx_d = nullptr, *dy_d = nullptr;
  if (mode == Mode::BWD) {
    CHECK_CUDA(cudaMalloc((void**)&dx_d, x_sz));
    CHECK_CUDA(cudaMalloc((void**)&dy_d, x_sz));
  }

  // The gradients for gamma and beta on the host.
  float *dgamma_h = nullptr, *dbeta_h = nullptr;
  if (mode == Mode::BWD) {
    dgamma_h = (float*)malloc(gamma_sz);
    dbeta_h = (float*)malloc(gamma_sz);
  }

  // The gradients for gamma and beta on the device.
  float *dgamma_d = nullptr, *dbeta_d = nullptr;
  if (mode == Mode::BWD) {
    CHECK_CUDA(cudaMalloc((void**)&dgamma_d, gamma_sz));
    CHECK_CUDA(cudaMalloc((void**)&dbeta_d, gamma_sz));
  }

  // The number of sums for the bwd pass.
  size_t sums_elts = mode == Mode::FWD_INFERENCE ? 0 : n * groups;
  // The size needed to store that array.
  size_t sums_sz = sums_elts * sizeof(float2);

  // The sums for the bwd pass on the host.
  float2* sums_h = nullptr;
  if (sums_sz > 0) {
    sums_h = (float2*)malloc(sums_sz);
  }

  // The sums for the bwd pass on the device.
  float2* sums_d = nullptr;
  if (sums_sz > 0) {
    CHECK_CUDA(cudaMalloc((void**)&sums_d, sums_sz));
  }

  // Allocate the reference on the host (to be computed on the host).
  void* dx_ref_h = nullptr;
  if (mode == Mode::BWD && !skip_checks) {
    dx_ref_h = malloc(x_sz);
  }

  // Allocate the reference on the host (to be computed on the host).
  float *dgamma_ref_h = nullptr, *dbeta_ref_h = nullptr;
  if (mode == Mode::BWD && !skip_checks) {
    dgamma_ref_h = (float*)malloc(gamma_sz);
    dbeta_ref_h = (float*)malloc(gamma_sz);
  }

  // Generate random input data for the forward pass.
  if (use_fp32) {
    random_data<float>(reinterpret_cast<float*>(x_h), x_elts, use_1s);
  } else if (use_bf16) {
    random_data<__nv_bfloat16>(reinterpret_cast<__nv_bfloat16*>(x_h), x_elts, use_1s);
  } else {
    random_data<__half>(reinterpret_cast<__half*>(x_h), x_elts, use_1s);
  }
  random_data<float>(gamma_h, gamma_elts, use_1s);
  random_data<float>(beta_h, gamma_elts, use_1s);

  // Generate the gradients for the bwd pass.
  if (mode == Mode::BWD) {
    if (use_fp32) {
      random_data<float>(reinterpret_cast<float*>(dy_h), x_elts, use_1s);
    } else if (use_bf16) {
      random_data<__nv_bfloat16>(reinterpret_cast<__nv_bfloat16*>(dy_h), x_elts, use_1s);
    } else {
      random_data<__half>(reinterpret_cast<__half*>(dy_h), x_elts, use_1s);
    }
  }

  // Precompute the sums (from the fwd pass) for bwd.
  if (mode == Mode::BWD) {
    // Clear the array of sums (all the elements are set to 0.f).
    memset(sums_h, 0, sums_sz);

    // The number of channels in each group.
    int channels_per_group = c / groups;
    // Iterate over the different groups.
    for (int ni = 0; ni < n; ++ni) {
      for (int gi = 0; gi < groups; ++gi) {
        for (int hi = 0; hi < h; ++hi) {
          for (int wi = 0; wi < w; ++wi) {
            for (int ii = 0; ii < channels_per_group; ++ii) {
              // The position of the channel.
              int ci = gi * channels_per_group + ii;
              // The offset to the element.
              int64_t offset = (int64_t)ni * h * w * c + hi * w * c + wi * c + ci;
              // The element in float.
              float x;
              if (use_fp32) {
                x = reinterpret_cast<float*>(x_h)[offset];
              } else if (use_bf16) {
                x = __bfloat162float(reinterpret_cast<__nv_bfloat16*>(x_h)[offset]);
              } else {
                x = __half2float(reinterpret_cast<__half*>(x_h)[offset]);
              }

              // Update the sums (sum of X and sum of squares).
              sums_h[ni * groups + gi].x += x;
              sums_h[ni * groups + gi].y += x * x;
            }
          }
        }
      }
    }

    // The normalization term to compute the means.
    float rcp_hwc_per_group = 1.f / (float)(h * w * channels_per_group);
    // Normalize the sums.
    for (int ngi = 0; ngi < n * groups; ++ngi) {
      sums_h[ngi].x *= rcp_hwc_per_group;
      sums_h[ngi].y *= rcp_hwc_per_group;
    }
  }

  // Compute the golden reference on the host.
  if (!skip_checks) {
    if (mode == Mode::BWD) {
      group_norm_nhwc_bwd_(dx_ref_h, dgamma_ref_h, dbeta_ref_h, dy_h, x_h, gamma_h, beta_h, sums_h, epsilon, n, h, w, c,
                           groups, with_swish, use_fp32, use_bf16);
    } else {
      group_norm_nhwc_fwd_(y_ref_h, x_h, gamma_h, beta_h, epsilon, n, h, w, c, groups, with_swish, use_fp32, use_bf16);
    }
  }

  // Copy to the device.
  CHECK_CUDA(cudaMemcpyAsync(x_d, x_h, x_sz, cudaMemcpyHostToDevice, cudaStreamDefault));
  CHECK_CUDA(cudaMemcpyAsync(gamma_d, gamma_h, gamma_sz, cudaMemcpyHostToDevice, cudaStreamDefault));
  CHECK_CUDA(cudaMemcpyAsync(beta_d, beta_h, gamma_sz, cudaMemcpyHostToDevice, cudaStreamDefault));

  if (mode == Mode::BWD) {
    CHECK_CUDA(cudaMemcpyAsync(dy_d, dy_h, x_sz, cudaMemcpyHostToDevice, cudaStreamDefault));

    // // DEBUG.
    // printf("sums_h[0] = %8.3f, %8.3f\n", sums_h[0].x, sums_h[0].y);
    // // END OF DEBUG.

    CHECK_CUDA(cudaMemcpyAsync(sums_d, sums_h, sums_sz, cudaMemcpyHostToDevice, cudaStreamDefault));
  }

  // Reset the output buffer with garbage to detect invalid results.
  if (mode == Mode::BWD) {
    CHECK_CUDA(cudaMemsetAsync(dx_d, 0xdc, x_sz, cudaStreamDefault));
    CHECK_CUDA(cudaMemsetAsync(dgamma_d, 0xdc, gamma_sz, cudaStreamDefault));
    CHECK_CUDA(cudaMemsetAsync(dbeta_d, 0xdc, gamma_sz, cudaStreamDefault));
  } else {
    CHECK_CUDA(cudaMemsetAsync(y_d, 0xdc, x_sz, cudaStreamDefault));
  }

  // Declare the parameters.
  Group_norm_nhwc_fwd_params params_fwd;
  memset(&params_fwd, 0, sizeof(params_fwd));
  Group_norm_nhwc_bwd_params params_bwd;
  memset(&params_bwd, 0, sizeof(params_bwd));

  const auto precision = [&]() -> PrecisionMode {
    if (use_fp32) {
      return PrecisionMode::FP32IOFP32W;
    } else if (use_bf16) {
      return PrecisionMode::BF16IOFP32W;
    } else {
      return PrecisionMode::FP16IOFP32W;
    }
  }();

  // Initialize the parameters.
  if (mode == Mode::BWD) {
    params_bwd.dx = dx_d;
    params_bwd.dgamma = dgamma_d;
    params_bwd.dbeta = dbeta_d;
    params_bwd.sums = sums_d;
    params_bwd.dy = dy_d;
    params_bwd.x = x_d;
    params_bwd.gamma = gamma_d;
    params_bwd.beta = beta_d;
    params_bwd.epsilon = epsilon;
    params_bwd.n = n;
    params_bwd.h = h;
    params_bwd.w = w;
    params_bwd.c = c;
    params_bwd.groups = groups;
    params_bwd.with_swish = with_swish;
    params_bwd.precision = precision;
  } else {
    params_fwd.y = y_d;
    params_fwd.sums = sums_d;
    params_fwd.x = x_d;
    params_fwd.gamma = gamma_d;
    params_fwd.beta = beta_d;
    params_fwd.epsilon = epsilon;
    params_fwd.n = n;
    params_fwd.h = h;
    params_fwd.w = w;
    params_fwd.c = c;
    params_fwd.groups = groups;
    params_fwd.with_swish = with_swish;
    params_fwd.precision = precision;
  }

  // The number of barriers.
  size_t barriers_elts = 0;
  // The number of elements in the reduction buffer.
  size_t red_buffer_elts = 0;
  // The number of elements in the reduction buffer that must be zeroed.
  size_t zeroed_red_buffer_elts = 0;

  // Finalize the parameters.
  dim3 grid;
  if (mode == Mode::BWD && use_one_pass) {
    group_norm_nhwc_bwd_one_pass_setup(params_bwd, barriers_elts, red_buffer_elts, zeroed_red_buffer_elts, grid, props);
  } else if (mode == Mode::BWD) {
    group_norm_nhwc_bwd_two_passes_setup(params_bwd, zeroed_red_buffer_elts);
  } else if (use_one_pass) {
    group_norm_nhwc_fwd_one_pass_setup(params_fwd, barriers_elts, red_buffer_elts, grid, props);
  } else {
    group_norm_nhwc_fwd_two_passes_setup(params_fwd, zeroed_red_buffer_elts);
  }

  // The size in bytes for the reduction buffer.
  size_t red_buffer_sz = red_buffer_elts * sizeof(float);
  // Allocate on the device.
  if (red_buffer_sz > 0) {
    float** ptr = mode == Mode::BWD ? &params_bwd.red_buffer : &params_fwd.red_buffer;
    CHECK_CUDA(cudaMalloc((void**)ptr, red_buffer_sz));
  }

  // The size of the array of barriers.
  size_t barriers_sz = barriers_elts * sizeof(int);
  // The size in bytes for the reduction buffer that must be zeroed.
  size_t zeroed_red_buffer_sz = barriers_sz + zeroed_red_buffer_elts * sizeof(float);

  // Allocate the buffer if needed.
  void* zeroed_red_buffer_d_ = nullptr;
  if (zeroed_red_buffer_sz > 0) {
    CHECK_CUDA(cudaMalloc((void**)&zeroed_red_buffer_d_, zeroed_red_buffer_sz));
  }

  // The buffer of barriers. DO NOT CALL cudaFree on it!!!
  int* barriers_d = reinterpret_cast<int*>(zeroed_red_buffer_d_);
  // The zeroed red buffer. DO NOT CALL cudaFree on it!!!
  float* zeroed_red_buffer_d = reinterpret_cast<float*>(&barriers_d[barriers_elts]);
  // Must be aligned on 4B for floats. It obviously is (unless someone changes the code ;)).
  assert(reinterpret_cast<const int64_t&>(zeroed_red_buffer_d) % sizeof(float) == 0);

  // Set the barriers if needed.
  if (mode == Mode::BWD) {
    params_bwd.barriers = barriers_d;
    params_bwd.zeroed_red_buffer = zeroed_red_buffer_d;
  } else {
    params_fwd.barriers = barriers_d;
    params_fwd.zeroed_red_buffer = zeroed_red_buffer_d;
  }

  // Create events to time the reference code.
  cudaEvent_t start, stop;
  CHECK_CUDA(cudaEventCreate(&start));
  CHECK_CUDA(cudaEventCreate(&stop));

  // Time the reference code.
  CHECK_CUDA(cudaEventRecord(start));
  for (int ii = 0; ii < runs; ++ii) {
    // Clear the zeroed buffer if needed.
    if (zeroed_red_buffer_sz > 0) {
      CHECK_CUDA(cudaMemsetAsync(zeroed_red_buffer_d_, 0, zeroed_red_buffer_sz, cudaStreamDefault));
    }
    if (use_one_pass && mode == Mode::BWD) {
      group_norm_nhwc_bwd_one_pass_run(params_bwd, grid, cudaStreamDefault);
    } else if (use_one_pass) {
      group_norm_nhwc_fwd_one_pass_run(params_fwd, grid, cudaStreamDefault);
    } else if (mode == Mode::BWD) {
      group_norm_nhwc_bwd_two_passes_sum(params_bwd, cudaStreamDefault);
      group_norm_nhwc_bwd_two_passes_scale(params_bwd, cudaStreamDefault);
    } else {
      group_norm_nhwc_fwd_two_passes_sum(params_fwd, cudaStreamDefault);
      group_norm_nhwc_fwd_two_passes_scale(params_fwd, cudaStreamDefault);
    }
  }
  CHECK_CUDA(cudaEventRecord(stop));
  CHECK_CUDA(cudaDeviceSynchronize());

  // Print the runtime.
  float elapsed = 0.f;
  CHECK_CUDA(cudaEventElapsedTime(&elapsed, start, stop));
  if (!csv_output) {
    printf("elapsed......................: %.3fms\n", elapsed);
    printf("elapsed per run..............: %.3fms\n", elapsed / (float)runs);
    printf("efficiency...................: %.3lf%%\n", dram_sol * runs / elapsed * 100.0);
    printf("\n");
  }

  // Copy the results to the host.
  if (mode == Mode::BWD) {
    CHECK_CUDA(cudaMemcpyAsync(dx_h, dx_d, x_sz, cudaMemcpyDeviceToHost, cudaStreamDefault));
    CHECK_CUDA(cudaMemcpyAsync(dgamma_h, dgamma_d, gamma_sz, cudaMemcpyDeviceToHost, cudaStreamDefault));
    CHECK_CUDA(cudaMemcpyAsync(dbeta_h, dbeta_d, gamma_sz, cudaMemcpyDeviceToHost, cudaStreamDefault));
  } else {
    CHECK_CUDA(cudaMemcpyAsync(y_h, y_d, x_sz, cudaMemcpyDeviceToHost, cudaStreamDefault));
  }

  // Make sure the data has been transferred.
  CHECK_CUDA(cudaStreamSynchronize(cudaStreamDefault));

  // Check the results.
  if (!csv_output) {
    if (mode == Mode::BWD && !skip_checks) {
      if (use_fp32) {
        check_results<float>("dx", reinterpret_cast<float*>(dx_h), reinterpret_cast<float*>(dx_ref_h), x_elts, tol);
      } else if (use_bf16) {
        check_results<__nv_bfloat16>("dx", reinterpret_cast<__nv_bfloat16*>(dx_h),
                                     reinterpret_cast<__nv_bfloat16*>(dx_ref_h), x_elts, tol);
      } else {
        check_results<__half>("dx", reinterpret_cast<__half*>(dx_h), reinterpret_cast<__half*>(dx_ref_h), x_elts, tol);
      }
      check_results<float>("dgamma", dgamma_h, dgamma_ref_h, gamma_elts, tol);
      check_results<float>("dbeta", dbeta_h, dbeta_ref_h, gamma_elts, tol);
    } else if (!skip_checks) {
      if (use_fp32) {
        check_results<float>("y", reinterpret_cast<float*>(y_h), reinterpret_cast<float*>(y_ref_h), x_elts, tol);
      } else if (use_bf16) {
        check_results<__nv_bfloat16>("y", reinterpret_cast<__nv_bfloat16*>(y_h),
                                     reinterpret_cast<__nv_bfloat16*>(y_ref_h), x_elts, tol);
      } else {
        check_results<__half>("y", reinterpret_cast<__half*>(y_h), reinterpret_cast<__half*>(y_ref_h), x_elts, tol);
      }
    }
  } else {
    printf("%d,%d,%d,%d,%d,%d,%d,%f\n", n, h, w, c, groups, (uint32_t)use_one_pass, (uint32_t)mode,
           elapsed / (float)runs);
  }

  // Destroy the cuda events.
  CHECK_CUDA(cudaEventDestroy(start));
  CHECK_CUDA(cudaEventDestroy(stop));

  // Release device memory.
  CHECK_CUDA(cudaFree(x_d));
  CHECK_CUDA(cudaFree(y_d));
  CHECK_CUDA(cudaFree(gamma_d));
  CHECK_CUDA(cudaFree(beta_d));
  CHECK_CUDA(cudaFree(dx_d));
  CHECK_CUDA(cudaFree(dy_d));
  CHECK_CUDA(cudaFree(dgamma_d));
  CHECK_CUDA(cudaFree(dbeta_d));
  CHECK_CUDA(cudaFree(sums_d));
  CHECK_CUDA(cudaFree(zeroed_red_buffer_d_));
  CHECK_CUDA(cudaFree(params_bwd.red_buffer));
  CHECK_CUDA(cudaFree(params_fwd.red_buffer));

  // Release host memory.
  free(x_h);
  free(y_h);
  free(gamma_h);
  free(beta_h);
  free(dx_h);
  free(dy_h);
  free(dgamma_h);
  free(dbeta_h);
  free(sums_h);
  free(y_ref_h);
  free(dx_ref_h);
  free(dgamma_ref_h);
  free(dbeta_ref_h);

  // Release the GPU.
  CHECK_CUDA(cudaDeviceReset());
  return 0;
}

////////////////////////////////////////////////////////////////////////////////////////////////////


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc.h
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */
#pragma once

#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

////////////////////////////////////////////////////////////////////////////////////////////////////

#define CHECK_CUDA(call)                                                                            \
  do {                                                                                              \
    cudaError_t status_ = call;                                                                     \
    if (status_ != cudaSuccess) {                                                                   \
      fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, cudaGetErrorString(status_)); \
      exit(1);                                                                                      \
    }                                                                                               \
  } while (0)

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ __host__ int div_up(int m, int n) { return (m + n - 1) / n; }

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ __host__ float sigmoid(float x) { return 1.f / (1.f + expf(-x)); }

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline __device__ void spin_wait_(int* barrier, int step, int expected) {
  // THE FOLLOWING CODE MUST BE EXECUTED BY A SINGLE THREAD IN THE CTA.

  // Update the global counter. Make sure prior writes are visible.
  asm volatile("red.release.gpu.global.add.s32 [%0], %1;" ::"l"(barrier), "r"(step));

  // Busy wait. We could use found = old + step with old = atomicAdd(...) but it's not faster.
  for (volatile int found = -1; found != expected;) {
    asm volatile("ld.global.acquire.gpu.b32 %0, [%1];" : "=r"(found) : "l"(barrier));
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

// Input type followed by parameter type
enum PrecisionMode {
  FP32IOFP16W,
  FP32IOBF16W,
  FP32IOFP32W,
  FP16IOFP16W,
  FP16IOBF16W,
  FP16IOFP32W,
  BF16IOFP16W,
  BF16IOBF16W,
  BF16IOFP32W,
};

////////////////////////////////////////////////////////////////////////////////////////////////////

struct Group_sums {
  // Is it the 1st element of the group?
  int flag;
  // The sum.
  float sum;
  // The sum of squares.
  float sum_sq;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

struct Group_sums_op {
  inline __device__ Group_sums operator()(const Group_sums& a, const Group_sums& b) {
    Group_sums dst;
    dst.sum = b.flag ? b.sum : (a.sum + b.sum);
    dst.sum_sq = b.flag ? b.sum_sq : (a.sum_sq + b.sum_sq);
    dst.flag = a.flag + b.flag;
    return dst;
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

struct Group_norm_nhwc_fwd_params {
  // The output buffer. Layout NHWC.
  void* y;
  // The sums for the bwd pass. Not written if it is a nullptr.
  float2* sums;
  // The input buffer. Layout NHWC.
  const void* x;
  // The gamma scaling factor.
  const void* gamma;
  // The beta term to add in GN.
  const void* beta;
  // The constant epsilon for sqrt(var + epsilon).
  float epsilon;
  // The barriers for the persistent kernel.
  int* barriers;
  // The extra storage for multi-CTA reductions as well as to pass data to the bwd.
  float *red_buffer, *zeroed_red_buffer;

  // The number of instances in the batch.
  int n;
  // The height and width of each activation map. The number of channels.
  int64_t h, w, c, hw, hwc;
  // The number of groups.
  int groups;
  // Do we apply the Swish activation function?
  bool with_swish;

  // Precomputed values and parameters to control the execution of the kernels.

  // The number of batch instances per block.
  int instances_per_block;
  // The number of activations computed per block.
  int acts_per_block;
  // The number of groups in each block.
  int groups_per_block;
  // The number of channels per group = c / groups.
  int channels_per_group;
  // The number of channels per block = groups_per_block * channels_per_group.
  int channels_per_block;
  // The inverse of hwc in floats (to compute mean/var).
  float inv_hwc_per_group;
  // IO precision
  PrecisionMode precision;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

void group_norm_nhwc_fwd_two_passes_setup(Group_norm_nhwc_fwd_params&, size_t& red_buffer_elts);

////////////////////////////////////////////////////////////////////////////////////////////////////

void group_norm_nhwc_fwd_two_passes_sum(const Group_norm_nhwc_fwd_params&, cudaStream_t);

////////////////////////////////////////////////////////////////////////////////////////////////////

void group_norm_nhwc_fwd_two_passes_scale(const Group_norm_nhwc_fwd_params&, cudaStream_t);

////////////////////////////////////////////////////////////////////////////////////////////////////

struct Group_norm_nhwc_bwd_params {
  // The output buffer. Layout NHWC.
  void* dx;
  // The output buffer. Layout NHWC.
  void* dgamma;
  // The output buffer. Layout NHWC.
  void* dbeta;
  // The input buffer. Layout NHWC.
  const void* dy;
  // The input buffer. Layout NHWC.
  const void* x;
  // The gamma scaling factor.
  const void* gamma;
  // The beta term to add in GN.
  const void* beta;
  // The sums from the fwd pass.
  const float2* sums;
  // The constant epsilon for sqrt(var + epsilon).
  float epsilon;
  // The barriers for the persistent kernel.
  int* barriers;
  // The extra storage for multi-CTA reductions as well as to pass data to the bwd.
  float *red_buffer, *zeroed_red_buffer;

  // The number of instances in the batch.
  int n;
  // The height and width of each activation map. The number of channels.
  int64_t h, w, c, hw, hwc;
  // The number of groups.
  int groups;
  // Do we apply the Swish activation function?
  bool with_swish;

  // Precomputed values and parameters to control the execution of the kernels.

  // The number of batch instances per block.
  int instances_per_block;
  // The number of activations computed per block.
  int acts_per_block;
  // The number of groups in each block.
  int groups_per_block;
  // The number of channels per group = c / groups.
  int channels_per_group;
  // The number of channels per block = groups_per_block * channels_per_group.
  int channels_per_block;
  // The inverse of hwc in floats (to compute mean/var).
  float inv_hwc_per_group;
  // IO precision
  PrecisionMode precision;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

void group_norm_nhwc_bwd_two_passes_setup(Group_norm_nhwc_bwd_params&, size_t& red_buffer_elts);

////////////////////////////////////////////////////////////////////////////////////////////////////

void group_norm_nhwc_bwd_two_passes_sum(const Group_norm_nhwc_bwd_params&, cudaStream_t);

////////////////////////////////////////////////////////////////////////////////////////////////////

void group_norm_nhwc_bwd_two_passes_scale(const Group_norm_nhwc_bwd_params&, cudaStream_t);

////////////////////////////////////////////////////////////////////////////////////////////////////


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_bwd_one_pass.h
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */
#include <assert.h>

#include <algorithm>

#include "group_norm_nhwc.h"
#include "macros.h"
#include "traits.h"

////////////////////////////////////////////////////////////////////////////////////////////////////
//
// B A C K W A R D
//
////////////////////////////////////////////////////////////////////////////////////////////////////

#define GN_BWD_SELECT(FUNC_POSTFIX, function)                                                    \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(4, FUNC_POSTFIX, function)     \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(8, FUNC_POSTFIX, function)     \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(10, FUNC_POSTFIX, function)    \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(12, FUNC_POSTFIX, function)    \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(14, FUNC_POSTFIX, function)    \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(16, FUNC_POSTFIX, function)    \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(20, FUNC_POSTFIX, function)    \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(26, FUNC_POSTFIX, function)    \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(24, FUNC_POSTFIX, function)    \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(28, FUNC_POSTFIX, function)    \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(30, FUNC_POSTFIX, function)    \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(32, FUNC_POSTFIX, function)    \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(40, FUNC_POSTFIX, function)    \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(42, FUNC_POSTFIX, function)    \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(48, FUNC_POSTFIX, function)    \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(56, FUNC_POSTFIX, function)    \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(60, FUNC_POSTFIX, function)    \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(64, FUNC_POSTFIX, function)    \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(70, FUNC_POSTFIX, function)    \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(80, FUNC_POSTFIX, function)    \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(84, FUNC_POSTFIX, function)    \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(96, FUNC_POSTFIX, function)    \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(98, FUNC_POSTFIX, function)    \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(112, FUNC_POSTFIX, function)   \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(120, FUNC_POSTFIX, function)   \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(128, FUNC_POSTFIX, function)   \
  GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(160, FUNC_POSTFIX, function) { \
    assert(false && "Not implemented");                                                          \
  }

////////////////////////////////////////////////////////////////////////////////////////////////////

#define GN_BWD_RUNNER_SELECT(function) GN_BWD_SELECT(_run, function)

#define GN_BWD_BLOCKS_PER_SM_SELECT(function) GN_BWD_SELECT(_blocks_per_sm, function)

////////////////////////////////////////////////////////////////////////////////////////////////////

GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 4)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 8)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 10)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 12)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 14)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 16)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 20)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 26)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 24)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 28)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 30)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 32)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 40)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 42)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 48)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 56)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 60)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 64)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 70)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 80)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 84)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 96)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 98)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 112)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 120)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 128)
GN_BWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 160)

////////////////////////////////////////////////////////////////////////////////////////////////////

void group_norm_nhwc_bwd_one_pass_setup(Group_norm_nhwc_bwd_params& params, size_t& barriers_elts,
                                        size_t& red_buffer_elts, size_t& zeroed_red_buffer_elts, dim3& grid,
                                        const cudaDeviceProp& props) {
  // The pre-computed dimensions.
  params.hw = params.h * params.w;
  params.hwc = params.c * params.hw;

  // The number of channels per group.
  params.channels_per_group = params.c / params.groups;
  // The inverse to compute the mean/variance.
  params.inv_hwc_per_group = 1.f / (float)(params.hw * params.channels_per_group);

  // Define how many activations are computed per block.
  if ((params.hw >= 1024 && params.channels_per_group >= 80) ||
      (params.hw >= 256 && params.channels_per_group >= 160)) {
    params.acts_per_block = 8 * 16;
  } else if (params.hw >= 512) {
    params.acts_per_block = 32 * 16;
  } else if (params.hw >= 256) {
    params.acts_per_block = 16 * 16;
  } else if (params.hw >= 128) {
    params.acts_per_block = 8 * 16;
  } else if (params.hw > 0) {
    params.acts_per_block = 8 * 8;
  } else {
    // We should never be here if params are set correctly.
    assert(false);
  }

  // Define the number of blocks per activation map. TODO: Make sure it matches the kernel sizes.
  int blocks_per_slice = div_up(params.hw, params.acts_per_block);

  // Select the kernel.
  using Function_t = int (*)();

  Function_t blocks_per_sm_function;
  GN_BWD_BLOCKS_PER_SM_SELECT(blocks_per_sm_function);
  // The number of blocks that can be run per SM.
  int blocks_per_sm = blocks_per_sm_function();

  // The number of blocks per grid.
  int max_blocks_per_grid = blocks_per_sm * props.multiProcessorCount;

  // Make sure we are safe to run that many blocks
  assert(blocks_per_slice <= max_blocks_per_grid);

  // The number of blocks per slice is the X dimension of the grid.
  grid.x = blocks_per_slice;
  // The number of groups *  is the X dimension of the grid.
  grid.y = std::min(max_blocks_per_grid / blocks_per_slice, params.groups * params.n);

  // The number of barriers.
  barriers_elts = blocks_per_slice > 1 ? grid.y * 2 : 0;

  // Add 1 for the final conversion for dgamma/dbeta.
  barriers_elts += 1;

  // The number of elements in the reduction buffer (for the sums and sums of squared).
  if (blocks_per_slice == 1) {
    red_buffer_elts = 0;
  } else {
    // The first 2 is for double-buffering. The 2nd one is for the fact that we have two floats.
    red_buffer_elts = 2 * grid.x * grid.y * 2;
  }

  // The number of elements in the buffer that has to be zeroed.
  zeroed_red_buffer_elts = params.c * 2;

  // Make sure a group does not span multiple blocks.
  assert(params.channels_per_block % params.channels_per_group == 0);
}

inline void group_norm_nhwc_bwd_one_pass_run(const Group_norm_nhwc_bwd_params& params, const dim3& grid,
                                             cudaStream_t stream) {
  using Function_t = void (*)(const Group_norm_nhwc_bwd_params&, const dim3&, cudaStream_t);

  Function_t runner;
  GN_BWD_RUNNER_SELECT(runner);

  runner(params, grid, stream);
}


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_bwd_one_pass_kernel.cuh
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */
#include <assert.h>

#include <cub/cub.cuh>

#include "group_norm_nhwc.h"
#include "traits.h"

////////////////////////////////////////////////////////////////////////////////////////////////////
//
// B A C K W A R D
//
////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Traits_, int ACTS_PER_BLOCK_, int CHANNELS_PER_GROUP_, int THREADS_PER_BLOCK_>
__global__ __launch_bounds__(THREADS_PER_BLOCK_) void group_norm_nhwc_bwd_one_pass_kernel(
    Group_norm_nhwc_bwd_params params) {
  // The IO traits.
  using Traits = Traits_;
  // The IO traits.
  using IOTraits = typename Traits::IOTraits;
  // The Weights traits.
  using WTraits = typename Traits::WTraits;

  // The IO type
  using IOType = typename IOTraits::Type;
  // The IO doubled type
  using IOType2 = typename IOTraits::Type2;

  // Weights type
  using WType = typename WTraits::Type;
  // Weights doubled type
  using WType2 = typename WTraits::Type2;

  // The number of activations per block.
  constexpr int ACTS_PER_BLOCK = ACTS_PER_BLOCK_;
  // The number of channels per group.
  constexpr int CHANNELS_PER_GROUP = CHANNELS_PER_GROUP_;
  // The number of threads per block.
  constexpr int THREADS_PER_BLOCK = THREADS_PER_BLOCK_;
  // The number of channels per thread (load fp16x2 numbers).
  constexpr int CHANNELS_PER_THREAD = 2;

  // The number of threads needed per activation.
  constexpr int THREADS_PER_ACT = CHANNELS_PER_GROUP / CHANNELS_PER_THREAD;
  // The number of activations that are loaded per loop.
  constexpr int ACTS_PER_LOOP = THREADS_PER_BLOCK / THREADS_PER_ACT;
  // The number of rows per thread.
  constexpr int ACTS_PER_THREAD = (ACTS_PER_BLOCK + ACTS_PER_LOOP - 1) / ACTS_PER_LOOP;

  // The number of active threads.
  constexpr int ACTIVE_THREADS = THREADS_PER_BLOCK / THREADS_PER_ACT * THREADS_PER_ACT;

  // The object in charge of doing the sums for the block.
  typedef cub::BlockReduce<float2, THREADS_PER_BLOCK> Block_reduce;
  // Allocate shared memory for Block_reduce.
  __shared__ typename Block_reduce::TempStorage temp_storage;
  // Allocate shared memory to store the sums.
  __shared__ float2 smem_sums;
  // Allocate shared memory to store the gamma/beta gradients.
  __shared__ float4 smem_dgamma_dbeta[THREADS_PER_BLOCK];

  // Shared memory to store the gradients for gamma and beta.

  // The first activation loaded by that thread.
  int hwi = blockIdx.x * params.acts_per_block + threadIdx.x / THREADS_PER_ACT;
  // The first channel loaded by that thread.
  int ci = threadIdx.x % THREADS_PER_ACT * CHANNELS_PER_THREAD;

  // Is it an active thread?
  const bool is_active = threadIdx.x < ACTIVE_THREADS;

  // Iterate over the iterms in the batch.
  for (int ngi = blockIdx.y, step = 0; ngi < params.n * params.groups; ngi += gridDim.y, ++step) {
    // The instance and the group. TODO: Use fast divmod?
    int ni = ngi / params.groups;
    int gi = ngi % params.groups;

    // The sums from the fwd pass.
    float2 fwd = params.sums[ngi];
    // The mean of X (computed during the fwd pass -- one value per batch*group).
    float x_mean = fwd.x;
    // The mean of squares of X (computed during the fwd pass -- one value per batch*group).
    float x_sq_mean = fwd.y;
    // The variance.
    float x_var = x_sq_mean - x_mean * x_mean;
    // The reciprocal of the standard deviation (i.e. 1.f / sqrt(var + epsilon)).
    float rcp_x_stddev = x_var <= 0.f ? 1.f : 1.f / sqrtf(x_var + params.epsilon);

    // The offset to the first activation loaded by that thread.
    const int64_t offset = (int64_t)ni * params.hwc + gi * CHANNELS_PER_GROUP + ci;
    // The pointer to the first activation loaded by that thread.
    const IOType* x_ptr = &reinterpret_cast<const IOType*>(params.x)[offset];
    // The pointer to the first gradient loaded by that thread.
    const IOType* dy_ptr = &reinterpret_cast<const IOType*>(params.dy)[offset];

    // Load the X and dY into registers.
    IOType2 x[ACTS_PER_THREAD], dy[ACTS_PER_THREAD];
#pragma unroll
    for (int ii = 0; ii < ACTS_PER_THREAD; ++ii) {
      int hwj = hwi + ii * ACTS_PER_LOOP;
      x[ii] = IOTraits::zero();
      dy[ii] = IOTraits::zero();
      if (is_active && hwj < params.hw) {
        x[ii] = *reinterpret_cast<const IOType2*>(&x_ptr[hwj * params.c]);
        dy[ii] = *reinterpret_cast<const IOType2*>(&dy_ptr[hwj * params.c]);
      }
    }

    // Load gamma as well.
    float2 gamma_f2 = make_float2(0.f, 0.f);
    float2 beta_f2 = make_float2(0.f, 0.f);
    if (is_active) {
      gamma_f2 = WTraits::unpack(*reinterpret_cast<const WType2*>(
          &reinterpret_cast<const WType*>(params.gamma)[gi * CHANNELS_PER_GROUP + ci]));
      if (params.with_swish) {
        beta_f2 = WTraits::unpack(*reinterpret_cast<const WType2*>(
            &reinterpret_cast<const WType*>(params.beta)[gi * CHANNELS_PER_GROUP + ci]));
      }
    }

    // Gradients for gamma and beta (for this particular group).
    float4 dgamma_dbeta = make_float4(0.f, 0.f, 0.f, 0.f);
    // Accumulated gradients for dgrad calculation.
    float mean_1 = 0.f, mean_2 = 0.f;

// Compute the sum and the sum of squares for each thread.
#pragma unroll
    for (int ii = 0; ii < ACTS_PER_THREAD; ++ii) {
      // Convert x to float.
      float2 x_f2 = IOTraits::unpack(x[ii]);
      // Convert dY to float.
      float2 dy_f2 = IOTraits::unpack(dy[ii]);

      // X - X_mean.
      float x_minus_x_mean_x = x_f2.x - x_mean;
      float x_minus_x_mean_y = x_f2.y - x_mean;

      // Normalize X.
      float x_norm_x = x_minus_x_mean_x * rcp_x_stddev;
      float x_norm_y = x_minus_x_mean_y * rcp_x_stddev;

      if (params.with_swish) {
        float x_gn_x = x_norm_x * gamma_f2.x + beta_f2.x;
        float x_gn_y = x_norm_y * gamma_f2.y + beta_f2.y;
        float s_x = sigmoid(x_gn_x);
        float s_y = sigmoid(x_gn_y);
        dy_f2.x = dy_f2.x * s_x * (1.f + x_gn_x * (1.f - s_x));
        dy_f2.y = dy_f2.y * s_y * (1.f + x_gn_y * (1.f - s_y));
      }

      // Update beta.
      dgamma_dbeta.z += dy_f2.x;
      dgamma_dbeta.w += dy_f2.y;

      // Update dgamma.
      dgamma_dbeta.x += dy_f2.x * x_norm_x;
      dgamma_dbeta.y += dy_f2.y * x_norm_y;

      // The gradient that enters the x_norm node.
      float dx_norm_x = dy_f2.x * gamma_f2.x;
      float dx_norm_y = dy_f2.y * gamma_f2.y;

      // Add to the 1st mean.
      mean_1 += dx_norm_x * x_norm_x;
      mean_1 += dx_norm_y * x_norm_y;

      // Add to the 2nd mean.
      mean_2 += dx_norm_x;
      mean_2 += dx_norm_y;
    }

    // Pack valid gradients.
    float2 sums = make_float2(0.f, 0.f);
    if (ACTIVE_THREADS == THREADS_PER_BLOCK || is_active) {
      sums = make_float2(mean_1, mean_2);
    }

    // Store dgamma and dbeta to shared memory.
    smem_dgamma_dbeta[threadIdx.x] = dgamma_dbeta;

    // Compute the sums for the block.
    sums = Block_reduce(temp_storage).Reduce(sums, [](const float2& a, const float2& b) {
      return make_float2(a.x + b.x, a.y + b.y);
    });

    // Make sure we can read gamma/beta from smemory. Block_reduce uses one syncthread already.
    __syncthreads();

    // Compute gamma/beta for the block.
    if (threadIdx.x < THREADS_PER_ACT) {
      for (int ii = 1; ii < ACTS_PER_LOOP; ++ii) {
        float4 other = smem_dgamma_dbeta[threadIdx.x + ii * THREADS_PER_ACT];
        dgamma_dbeta.x += other.x;
        dgamma_dbeta.y += other.y;
        dgamma_dbeta.z += other.z;
        dgamma_dbeta.w += other.w;
      }
    }

    // The position in the channel dimension - 2 channels per thread.
    int cj = gi * THREADS_PER_ACT + threadIdx.x;
    // The reduction buffer dfor gamma/dbeta.
    float* red_buffer_dgamma_dbeta = &params.zeroed_red_buffer[cj];

    // The first threads store their gradients for gamma/beta.
    if (threadIdx.x < THREADS_PER_ACT) {
      atomicAdd(&red_buffer_dgamma_dbeta[0 * params.c / 2], dgamma_dbeta.x);
      atomicAdd(&red_buffer_dgamma_dbeta[1 * params.c / 2], dgamma_dbeta.y);
      atomicAdd(&red_buffer_dgamma_dbeta[2 * params.c / 2], dgamma_dbeta.z);
      atomicAdd(&red_buffer_dgamma_dbeta[3 * params.c / 2], dgamma_dbeta.w);
    }

    // The block leader stores to global memory, if needed.
    if (gridDim.x > 1) {
      // The index of the buffer.
      int red_buffer_idx = step & 1;
      // The barrier.
      int* barrier = &params.barriers[red_buffer_idx * gridDim.y + blockIdx.y];
      // The offset to the reduction buffer.
      int red_buffer_offset = red_buffer_idx * gridDim.x * gridDim.y * 2;
      // The reduction buffer.
      float2* red_buffer = reinterpret_cast<float2*>(&params.red_buffer[red_buffer_offset]);

      // The offset to the reduction buffer for dgamma/dbeta.

      // The first thread stores its sums.
      if (threadIdx.x == 0) {
        red_buffer[blockIdx.x * gridDim.y + blockIdx.y] = sums;
      }

      // Make sure the data is in memory.
      if (threadIdx.x == 0) {
        spin_wait_(barrier, (step & 2) ? -1 : 1, (step & 2) ? 0 : gridDim.x);
      }
      __syncthreads();

      // Update the sums.
      for (int ii = 0; ii < gridDim.x; ++ii) {
        if (ii != blockIdx.x && threadIdx.x == 0) {
          float2 other_sums = red_buffer[ii * gridDim.y + blockIdx.y];
          sums.x += other_sums.x;
          sums.y += other_sums.y;
        }
      }
    }

    // Store the result for other threads.
    if (threadIdx.x == 0) {
      smem_sums = sums;
    }

    // Make sure the sums are in shared memory.
    __syncthreads();

    // Read the 1st mean from shared memory.
    mean_1 = smem_sums.x;
    // Read the 2nd mean from shared memory.
    mean_2 = smem_sums.y;

    mean_1 *= params.inv_hwc_per_group;
    mean_2 *= params.inv_hwc_per_group;

    // The pointer to the first activation stored by that thread.
    IOType* dx_ptr = &reinterpret_cast<IOType*>(params.dx)[offset];

    // Iterate over the activations to normalize the activations and store the results.
    for (int ii = 0; ii < ACTS_PER_THREAD; ++ii) {
      // Convert x to float.
      float2 x_f2 = IOTraits::unpack(x[ii]);
      // Convert dY to float.
      float2 dy_f2 = IOTraits::unpack(dy[ii]);

      // X - X_mean.
      float2 x_minus_x_mean_f2;
      x_minus_x_mean_f2.x = x_f2.x - x_mean;
      x_minus_x_mean_f2.y = x_f2.y - x_mean;
      // Normalize X.
      float2 x_norm_f2;
      x_norm_f2.x = x_minus_x_mean_f2.x * rcp_x_stddev;
      x_norm_f2.y = x_minus_x_mean_f2.y * rcp_x_stddev;

      if (params.with_swish) {
        float x_gn_x = x_norm_f2.x * gamma_f2.x + beta_f2.x;
        float x_gn_y = x_norm_f2.y * gamma_f2.y + beta_f2.y;
        float s_x = sigmoid(x_gn_x);
        float s_y = sigmoid(x_gn_y);
        dy_f2.x = dy_f2.x * s_x * (1.f + x_gn_x * (1.f - s_x));
        dy_f2.y = dy_f2.y * s_y * (1.f + x_gn_y * (1.f - s_y));
      }

      // The gradient that enters the x_norm node.
      float2 dx_norm;
      dx_norm.x = dy_f2.x * gamma_f2.x;
      dx_norm.y = dy_f2.y * gamma_f2.y;

      // The gradient along the input path.
      float2 dx;
      dx.x = (dx_norm.x - (x_norm_f2.x * mean_1 + mean_2)) * rcp_x_stddev;
      dx.y = (dx_norm.y - (x_norm_f2.y * mean_1 + mean_2)) * rcp_x_stddev;

      // Store the scaled values.
      int hwj = hwi + ii * ACTS_PER_LOOP;
      if (is_active && hwj < params.hw) {
        *reinterpret_cast<IOType2*>(&dx_ptr[hwj * params.c]) = IOTraits::pack(dx);
      }
    }
  }

  // The completion barrier.
  int* barrier = &params.barriers[gridDim.x == 1 ? 0 : gridDim.y * 2];

  // Mark the completion of the threadblock.
  if (threadIdx.x == 0) {
    asm volatile("red.release.gpu.global.add.s32 [%0], 1;" ::"l"(barrier));
  }

  // Exit if that's not the last thread block.
  if (blockIdx.x != gridDim.x - 1 || blockIdx.y != gridDim.y - 1) {
    return;
  }

  // Busy wait. We could use found = old + step with old = atomicAdd(...) but it's not faster.
  if (threadIdx.x == 0) {
    for (int found = -1; found != gridDim.x * gridDim.y;) {
      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];" : "=r"(found) : "l"(barrier));
    }
  }
  __syncthreads();

  // The last block converts dgamma and dbeta to half.
  for (int idx = threadIdx.x; idx < params.c / 2; idx += THREADS_PER_BLOCK) {
    // Load dgamma.
    float2 dgamma;
    dgamma.x = params.zeroed_red_buffer[idx + 0 * params.c / 2];
    dgamma.y = params.zeroed_red_buffer[idx + 1 * params.c / 2];

    // Load dbeta.
    float2 dbeta;
    dbeta.x = params.zeroed_red_buffer[idx + 2 * params.c / 2];
    dbeta.y = params.zeroed_red_buffer[idx + 3 * params.c / 2];

    // Store to global memory.
    *reinterpret_cast<WType2*>(&reinterpret_cast<WType*>(params.dgamma)[idx * 2]) = WTraits::pack(dgamma);
    *reinterpret_cast<WType2*>(&reinterpret_cast<WType*>(params.dbeta)[idx * 2]) = WTraits::pack(dbeta);
  }
}

//////////////////////////////////////////////////////////////////////////////////////////////////


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_bwd_two_pass.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */
#include <assert.h>

#include <cub/cub.cuh>

#include "group_norm_nhwc.h"
#include "macros.h"
#include "traits.h"

////////////////////////////////////////////////////////////////////////////////////////////////////
//
// B A C K W A R D
//
////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Traits_, int THREADS_PER_BLOCK>
__global__ void group_norm_nhwc_bwd_sum_kernel(Group_norm_nhwc_bwd_params params) {
  // The IO traits.
  using Traits = Traits_;
  // The IO traits.
  using IOTraits = typename Traits::IOTraits;
  // The Weights traits.
  using WTraits = typename Traits::WTraits;

  // The IO type
  using IOType = typename IOTraits::Type;
  // The IO doubled type
  using IOType2 = typename IOTraits::Type2;

  // Weights type
  using WType = typename WTraits::Type;
  // Weights doubled type
  using WType2 = typename WTraits::Type2;

  // The object in charge of doing the sums for the different blocks.
  typedef cub::BlockScan<Group_sums, THREADS_PER_BLOCK> Block_scan;

  // Allocate shared memory for Block_scan.
  __shared__ typename Block_scan::TempStorage temp_storage;
  // Allocate shared memory for the groups. We could reduce the amount of shared memory reserved.
  __shared__ float2 smem[THREADS_PER_BLOCK];

  // The instance in the batch.
  int ni = blockIdx.z;
  // The channel loaded by that thread (2 channels per thread for F16x2).
  int ci = blockIdx.x * params.channels_per_block + threadIdx.x * 2;
  // The group that thread works on and the channel in the group (modulus).
  int gi = ci / params.channels_per_group;

  // The sums from the fwd pass.
  float2 fwd = params.sums[ni * params.groups + gi];
  // The mean of X (computed during the fwd pass -- one value per batch*group).
  float x_mean = fwd.x;
  // The mean of squares of X (computed during the fwd pass -- one value per batch*group).
  float x_sq_mean = fwd.y;
  // The variance.
  float x_var = x_sq_mean - x_mean * x_mean;
  // The reciprocal of the standard deviation (i.e. 1.f / sqrt(var + epsilon)).
  float rcp_x_stddev = x_var <= 0.f ? 1.f : 1.f / sqrtf(x_var + params.epsilon);

  // Load gamma.
  float2 gamma_f2 = make_float2(0.f, 0.f);
  float2 beta_f2 = make_float2(0.f, 0.f);
  if (ci < params.c) {
    gamma_f2 = WTraits::unpack(*reinterpret_cast<const WType2*>(&reinterpret_cast<const WType*>(params.gamma)[ci]));
    if (params.with_swish) {
      beta_f2 = WTraits::unpack(*reinterpret_cast<const WType2*>(&reinterpret_cast<const WType*>(params.beta)[ci]));
    }
  }

  // The group that thread works on and the channel in the group (modulus).
  int gj = threadIdx.x * 2 / params.channels_per_group;
  int cj = threadIdx.x * 2 - params.channels_per_group * gj;

  // The first activation loaded by that block.
  int hw_begin = blockIdx.y * params.acts_per_block;
  // The last activation loaded by that block.
  int hw_end = min((int64_t)hw_begin + params.acts_per_block, params.hw);

  // The gradients for gamma/beta.
  float2 dgamma = make_float2(0.f, 0.f), dbeta = make_float2(0.f, 0.f);
  // Accumulated gradients for dgrad calculation
  float mean_1 = 0.f, mean_2 = 0.f;

  // Iterate over the activations to compute the sums.
  for (int hwi = hw_begin; hwi < hw_end; ++hwi) {
    // The offset.
    int64_t offset = (int64_t)ni * params.hwc + hwi * params.c + ci;

    // Fetch two channels per thread.
    IOType2 x_v2 = IOTraits::zero();
    IOType2 dy_v2 = IOTraits::zero();
    if (ci < params.c) {
      x_v2 = *reinterpret_cast<const IOType2*>(&reinterpret_cast<const IOType*>(params.x)[offset]);
      dy_v2 = *reinterpret_cast<const IOType2*>(&reinterpret_cast<const IOType*>(params.dy)[offset]);
    }

    // Extract the two half values.
    float2 x_f2 = IOTraits::unpack(x_v2);
    float2 dy_f2 = IOTraits::unpack(dy_v2);

    // X - X_mean.
    float x_minus_x_mean_x = x_f2.x - x_mean;
    float x_minus_x_mean_y = x_f2.y - x_mean;

    // Normalize X.
    float x_norm_x = x_minus_x_mean_x * rcp_x_stddev;
    float x_norm_y = x_minus_x_mean_y * rcp_x_stddev;

    if (params.with_swish) {
      float x_gn_x = x_norm_x * gamma_f2.x + beta_f2.x;
      float x_gn_y = x_norm_y * gamma_f2.y + beta_f2.y;
      float s_x = sigmoid(x_gn_x);
      float s_y = sigmoid(x_gn_y);
      dy_f2.x = dy_f2.x * s_x * (1.f + x_gn_x * (1.f - s_x));
      dy_f2.y = dy_f2.y * s_y * (1.f + x_gn_y * (1.f - s_y));
    }

    // Update beta.
    dbeta.x += dy_f2.x;
    dbeta.y += dy_f2.y;

    // Update dgamma.
    dgamma.x += dy_f2.x * x_norm_x;
    dgamma.y += dy_f2.y * x_norm_y;

    // The gradient that enters the x_norm node.
    float dx_norm_x = dy_f2.x * gamma_f2.x;
    float dx_norm_y = dy_f2.y * gamma_f2.y;

    // Add to the 1st mean.
    mean_1 += dx_norm_x * x_norm_x;
    mean_1 += dx_norm_y * x_norm_y;

    // Add to the 2nd mean.
    mean_2 += dx_norm_x;
    mean_2 += dx_norm_y;
  }

  // The data for the summations.
  Group_sums inp{cj == 0 ? 1 : 0, mean_1, mean_2};

  // Do the segmented scan.
  Group_sums out;
  Block_scan(temp_storage).InclusiveScan(inp, out, Group_sums_op());

  // Store the results for the groups in shared memory (to produce coalesced stores later).
  if (cj == params.channels_per_group - 2 /* 2 channels per thread */) {
    smem[gj] = make_float2(out.sum, out.sum_sq);
  }

  // Make sure the data is in shared memory.
  __syncthreads();

  // The global group index.
  int gk = blockIdx.x * params.groups_per_block + threadIdx.x;

  // The first threads (those storing to global memory, load the values).
  float2 sums = smem[threadIdx.x];

  // Store to global memory.
  if (threadIdx.x < params.groups_per_block && gk < params.groups) {
    atomicAdd(&params.zeroed_red_buffer[(2 * ni + 0) * params.groups + gk], sums.x);
    atomicAdd(&params.zeroed_red_buffer[(2 * ni + 1) * params.groups + gk], sums.y);
  }

  // The base pointer for the gradients for gamma and beta.
  float* dgamma_beta_ptr = &params.zeroed_red_buffer[params.n * params.groups * 2];

  // The 1st channel in the output tensor. NOTE: Two channels per thread store interleaved.
  int ck = blockIdx.x * params.channels_per_block + threadIdx.x;

  // Store dgamma and dbeta as well.
  if (ck < params.c) {
    atomicAdd(&dgamma_beta_ptr[0 * params.c + 0 * blockDim.x + ck], dgamma.x);
    atomicAdd(&dgamma_beta_ptr[0 * params.c + 1 * blockDim.x + ck], dgamma.y);
    atomicAdd(&dgamma_beta_ptr[1 * params.c + 0 * blockDim.x + ck], dbeta.x);
    atomicAdd(&dgamma_beta_ptr[1 * params.c + 1 * blockDim.x + ck], dbeta.y);
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

void group_norm_nhwc_bwd_two_passes_setup(Group_norm_nhwc_bwd_params& params, size_t& zeroed_red_buffer_elts) {
  // The pre-computed dimensions.
  params.hw = params.h * params.w;
  params.hwc = params.c * params.hw;

  // The number of channels per group.
  params.channels_per_group = params.c / params.groups;
  // The inverse to compute the mean/variance.
  params.inv_hwc_per_group = 1.f / (float)(params.hw * params.channels_per_group);

  // Define the number of blocks per activation map. That's a simple heuristic.
  int blocks_per_act_slice = 0;
  if (params.c >= 1280) {
    blocks_per_act_slice = 128 / params.n;
  } else if (params.c >= 640) {
    blocks_per_act_slice = 256 / params.n;
  } else {
    blocks_per_act_slice = 512 / params.n;
  }

  // Clamp to at least 1 to avoid divide-by-zero when batch size is large.
  blocks_per_act_slice = max(blocks_per_act_slice, 1);

  // Make sure we launch blocks per activation is no less than activations
  blocks_per_act_slice = min(blocks_per_act_slice, div_up(params.hw, params.n));

  // Define how many activations are computed per block.
  params.acts_per_block = div_up(params.hw, blocks_per_act_slice);

  // The number of channels per block.
  params.channels_per_block = 320;
  // Special case to deal with 30 channels per group.
  if (params.channels_per_block % params.channels_per_group != 0) {
    params.channels_per_block = 240;
  }

  // Special case to deal with 70 channels per group.
  if (params.c == 2240) {
    params.channels_per_block = 280;
  } else if (params.c == 832) {
    params.channels_per_block = 208;
  }

  if (params.c % params.channels_per_block != 0) {
    if (params.c % 512 == 0 && params.c != 1536 && params.c != 3072 && params.c % 448 != 0) {
      params.channels_per_block = 512;
    } else if (params.c % 42 == 0) {
      params.channels_per_block = 336;
    } else if (params.c % 384 == 0) {
      params.channels_per_block = 384;
    } else if (params.c % 256 == 0 && params.c % 448 != 0 && params.c % 392 != 0) {
      params.channels_per_block = 256;
    } else if (params.c % 128 == 0 && params.c % 448 != 0 && params.c % 392 != 0) {
      params.channels_per_block = 128;
    } else if (params.c % 448 == 0 && params.c % 392 != 0) {
      params.channels_per_block = 448;
    } else if (params.c % 392 == 0) {
      params.channels_per_block = 392;
    }
  }

  // The number of groups per block.
  params.groups_per_block = params.channels_per_block / params.channels_per_group;

  // Make sure the number of channels is a multiple of the number of channels per block.
  assert(params.c % params.channels_per_block == 0);
  // Make sure a group does not span multiple blocks.
  assert(params.channels_per_block % params.channels_per_group == 0);

  // The number of elements in the reduction buffer (for the sums and sums of squared).
  zeroed_red_buffer_elts = params.n * params.groups * 2 + params.c * 2;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

void group_norm_nhwc_bwd_two_passes_sum(const Group_norm_nhwc_bwd_params& params, cudaStream_t stream) {
  // The dimension of the grid.
  dim3 grid;

  // The number of blocks to compute all the channels.
  grid.x = params.c / params.channels_per_block;
  // The number of blocks to compute all the activations in a given instance.
  grid.y = div_up(params.hw, params.acts_per_block);
  // The number of instances.
  grid.z = params.n;

  if (params.precision == PrecisionMode::FP16IOFP16W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_bwd_sum_kernel, Fp16IOFp16W)
  } else if (params.precision == PrecisionMode::FP16IOBF16W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_bwd_sum_kernel, Fp16IOBf16W)
  } else if (params.precision == PrecisionMode::FP16IOFP32W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_bwd_sum_kernel, Fp16IOFp32W)
  } else if (params.precision == PrecisionMode::BF16IOFP16W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_bwd_sum_kernel, Bf16IOFp16W)
  } else if (params.precision == PrecisionMode::BF16IOBF16W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_bwd_sum_kernel, Bf16IOBf16W)
  } else if (params.precision == PrecisionMode::BF16IOFP32W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_bwd_sum_kernel, Bf16IOFp32W)
  } else if (params.precision == PrecisionMode::FP32IOFP16W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_bwd_sum_kernel, Fp32IOFp16W)
  } else if (params.precision == PrecisionMode::FP32IOBF16W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_bwd_sum_kernel, Fp32IOBf16W)
  } else {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_bwd_sum_kernel, Fp32IOFp32W)
  }

  // Make sure it launched ok.
  CHECK_CUDA(cudaGetLastError());
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Traits_, int THREADS_PER_BLOCK>
__global__ void group_norm_nhwc_bwd_scale_kernel(Group_norm_nhwc_bwd_params params) {
  // The IO traits.
  using Traits = Traits_;
  // The IO traits.
  using IOTraits = typename Traits::IOTraits;
  // The Weights traits.
  using WTraits = typename Traits::WTraits;

  // The IO type
  using IOType = typename IOTraits::Type;
  // The IO doubled type
  using IOType2 = typename IOTraits::Type2;

  // Weights type
  using WType = typename WTraits::Type;
  // Weights doubled type
  using WType2 = typename WTraits::Type2;

  // The instance in the batch.
  int ni = blockIdx.z;
  // The channel loaded by that thread (2 channels per thread for F16x2).
  int ci = blockIdx.x * params.channels_per_block + threadIdx.x * 2;
  // The group that thread works on and the channel in the group (modulus).
  int gi = ci / params.channels_per_group;

  // Load the gradients for the group.
  float mean_1 = 0.f, mean_2 = 0.f;
  if (gi < params.groups) {
    mean_1 = params.zeroed_red_buffer[(2 * ni + 0) * params.groups + gi];
    mean_2 = params.zeroed_red_buffer[(2 * ni + 1) * params.groups + gi];
  }

  // The sums from the fwd pass.
  float2 fwd = params.sums[ni * params.groups + gi];
  // The mean of X (computed during the fwd pass -- one value per batch*group).
  float x_mean = fwd.x;
  // The mean of squares of X (computed during the fwd pass -- one value per batch*group).
  float x_sq_mean = fwd.y;
  // The variance.
  float x_var = x_sq_mean - x_mean * x_mean;
  // The reciprocal of the standard deviation (i.e. 1.f / sqrt(var + epsilon)).
  float rcp_x_stddev = x_var <= 0.f ? 1.f : 1.f / sqrtf(x_var + params.epsilon);

  // Mutiply by 1/(HWC) to get real mean
  mean_1 *= params.inv_hwc_per_group;
  mean_2 *= params.inv_hwc_per_group;

  // Load gamma.
  float2 gamma_f2 = make_float2(0.f, 0.f);
  float2 beta_f2 = make_float2(0.f, 0.f);
  if (ci < params.c) {
    gamma_f2 = WTraits::unpack(*reinterpret_cast<const WType2*>(&reinterpret_cast<const WType*>(params.gamma)[ci]));
    if (params.with_swish) {
      beta_f2 = WTraits::unpack(*reinterpret_cast<const WType2*>(&reinterpret_cast<const WType*>(params.beta)[ci]));
    }
  }

  // The first activation loaded by that block.
  int hw_begin = blockIdx.y * params.acts_per_block;
  // The last activation loaded by that block.
  int hw_end = min((int64_t)hw_begin + params.acts_per_block, params.hw);

  // Iterate over the activations to compute the sums.
  for (int hwi = hw_begin; hwi < hw_end; ++hwi) {
    // The src/dst offset.
    int64_t offset = (int64_t)ni * params.hwc + hwi * params.c + ci;

    // Fetch two channels per thread.
    IOType2 x_v2 = IOTraits::zero();
    IOType2 dy_v2 = IOTraits::zero();
    if (ci < params.c) {
      x_v2 = *reinterpret_cast<const IOType2*>(&reinterpret_cast<const IOType*>(params.x)[offset]);
      dy_v2 = *reinterpret_cast<const IOType2*>(&reinterpret_cast<const IOType*>(params.dy)[offset]);
    }

    // Extract the two half values.
    float2 x_f2 = IOTraits::unpack(x_v2);
    float2 dy_f2 = IOTraits::unpack(dy_v2);

    // X - X_mean.
    float2 x_minus_x_mean_f2;
    x_minus_x_mean_f2.x = x_f2.x - x_mean;
    x_minus_x_mean_f2.y = x_f2.y - x_mean;

    // Normalize X.
    float2 x_norm_f2;
    x_norm_f2.x = x_minus_x_mean_f2.x * rcp_x_stddev;
    x_norm_f2.y = x_minus_x_mean_f2.y * rcp_x_stddev;

    if (params.with_swish) {
      float x_gn_x = x_norm_f2.x * gamma_f2.x + beta_f2.x;
      float x_gn_y = x_norm_f2.y * gamma_f2.y + beta_f2.y;
      float s_x = sigmoid(x_gn_x);
      float s_y = sigmoid(x_gn_y);
      dy_f2.x = dy_f2.x * s_x * (1.f + x_gn_x * (1.f - s_x));
      dy_f2.y = dy_f2.y * s_y * (1.f + x_gn_y * (1.f - s_y));
    }

    // The gradient that enters the x_norm node.
    float2 dx_norm;
    dx_norm.x = dy_f2.x * gamma_f2.x;
    dx_norm.y = dy_f2.y * gamma_f2.y;

    // The gradient along the input path.
    float2 dx;
    dx.x = (dx_norm.x - (x_norm_f2.x * mean_1 + mean_2)) * rcp_x_stddev;
    dx.y = (dx_norm.y - (x_norm_f2.y * mean_1 + mean_2)) * rcp_x_stddev;

    // Store the scaled values.
    if (ci < params.c) {
      *reinterpret_cast<IOType2*>(&reinterpret_cast<IOType*>(params.dx)[offset]) = IOTraits::pack(dx);
    }
  }

  // Load gamma/beta and convert to half.
  if (blockIdx.y > 0 || blockIdx.z > 0 || ci >= params.c) {
    return;
  }

  // The base pointer for the gradients for gamma and beta.
  float* dgamma_beta_ptr = &params.zeroed_red_buffer[params.n * params.groups * 2];

  // The 1st channel in the output tensor. NOTE: Two channels per thread store interleaved.
  int ck = blockIdx.x * params.channels_per_block + threadIdx.x;

  // Load the FP32 version of dgamma and dbeta.
  float2 dgamma, dbeta;
  if (ck < params.c) {
    dgamma.x = dgamma_beta_ptr[0 * params.c + 0 * blockDim.x + ck];
    dgamma.y = dgamma_beta_ptr[0 * params.c + 1 * blockDim.x + ck];
    dbeta.x = dgamma_beta_ptr[1 * params.c + 0 * blockDim.x + ck];
    dbeta.y = dgamma_beta_ptr[1 * params.c + 1 * blockDim.x + ck];

    // Convert to half2 and store to memory.
    *reinterpret_cast<WType2*>(&reinterpret_cast<WType*>(params.dgamma)[ci]) = WTraits::pack(dgamma);
    *reinterpret_cast<WType2*>(&reinterpret_cast<WType*>(params.dbeta)[ci]) = WTraits::pack(dbeta);
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

void group_norm_nhwc_bwd_two_passes_scale(const Group_norm_nhwc_bwd_params& params, cudaStream_t stream) {
  // The dimension of the grid.
  dim3 grid;

  // The number of blocks to compute all the channels.
  grid.x = params.c / params.channels_per_block;
  // The number of blocks to compute all the activations in a given instance.
  grid.y = div_up(params.hw, params.acts_per_block);
  // The number of instances.
  grid.z = params.n;

  if (params.precision == PrecisionMode::FP16IOFP16W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_bwd_scale_kernel, Fp16IOFp16W)
  } else if (params.precision == PrecisionMode::FP16IOBF16W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_bwd_scale_kernel, Fp16IOBf16W)
  } else if (params.precision == PrecisionMode::FP16IOFP32W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_bwd_scale_kernel, Fp16IOFp32W)
  } else if (params.precision == PrecisionMode::BF16IOFP16W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_bwd_scale_kernel, Bf16IOFp16W)
  } else if (params.precision == PrecisionMode::BF16IOBF16W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_bwd_scale_kernel, Bf16IOBf16W)
  } else if (params.precision == PrecisionMode::BF16IOFP32W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_bwd_scale_kernel, Bf16IOFp32W)
  } else if (params.precision == PrecisionMode::FP32IOFP16W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_bwd_scale_kernel, Fp32IOFp16W)
  } else if (params.precision == PrecisionMode::FP32IOBF16W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_bwd_scale_kernel, Fp32IOBf16W)
  } else {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_bwd_scale_kernel, Fp32IOFp32W)
  }

  // Make sure it launched ok.
  CHECK_CUDA(cudaGetLastError());
}

////////////////////////////////////////////////////////////////////////////////////////////////////


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_fwd_one_pass.h
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */
#include <assert.h>

#include <algorithm>

#include "group_norm_nhwc.h"
#include "macros.h"
#include "traits.h"

////////////////////////////////////////////////////////////////////////////////////////////////////
//
// F O R W A R D
//
////////////////////////////////////////////////////////////////////////////////////////////////////

#define GN_FWD_SELECT(FUNC_POSTFIX, function)                                                    \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(4, FUNC_POSTFIX, function)     \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(8, FUNC_POSTFIX, function)     \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(10, FUNC_POSTFIX, function)    \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(12, FUNC_POSTFIX, function)    \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(14, FUNC_POSTFIX, function)    \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(16, FUNC_POSTFIX, function)    \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(20, FUNC_POSTFIX, function)    \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(26, FUNC_POSTFIX, function)    \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(24, FUNC_POSTFIX, function)    \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(28, FUNC_POSTFIX, function)    \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(30, FUNC_POSTFIX, function)    \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(32, FUNC_POSTFIX, function)    \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(40, FUNC_POSTFIX, function)    \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(42, FUNC_POSTFIX, function)    \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(48, FUNC_POSTFIX, function)    \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(56, FUNC_POSTFIX, function)    \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(60, FUNC_POSTFIX, function)    \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(64, FUNC_POSTFIX, function)    \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(70, FUNC_POSTFIX, function)    \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(80, FUNC_POSTFIX, function)    \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(84, FUNC_POSTFIX, function)    \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(96, FUNC_POSTFIX, function)    \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(98, FUNC_POSTFIX, function)    \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(112, FUNC_POSTFIX, function)   \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(120, FUNC_POSTFIX, function)   \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(128, FUNC_POSTFIX, function)   \
  GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(160, FUNC_POSTFIX, function) { \
    assert(false && "Not implemented");                                                          \
  }

////////////////////////////////////////////////////////////////////////////////////////////////////

#define GN_FWD_RUNNER_SELECT(function) GN_FWD_SELECT(_run, function)

#define GN_FWD_BLOCKS_PER_SM_SELECT(function) GN_FWD_SELECT(_blocks_per_sm, function)

////////////////////////////////////////////////////////////////////////////////////////////////////

GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 4)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 8)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 10)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 12)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 14)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 16)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 20)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 26)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 24)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 28)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 30)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 32)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 40)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 42)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 48)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 56)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 60)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 64)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 70)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 80)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 84)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 96)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 98)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 112)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 120)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 128)
GN_FWD_ONE_PASS_DECLARATION(/* CHANNELS_PER_GROUP */ 160)

////////////////////////////////////////////////////////////////////////////////////////////////////

inline void group_norm_nhwc_fwd_one_pass_setup(Group_norm_nhwc_fwd_params& params, size_t& barriers_elts,
                                               size_t& red_buffer_elts, dim3& grid, const cudaDeviceProp& props) {
  // The pre-computed dimensions.
  params.hw = params.h * params.w;
  params.hwc = params.c * params.hw;

  // The number of channels per group.
  params.channels_per_group = params.c / params.groups;
  // The inverse to compute the mean/variance.
  params.inv_hwc_per_group = 1.f / (float)(params.hw * params.channels_per_group);

  // Select the kernel.
  using Function_t = int (*)();

  Function_t blocks_per_sm_function;
  GN_FWD_BLOCKS_PER_SM_SELECT(blocks_per_sm_function);

  // Define how many activations are computed per block.
  if (params.hw >= 1024 && params.channels_per_group >= 80 || (params.hw >= 256 && params.channels_per_group >= 160)) {
    params.acts_per_block = 8 * 16;
  } else if (params.hw >= 512) {
    params.acts_per_block = 16 * 32;
  } else if (params.hw >= 256) {
    params.acts_per_block = 16 * 16;
  } else if (params.hw >= 128) {
    params.acts_per_block = 8 * 16;
  } else if (params.hw > 0) {
    params.acts_per_block = 8 * 8;
  } else {
    // We should never be here if params are set correctly.
    assert(false);
  }

  // Define the number of blocks per activation map. TODO: Make sure it matches the kernel sizes.
  int blocks_per_slice = div_up(params.hw, params.acts_per_block);

  // The number of blocks that can be run per SM.
  int blocks_per_sm = blocks_per_sm_function();

  // The number of blocks per grid.
  int max_blocks_per_grid = blocks_per_sm * props.multiProcessorCount;

  // Make sure we are safe to run that many blocks
  assert(blocks_per_slice <= max_blocks_per_grid);

  // The number of blocks per slice is the X dimension of the grid.
  grid.x = blocks_per_slice;
  // The number of groups *  is the X dimension of the grid.
  grid.y = std::min(max_blocks_per_grid / blocks_per_slice, params.groups * params.n);

  // The number of barriers.
  barriers_elts = blocks_per_slice > 1 ? grid.y * 2 : 0;

  // The number of elements in the reduction buffer (for the sums and sums of squared).
  if (blocks_per_slice == 1) {
    red_buffer_elts = 0;
  } else {
    // The first 2 is for double-buffering. The 2nd one is for the fact that we have two floats.
    red_buffer_elts = 2 * grid.x * grid.y * 2;
  }
}

inline void group_norm_nhwc_fwd_one_pass_run(const Group_norm_nhwc_fwd_params& params, const dim3& grid,
                                             cudaStream_t stream) {
  using Function_t = void (*)(const Group_norm_nhwc_fwd_params&, const dim3&, cudaStream_t);

  Function_t runner;
  GN_FWD_RUNNER_SELECT(runner);

  runner(params, grid, stream);
}


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_fwd_one_pass_kernel.cuh
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */
#include <assert.h>

#include <cub/cub.cuh>

#include "group_norm_nhwc.h"
#include "traits.h"

////////////////////////////////////////////////////////////////////////////////////////////////////
//
// F O R W A R D
//
////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Traits_, int ACTS_PER_BLOCK_, int CHANNELS_PER_GROUP_, int THREADS_PER_BLOCK_>
__global__ __launch_bounds__(THREADS_PER_BLOCK_) void group_norm_nhwc_fwd_one_pass_kernel(
    Group_norm_nhwc_fwd_params params) {
  // The traits.
  using Traits = Traits_;
  // The IO traits.
  using IOTraits = typename Traits::IOTraits;
  // The Weights traits.
  using WTraits = typename Traits::WTraits;

  // The IO type
  using IOType = typename IOTraits::Type;
  // The IO doubled type
  using IOType2 = typename IOTraits::Type2;

  // Weights type
  using WType = typename WTraits::Type;
  // Weights doubled type
  using WType2 = typename WTraits::Type2;

  // The number of activations per block.
  constexpr int ACTS_PER_BLOCK = ACTS_PER_BLOCK_;
  // The number of channels per group.
  constexpr int CHANNELS_PER_GROUP = CHANNELS_PER_GROUP_;
  // The number of threads per block.
  constexpr int THREADS_PER_BLOCK = THREADS_PER_BLOCK_;
  // The number of channels per thread (load fp16x2 numbers).
  constexpr int CHANNELS_PER_THREAD = 2;

  // The number of threads needed per activation.
  constexpr int THREADS_PER_ACT = CHANNELS_PER_GROUP / CHANNELS_PER_THREAD;
  // The number of activations that are loaded per loop.
  constexpr int ACTS_PER_LOOP = THREADS_PER_BLOCK / THREADS_PER_ACT;
  // The number of rows per thread.
  constexpr int ACTS_PER_THREAD = (ACTS_PER_BLOCK + ACTS_PER_LOOP - 1) / ACTS_PER_LOOP;

  // The number of active threads.
  constexpr int ACTIVE_THREADS = THREADS_PER_BLOCK / THREADS_PER_ACT * THREADS_PER_ACT;

  // The object in charge of doing the sums for the block.
  typedef cub::BlockReduce<float2, THREADS_PER_BLOCK> Block_reduce;
  // Allocate shared memory for Block_reduce.
  __shared__ typename Block_reduce::TempStorage temp_storage;
  // Allocate shared memory to store the sums.
  __shared__ float2 smem_sums;

  // The first activation loaded by that thread.
  int hwi = blockIdx.x * params.acts_per_block + threadIdx.x / THREADS_PER_ACT;
  // The first channel loaded by that thread.
  int ci = threadIdx.x % THREADS_PER_ACT * CHANNELS_PER_THREAD;

  // Is it an active thread?
  const bool is_active = threadIdx.x < ACTIVE_THREADS;

  // Iterate over the iterms in the batch.
  for (int ngi = blockIdx.y, step = 0; ngi < params.n * params.groups; ngi += gridDim.y, ++step) {
    // The instance and the group. TODO: Use fast divmod?
    int ni = ngi / params.groups;
    int gi = ngi % params.groups;

    // The offset to the first activation loaded by that thread.
    const int64_t offset = (int64_t)ni * params.hwc + gi * CHANNELS_PER_GROUP + ci;
    // The pointer to the first activation loaded by that thread.
    const IOType* x_ptr = &reinterpret_cast<const IOType*>(params.x)[offset];

    // Load the activations into registers.
    IOType2 x[ACTS_PER_THREAD];
#pragma unroll
    for (int ii = 0; ii < ACTS_PER_THREAD; ++ii) {
      int hwj = hwi + ii * ACTS_PER_LOOP;
      x[ii] = IOTraits::zero();
      if (is_active && hwj < params.hw) {
        x[ii] = *reinterpret_cast<const IOType2*>(&x_ptr[hwj * params.c]);
      }
    }

    // Compute the sum and the sum of squares for each thread.
    float2 sums = make_float2(0.f, 0.f);
#pragma unroll
    for (int ii = 0; ii < ACTS_PER_THREAD; ++ii) {
      float2 f2 = IOTraits::unpack(x[ii]);
      sums.x += f2.x + f2.y;
      sums.y += f2.x * f2.x + f2.y * f2.y;
    }

    // Clear invalid threads.
    if (ACTIVE_THREADS < THREADS_PER_BLOCK && !is_active) {
      sums = make_float2(0.f, 0.f);
    }

    // Compute the sums for the block.
    sums = Block_reduce(temp_storage).Reduce(sums, [](const float2& a, const float2& b) {
      return make_float2(a.x + b.x, a.y + b.y);
    });

    // The block leader stores to global memory, if needed.
    if (gridDim.x > 1) {
      // The index of the buffer (double-buffering).
      int red_buffer_idx = step & 1;
      // The barrier.
      int* barrier = &params.barriers[red_buffer_idx * gridDim.y + blockIdx.y];
      // The offset to the reduction buffer.
      int red_buffer_offset = red_buffer_idx * gridDim.x * gridDim.y * 2;
      // The reduction buffer.
      float2* red_buffer = reinterpret_cast<float2*>(&params.red_buffer[red_buffer_offset]);

      // The first thread stores its sums.
      if (threadIdx.x == 0) {
        red_buffer[blockIdx.x * gridDim.y + blockIdx.y] = sums;
      }

      // Make sure the data is in memory.
      if (threadIdx.x == 0) {
        spin_wait_(barrier, (step & 2) ? -1 : 1, (step & 2) ? 0 : gridDim.x);
      }
      __syncthreads();

      // Update the sums.
      for (int ii = 0; ii < gridDim.x; ++ii) {
        if (ii != blockIdx.x && threadIdx.x == 0) {
          float2 other_sums = red_buffer[ii * gridDim.y + blockIdx.y];
          sums.x += other_sums.x;
          sums.y += other_sums.y;
        }
      }
    }

    // Store the result for other threads.
    if (threadIdx.x == 0) {
      smem_sums = sums;
    }

    // Store the results to global memory as well (for training).
    if (params.sums != nullptr && blockIdx.x == 0 && threadIdx.x == 0) {
      sums.x *= params.inv_hwc_per_group;
      sums.y *= params.inv_hwc_per_group;
      params.sums[ngi] = sums;
    }

    // Make sure the sums are in shared memory.
    __syncthreads();

    // Load gamma/beta.
    float2 gamma_f2 = WTraits::unpack(
        *reinterpret_cast<const WType2*>(&reinterpret_cast<const WType*>(params.gamma)[gi * CHANNELS_PER_GROUP + ci]));
    float2 beta_f2 = WTraits::unpack(
        *reinterpret_cast<const WType2*>(&reinterpret_cast<const WType*>(params.beta)[gi * CHANNELS_PER_GROUP + ci]));

    // Compute the mean.
    float mean = smem_sums.x * params.inv_hwc_per_group;
    // Compute the variance.
    float var = smem_sums.y * params.inv_hwc_per_group - (mean * mean);
    // Compute the inverse of the stddev.
    float inv_stddev = var <= 0.f ? 1.f : rsqrtf(var + params.epsilon);

    // The pointer to the first activation stored by that thread.
    IOType* y_ptr = &reinterpret_cast<IOType*>(params.y)[offset];

    // Iterate over the activations to normalize the activations and store the results.
    for (int ii = 0; ii < ACTS_PER_THREAD; ++ii) {
      // Extract the two half values.
      float2 f2 = IOTraits::unpack(x[ii]);

      // Normalize the channels.
      f2.x = (f2.x - mean) * inv_stddev;
      f2.y = (f2.y - mean) * inv_stddev;

      // Scale by gamma and add beta.
      f2.x = gamma_f2.x * f2.x + beta_f2.x;
      f2.y = gamma_f2.y * f2.y + beta_f2.y;

      // Apply Swish if needed.
      if (params.with_swish) {
        f2.x = f2.x * sigmoid(f2.x);
        f2.y = f2.y * sigmoid(f2.y);
      }

      // Store the scaled values.
      int hwj = hwi + ii * ACTS_PER_LOOP;
      if (is_active && hwj < params.hw) {
        *reinterpret_cast<IOType2*>(&y_ptr[hwj * params.c]) = IOTraits::pack(f2);
      }
    }
  }
}

//////////////////////////////////////////////////////////////////////////////////////////////////


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_fwd_two_pass.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */
#include <assert.h>

#include <cub/cub.cuh>

#include "group_norm_nhwc.h"
#include "macros.h"
#include "traits.h"

////////////////////////////////////////////////////////////////////////////////////////////////////
//
// F O R W A R D
//
////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Traits_, int THREADS_PER_BLOCK>
__global__ void group_norm_nhwc_fwd_sum_kernel(Group_norm_nhwc_fwd_params params) {
  // The traits.
  using Traits = Traits_;
  // The IO traits.
  using IOTraits = typename Traits::IOTraits;

  // The IO type
  using IOType = typename IOTraits::Type;
  // The IO doubled type
  using IOType2 = typename IOTraits::Type2;

  // The object in charge of doing the sums for the different blocks.
  typedef cub::BlockScan<Group_sums, THREADS_PER_BLOCK> Block_scan;

  // Allocate shared memory for Block_scan.
  __shared__ typename Block_scan::TempStorage temp_storage;
  // Allocate shared memory for the groups. We could reduce the amount of shared memory reserved.
  __shared__ float2 smem[THREADS_PER_BLOCK];

  // The instance in the batch.
  int ni = blockIdx.z;
  // The channel loaded by that thread (2 channels per thread for F16x2).
  int ci = blockIdx.x * params.channels_per_block + threadIdx.x * 2;

  // The first activation loaded by that block.
  int hw_begin = blockIdx.y * params.acts_per_block;
  // The last activation loaded by that block.
  int hw_end = min((int64_t)hw_begin + params.acts_per_block, params.hw);

  // The sums.
  float sum = 0.f, sum_sq = 0.f;

  // Iterate over the activations to compute the sums.
  for (int hwi = hw_begin; hwi < hw_end; ++hwi) {
    // The offset.
    int64_t offset = (int64_t)ni * params.hwc + hwi * params.c + ci;

    // Fetch two channels per thread.
    IOType2 v2 = IOTraits::zero();
    if (ci < params.c) {
      v2 = *reinterpret_cast<const IOType2*>(&reinterpret_cast<const IOType*>(params.x)[offset]);
    }

    // Extract the two values.
    float2 f2 = IOTraits::unpack(v2);

    // Update the sum.
    sum += f2.x + f2.y;
    // Update the sum of squares.
    sum_sq += f2.x * f2.x + f2.y * f2.y;
  }

  // The group that thread works on and the channel in the group (modulus).
  int gj = threadIdx.x * 2 / params.channels_per_group;
  int cj = threadIdx.x * 2 - params.channels_per_group * gj;

  // The data for the summations.
  Group_sums inp{cj == 0 ? 1 : 0, sum, sum_sq};

  // Do the segmented scan.
  Group_sums out;
  Block_scan(temp_storage).InclusiveScan(inp, out, Group_sums_op());

  // Store the results for the groups in shared memory (to produce coalesced stores later).
  if (cj == params.channels_per_group - 2 /* 2 channels per thread */) {
    smem[gj] = make_float2(out.sum, out.sum_sq);
  }

  // Make sure the data is in shared memory.
  __syncthreads();

  // The global group index.
  int gk = blockIdx.x * params.groups_per_block + threadIdx.x;

  // Threads that have nothing left to do, exit.
  if (threadIdx.x >= params.groups_per_block || gk >= params.groups) {
    return;
  }

  // The first threads (those storing to global memory, load the values).
  float2 sums = smem[threadIdx.x];

  // Store to global memory.
  atomicAdd(&params.zeroed_red_buffer[(2 * ni + 0) * params.groups + gk], sums.x);
  atomicAdd(&params.zeroed_red_buffer[(2 * ni + 1) * params.groups + gk], sums.y);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

void group_norm_nhwc_fwd_two_passes_setup(Group_norm_nhwc_fwd_params& params, size_t& zeroed_red_buffer_elts) {
  // The pre-computed dimensions.
  params.hw = params.h * params.w;
  params.hwc = params.c * params.hw;

  // The number of channels per group.
  params.channels_per_group = params.c / params.groups;
  // The inverse to compute the mean/variance.
  params.inv_hwc_per_group = 1.f / (float)(params.hw * params.channels_per_group);

  // Define the number of blocks per activation map. That's a simple heuristic.
  int blocks_per_act_slice = 0;
  if (params.c >= 1280) {
    blocks_per_act_slice = 128 / params.n;
  } else if (params.c >= 640) {
    blocks_per_act_slice = 256 / params.n;
  } else {
    blocks_per_act_slice = 512 / params.n;
  }

  // Clamp to at least 1 to avoid divide-by-zero when batch size is large.
  blocks_per_act_slice = max(blocks_per_act_slice, 1);

  // Make sure we launch blocks per activation is no less than activations
  blocks_per_act_slice = min(blocks_per_act_slice, div_up(params.hw, params.n));

  // Define how many activations are computed per block.
  params.acts_per_block = div_up(params.hw, blocks_per_act_slice);
  // The number of channels per block.
  params.channels_per_block = 320;
  // Special case to deal with 30 channels per group.
  if (params.channels_per_block % params.channels_per_group != 0) {
    params.channels_per_block = 240;
  }

  // Special case to deal with 70 channels per group.
  if (params.c == 2240) {
    params.channels_per_block = 280;
  } else if (params.c == 832) {
    params.channels_per_block = 208;
  }

  if (params.c % params.channels_per_block != 0) {
    if (params.c % 512 == 0 && params.c != 1536 && params.c != 3072 && params.c % 448 != 0) {
      params.channels_per_block = 512;
    } else if (params.c % 42 == 0) {
      params.channels_per_block = 336;
    } else if (params.c % 384 == 0) {
      params.channels_per_block = 384;
    } else if (params.c % 256 == 0 && params.c % 448 != 0 && params.c % 392 != 0) {
      params.channels_per_block = 256;
    } else if (params.c % 128 == 0 && params.c % 448 != 0 && params.c % 392 != 0) {
      params.channels_per_block = 128;
    } else if (params.c % 448 == 0 && params.c % 392 != 0) {
      params.channels_per_block = 448;
    } else if (params.c % 392 == 0) {
      params.channels_per_block = 392;
    }
  }

  // The number of groups per block.
  params.groups_per_block = params.channels_per_block / params.channels_per_group;

  // Make sure the number of channels is a multiple of the number of channels per block.
  assert(params.c % params.channels_per_block == 0);
  // Make sure a group does not span multiple blocks.
  assert(params.channels_per_block % params.channels_per_group == 0);

  // The number of elements in the reduction buffer (for the sums and sums of squared).
  zeroed_red_buffer_elts = params.n * params.groups * 2;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

void group_norm_nhwc_fwd_two_passes_sum(const Group_norm_nhwc_fwd_params& params, cudaStream_t stream) {
  // The dimension of the grid.
  dim3 grid;

  // The number of blocks to compute all the channels.
  grid.x = params.c / params.channels_per_block;
  // The number of blocks to compute all the activations in a given instance.
  grid.y = div_up(params.hw, params.acts_per_block);
  // The number of instances.
  grid.z = params.n;

  // Launch the kernel.
  if (params.precision == PrecisionMode::FP16IOFP16W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_fwd_sum_kernel, Fp16IOFp16W)
  } else if (params.precision == PrecisionMode::FP16IOBF16W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_fwd_sum_kernel, Fp16IOBf16W)
  } else if (params.precision == PrecisionMode::FP16IOFP32W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_fwd_sum_kernel, Fp16IOFp32W)
  } else if (params.precision == PrecisionMode::BF16IOFP16W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_fwd_sum_kernel, Bf16IOFp16W)
  } else if (params.precision == PrecisionMode::BF16IOBF16W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_fwd_sum_kernel, Bf16IOBf16W)
  } else if (params.precision == PrecisionMode::BF16IOFP32W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_fwd_sum_kernel, Bf16IOFp32W)
  } else if (params.precision == PrecisionMode::FP32IOFP16W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_fwd_sum_kernel, Fp32IOFp16W)
  } else if (params.precision == PrecisionMode::FP32IOBF16W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_fwd_sum_kernel, Fp32IOBf16W)
  } else {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_fwd_sum_kernel, Fp32IOFp32W)
  }

  // Make sure it launched ok.
  CHECK_CUDA(cudaGetLastError());
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Traits_, int THREADS_PER_BLOCK>
__global__ void group_norm_nhwc_fwd_scale_kernel(Group_norm_nhwc_fwd_params params) {
  // The traits.
  using Traits = Traits_;
  // The IO traits.
  using IOTraits = typename Traits::IOTraits;
  // The Weights traits.
  using WTraits = typename Traits::WTraits;

  // The IO type
  using IOType = typename IOTraits::Type;
  // The IO doubled type
  using IOType2 = typename IOTraits::Type2;

  // Weights type
  using WType = typename WTraits::Type;
  // Weights doubled type
  using WType2 = typename WTraits::Type2;

  // The instance in the batch.
  int ni = blockIdx.z;
  // The channel loaded by that thread (2 channels per thread for F16x2).
  int ci = blockIdx.x * params.channels_per_block + threadIdx.x * 2;
  // The group that thread works on and the channel in the group (modulus).
  int gi = ci / params.channels_per_group;

  // Load the sum and sum of squares for the group.
  float sum = 0.f, sum_sq = 0.f;
  if (gi < params.groups) {
    sum = params.zeroed_red_buffer[(2 * ni + 0) * params.groups + gi];
    sum_sq = params.zeroed_red_buffer[(2 * ni + 1) * params.groups + gi];
  }

  // Load gamma/beta.
  float2 gamma_f2, beta_f2;
  if (ci < params.c) {
    gamma_f2 = WTraits::unpack(*reinterpret_cast<const WType2*>(&reinterpret_cast<const WType*>(params.gamma)[ci]));
    beta_f2 = WTraits::unpack(*reinterpret_cast<const WType2*>(&reinterpret_cast<const WType*>(params.beta)[ci]));
  }

  // Compute the mean.
  float mean = sum * params.inv_hwc_per_group;
  // Compute the variance.
  float var = sum_sq * params.inv_hwc_per_group - (mean * mean);
  // Compute the inverse of the stddev.
  float inv_stddev = var <= 0.f ? 1.f : rsqrtf(var + params.epsilon);

  // The first activation loaded by that block.
  int hw_begin = blockIdx.y * params.acts_per_block;
  // The last activation loaded by that block.
  int hw_end = min((int64_t)hw_begin + params.acts_per_block, params.hw);

  // Iterate over the activations to compute the sums.
  for (int hwi = hw_begin; hwi < hw_end; ++hwi) {
    // The src/dst offset.
    int64_t offset = (int64_t)ni * params.hwc + hwi * params.c + ci;

    // Fetch two channels per thread.
    IOType2 v2 = IOTraits::zero();
    if (ci < params.c) {
      v2 = *reinterpret_cast<const IOType2*>(&reinterpret_cast<const IOType*>(params.x)[offset]);
    }

    // Extract the two values.
    float2 f2 = IOTraits::unpack(v2);

    // Normalize the channels.
    f2.x = (f2.x - mean) * inv_stddev;
    f2.y = (f2.y - mean) * inv_stddev;

    // Scale by gamma and add beta.
    f2.x = gamma_f2.x * f2.x + beta_f2.x;
    f2.y = gamma_f2.y * f2.y + beta_f2.y;

    // Apply Swish if needed.
    if (params.with_swish) {
      f2.x = f2.x * sigmoid(f2.x);
      f2.y = f2.y * sigmoid(f2.y);
    }

    // Store the scaled values.
    if (ci < params.c) {
      *reinterpret_cast<IOType2*>(&reinterpret_cast<IOType*>(params.y)[offset]) = IOTraits::pack(f2);
    }
  }

  // Write the sums if needed.
  if (params.sums != nullptr && gi < params.groups) {
    float2 sums;
    sums.x = sum * params.inv_hwc_per_group;
    sums.y = sum_sq * params.inv_hwc_per_group;
    params.sums[ni * params.groups + gi] = sums;
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

void group_norm_nhwc_fwd_two_passes_scale(const Group_norm_nhwc_fwd_params& params, cudaStream_t stream) {
  // The dimension of the grid.
  dim3 grid;

  // The number of blocks to compute all the channels.
  grid.x = params.c / params.channels_per_block;
  // The number of blocks to compute all the activations in a given instance.
  grid.y = div_up(params.hw, params.acts_per_block);
  // The number of instances.
  grid.z = params.n;

  if (params.precision == PrecisionMode::FP16IOFP16W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_fwd_scale_kernel, Fp16IOFp16W)
  } else if (params.precision == PrecisionMode::FP16IOBF16W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_fwd_scale_kernel, Fp16IOBf16W)
  } else if (params.precision == PrecisionMode::FP16IOFP32W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_fwd_scale_kernel, Fp16IOFp32W)
  } else if (params.precision == PrecisionMode::BF16IOFP16W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_fwd_scale_kernel, Bf16IOFp16W)
  } else if (params.precision == PrecisionMode::BF16IOBF16W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_fwd_scale_kernel, Bf16IOBf16W)
  } else if (params.precision == PrecisionMode::BF16IOFP32W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_fwd_scale_kernel, Bf16IOFp32W)
  } else if (params.precision == PrecisionMode::FP32IOFP16W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_fwd_scale_kernel, Fp32IOFp16W)
  } else if (params.precision == PrecisionMode::FP32IOBF16W) {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_fwd_scale_kernel, Fp32IOBf16W)
  } else {
    CALL_TWO_PASS_KERNEL(group_norm_nhwc_fwd_scale_kernel, Fp32IOFp32W)
  }

  // Make sure it launched ok.
  CHECK_CUDA(cudaGetLastError());
}

////////////////////////////////////////////////////////////////////////////////////////////////////


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_10.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 10, /* THREADS_PER_BLOCK */ 640)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_112.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 112, /* THREADS_PER_BLOCK */ 448)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_12.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 12, /* THREADS_PER_BLOCK */ 384)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_120.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 120, /* THREADS_PER_BLOCK */ 480)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_128.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 128, /* THREADS_PER_BLOCK */ 512)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_14.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 14, /* THREADS_PER_BLOCK */ 224)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_16.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 16, /* THREADS_PER_BLOCK */ 256)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_160.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 160, /* THREADS_PER_BLOCK */ 640)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_20.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 20, /* THREADS_PER_BLOCK */ 640)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_24.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 24, /* THREADS_PER_BLOCK */ 384)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_26.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 26, /* THREADS_PER_BLOCK */ 416)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_28.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 28, /* THREADS_PER_BLOCK */ 448)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_30.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 30, /* THREADS_PER_BLOCK */ 480)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_32.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 32, /* THREADS_PER_BLOCK */ 512)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_4.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 4, /* THREADS_PER_BLOCK */ 128)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_40.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 40, /* THREADS_PER_BLOCK */ 640)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_42.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 42, /* THREADS_PER_BLOCK */ 672)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_48.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 48, /* THREADS_PER_BLOCK */ 384)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_56.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 56, /* THREADS_PER_BLOCK */ 448)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_60.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 60, /* THREADS_PER_BLOCK */ 480)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_64.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 64, /* THREADS_PER_BLOCK */ 512)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_70.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 70, /* THREADS_PER_BLOCK */ 560)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_8.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 8, /* THREADS_PER_BLOCK */ 128)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_80.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 80, /* THREADS_PER_BLOCK */ 640)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_84.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 84, /* THREADS_PER_BLOCK */ 672)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_96.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 96, /* THREADS_PER_BLOCK */ 768)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_98.cu
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
#include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
#include "macros.h"

GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 98, /* THREADS_PER_BLOCK */ 392)


================================================
FILE: apex/contrib/csrc/group_norm/group_norm_nhwc_op.cpp
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include <ATen/cuda/CUDAContext.h>
#include <torch/extension.h>

#include "group_norm_nhwc.h"
#include "group_norm_nhwc_bwd_one_pass.h"
#include "group_norm_nhwc_fwd_one_pass.h"

////////////////////////////////////////////////////////////////////////////////////////////////////

#define CHECK_CUDA_STATUS(call)                                                                     \
  do {                                                                                              \
    cudaError_t status_ = call;                                                                     \
    if (status_ != cudaSuccess) {                                                                   \
      fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, cudaGetErrorString(status_)); \
      exit(1);                                                                                      \
    }                                                                                               \
  } while (0)

#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_CHANNELS_LAST(x) TORCH_CHECK(x.is_contiguous(at::MemoryFormat::ChannelsLast), #x " must be channels last")
#define CHECK_INPUT(x) \
  CHECK_CUDA(x);       \
  CHECK_CONTIGUOUS(x)
#define CHECK_NHWC_INPUT(x) \
  CHECK_CUDA(x);            \
  CHECK_CHANNELS_LAST(x)

static bool initialized = false;
static cudaDeviceProp props;

const std::unordered_set<int> supported_c_values = {128,  256,  320,  384,  448,  512,  640,  768,
                                                    896,  960,  1024, 1280, 1344, 1536, 1792, 1920,
                                                    2048, 2240, 2560, 2688, 3072, 3136, 3584, 4096};
const std::unordered_set<int> supported_groups_values = {16, 32};

std::vector<torch::Tensor> group_norm_fwd(torch::Tensor input, int groups, torch::Tensor weight, torch::Tensor bias,
                                          float eps, int passes, bool with_swish = false) {
  if (!initialized) {
    CHECK_CUDA_STATUS(cudaGetDeviceProperties(&props, 0));
    initialized = true;
  }
  CHECK_NHWC_INPUT(input);
  auto stream = at::cuda::getCurrentCUDAStream();

  // Achieve group norm arguments
  int n = input.size(0);
  int c = input.size(1);
  int h = input.size(2);
  int w = input.size(3);

  // Check kernel constraints
  TORCH_CHECK(supported_groups_values.count(groups), "`groups` of {16, 32} are only supported but ", groups,
              " is passed");
  TORCH_CHECK(supported_c_values.count(c), "`c` of ", c, " is not included in supported_c_values");

  // Allocate tensors
  auto options = at::TensorOptions(at::kCUDA);
  auto output = at::empty_like(input, at::MemoryFormat::Preserve);
  auto sums_d = at::empty({2 * n * groups}, options.dtype(at::kFloat));

  // Declare the parameters.
  Group_norm_nhwc_fwd_params params_fwd;
  memset(&params_fwd, 0, sizeof(params_fwd));

  // Initialize the parameters.
  params_fwd.y = reinterpret_cast<void*>(output.data_ptr());
  params_fwd.sums = reinterpret_cast<float2*>(sums_d.data_ptr());
  params_fwd.x = const_cast<void*>(reinterpret_cast<void*>(input.data_ptr()));
  params_fwd.gamma = const_cast<void*>(reinterpret_cast<void*>(weight.data_ptr()));
  params_fwd.beta = const_cast<void*>(reinterpret_cast<void*>(bias.data_ptr()));
  params_fwd.epsilon = eps;
  params_fwd.n = n;
  params_fwd.h = h;
  params_fwd.w = w;
  params_fwd.c = c;
  params_fwd.groups = groups;
  params_fwd.with_swish = with_swish;

  PrecisionMode mode;
  if (input.dtype() == torch::kFloat32) {
    if (weight.dtype() == torch::kFloat16) {
      mode = PrecisionMode::FP32IOFP16W;
    } else if (weight.dtype() == torch::kBFloat16) {
      mode = PrecisionMode::FP32IOBF16W;
    } else {
      mode = PrecisionMode::FP32IOFP32W;
    }
  } else if (input.dtype() == torch::kBFloat16) {
    if (weight.dtype() == torch::kFloat16) {
      mode = PrecisionMode::BF16IOFP16W;
    } else if (weight.dtype() == torch::kBFloat16) {
      mode = PrecisionMode::BF16IOBF16W;
    } else {
      mode = PrecisionMode::BF16IOFP32W;
    }
  } else {
    if (weight.dtype() == torch::kFloat16) {
      mode = PrecisionMode::FP16IOFP16W;
    } else if (weight.dtype() == torch::kBFloat16) {
      mode = PrecisionMode::FP16IOBF16W;
    } else {
      mode = PrecisionMode::FP16IOFP32W;
    }
  }
  params_fwd.precision = mode;

  // The number of barriers.
  size_t barriers_elts = 0;
  // The number of elements in the reduction buffer.
  size_t red_buffer_elts = 0;
  // The number of elements in the reduction buffer that must be zeroed.
  size_t zeroed_red_buffer_elts = 0;

  // Finalize the parameters.
  dim3 grid;
  if (passes == 1) {
    group_norm_nhwc_fwd_one_pass_setup(params_fwd, barriers_elts, red_buffer_elts, grid, props);
  } else {
    group_norm_nhwc_fwd_two_passes_setup(params_fwd, zeroed_red_buffer_elts);
  }

  // Allocate on the device.
  auto red_buffer = at::empty({red_buffer_elts}, options.dtype(at::kFloat));
  params_fwd.red_buffer = red_buffer.data_ptr<float>();

  // Allocate the buffer if needed.
  auto barriers = at::zeros({barriers_elts}, options.dtype(at::kInt));
  params_fwd.barriers = barriers.data_ptr<int>();
  auto zeroed_red_buffer = at::zeros({zeroed_red_buffer_elts}, options.dtype(at::kFloat));
  params_fwd.zeroed_red_buffer = zeroed_red_buffer.data_ptr<float>();

  if (passes == 1) {
    group_norm_nhwc_fwd_one_pass_run(params_fwd, grid, stream);
  } else {
    group_norm_nhwc_fwd_two_passes_sum(params_fwd, stream);
    group_norm_nhwc_fwd_two_passes_scale(params_fwd, stream);
  }

  return {output, sums_d};
}

std::vector<torch::Tensor> group_norm_bwd(torch::Tensor grad_output, torch::Tensor sums, torch::Tensor input,
                                          int groups, torch::Tensor weight, torch::Tensor bias, float eps, int passes,
                                          bool with_swish = false) {
  if (!initialized) {
    CHECK_CUDA_STATUS(cudaGetDeviceProperties(&props, 0));
    initialized = true;
  }
  CHECK_NHWC_INPUT(grad_output);
  auto stream = at::cuda::getCurrentCUDAStream();

  // Achieve group norm arguments
  int n = input.size(0);
  int c = input.size(1);
  int h = input.size(2);
  int w = input.size(3);

  // Check kernel constraints
  TORCH_CHECK(supported_groups_values.count(groups), "`groups` of {16, 32} are only supported but ", groups,
              " is passed");
  TORCH_CHECK(supported_c_values.count(c), "`c` of ", c, " is not included in supported_c_values");

  // Allocate tensors
  auto options = at::TensorOptions(at::kCUDA);
  auto grad_input = at::empty_like(input, at::MemoryFormat::Preserve);
  auto grad_weight = at::empty_like(weight, at::MemoryFormat::Preserve);
  auto grad_bias = at::empty_like(bias, at::MemoryFormat::Preserve);
  auto sums_d = at::empty({2 * n * groups}, options.dtype(at::kFloat));

  // Declare the parameters.
  Group_norm_nhwc_bwd_params params_bwd;
  memset(&params_bwd, 0, sizeof(params_bwd));

  // Initialize the parameters.
  params_bwd.dx = reinterpret_cast<void*>(grad_input.data_ptr());
  params_bwd.dgamma = reinterpret_cast<void*>(grad_weight.data_ptr());
  params_bwd.dbeta = reinterpret_cast<void*>(grad_bias.data_ptr());
  params_bwd.sums = const_cast<float2*>(reinterpret_cast<float2*>(sums.data_ptr()));
  params_bwd.dy = const_cast<void*>(reinterpret_cast<void*>(grad_output.data_ptr()));
  params_bwd.x = const_cast<void*>(reinterpret_cast<void*>(input.data_ptr()));
  ;
  params_bwd.gamma = const_cast<void*>(reinterpret_cast<void*>(weight.data_ptr()));
  params_bwd.beta = const_cast<void*>(reinterpret_cast<void*>(bias.data_ptr()));
  ;
  params_bwd.epsilon = eps;
  params_bwd.n = n;
  params_bwd.h = h;
  params_bwd.w = w;
  params_bwd.c = c;
  params_bwd.groups = groups;
  params_bwd.with_swish = with_swish;

  PrecisionMode mode;
  if (input.dtype() == torch::kFloat32) {
    if (weight.dtype() == torch::kFloat16) {
      mode = PrecisionMode::FP32IOFP16W;
    } else if (weight.dtype() == torch::kBFloat16) {
      mode = PrecisionMode::FP32IOBF16W;
    } else {
      mode = PrecisionMode::FP32IOFP32W;
    }
  } else if (input.dtype() == torch::kBFloat16) {
    if (weight.dtype() == torch::kFloat16) {
      mode = PrecisionMode::BF16IOFP16W;
    } else if (weight.dtype() == torch::kBFloat16) {
      mode = PrecisionMode::BF16IOBF16W;
    } else {
      mode = PrecisionMode::BF16IOFP32W;
    }
  } else {
    if (weight.dtype() == torch::kFloat16) {
      mode = PrecisionMode::FP16IOFP16W;
    } else if (weight.dtype() == torch::kBFloat16) {
      mode = PrecisionMode::FP16IOBF16W;
    } else {
      mode = PrecisionMode::FP16IOFP32W;
    }
  }
  params_bwd.precision = mode;

  // The number of barriers.
  size_t barriers_elts = 0;
  // The number of elements in the reduction buffer.
  size_t red_buffer_elts = 0;
  // The number of elements in the reduction buffer that must be zeroed.
  size_t zeroed_red_buffer_elts = 0;

  // Finalize the parameters.
  dim3 grid;
  if (passes == 1) {
    group_norm_nhwc_bwd_one_pass_setup(params_bwd, barriers_elts, red_buffer_elts, zeroed_red_buffer_elts, grid, props);
  } else {
    group_norm_nhwc_bwd_two_passes_setup(params_bwd, zeroed_red_buffer_elts);
  }

  // Allocate on the device.
  auto red_buffer = at::empty({red_buffer_elts}, options.dtype(at::kFloat));
  params_bwd.red_buffer = red_buffer.data_ptr<float>();

  // Allocate the buffer if needed.
  auto barriers = at::zeros({barriers_elts}, options.dtype(at::kInt));
  params_bwd.barriers = barriers.data_ptr<int>();
  auto zeroed_red_buffer = at::zeros({zeroed_red_buffer_elts}, options.dtype(at::kFloat));
  params_bwd.zeroed_red_buffer = zeroed_red_buffer.data_ptr<float>();

  if (passes == 1) {
    group_norm_nhwc_bwd_one_pass_run(params_bwd, grid, stream);
  } else {
    group_norm_nhwc_bwd_two_passes_sum(params_bwd, stream);
    group_norm_nhwc_bwd_two_passes_scale(params_bwd, stream);
  }

  return {grad_input, grad_weight, grad_bias};
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &group_norm_fwd, "NHWC group norm forward", py::call_guard<py::gil_scoped_release>());
  m.def("backward", &group_norm_bwd, "NHWC group norm backward", py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: apex/contrib/csrc/group_norm/macros.h
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */

#define GN_ONE_PASS_RUN_FUNCTION_NAME(Traits, ACTS_PER_BLOCK, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME) \
  void group_norm_nhwc_##PASS_NAME##_one_pass_##CHANNELS_PER_GROUP##_##ACTS_PER_BLOCK##_##Traits##_run(         \
      const Group_norm_nhwc_##PASS_NAME##_params& params, const dim3& grid, cudaStream_t stream)

#define GN_ONE_PASS_RUN_FUNCTION(Traits, ACTS_PER_BLOCK, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME)            \
  GN_ONE_PASS_RUN_FUNCTION_NAME(Traits, ACTS_PER_BLOCK, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME) {           \
    auto kernel =                                                                                                     \
        group_norm_nhwc_##PASS_NAME##_one_pass_kernel<Traits, ACTS_PER_BLOCK, CHANNELS_PER_GROUP, THREADS_PER_BLOCK>; \
                                                                                                                      \
    const Group_norm_nhwc_##PASS_NAME##_params* params_ = &params;                                                    \
    if (grid.x > 1) {                                                                                                 \
      CHECK_CUDA(cudaLaunchCooperativeKernel((const void*)kernel, grid, dim3(THREADS_PER_BLOCK), (void**)&params_, 0, \
                                             stream));                                                                \
                                                                                                                      \
    } else {                                                                                                          \
      CHECK_CUDA(cudaLaunchKernel((const void*)kernel, grid, dim3(THREADS_PER_BLOCK), (void**)&params_, 0, stream));  \
    }                                                                                                                 \
                                                                                                                      \
    CHECK_CUDA(cudaGetLastError());                                                                                   \
  }

//////////////////////////////////////////////////////////////////////////////////////////////////

#define GN_ONE_PASS_BLOCKS_PER_SM_FUNCTION_NAME(Traits, ACTS_PER_BLOCK, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, \
                                                PASS_NAME)                                                     \
  int group_norm_nhwc_##PASS_NAME##_one_pass_##CHANNELS_PER_GROUP##_##ACTS_PER_BLOCK##_##Traits##_blocks_per_sm()

#define GN_ONE_PASS_BLOCKS_PER_SM_FUNCTION(Traits, ACTS_PER_BLOCK, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME)  \
  GN_ONE_PASS_BLOCKS_PER_SM_FUNCTION_NAME(Traits, ACTS_PER_BLOCK, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME) { \
    auto kernel =                                                                                                     \
        group_norm_nhwc_##PASS_NAME##_one_pass_kernel<Traits, ACTS_PER_BLOCK, CHANNELS_PER_GROUP, THREADS_PER_BLOCK>; \
                                                                                                                      \
    int blocks_per_sm = 0;                                                                                            \
    CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks_per_sm, kernel, THREADS_PER_BLOCK, 0));          \
                                                                                                                      \
    CHECK_CUDA(cudaGetLastError());                                                                                   \
    return blocks_per_sm;                                                                                             \
  }

//////////////////////////////////////////////////////////////////////////////////////////////////

#define GN_ONE_PASS_(FUNCTION, Traits, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME) \
  FUNCTION(Traits, 512, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME);               \
  FUNCTION(Traits, 256, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME);               \
  FUNCTION(Traits, 128, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME);               \
  FUNCTION(Traits, 64, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME);

#define GN_ONE_PASS_RUN_DEFINITION(CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME)                     \
  GN_ONE_PASS_(GN_ONE_PASS_RUN_FUNCTION, Fp32IOFp16W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME); \
  GN_ONE_PASS_(GN_ONE_PASS_RUN_FUNCTION, Fp32IOBf16W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME); \
  GN_ONE_PASS_(GN_ONE_PASS_RUN_FUNCTION, Fp32IOFp32W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME); \
  GN_ONE_PASS_(GN_ONE_PASS_RUN_FUNCTION, Fp16IOFp16W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME); \
  GN_ONE_PASS_(GN_ONE_PASS_RUN_FUNCTION, Fp16IOBf16W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME); \
  GN_ONE_PASS_(GN_ONE_PASS_RUN_FUNCTION, Fp16IOFp32W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME); \
  GN_ONE_PASS_(GN_ONE_PASS_RUN_FUNCTION, Bf16IOFp16W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME); \
  GN_ONE_PASS_(GN_ONE_PASS_RUN_FUNCTION, Bf16IOBf16W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME); \
  GN_ONE_PASS_(GN_ONE_PASS_RUN_FUNCTION, Bf16IOFp32W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME);

#define GN_ONE_PASS_RUN_DECLARATION(CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME)                         \
  GN_ONE_PASS_(GN_ONE_PASS_RUN_FUNCTION_NAME, Fp32IOFp16W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME); \
  GN_ONE_PASS_(GN_ONE_PASS_RUN_FUNCTION_NAME, Fp32IOBf16W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME); \
  GN_ONE_PASS_(GN_ONE_PASS_RUN_FUNCTION_NAME, Fp32IOFp32W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME); \
  GN_ONE_PASS_(GN_ONE_PASS_RUN_FUNCTION_NAME, Fp16IOFp16W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME); \
  GN_ONE_PASS_(GN_ONE_PASS_RUN_FUNCTION_NAME, Fp16IOBf16W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME); \
  GN_ONE_PASS_(GN_ONE_PASS_RUN_FUNCTION_NAME, Fp16IOFp32W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME); \
  GN_ONE_PASS_(GN_ONE_PASS_RUN_FUNCTION_NAME, Bf16IOFp16W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME); \
  GN_ONE_PASS_(GN_ONE_PASS_RUN_FUNCTION_NAME, Bf16IOBf16W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME); \
  GN_ONE_PASS_(GN_ONE_PASS_RUN_FUNCTION_NAME, Bf16IOFp32W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME);

#define GN_ONE_PASS_BLOCKS_PER_SM_DEFINITION(CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME)                     \
  GN_ONE_PASS_(GN_ONE_PASS_BLOCKS_PER_SM_FUNCTION, Fp32IOFp16W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME); \
  GN_ONE_PASS_(GN_ONE_PASS_BLOCKS_PER_SM_FUNCTION, Fp32IOBf16W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME); \
  GN_ONE_PASS_(GN_ONE_PASS_BLOCKS_PER_SM_FUNCTION, Fp32IOFp32W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME); \
  GN_ONE_PASS_(GN_ONE_PASS_BLOCKS_PER_SM_FUNCTION, Fp16IOFp16W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME); \
  GN_ONE_PASS_(GN_ONE_PASS_BLOCKS_PER_SM_FUNCTION, Fp16IOBf16W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME); \
  GN_ONE_PASS_(GN_ONE_PASS_BLOCKS_PER_SM_FUNCTION, Fp16IOFp32W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME); \
  GN_ONE_PASS_(GN_ONE_PASS_BLOCKS_PER_SM_FUNCTION, Bf16IOFp16W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME); \
  GN_ONE_PASS_(GN_ONE_PASS_BLOCKS_PER_SM_FUNCTION, Bf16IOBf16W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME); \
  GN_ONE_PASS_(GN_ONE_PASS_BLOCKS_PER_SM_FUNCTION, Bf16IOFp32W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME);

#define GN_ONE_PASS_BLOCKS_PER_SM_DECLARATION(CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME)             \
  GN_ONE_PASS_(GN_ONE_PASS_BLOCKS_PER_SM_FUNCTION_NAME, Fp32IOFp16W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, \
               PASS_NAME);                                                                                  \
  GN_ONE_PASS_(GN_ONE_PASS_BLOCKS_PER_SM_FUNCTION_NAME, Fp32IOBf16W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, \
               PASS_NAME);                                                                                  \
  GN_ONE_PASS_(GN_ONE_PASS_BLOCKS_PER_SM_FUNCTION_NAME, Fp32IOFp32W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, \
               PASS_NAME);                                                                                  \
  GN_ONE_PASS_(GN_ONE_PASS_BLOCKS_PER_SM_FUNCTION_NAME, Fp16IOFp16W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, \
               PASS_NAME);                                                                                  \
  GN_ONE_PASS_(GN_ONE_PASS_BLOCKS_PER_SM_FUNCTION_NAME, Fp16IOBf16W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, \
               PASS_NAME);                                                                                  \
  GN_ONE_PASS_(GN_ONE_PASS_BLOCKS_PER_SM_FUNCTION_NAME, Fp16IOFp32W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, \
               PASS_NAME);                                                                                  \
  GN_ONE_PASS_(GN_ONE_PASS_BLOCKS_PER_SM_FUNCTION_NAME, Bf16IOFp16W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, \
               PASS_NAME);                                                                                  \
  GN_ONE_PASS_(GN_ONE_PASS_BLOCKS_PER_SM_FUNCTION_NAME, Bf16IOBf16W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, \
               PASS_NAME);                                                                                  \
  GN_ONE_PASS_(GN_ONE_PASS_BLOCKS_PER_SM_FUNCTION_NAME, Bf16IOFp32W, CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME);

#define GN_ONE_PASS_DEFINITION(CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME) \
  GN_ONE_PASS_RUN_DEFINITION(CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME)   \
  GN_ONE_PASS_BLOCKS_PER_SM_DEFINITION(CHANNELS_PER_GROUP, THREADS_PER_BLOCK, PASS_NAME)

#define GN_FWD_ONE_PASS_DEFINITION(CHANNELS_PER_GROUP, THREADS_PER_BLOCK) \
  GN_ONE_PASS_DEFINITION(CHANNELS_PER_GROUP, THREADS_PER_BLOCK, fwd)

#define GN_BWD_ONE_PASS_DEFINITION(CHANNELS_PER_GROUP, THREADS_PER_BLOCK) \
  GN_ONE_PASS_DEFINITION(CHANNELS_PER_GROUP, THREADS_PER_BLOCK, bwd)

#define GN_FWD_BWD_ONE_PASS_DEFINITION(CHANNELS_PER_GROUP, THREADS_PER_BLOCK) \
  GN_FWD_ONE_PASS_DEFINITION(CHANNELS_PER_GROUP, THREADS_PER_BLOCK)           \
  GN_BWD_ONE_PASS_DEFINITION(CHANNELS_PER_GROUP, THREADS_PER_BLOCK)

////////////////////////////////////////////////////////////////////////////////////////////////////

#define GN_SELECTION_STATEMENT(function, Traits, PRECISION, FUNC_POSTFIX, HW_THRESHOLD, ACTS_PER_BLOCK,          \
                               CHANNELS_PER_GROUP, PASS_NAME)                                                    \
  if (params.hw >= HW_THRESHOLD && params.channels_per_group == CHANNELS_PER_GROUP &&                            \
      params.precision == PrecisionMode::PRECISION) {                                                            \
    function =                                                                                                   \
        group_norm_nhwc_##PASS_NAME##_one_pass_##CHANNELS_PER_GROUP##_##ACTS_PER_BLOCK##_##Traits##FUNC_POSTFIX; \
  } else

#define GN_SELECTION_STATEMENT_WITH_CPG_LIMIT(function, Traits, PRECISION, FUNC_POSTFIX, HW_THRESHOLD, ACTS_PER_BLOCK, \
                                              CHANNELS_PER_GROUP, PASS_NAME, LIMIT_CPG)                                \
  if (params.hw >= HW_THRESHOLD && params.channels_per_group == CHANNELS_PER_GROUP &&                                  \
      params.precision == PrecisionMode::PRECISION && CHANNELS_PER_GROUP >= LIMIT_CPG) {                               \
    function =                                                                                                         \
        group_norm_nhwc_##PASS_NAME##_one_pass_##CHANNELS_PER_GROUP##_##ACTS_PER_BLOCK##_##Traits##FUNC_POSTFIX;       \
  } else

#define GN_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(Traits, PRECISION, CHANNELS_PER_GROUP,        \
                                                                    FUNC_POSTFIX, function, PASS_NAME)            \
  GN_SELECTION_STATEMENT_WITH_CPG_LIMIT(function, Traits, PRECISION, FUNC_POSTFIX, 1024, 128, CHANNELS_PER_GROUP, \
                                        PASS_NAME, 80)                                                            \
  GN_SELECTION_STATEMENT_WITH_CPG_LIMIT(function, Traits, PRECISION, FUNC_POSTFIX, 256, 128, CHANNELS_PER_GROUP,  \
                                        PASS_NAME, 160)                                                           \
  GN_SELECTION_STATEMENT(function, Traits, PRECISION, FUNC_POSTFIX, 512, 512, CHANNELS_PER_GROUP, PASS_NAME)      \
  GN_SELECTION_STATEMENT(function, Traits, PRECISION, FUNC_POSTFIX, 256, 256, CHANNELS_PER_GROUP, PASS_NAME)      \
  GN_SELECTION_STATEMENT(function, Traits, PRECISION, FUNC_POSTFIX, 128, 128, CHANNELS_PER_GROUP, PASS_NAME)      \
  GN_SELECTION_STATEMENT(function, Traits, PRECISION, FUNC_POSTFIX, 0, 64, CHANNELS_PER_GROUP, PASS_NAME)

#define GN_FWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(CHANNELS_PER_GROUP, FUNC_POSTFIX, function) \
  GN_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(Fp32IOFp16W, FP32IOFP16W, CHANNELS_PER_GROUP,         \
                                                              FUNC_POSTFIX, function, fwd)                          \
  GN_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(Fp32IOBf16W, FP32IOBF16W, CHANNELS_PER_GROUP,         \
                                                              FUNC_POSTFIX, function, fwd)                          \
  GN_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(Fp32IOFp32W, FP32IOFP32W, CHANNELS_PER_GROUP,         \
                                                              FUNC_POSTFIX, function, fwd)                          \
  GN_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(Fp16IOFp16W, FP16IOFP16W, CHANNELS_PER_GROUP,         \
                                                              FUNC_POSTFIX, function, fwd)                          \
  GN_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(Fp16IOBf16W, FP16IOBF16W, CHANNELS_PER_GROUP,         \
                                                              FUNC_POSTFIX, function, fwd)                          \
  GN_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(Fp16IOFp32W, FP16IOFP32W, CHANNELS_PER_GROUP,         \
                                                              FUNC_POSTFIX, function, fwd)                          \
  GN_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(Bf16IOFp16W, BF16IOFP16W, CHANNELS_PER_GROUP,         \
                                                              FUNC_POSTFIX, function, fwd)                          \
  GN_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(Bf16IOBf16W, BF16IOBF16W, CHANNELS_PER_GROUP,         \
                                                              FUNC_POSTFIX, function, fwd)                          \
  GN_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(Bf16IOFp32W, BF16IOFP32W, CHANNELS_PER_GROUP,         \
                                                              FUNC_POSTFIX, function, fwd)

#define GN_BWD_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(CHANNELS_PER_GROUP, FUNC_POSTFIX, function) \
  GN_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(Fp32IOFp16W, FP32IOFP16W, CHANNELS_PER_GROUP,         \
                                                              FUNC_POSTFIX, function, bwd)                          \
  GN_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(Fp32IOBf16W, FP32IOBF16W, CHANNELS_PER_GROUP,         \
                                                              FUNC_POSTFIX, function, bwd)                          \
  GN_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(Fp32IOFp32W, FP32IOFP32W, CHANNELS_PER_GROUP,         \
                                                              FUNC_POSTFIX, function, bwd)                          \
  GN_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(Fp16IOFp16W, FP16IOFP16W, CHANNELS_PER_GROUP,         \
                                                              FUNC_POSTFIX, function, bwd)                          \
  GN_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(Fp16IOBf16W, FP16IOBF16W, CHANNELS_PER_GROUP,         \
                                                              FUNC_POSTFIX, function, bwd)                          \
  GN_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(Fp16IOFp32W, FP16IOFP32W, CHANNELS_PER_GROUP,         \
                                                              FUNC_POSTFIX, function, bwd)                          \
  GN_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(Bf16IOFp16W, BF16IOFP16W, CHANNELS_PER_GROUP,         \
                                                              FUNC_POSTFIX, function, bwd)                          \
  GN_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(Bf16IOBf16W, BF16IOBF16W, CHANNELS_PER_GROUP,         \
                                                              FUNC_POSTFIX, function, bwd)                          \
  GN_SELECTION_STATEMENT_HW_THRESHOLD_ACTS_PER_BLOCK_DISPATCH(Bf16IOFp32W, BF16IOFP32W, CHANNELS_PER_GROUP,         \
                                                              FUNC_POSTFIX, function, bwd)

////////////////////////////////////////////////////////////////////////////////////////////////////

#define GN_ONE_PASS_DECLARATION(CHANNELS_PER_GROUP, PASS_NAME)                    \
  GN_ONE_PASS_RUN_DECLARATION(CHANNELS_PER_GROUP, /* dummy value */ 0, PASS_NAME) \
  GN_ONE_PASS_BLOCKS_PER_SM_DECLARATION(CHANNELS_PER_GROUP, /* dummy value */ 0, PASS_NAME)

#define GN_FWD_ONE_PASS_DECLARATION(CHANNELS_PER_GROUP) GN_ONE_PASS_DECLARATION(CHANNELS_PER_GROUP, fwd)

#define GN_BWD_ONE_PASS_DECLARATION(CHANNELS_PER_GROUP) GN_ONE_PASS_DECLARATION(CHANNELS_PER_GROUP, bwd)

////////////////////////////////////////////////////////////////////////////////////////////////////

#define CALL_TWO_PASS_KERNEL(Kernel, Precision)               \
  if (params.channels_per_block == 320) {                     \
    Kernel<Precision, 160><<<grid, 160, 0, stream>>>(params); \
  } else if (params.channels_per_block == 280) {              \
    Kernel<Precision, 140><<<grid, 140, 0, stream>>>(params); \
  } else if (params.channels_per_block == 208) {              \
    Kernel<Precision, 140><<<grid, 104, 0, stream>>>(params); \
  } else if (params.channels_per_block == 240) {              \
    Kernel<Precision, 120><<<grid, 120, 0, stream>>>(params); \
  } else if (params.channels_per_block == 512) {              \
    Kernel<Precision, 256><<<grid, 256, 0, stream>>>(params); \
  } else if (params.channels_per_block == 448) {              \
    Kernel<Precision, 448><<<grid, 224, 0, stream>>>(params); \
  } else if (params.channels_per_block == 384) {              \
    Kernel<Precision, 192><<<grid, 192, 0, stream>>>(params); \
  } else if (params.channels_per_block == 256) {              \
    Kernel<Precision, 128><<<grid, 128, 0, stream>>>(params); \
  } else if (params.channels_per_block == 128) {              \
    Kernel<Precision, 64><<<grid, 64, 0, stream>>>(params);   \
  } else if (params.channels_per_block == 336) {              \
    Kernel<Precision, 168><<<grid, 168, 0, stream>>>(params); \
  } else if (params.channels_per_block == 392) {              \
    Kernel<Precision, 196><<<grid, 196, 0, stream>>>(params); \
  } else {                                                    \
    assert(false);                                            \
  }

////////////////////////////////////////////////////////////////////////////////////////////////////


================================================
FILE: apex/contrib/csrc/group_norm/traits.h
================================================
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */
#pragma once

#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

////////////////////////////////////////////////////////////////////////////////////////////////////

struct Fp32 {
  // Type is float32_t
  using Type = float;
  // Doubled type
  using Type2 = float2;

  // Unpack input to accumulators type
  static inline __device__ float2 unpack(const float2& f2) { return f2; }

  // Pack the accumulators into outputs.
  static inline __device__ float2 pack(const float2& f2) { return f2; }

  static inline __device__ float2 zero() { return {0.f, 0.f}; }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

struct Fp16 {
  // Type is __half
  using Type = __half;
  // Doubled type
  using Type2 = __half2;

  // Unpack input to accumulators type
  static inline __device__ float2 unpack(const __half2& h2) {
    // FIXME(nkorobov): __half22float2 makes compilation error in container
    return {__half2float(h2.x), __half2float(h2.y)};
  }

  // Pack the accumulators into outputs.
  static inline __device__ __half2 pack(const float2& f2) {
    // FIXME(nkorobov): __float22half2_rn makes compilation error in container
    return {__float2half_rn(f2.x), __float2half_rn(f2.y)};
  }

  static inline __device__ __half2 zero() {
    uint32_t zero = 0;
    return *reinterpret_cast<__half2*>(&zero);
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

struct Bf16 {
  // Type is __nv_bfloat16
  using Type = __nv_bfloat16;
  // Doubled type
  using Type2 = __nv_bfloat162;

  // Unpack input to accumulators type
  static inline __device__ float2 unpack(const __nv_bfloat162& h2) {
    // FIXME(nkorobov): __half22float2 makes compilation error in container
    return {__bfloat162float(h2.x), __bfloat162float(h2.y)};
  }

  // Pack the accumulators into outputs.
  static inline __device__ __nv_bfloat162 pack(const float2& f2) {
    // FIXME(nkorobov): __float22bfloat162_rn makes compilation error in container
    return {__float2bfloat16_rn(f2.x), __float2bfloat16_rn(f2.y)};
  }

  static inline __device__ __nv_bfloat162 zero() {
    uint32_t zero = 0;
    return *reinterpret_cast<__nv_bfloat162*>(&zero);
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////
struct Fp32IOFp16W {
  // IO traits
  using IOTraits = Fp32;
  // Weigths traits
  using WTraits = Fp16;
};

struct Fp32IOBf16W {
  // IO traits
  using IOTraits = Fp32;
  // Weigths traits
  using WTraits = Bf16;
};

struct Fp32IOFp32W {
  // IO traits
  using IOTraits = Fp32;
  // Weigths traits
  using WTraits = Fp32;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

struct Fp16IOFp16W {
  // IO traits
  using IOTraits = Fp16;
  // Weigths traits
  using WTraits = Fp16;
};

struct Fp16IOBf16W {
  // IO traits
  using IOTraits = Fp16;
  // Weigths traits
  using WTraits = Bf16;
};

struct Fp16IOFp32W {
  // IO traits
  using IOTraits = Fp16;
  // Weigths traits
  using WTraits = Fp32;
};

////////////////////////////////////////////////////////////////////////////////////////////////////
struct Bf16IOFp16W {
  // IO traits
  using IOTraits = Bf16;
  // Weigths traits
  using WTraits = Fp16;
};

struct Bf16IOBf16W {
  // IO traits
  using IOTraits = Bf16;
  // Weigths traits
  using WTraits = Bf16;
};

struct Bf16IOFp32W {
  // IO traits
  using IOTraits = Bf16;
  // Weigths traits
  using WTraits = Fp32;
};

////////////////////////////////////////////////////////////////////////////////////////////////////


================================================
FILE: apex/contrib/csrc/group_norm_v2/generate_gn_cuda_inst.py
================================================
import pathlib


hw_c_list = [
    (8 * 8, 1280),
    (8 * 8, 2560),
    (16 * 16, 640),
    (16 * 16, 1280),
    (16 * 16, 1920),
    (16 * 16, 2560),
    (32 * 32, 320),
    (32 * 32, 640),
    (32 * 32, 960),
    (32 * 32, 1280),
    (32 * 32, 1920),
    (64 * 64, 320),
    (64 * 64, 640),
    (64 * 64, 960),
]


def run():
    src_path = pathlib.Path(__file__).parent.absolute()

    for f in src_path.glob("gn_cuda_inst_*.cu"):
        f.unlink()

    for hw, c in hw_c_list:
        print(f"GN_CUDA_INST_DEFINE({hw}, {c})")
        with open(src_path / f"gn_cuda_inst_{hw}_{c}.cu", "w") as f:
            f.write('#include "gn_cuda_host_template.cuh"\n')
            f.write("\n")
            f.write("\n")
            f.write("namespace group_norm_v2 {\n")
            f.write("\n")
            f.write(f"GN_CUDA_INST_DEFINE({hw}, {c})\n")
            f.write("\n")
            f.write("}  // namespace group_norm_v2\n")

    with open(src_path / "gn_dispatch_hw_c.hpp", "w") as f:
        f.write("#pragma once\n")
        f.write("\n")
        f.write("#define DISPATCH_HW_C(hw, c, HW, C, ...) [&] { \\\n")
        for hw, c in hw_c_list:
            f.write(
                f"    if (hw == {hw} && c == {c}) {{ constexpr int HW = {hw}, C = {c}; return __VA_ARGS__(); }} \\\n"
            )
        f.write(
            '    throw std::invalid_argument("DISPATCH_HW_C " + std::to_string(hw) + " " + std::to_string(c)); \\\n'
        )
        f.write("    }()\n")


if __name__ == "__main__":
    run()


================================================
FILE: apex/contrib/csrc/group_norm_v2/gn.cpp
================================================
#include "gn.hpp"

#include <ATen/cuda/CUDAContext.h>
#include <torch/extension.h>

namespace group_norm_v2 {

torch::Tensor gn(torch::Tensor x, torch::Tensor w, torch::Tensor b, float eps, bool silu, int num_groups,
                 std::optional<torch::Tensor> mean_var_out, int sm_margin) {
  if (w.dtype() != b.dtype() || (mean_var_out.has_value() && mean_var_out->dtype() != torch::kFloat32)) {
    throw std::invalid_argument("gn dtype mismatch");
  }
  torch::Tensor out = torch::empty_like(x);
  float* ptr_mean_var_out = mean_var_out.has_value() ? mean_var_out->data_ptr<float>() : nullptr;
  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  int device_id = at::cuda::getCurrentCUDAStream().device().index();
  group_norm_v2::Meta meta;
  if (x.dtype() == torch::kHalf && w.dtype() == torch::kHalf) {
    group_norm_v2::gn_cuda((half*)out.data_ptr(), (half*)x.data_ptr(), (half*)w.data_ptr(), (half*)b.data_ptr(), eps,
                           silu, x.size(0), x.size(2) * x.size(3), num_groups, x.size(1) / num_groups, ptr_mean_var_out,
                           nullptr, nullptr, sm_margin, stream, device_id, &meta, true);
  } else if (x.dtype() == torch::kBFloat16 && w.dtype() == torch::kBFloat16) {
    group_norm_v2::gn_cuda((__nv_bfloat16*)out.data_ptr(), (__nv_bfloat16*)x.data_ptr(), (__nv_bfloat16*)w.data_ptr(),
                           (__nv_bfloat16*)b.data_ptr(), eps, silu, x.size(0), x.size(2) * x.size(3), num_groups,
                           x.size(1) / num_groups, ptr_mean_var_out, nullptr, nullptr, sm_margin, stream, device_id,
                           &meta, true);
  } else {
    throw std::invalid_argument("gn only supports half or bfloat16 input and weight");
  }
  torch::Tensor red_buffer =
      torch::empty({meta.red_buffer_size}, torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA));
  thread_local torch::Tensor barrier;
  if (barrier.size(0) < meta.barrier_size) {
    barrier = torch::zeros({meta.barrier_size}, torch::TensorOptions().dtype(torch::kUInt32).device(torch::kCUDA));
  }
  if (x.dtype() == torch::kHalf && w.dtype() == torch::kHalf) {
    group_norm_v2::gn_cuda((half*)out.data_ptr(), (half*)x.data_ptr(), (half*)w.data_ptr(), (half*)b.data_ptr(), eps,
                           silu, x.size(0), x.size(2) * x.size(3), num_groups, x.size(1) / num_groups, ptr_mean_var_out,
                           red_buffer.data_ptr<float>(), barrier.data_ptr<unsigned>(), sm_margin, stream, device_id,
                           nullptr, false);
  } else if (x.dtype() == torch::kBFloat16 && w.dtype() == torch::kBFloat16) {
    group_norm_v2::gn_cuda((__nv_bfloat16*)out.data_ptr(), (__nv_bfloat16*)x.data_ptr(), (__nv_bfloat16*)w.data_ptr(),
                           (__nv_bfloat16*)b.data_ptr(), eps, silu, x.size(0), x.size(2) * x.size(3), num_groups,
                           x.size(1) / num_groups, ptr_mean_var_out, red_buffer.data_ptr<float>(),
                           barrier.data_ptr<unsigned>(), sm_margin, stream, device_id, nullptr, false);
  } else {
    throw std::invalid_argument("gn only supports half or bfloat16 input and weight");
  }
  return out;
}

auto gn_bwd(torch::Tensor grad_output, torch::Tensor x, torch::Tensor w, torch::Tensor b, torch::Tensor mean_var,
            float eps, bool silu, int num_groups, int sm_margin) {
  if (w.dtype() != b.dtype() || x.dtype() != grad_output.dtype() || mean_var.dtype() != torch::kFloat32) {
    throw std::invalid_argument("gn_bwd dtype mismatch");
  }
  torch::Tensor grad_input = torch::empty_like(x);
  torch::Tensor grad_weight = torch::empty_like(w);
  torch::Tensor grad_bias = torch::empty_like(w);
  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  int device_id = at::cuda::getCurrentCUDAStream().device().index();
  group_norm_v2::Meta meta;
  if (x.dtype() == torch::kHalf && w.dtype() == torch::kHalf) {
    group_norm_v2::gn_bwd_cuda((half*)grad_input.data_ptr(), (half*)grad_weight.data_ptr(), (half*)grad_bias.data_ptr(),
                               (half*)grad_output.data_ptr(), (half*)x.data_ptr(), (half*)w.data_ptr(),
                               (half*)b.data_ptr(), mean_var.data_ptr<float>(), eps, silu, x.size(0),
                               x.size(2) * x.size(3), num_groups, x.size(1) / num_groups, nullptr, nullptr, sm_margin,
                               stream, device_id, &meta, true);
  } else if (x.dtype() == torch::kBFloat16 && w.dtype() == torch::kBFloat16) {
    group_norm_v2::gn_bwd_cuda((__nv_bfloat16*)grad_input.data_ptr(), (__nv_bfloat16*)grad_weight.data_ptr(),
                               (__nv_bfloat16*)grad_bias.data_ptr(), (__nv_bfloat16*)grad_output.data_ptr(),
                               (__nv_bfloat16*)x.data_ptr(), (__nv_bfloat16*)w.data_ptr(), (__nv_bfloat16*)b.data_ptr(),
                               mean_var.data_ptr<float>(), eps, silu, x.size(0), x.size(2) * x.size(3), num_groups,
                               x.size(1) / num_groups, nullptr, nullptr, sm_margin, stream, device_id, &meta, true);
  } else {
    throw std::invalid_argument("gn only supports half or bfloat16 input and weight");
  }
  torch::Tensor red_buffer =
      torch::empty({meta.red_buffer_size}, torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA));
  thread_local torch::Tensor barrier;
  if (barrier.size(0) < meta.barrier_size) {
    barrier = torch::zeros({meta.barrier_size}, torch::TensorOptions().dtype(torch::kUInt32).device(torch::kCUDA));
  }
  if (x.dtype() == torch::kHalf && w.dtype() == torch::kHalf) {
    group_norm_v2::gn_bwd_cuda((half*)grad_input.data_ptr(), (half*)grad_weight.data_ptr(), (half*)grad_bias.data_ptr(),
                               (half*)grad_output.data_ptr(), (half*)x.data_ptr(), (half*)w.data_ptr(),
                               (half*)b.data_ptr(), mean_var.data_ptr<float>(), eps, silu, x.size(0),
                               x.size(2) * x.size(3), num_groups, x.size(1) / num_groups, red_buffer.data_ptr<float>(),
                               barrier.data_ptr<unsigned>(), sm_margin, stream, device_id, nullptr, false);
  } else if (x.dtype() == torch::kBFloat16 && w.dtype() == torch::kBFloat16) {
    group_norm_v2::gn_bwd_cuda((__nv_bfloat16*)grad_input.data_ptr(), (__nv_bfloat16*)grad_weight.data_ptr(),
                               (__nv_bfloat16*)grad_bias.data_ptr(), (__nv_bfloat16*)grad_output.data_ptr(),
                               (__nv_bfloat16*)x.data_ptr(), (__nv_bfloat16*)w.data_ptr(), (__nv_bfloat16*)b.data_ptr(),
                               mean_var.data_ptr<float>(), eps, silu, x.size(0), x.size(2) * x.size(3), num_groups,
                               x.size(1) / num_groups, red_buffer.data_ptr<float>(), barrier.data_ptr<unsigned>(),
                               sm_margin, stream, device_id, nullptr, false);
  } else {
    throw std::invalid_argument("gn only supports half or bfloat16 input and weight");
  }
  return std::make_tuple(grad_input, grad_weight, grad_bias);
}

}  // namespace group_norm_v2

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("gn", &group_norm_v2::gn, py::arg("x"), py::arg("w"), py::arg("b"), py::arg("eps"), py::arg("silu"),
        py::arg("num_groups"), py::arg("mean_var_out") = py::none(), py::arg("sm_margin") = 0, "");
  m.def("gn_bwd", &group_norm_v2::gn_bwd, py::arg("grad_output"), py::arg("x"), py::arg("w"), py::arg("b"),
        py::arg("mean_var"), py::arg("eps"), py::arg("silu"), py::arg("num_groups"), py::arg("sm_margin") = 0, "");
}


================================================
FILE: apex/contrib/csrc/group_norm_v2/gn.hpp
================================================
#pragma once

#include <cuda_runtime.h>

#include <cstdint>

namespace group_norm_v2 {

struct Meta {
  int64_t red_buffer_size;
  int64_t barrier_size;
  int BLOCK_DIM_X;
  int C_PER_BLOCK;
  int ROWS_PER_BLOCK;
  int VEC_ELEMS;
  bool LOAD_TWICE;
  int BLOCKS_PER_SM;
  bool HARDWARE_CLUSTER;
  int wgrad_sync_method;
};

template <typename T>
void gn_cuda(T* out, T* x, T* w, T* b, float eps, bool silu, int64_t n, int64_t hw, int num_groups,
             int channels_per_group, float* mean_var_out, float* red_buffer, unsigned* barrier, int sm_margin,
             cudaStream_t stream, int device_id, Meta* meta_ptr, bool meta_only);

template <typename T>
void gn_bwd_cuda(T* grad_input, T* grad_weight, T* grad_bias, T* grad_output, T* x, T* w, T* b, float* mean_var,
                 float eps, bool silu, int64_t n, int64_t hw, int num_groups, int channels_per_group, float* red_buffer,
                 unsigned* barrier, int sm_margin, cudaStream_t stream, int device_id, Meta* meta_ptr, bool meta_only);

}  // namespace group_norm_v2


================================================
FILE: apex/contrib/csrc/group_norm_v2/gn_cuda.cu
================================================
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>

#include <cstdio>
#include <mutex>
#include <stdexcept>

#include "gn.hpp"
#include "gn_dispatch_hw_c.hpp"
#include "gn_utils.hpp"

#define DISPATCH_NUM_GROUPS_AND_SILU(num_groups, silu, NUM_GROUPS, SILU, ...)                        \
  [&] {                                                                                              \
    if (num_groups == 16 && silu == true) {                                                          \
      constexpr int NUM_GROUPS = 16;                                                                 \
      constexpr bool SILU = true;                                                                    \
      return __VA_ARGS__();                                                                          \
    }                                                                                                \
    if (num_groups == 32 && silu == false) {                                                         \
      constexpr int NUM_GROUPS = 32;                                                                 \
      constexpr bool SILU = false;                                                                   \
      return __VA_ARGS__();                                                                          \
    }                                                                                                \
    throw std::invalid_argument("DISPATCH_NUM_GROUPS_AND_SILU " + std::to_string(num_groups) + " " + \
                                std::to_string(silu));                                               \
  }()

namespace group_norm_v2 {

template <typename T, int HW, int C, int G, bool SILU>
void gn_cuda_single_shape(GN_CUDA_HOST_PARAMS(T));

template <typename T, int HW, int C, int G, bool SILU>
void gn_bwd_cuda_single_shape(GN_BWD_CUDA_HOST_PARAMS(T));

template <typename T>
void gn_cuda(GN_CUDA_HOST_PARAMS(T)) {
  DISPATCH_HW_C(hw, num_groups * channels_per_group, HW, C, [&] {
    DISPATCH_NUM_GROUPS_AND_SILU(num_groups, silu, G, SILU,
                                 [&] { return gn_cuda_single_shape<T, HW, C, G, SILU>(GN_CUDA_HOST_ARGS); });
  });
}

template <typename T>
void gn_bwd_cuda(GN_BWD_CUDA_HOST_PARAMS(T)) {
  DISPATCH_HW_C(hw, num_groups * channels_per_group, HW, C, [&] {
    DISPATCH_NUM_GROUPS_AND_SILU(num_groups, silu, G, SILU,
                                 [&] { return gn_bwd_cuda_single_shape<T, HW, C, G, SILU>(GN_BWD_CUDA_HOST_ARGS); });
  });
}

template void gn_cuda(GN_CUDA_HOST_PARAMS(half));
template void gn_cuda(GN_CUDA_HOST_PARAMS(__nv_bfloat16));

template void gn_bwd_cuda(GN_BWD_CUDA_HOST_PARAMS(half));
template void gn_bwd_cuda(GN_BWD_CUDA_HOST_PARAMS(__nv_bfloat16));

}  // namespace group_norm_v2


================================================
FILE: apex/contrib/csrc/group_norm_v2/gn_cuda_host_template.cuh
================================================
#pragma once

#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>

#include <cstdio>
#include <stdexcept>

#include "gn_cuda_kernel.cuh"
#include "gn_utils.hpp"

namespace group_norm_v2 {

#define DISPATCH_LOWER_BOUND_N(VALUE, CONST_NAME, ...)                              \
  [&] {                                                                             \
    if (VALUE >= 16) {                                                              \
      constexpr int CONST_NAME = 16;                                                \
      return __VA_ARGS__();                                                         \
    }                                                                               \
    if (VALUE >= 8) {                                                               \
      constexpr int CONST_NAME = 8;                                                 \
      return __VA_ARGS__();                                                         \
    }                                                                               \
    if (VALUE >= 4) {                                                               \
      constexpr int CONST_NAME = 4;                                                 \
      return __VA_ARGS__();                                                         \
    }                                                                               \
    if (VALUE >= 2) {                                                               \
      constexpr int CONST_NAME = 2;                                                 \
      return __VA_ARGS__();                                                         \
    }                                                                               \
    if (VALUE >= 1) {                                                               \
      constexpr int CONST_NAME = 1;                                                 \
      return __VA_ARGS__();                                                         \
    }                                                                               \
    throw std::invalid_argument("DISPATCH_LOWER_BOUND_N " + std::to_string(VALUE)); \
  }()

#define DISPATCH_CUDA_ARCH_AND_LOWER_BOUND_SM_COUNT(runtime_cuda_arch, sm_count, RUNTIME_CUDA_ARCH, LB_SM_COUNT, ...) \
  [&] {                                                                                                               \
    if (runtime_cuda_arch == 1000 && sm_count >= 148) {                                                               \
      constexpr int RUNTIME_CUDA_ARCH = 1000, LB_SM_COUNT = 148;                                                      \
      return __VA_ARGS__();                                                                                           \
    }                                                                                                                 \
    throw std::invalid_argument("DISPATCH_CUDA_ARCH_AND_LOWER_BOUND_SM_COUNT " + std::to_string(runtime_cuda_arch) +  \
                                " " + std::to_string(sm_count));                                                      \
  }()

#define DISPATCH_SM_MARGIN(VALUE, CONST_NAME, ...)                              \
  [&] {                                                                         \
    if (VALUE == 0) {                                                           \
      constexpr int CONST_NAME = 0;                                             \
      return __VA_ARGS__();                                                     \
    }                                                                           \
    if (VALUE == 32) {                                                          \
      constexpr int CONST_NAME = 32;                                            \
      return __VA_ARGS__();                                                     \
    }                                                                           \
    throw std::invalid_argument("DISPATCH_SM_MARGIN " + std::to_string(VALUE)); \
  }()

inline constexpr int get_max_cuda_arch() {
  int cuda_arch_list[] = {__CUDA_ARCH_LIST__};
  int max_cuda_arch = -1;
  for (int cuda_arch_item : cuda_arch_list) {
    if (cuda_arch_item > max_cuda_arch) {
      max_cuda_arch = cuda_arch_item;
    }
  }
  return max_cuda_arch;
}

template <typename T, bool BWD, bool REQUIRES_WGRAD, int HW, int G, int CPG, int LB_N, int RUNTIME_CUDA_ARCH,
          int LB_SM_COUNT, int EFFECTIVE_CUDA_ARCH, int SM_MARGIN>
constexpr auto compute_gn_params() {
  constexpr int C = G * CPG;

  // Initialize each variable to comply with C++17
  int BLOCK_DIM_X = 0;
  int C_PER_BLOCK = 0;
  int ROWS_PER_BLOCK = 0;
  bool LOAD_TWICE = false;
  int BLOCKS_PER_SM = 0;
  WgradSyncMethod wgrad_sync_method = WGRAD_SYNC_UNSPECIFIED;

  // There are two tiling strategies:
  //   - block sync: each block handles a whole group, i.e., a multiple of (G * HW) elements
  //   - virtual cluster sync: each virtual cluster handles a group
  // Block sync can avoid cross-block synchronization latency, but it may cause low occupancy.
  //   Use block sync if the IO size is small, when latency rather than occupancy dominates the kernel running time.

  // Elements to load for forward pass is `x`, elements to load for backward pass are `x` and `grad_output`, hence there
  // is a factor of (1 + BWD)
  if (HW * CPG * (1 + BWD) * sizeof(T) <= 20480) {
    // Strategy 1: block sync
    C_PER_BLOCK = CPG;
    ROWS_PER_BLOCK = HW;
    BLOCK_DIM_X = lcm(32, C_PER_BLOCK);
    while (BLOCK_DIM_X < 256) {
      BLOCK_DIM_X *= 2;
    }
    BLOCKS_PER_SM = 1;
    // The size of registers is 65536 registers * 4 bytes per register.
    //   We have to leave some room for other variables and compiler optimizations,
    //   so we use 36000 as the threshold.
    LOAD_TWICE = BLOCKS_PER_SM * ROWS_PER_BLOCK * C_PER_BLOCK * (1 + BWD) * sizeof(T) > 36000 * 4;
  } else {
    // Strategy 2: virtual cluster sync
    //   A virtual cluster is a group of blocks that are synchronized with each other.
    //   Each group, i.e., a multiple of (G * HW) elements, should be handled on the same virtual cluster.
    //   If the virtual cluster size is supported by the hardware, HARDWARE_CLUSTER is preferred;
    //   otherwise, cooperative groups are used (i.e., PERSISTENT kernels).
    int c_per_cluster = lcm(128 / (int)sizeof(T), CPG);

    C_PER_BLOCK = c_per_cluster;
    BLOCK_DIM_X = C_PER_BLOCK == 320 ? 320 : 480;

    // Maximum number of rows that should reside in registers
    int register_max_rows = 36000 * 4 / (C_PER_BLOCK * (1 + BWD) * sizeof(T));

    std::tuple<bool, int, int, int, int, int> best_candidate{};
    BLOCKS_PER_SM = 0;
    ROWS_PER_BLOCK = 0;
    for (int blocks_per_sm = 1; blocks_per_sm <= 3; blocks_per_sm++) {
      for (int rows_per_block = HW; rows_per_block >= 1; rows_per_block /= 2) {
        int virtual_cluster_size = (HW / rows_per_block) * (c_per_cluster / C_PER_BLOCK);
        if (virtual_cluster_size > blocks_per_sm * (LB_SM_COUNT - SM_MARGIN)) {
          continue;
        }
        int num_clusters = blocks_per_sm * (LB_SM_COUNT - SM_MARGIN) / virtual_cluster_size;
        int num_tasks = LB_N * (C / c_per_cluster);
        int num_waves = up_div(num_tasks, num_clusters);
        bool load_twice = rows_per_block > register_max_rows / blocks_per_sm;

        // Wave utilization: the percent of SMs that are used for each wave
        //   For example, SM_COUNT=100 and VIRTUAL_CLUSTER_SIZE=64,
        //     if BLOCKS_PER_SM=1, num_clusters=1, wave_util=64%;
        //     if BLOCKS_PER_SM=2, num_clusters=3, wave_util=96%.
        //   This helps select a good number of BLOCKS_PER_SM
        int wave_util = 10000 * std::min(num_tasks, num_clusters) * virtual_cluster_size /
                        (blocks_per_sm * (LB_SM_COUNT - SM_MARGIN));

        decltype(best_candidate) candidate = {
            true,
            !load_twice,  // Prefer no load twice
            !(num_waves >= 2 &&
              blocks_per_sm ==
                  1),    // When there are multiple waves, prefer multiple blocks per SM to ensure overlapping
            -num_waves,  // Prefer fewer waves
            std::min(9000, wave_util),  // Prefer high wave utilization
            -blocks_per_sm,             // Prefer fewer blocks per SM in order to reduce threads overhead
        };
        if (candidate > best_candidate) {
          // Assign each element respectively to comply with C++17
          std::get<0>(best_candidate) = std::get<0>(candidate);
          std::get<1>(best_candidate) = std::get<1>(candidate);
          std::get<2>(best_candidate) = std::get<2>(candidate);
          std::get<3>(best_candidate) = std::get<3>(candidate);
          std::get<4>(best_candidate) = std::get<4>(candidate);
          std::get<5>(best_candidate) = std::get<5>(candidate);
          static_assert(std::tuple_size<decltype(best_candidate)>::value == 6, "missing assignments");

          BLOCKS_PER_SM = blocks_per_sm;
          ROWS_PER_BLOCK = rows_per_block;
        }
      }
    }

    LOAD_TWICE = ROWS_PER_BLOCK > register_max_rows / BLOCKS_PER_SM;
  }

  int c_per_cluster = lcm(CPG, C_PER_BLOCK);
  int virtual_cluster_size = (c_per_cluster / C_PER_BLOCK) * (HW / ROWS_PER_BLOCK);

  // The occupancy is affected if cluster size is large.
  //   For example, on H100, when gridDim=128 and each block occupies the whole SM,
  //     if cluster is not used, all blocks can be active simultaneously.
  //     if cluster size is 16, not all blocks can be active simultaneously (which can be queried by
  //     cudaOccupancyMaxActiveClusters),
  //       so there will be two waves which impacts efficiency.
  // When SM_MARGIN is set, no cluster should be used because other kernels may occupy a part of the cluster.
  bool HARDWARE_CLUSTER = virtual_cluster_size <= 2 && virtual_cluster_size != 1 && SM_MARGIN == 0;

  int MAX_VEC_BYTES =
      8;  // Sometimes 4 or 16 is better, but there is no trivial way to select the best vectorization size.
  int VEC_ELEMS = std::min(gcd(MAX_VEC_BYTES / (int)sizeof(T), C_PER_BLOCK),
                           gcd(MAX_VEC_BYTES / (int)sizeof(T), ROWS_PER_BLOCK * C_PER_BLOCK / BLOCK_DIM_X));

  return std::make_tuple(BLOCK_DIM_X, C_PER_BLOCK, ROWS_PER_BLOCK, VEC_ELEMS, LOAD_TWICE, BLOCKS_PER_SM,
                         HARDWARE_CLUSTER, wgrad_sync_method);
}

// Save compilation time for unused CUDA_ARCHs
//   For each template argument from DISPATCH_CUDA_ARCH_AND_LOWER_BOUND_SM_COUNT, the kernel is only compiled for the
//   corresponding CUDA_ARCH
template <int EFFECTIVE_CUDA_ARCH>
class CompileCondition {
 public:
  __host__ __device__ static constexpr bool matches() {
#if defined(__CUDA_ARCH__)
    return __CUDA_ARCH__ == EFFECTIVE_CUDA_ARCH;
#else
    return false;
#endif
  }
};

template <typename T, int HW, int C, int G, bool SILU>
void gn_cuda_single_shape(GN_CUDA_HOST_PARAMS(T)) {
  if (out == x) {
    throw std::invalid_argument("not __restrict__");
  }

  cudaDeviceProp const& deviceProp = get_device_prop(device_id);
  int runtime_cuda_arch = deviceProp.major * 100 + deviceProp.minor * 10;
  int sm_count = deviceProp.multiProcessorCount;

  DISPATCH_LOWER_BOUND_N(n, LB_N, [&] {
    DISPATCH_CUDA_ARCH_AND_LOWER_BOUND_SM_COUNT(runtime_cuda_arch, sm_count, RUNTIME_CUDA_ARCH, LB_SM_COUNT, [&] {
      DISPATCH_SM_MARGIN(sm_margin, SM_MARGIN, [&] {
        if (hw != HW) {
          throw std::invalid_argument("wrong HW");
        }
        if (num_groups * channels_per_group != C) {
          throw std::invalid_argument("wrong C");
        }
        if (num_groups != G) {
          throw std::invalid_argument("wrong G");
        }
        if (silu != SILU) {
          throw std::invalid_argument("wrong SILU");
        }
        if (n < LB_N) {
          throw std::invalid_argument("wrong LB_N");
        }
        if (runtime_cuda_arch != RUNTIME_CUDA_ARCH) {
          throw std::invalid_argument("wrong RUNTIME_CUDA_ARCH");
        }
        if (sm_count < LB_SM_COUNT) {
          throw std::invalid_argument("wrong LB_SM_COUNT");
        }
        if (sm_margin != SM_MARGIN) {
          throw std::invalid_argument("wrong SM_MARGIN");
        }
        constexpr int EFFECTIVE_CUDA_ARCH =
            std::min(RUNTIME_CUDA_ARCH, get_max_cuda_arch());  // Assume the max CUDA_ARCH is used to generate PTX

        constexpr int CPG = C / G;

        constexpr auto params = compute_gn_params<T, false, false, HW, G, CPG, LB_N, RUNTIME_CUDA_ARCH, LB_SM_COUNT,
                                                  EFFECTIVE_CUDA_ARCH, SM_MARGIN>();
        constexpr int BLOCK_DIM_X = std::get<0>(params);
        constexpr int C_PER_BLOCK = std::get<1>(params);
        constexpr int ROWS_PER_BLOCK = std::get<2>(params);
        constexpr int VEC_ELEMS = std::get<3>(params);
        constexpr bool LOAD_TWICE = std::get<4>(params);
        constexpr int BLOCKS_PER_SM = std::get<5>(params);
        constexpr bool HARDWARE_CLUSTER = std::get<6>(params);

        constexpr int C_PER_CLUSTER = lcm(CPG, C_PER_BLOCK);
        constexpr int VIRTUAL_CLUSTER_SIZE = (C_PER_CLUSTER / C_PER_BLOCK) * (HW / ROWS_PER_BLOCK);
        constexpr int NUM_VIRTUAL_CLUSTERS = ((LB_SM_COUNT - SM_MARGIN) * BLOCKS_PER_SM) / VIRTUAL_CLUSTER_SIZE;
        constexpr bool PERSISTENT =
            !HARDWARE_CLUSTER &&
            VIRTUAL_CLUSTER_SIZE >=
                2;  // Only virtual cluster sync (not include hardware cluster sync) requires PERSISTENT kernels

        if (meta_ptr) {
          constexpr int MAX_NUM_GROUPS_PER_BLOCK =
              C_PER_BLOCK % CPG == 0 ? C_PER_BLOCK / CPG : up_div(C_PER_BLOCK - gcd(C_PER_BLOCK, CPG), CPG) + 1;
          meta_ptr->red_buffer_size = 2 * NUM_VIRTUAL_CLUSTERS * VIRTUAL_CLUSTER_SIZE * MAX_NUM_GROUPS_PER_BLOCK * 2;
          meta_ptr->barrier_size = NUM_VIRTUAL_CLUSTERS;
          meta_ptr->BLOCK_DIM_X = BLOCK_DIM_X;
          meta_ptr->C_PER_BLOCK = C_PER_BLOCK;
          meta_ptr->ROWS_PER_BLOCK = ROWS_PER_BLOCK;
          meta_ptr->VEC_ELEMS = VEC_ELEMS;
          meta_ptr->LOAD_TWICE = LOAD_TWICE;
          meta_ptr->BLOCKS_PER_SM = BLOCKS_PER_SM;
          meta_ptr->HARDWARE_CLUSTER = HARDWARE_CLUSTER;
          meta_ptr->wgrad_sync_method = (int)WGRAD_SYNC_UNSPECIFIED;
        }
        if (meta_only) {
          return;
        }

        cudaLaunchConfig_t config = {0};
        config.gridDim = dim3(
            VIRTUAL_CLUSTER_SIZE,
            PERSISTENT ? std::min((int)n * (C / C_PER_CLUSTER), NUM_VIRTUAL_CLUSTERS) : n * (C / C_PER_CLUSTER), 1);
        config.blockDim = BLOCK_DIM_X;
        config.stream = stream;

        cudaLaunchAttribute attribute[2];
        if constexpr (HARDWARE_CLUSTER) {
          attribute[0].id = cudaLaunchAttributeClusterDimension;
          attribute[0].val.clusterDim.x = VIRTUAL_CLUSTER_SIZE;  // Cluster size in X-dimension
          attribute[0].val.clusterDim.y = 1;
          attribute[0].val.clusterDim.z = 1;
          config.attrs = attribute;
          config.numAttrs++;
        }
        if constexpr (PERSISTENT) {
          attribute[config.numAttrs].id = cudaLaunchAttributeCooperative;
          attribute[config.numAttrs].val.cooperative = 1;
          config.attrs = attribute;
          config.numAttrs++;
        }

        auto kernel = &gn_cuda_kernel<T, BLOCK_DIM_X, BLOCKS_PER_SM, G, CPG, HW, SILU, ROWS_PER_BLOCK, C_PER_BLOCK,
                                      C_PER_CLUSTER, VEC_ELEMS, PERSISTENT, NUM_VIRTUAL_CLUSTERS, LOAD_TWICE,
                                      HARDWARE_CLUSTER, CompileCondition<EFFECTIVE_CUDA_ARCH> >;
        if constexpr (HARDWARE_CLUSTER) {
          if constexpr (VIRTUAL_CLUSTER_SIZE > 8) {
            CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeNonPortableClusterSizeAllowed, 1));
          }
          int max_cluster_size;
          int active_clusters;
          CUDA_CHECK(cudaOccupancyMaxPotentialClusterSize(&max_cluster_size, (void*)kernel, &config));
          if (VIRTUAL_CLUSTER_SIZE <= max_cluster_size && PERSISTENT) {
            attribute[0].val.clusterDim.x = VIRTUAL_CLUSTER_SIZE;
            CUDA_CHECK(cudaOccupancyMaxActiveClusters(&active_clusters, (void*)kernel, &config));
          }
          if (VIRTUAL_CLUSTER_SIZE <= max_cluster_size &&
              (!PERSISTENT || PERSISTENT && NUM_VIRTUAL_CLUSTERS <= active_clusters)) {
            attribute[0].val.clusterDim.x = VIRTUAL_CLUSTER_SIZE;
          } else {
            // Fallback to cooperative groups because hardware cluster cannot be active simultaneously
            constexpr bool HARDWARE_CLUSTER_NEW = false;
            constexpr bool PERSISTENT_NEW = !HARDWARE_CLUSTER_NEW && VIRTUAL_CLUSTER_SIZE >= 2;
            config.gridDim = dim3(
                VIRTUAL_CLUSTER_SIZE,
                PERSISTENT_NEW ? std::min((int)n * (C / C_PER_CLUSTER), NUM_VIRTUAL_CLUSTERS) : n * (C / C_PER_CLUSTER),
                1);
            config.attrs = nullptr;
            config.numAttrs = 0;
            if constexpr (PERSISTENT_NEW) {
              attribute[config.numAttrs].id = cudaLaunchAttributeCooperative;
              attribute[config.numAttrs].val.cooperative = 1;
              config.attrs = attribute;
              config.numAttrs++;
            }
            kernel = &gn_cuda_kernel<T, BLOCK_DIM_X, BLOCKS_PER_SM, G, CPG, HW, SILU, ROWS_PER_BLOCK, C_PER_BLOCK,
                                     C_PER_CLUSTER, VEC_ELEMS, PERSISTENT_NEW, NUM_VIRTUAL_CLUSTERS, LOAD_TWICE,
                                     HARDWARE_CLUSTER_NEW, CompileCondition<EFFECTIVE_CUDA_ARCH> >;
          }
        }
        CUDA_CHECK(cudaLaunchKernelEx(&config, kernel, out, x, w, b, eps, n, mean_var_out, red_buffer, barrier));
      });
    });
  });
}

template <typename T, int HW, int C, int G, bool SILU>
void gn_bwd_cuda_single_shape(GN_BWD_CUDA_HOST_PARAMS(T)) {
  if (grad_input == grad_output || grad_input == x) {
    throw std::invalid_argument("not __restrict__");
  }

  cudaDeviceProp const& deviceProp = get_device_prop(device_id);
  int runtime_cuda_arch = deviceProp.major * 100 + deviceProp.minor * 10;
  int sm_count = deviceProp.multiProcessorCount;

  DISPATCH_LOWER_BOUND_N(n, LB_N, [&] {
    DISPATCH_CUDA_ARCH_AND_LOWER_BOUND_SM_COUNT(runtime_cuda_arch, sm_count, RUNTIME_CUDA_ARCH, LB_SM_COUNT, [&] {
      DISPATCH_SM_MARGIN(sm_margin, SM_MARGIN, [&] {
        if (hw != HW) {
          throw std::invalid_argument("wrong HW");
        }
        if (num_groups * channels_per_group != C) {
          throw std::invalid_argument("wrong C");
        }
        if (num_groups != G) {
          throw std::invalid_argument("wrong G");
        }
        if (silu != SILU) {
          throw std::invalid_argument("wrong SILU");
        }
        if (n < LB_N) {
          throw std::invalid_argument("wrong LB_N");
        }
        if (runtime_cuda_arch != RUNTIME_CUDA_ARCH) {
          throw std::invalid_argument("wrong RUNTIME_CUDA_ARCH");
        }
        if (sm_count < LB_SM_COUNT) {
          throw std::invalid_argument("wrong LB_SM_COUNT");
        }
        if (sm_margin != SM_MARGIN) {
          throw std::invalid_argument("wrong SM_MARGIN");
        }
        constexpr int EFFECTIVE_CUDA_ARCH =
            std::min(RUNTIME_CUDA_ARCH, get_max_cuda_arch());  // Assume the max CUDA_ARCH is used to generate PTX

        constexpr bool REQUIRES_WGRAD = true;
        constexpr int CPG = C / G;

        constexpr auto params = compute_gn_params<T, true, REQUIRES_WGRAD, HW, G, CPG, LB_N, RUNTIME_CUDA_ARCH,
                                                  LB_SM_COUNT, EFFECTIVE_CUDA_ARCH, SM_MARGIN>();
        constexpr int BLOCK_DIM_X = std::get<0>(params);
        constexpr int C_PER_BLOCK = std::get<1>(params);
        constexpr int ROWS_PER_BLOCK = std::get<2>(params);
        constexpr int VEC_ELEMS = std::get<3>(params);
        constexpr bool LOAD_TWICE = std::get<4>(params);
        constexpr int BLOCKS_PER_SM = std::get<5>(params);
        constexpr bool HARDWARE_CLUSTER = std::get<6>(params);
        constexpr WgradSyncMethod wgrad_sync_method_hint = std::get<7>(params);

        constexpr int C_PER_CLUSTER = lcm(CPG, C_PER_BLOCK);
        constexpr int VIRTUAL_CLUSTER_SIZE = (C_PER_CLUSTER / C_PER_BLOCK) * (HW / ROWS_PER_BLOCK);
        constexpr int NUM_VIRTUAL_CLUSTERS_NOT_ALIGNED =
            ((LB_SM_COUNT - SM_MARGIN) * BLOCKS_PER_SM) / VIRTUAL_CLUSTER_SIZE;

        // PERSISTENT is required because wgrad reduction requires synchronization.
        //   TODO: specilize for the case that REQUIRES_WGRAD == false
        constexpr bool PERSISTENT = true;

        // Determine whether to align each virtual cluster to a fixed range of channels
        //   If aligned, WGRAD_REUSE_SUM_SYNC_GROUP can be used, then less local wgrad memory is used (leave more room
        //   for compiler
        //     optimizations), and wgrad reduction is more efficient.
        //   However, aligning can cause low occupancy.
        //   There is a trade-off, and the condition to align is `NUM_VIRTUAL_CLUSTERS_NOT_ALIGNED > 2 * (C /
        //   C_PER_CLUSTER)`
        constexpr WgradSyncMethod wgrad_sync_method =
            wgrad_sync_method_hint == WGRAD_SYNC_UNSPECIFIED
                ? NUM_VIRTUAL_CLUSTERS_NOT_ALIGNED > 2 * (C / C_PER_CLUSTER) ||
                          NUM_VIRTUAL_CLUSTERS_NOT_ALIGNED % (C / C_PER_CLUSTER) == 0
                      ? (HARDWARE_CLUSTER ? WGRAD_ARRIVE_AND_WAIT_GROUP : WGRAD_REUSE_SUM_SYNC_GROUP)
                      : WGRAD_REUSE_SUM_SYNC_GRID
                : wgrad_sync_method_hint;
        constexpr int NUM_VIRTUAL_CLUSTERS =
            wgrad_sync_method == WGRAD_ARRIVE_AND_WAIT_GROUP || wgrad_sync_method == WGRAD_REUSE_SUM_SYNC_GROUP
                ? NUM_VIRTUAL_CLUSTERS_NOT_ALIGNED / (C / C_PER_CLUSTER) * (C / C_PER_CLUSTER)
                : NUM_VIRTUAL_CLUSTERS_NOT_ALIGNED;

        if (meta_ptr) {
          constexpr int MAX_NUM_GROUPS_PER_BLOCK =
              C_PER_BLOCK % CPG == 0 ? C_PER_BLOCK / CPG : up_div(C_PER_BLOCK - gcd(C_PER_BLOCK, CPG), CPG) + 1;
          meta_ptr->red_buffer_size =
              2 * NUM_VIRTUAL_CLUSTERS * VIRTUAL_CLUSTER_SIZE * MAX_NUM_GROUPS_PER_BLOCK * 2 +
              std::max(n, (int64_t)NUM_VIRTUAL_CLUSTERS / (C / C_PER_CLUSTER)) * (HW / ROWS_PER_BLOCK) * C * 2;
          meta_ptr->barrier_size = NUM_VIRTUAL_CLUSTERS + C / C_PER_CLUSTER;
          meta_ptr->BLOCK_DIM_X = BLOCK_DIM_X;
          meta_ptr->C_PER_BLOCK = C_PER_BLOCK;
          meta_ptr->ROWS_PER_BLOCK = ROWS_PER_BLOCK;
          meta_ptr->VEC_ELEMS = VEC_ELEMS;
          meta_ptr->LOAD_TWICE = LOAD_TWICE;
          meta_ptr->BLOCKS_PER_SM = BLOCKS_PER_SM;
          meta_ptr->HARDWARE_CLUSTER = HARDWARE_CLUSTER;
          meta_ptr->wgrad_sync_method = (int)wgrad_sync_method;
        }
        if (meta_only) {
          return;
        }

        cudaLaunchConfig_t config = {0};
        config.gridDim = dim3(VIRTUAL_CLUSTER_SIZE, PERSISTENT ? NUM_VIRTUAL_CLUSTERS : n * (C / C_PER_CLUSTER), 1);
        config.blockDim = BLOCK_DIM_X;
        config.stream = stream;

        cudaLaunchAttribute attribute[2];
        if constexpr (HARDWARE_CLUSTER) {
          attribute[0].id = cudaLaunchAttributeClusterDimension;
          attribute[0].val.clusterDim.x = 1;  // Cluster size in X-dimension
          attribute[0].val.clusterDim.y = 1;
          attribute[0].val.clusterDim.z = 1;
          config.attrs = attribute;
          config.numAttrs++;
        }
        if constexpr (PERSISTENT) {
          attribute[config.numAttrs].id = cudaLaunchAttributeCooperative;
          attribute[config.numAttrs].val.cooperative = 1;
          config.attrs = attribute;
          config.numAttrs++;
        }

        auto kernel =
            &gn_bwd_cuda_kernel<T, BLOCK_DIM_X, BLOCKS_PER_SM, G, CPG, HW, SILU, REQUIRES_WGRAD, ROWS_PER_BLOCK,
                                C_PER_BLOCK, C_PER_CLUSTER, VEC_ELEMS, PERSISTENT, NUM_VIRTUAL_CLUSTERS, LOAD_TWICE,
                                HARDWARE_CLUSTER, wgrad_sync_method, CompileCondition<EFFECTIVE_CUDA_ARCH> >;
        if constexpr (HARDWARE_CLUSTER) {
          if constexpr (VIRTUAL_CLUSTER_SIZE > 8) {
            CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeNonPortableClusterSizeAllowed, 1));
          }
          int max_cluster_size;
          int active_clusters;
          CUDA_CHECK(cudaOccupancyMaxPotentialClusterSize(&max_cluster_size, (void*)kernel, &config));
          if (VIRTUAL_CLUSTER_SIZE <= max_cluster_size && PERSISTENT) {
            attribute[0].val.clusterDim.x = VIRTUAL_CLUSTER_SIZE;
            CUDA_CHECK(cudaOccupancyMaxActiveClusters(&active_clusters, (void*)kernel, &config));
          }
          if (VIRTUAL_CLUSTER_SIZE <= max_cluster_size &&
              (!PERSISTENT || PERSISTENT && NUM_VIRTUAL_CLUSTERS <= active_clusters)) {
            attribute[0].val.clusterDim.x = VIRTUAL_CLUSTER_SIZE;
          } else {
            // Fallback to cooperative groups for dgrad computation because hardware cluster cannot be active
            // simultaneously
            attribute[0].val.clusterDim.x = 1;
            kernel =
                &gn_bwd_cuda_kernel<T, BLOCK_DIM_X, BLOCKS_PER_SM, G, CPG, HW, SILU, REQUIRES_WGRAD, ROWS_PER_BLOCK,
                                    C_PER_BLOCK, C_PER_CLUSTER, VEC_ELEMS, PERSISTENT, NUM_VIRTUAL_CLUSTERS, LOAD_TWICE,
                                    false, wgrad_sync_method, CompileCondition<EFFECTIVE_CUDA_ARCH> >;
          }
        }
        CUDA_CHECK(cudaLaunchKernelEx(&config, kernel, grad_input, grad_weight, grad_bias, grad_output, x, w, b,
                                      mean_var, eps, n, red_buffer, barrier));
      });
    });
  });
}

#define GN_CUDA_INST_DEFINE(HW, C)                                                                                \
  template void gn_cuda_single_shape<half, HW, C, 16, true>(GN_CUDA_HOST_PARAMS(half));                           \
  template void gn_cuda_single_shape<half, HW, C, 32, false>(GN_CUDA_HOST_PARAMS(half));                          \
  template void gn_bwd_cuda_single_shape<half, HW, C, 16, true>(GN_BWD_CUDA_HOST_PARAMS(half));                   \
  template void gn_bwd_cuda_single_shape<half, HW, C, 32, false>(GN_BWD_CUDA_HOST_PARAMS(half));                  \
  template void gn_cuda_single_shape<__nv_bfloat16, HW, C, 16, true>(GN_CUDA_HOST_PARAMS(__nv_bfloat16));         \
  template void gn_cuda_single_shape<__nv_bfloat16, HW, C, 32, false>(GN_CUDA_HOST_PARAMS(__nv_bfloat16));        \
  template void gn_bwd_cuda_single_shape<__nv_bfloat16, HW, C, 16, true>(GN_BWD_CUDA_HOST_PARAMS(__nv_bfloat16)); \
  template void gn_bwd_cuda_single_shape<__nv_bfloat16, HW, C, 32, false>(GN_BWD_CUDA_HOST_PARAMS(__nv_bfloat16));

}  // namespace group_norm_v2


================================================
FILE: apex/contrib/csrc/group_norm_v2/gn_cuda_inst_1024_1280.cu
================================================
#include "gn_cuda_host_template.cuh"

namespace group_norm_v2 {

GN_CUDA_INST_DEFINE(1024, 1280)

}  // namespace group_norm_v2


================================================
FILE: apex/contrib/csrc/group_norm_v2/gn_cuda_inst_1024_1920.cu
================================================
#include "gn_cuda_host_template.cuh"

namespace group_norm_v2 {

GN_CUDA_INST_DEFINE(1024, 1920)

}  // namespace group_norm_v2


================================================
FILE: apex/contrib/csrc/group_norm_v2/gn_cuda_inst_1024_320.cu
================================================
#include "gn_cuda_host_template.cuh"

namespace group_norm_v2 {

GN_CUDA_INST_DEFINE(1024, 320)

}  // namespace group_norm_v2


================================================
FILE: apex/contrib/csrc/group_norm_v2/gn_cuda_inst_1024_640.cu
================================================
#include "gn_cuda_host_template.cuh"

namespace group_norm_v2 {

GN_CUDA_INST_DEFINE(1024, 640)

}  // namespace group_norm_v2


================================================
FILE: apex/contrib/csrc/group_norm_v2/gn_cuda_inst_1024_960.cu
================================================
#include "gn_cuda_host_template.cuh"

namespace group_norm_v2 {

GN_CUDA_INST_DEFINE(1024, 960)

}  // namespace group_norm_v2


================================================
FILE: apex/contrib/csrc/group_norm_v2/gn_cuda_inst_256_1280.cu
================================================
#include "gn_cuda_host_template.cuh"

namespace group_norm_v2 {

GN_CUDA_INST_DEFINE(256, 1280)

}  // namespace group_norm_v2


================================================
FILE: apex/contrib/csrc/group_norm_v2/gn_cuda_inst_256_1920.cu
================================================
#include "gn_cuda_host_template.cuh"

namespace group_norm_v2 {

GN_CUDA_INST_DEFINE(256, 1920)

}  // namespace group_norm_v2


================================================
FILE: apex/contrib/csrc/group_norm_v2/gn_cuda_inst_256_2560.cu
================================================
#include "gn_cuda_host_template.cuh"

namespace group_norm_v2 {

GN_CUDA_INST_DEFINE(256, 2560)

}  // namespace group_norm_v2


================================================
FILE: apex/contrib/csrc/group_norm_v2/gn_cuda_inst_256_640.cu
================================================
#include "gn_cuda_host_template.cuh"

namespace group_norm_v2 {

GN_CUDA_INST_DEFINE(256, 640)

}  // namespace group_norm_v2


================================================
FILE: apex/contrib/csrc/group_norm_v2/gn_cuda_inst_4096_320.cu
================================================
#include "gn_cuda_host_template.cuh"

namespace group_norm_v2 {

GN_CUDA_INST_DEFINE(4096, 320)

}  // namespace group_norm_v2


================================================
FILE: apex/contrib/csrc/group_norm_v2/gn_cuda_inst_4096_640.cu
================================================
#include "gn_cuda_host_template.cuh"

namespace group_norm_v2 {

GN_CUDA_INST_DEFINE(4096, 640)

}  // namespace group_norm_v2


================================================
FILE: apex/contrib/csrc/group_norm_v2/gn_cuda_inst_4096_960.cu
================================================
#include "gn_cuda_host_template.cuh"

namespace group_norm_v2 {

GN_CUDA_INST_DEFINE(4096, 960)

}  // namespace group_norm_v2


================================================
FILE: apex/contrib/csrc/group_norm_v2/gn_cuda_inst_64_1280.cu
================================================
#include "gn_cuda_host_template.cuh"

namespace group_norm_v2 {

GN_CUDA_INST_DEFINE(64, 1280)

}  // namespace group_norm_v2


================================================
FILE: apex/contrib/csrc/group_norm_v2/gn_cuda_inst_64_2560.cu
================================================
#include "gn_cuda_host_template.cuh"

namespace group_norm_v2 {

GN_CUDA_INST_DEFINE(64, 2560)

}  // namespace group_norm_v2


================================================
FILE: apex/contrib/csrc/group_norm_v2/gn_cuda_kernel.cuh
================================================
#pragma once

#include <cooperative_groups.h>

#include "gn_utils.hpp"

namespace group_norm_v2 {

namespace cg = cooperative_groups;

template <typename T>
inline constexpr T up_div(T a, T b) {
  return (a + b - 1) / b;
}

template <typename T>
inline constexpr T round_up(T a, T b) {
  return up_div(a, b) * b;
}

inline constexpr unsigned round_up_pow2(unsigned x) {
  int log = 0;
  x--;
  while (x) {
    x /= 2;
    log++;
  }
  return 1U << log;
}

inline constexpr unsigned round_down_pow2(unsigned x) { return round_up_pow2(x + 1) / 2; }

template <typename T>
inline constexpr T gcd(T a, T b) {
  while (b != 0) {
    int t = b;
    b = a % b;
    a = t;
  }
  return a;
}

template <typename T>
inline constexpr T lcm(T a, T b) {
  return (a * b) / gcd(a, b);
}

template <typename T>
inline constexpr T relative_prime(T x, T min) {
  int p = min;
  while (gcd(p, x) != 1) {
    p++;
  }
  return p;
}

template <typename T>
inline constexpr T max_divisor(T x, T max) {
  int p = max;
  while (x % p != 0) {
    p--;
  }
  return p;
}

constexpr unsigned FINAL_MASK = 0xffffffff;

template <int VIRTUAL_CLUSTER_SIZE, bool PERSISTENT, bool HARDWARE_CLUSTER>
__device__ void virtual_cluster_sync(unsigned int* barrier) {
  if constexpr (VIRTUAL_CLUSTER_SIZE == 1) {
    __syncthreads();
  } else if constexpr (HARDWARE_CLUSTER) {
    cg::this_cluster().sync();
  } else {
    static_assert(PERSISTENT, "potential deadlock");
    volatile unsigned int* arrived = &barrier[blockIdx.y];
    __syncthreads();
    if (threadIdx.x == 0) {
      unsigned int expected = VIRTUAL_CLUSTER_SIZE;
      bool gpu_master = blockIdx.x == 0;
      unsigned int nb = 1;
      if (gpu_master) {
        nb = 0x80000000 - (expected - 1);
      }
      unsigned int oldArrive;
      asm volatile("atom.add.release.gpu.u32 %0,[%1],%2;"
                   : "=r"(oldArrive)
                   : _CG_ASM_PTR_CONSTRAINT((unsigned int*)arrived), "r"(nb)
                   : "memory");
      unsigned int current_arrive;
      do {
        asm volatile("ld.acquire.gpu.u32 %0,[%1];"
                     : "=r"(current_arrive)
                     : _CG_ASM_PTR_CONSTRAINT((unsigned int*)arrived)
                     : "memory");
      } while (!cooperative_groups::details::bar_has_flipped(oldArrive, current_arrive));
    }
    __syncthreads();
  }
}

template <int NUM_BLOCKS, bool PERSISTENT>
__device__ unsigned int group_barrier_arrive(unsigned int* barrier, bool gpu_master) {
  static_assert(PERSISTENT, "potential deadlock");
  volatile unsigned int* arrived = &barrier[0];
  __syncthreads();
  if (threadIdx.x == 0) {
    unsigned int expected = NUM_BLOCKS;
    unsigned int nb = 1;
    if (gpu_master) {
      nb = 0x80000000 - (expected - 1);
    }
    unsigned int oldArrive;
    asm volatile("atom.add.release.gpu.u32 %0,[%1],%2;"
                 : "=r"(oldArrive)
                 : _CG_ASM_PTR_CONSTRAINT((unsigned int*)arrived), "r"(nb)
                 : "memory");
    return oldArrive;
  } else {
    return 0;
  }
}

__device__ inline void group_barrier_wait(unsigned int* barrier, unsigned int oldArrive) {
  volatile unsigned int* arrived = &barrier[0];
  if (threadIdx.x == 0) {
    unsigned int current_arrive;
    do {
      asm volatile("ld.acquire.gpu.u32 %0,[%1];"
                   : "=r"(current_arrive)
                   : _CG_ASM_PTR_CONSTRAINT((unsigned int*)arrived)
                   : "memory");
    } while (!cooperative_groups::details::bar_has_flipped(oldArrive, current_arrive));
  }
  __syncthreads();
}

// Calculate `n` (batch id) and `c` (channel range id) for each loop
template <bool CONSTANT_C_LOOP, int C, int C_PER_CLUSTER, int NUM_VIRTUAL_CLUSTERS, bool PERSISTENT>
class NCScheduler;

template <int C, int C_PER_CLUSTER, int NUM_VIRTUAL_CLUSTERS, bool PERSISTENT>
class NCScheduler<false, C, C_PER_CLUSTER, NUM_VIRTUAL_CLUSTERS, PERSISTENT> {
 public:
  __device__ NCScheduler(int64_t n) {
    nc_loop_ = blockIdx.y;
    at_end_ = nc_loop_ >= n * (C / C_PER_CLUSTER);
  }
  __device__ auto get_nc() {
    int64_t n_loop = nc_loop_ / (C / C_PER_CLUSTER);
    int c_loop = nc_loop_ % (C / C_PER_CLUSTER);
    return std::make_tuple(n_loop, c_loop);
  }
  __device__ void next(int64_t n) {
    if constexpr (PERSISTENT) {
      nc_loop_ += NUM_VIRTUAL_CLUSTERS;
      at_end_ = nc_loop_ >= n * (C / C_PER_CLUSTER);
    }
  }
  __device__ bool at_end(int64_t n) { return !PERSISTENT || at_end_; }

 private:
  int64_t nc_loop_;
  bool at_end_;
};

template <int C, int C_PER_CLUSTER, int NUM_VIRTUAL_CLUSTERS, bool PERSISTENT>
class NCScheduler<true, C, C_PER_CLUSTER, NUM_VIRTUAL_CLUSTERS, PERSISTENT> {
 public:
  __device__ NCScheduler(int64_t n) {
    n_loop_ = blockIdx.y / (C / C_PER_CLUSTER);
    c_loop_ = blockIdx.y % (C / C_PER_CLUSTER);
  }
  __device__ auto get_nc() { return std::make_tuple(n_loop_, c_loop_); }
  __device__ void next(int64_t n) {
    if constexpr (PERSISTENT) {
      n_loop_ += NUM_VIRTUAL_CLUSTERS / (C / C_PER_CLUSTER);
    }
  }
  __device__ bool at_end(int64_t n) { return !PERSISTENT || n_loop_ >= n; }

 private:
  int64_t n_loop_;
  int c_loop_;
};

class CompileConditionAlwaysTrue {
 public:
  __device__ static constexpr bool matches() { return true; }
};

template <typename T, int BLOCK_DIM_X, int BLOCKS_PER_SM, int G, int CPG, int HW, bool SILU, int ROWS_PER_BLOCK,
          int C_PER_BLOCK, int C_PER_CLUSTER, int VEC_ELEMS, bool PERSISTENT, int NUM_VIRTUAL_CLUSTERS, bool LOAD_TWICE,
          bool HARDWARE_CLUSTER, class CompileCondition = CompileConditionAlwaysTrue>
__global__ __launch_bounds__(BLOCK_DIM_X, BLOCKS_PER_SM) void gn_cuda_kernel(
    T* __restrict__ out, T const* __restrict__ x, T const* __restrict__ w, T const* __restrict__ b, float eps,
    int64_t n, float* __restrict__ mean_var_out, float* __restrict__ red_buffer, unsigned* __restrict__ barrier) {
  // Procedure Overview
  //   1. Thread sum: read from gmem, write partial sum to smem, store input in registers (if no LOAD_TWICE)
  //   2. Block sum: read from smem, write partial sum to gmem (or distributed shared memory if HARDWARE_CLUSTER is
  //   used)
  //   3. Group sum: read from gmem, write mean&var to smem
  //   4. Scale: read mean&var from smem, read input from gmem (if LOAD_TWICE), write output to gmem

  static_assert(BLOCK_DIM_X % 32 == 0, "warp shuffle error");

  constexpr int C = G * CPG;
  static_assert(C % C_PER_CLUSTER == 0, "cannot divide channels into clusters");
  static_assert(C_PER_CLUSTER % C_PER_BLOCK == 0, "cannot divide a cluster into blocks");
  static_assert(C_PER_CLUSTER % CPG == 0, "no reduce between clusters, would produce incorrect results");
  static_assert(!(C_PER_BLOCK % CPG == 0 && C_PER_CLUSTER != C_PER_BLOCK),
                "inefficient configuration, please reduce C_PER_CLUSTER");

  static_assert(ROWS_PER_BLOCK * C_PER_BLOCK % BLOCK_DIM_X == 0, "cannot divide tile into threads");
  struct alignas(VEC_ELEMS * sizeof(T)) U {
    T data[VEC_ELEMS];
  };

  auto compute_mean_var = [&](float2 sum) {
    float mean = sum.x / (HW * CPG);
    float var = std::max(0.f, sum.y / (HW * CPG) - mean * mean);
    return float2{mean, var};
  };

  static_assert(HW % ROWS_PER_BLOCK == 0,
                "HW must be divisible by ROWS_PER_BLOCK to determine the number of blocks on the HW axis");
  constexpr int MAX_NUM_GROUPS_PER_BLOCK =
      C_PER_BLOCK % CPG == 0 ? C_PER_BLOCK / CPG : up_div(C_PER_BLOCK - gcd(C_PER_BLOCK, CPG), CPG) + 1;
  constexpr int VIRTUAL_CLUSTER_SIZE = (C_PER_CLUSTER / C_PER_BLOCK) * (HW / ROWS_PER_BLOCK);
  constexpr int virtual_cluster_dim_x = C_PER_CLUSTER / C_PER_BLOCK;
  constexpr int virtual_cluster_dim_y = HW / ROWS_PER_BLOCK;
  int virtual_block_idx_x = (blockIdx.x % VIRTUAL_CLUSTER_SIZE) % virtual_cluster_dim_x;
  int virtual_block_idx_y = (blockIdx.x % VIRTUAL_CLUSTER_SIZE) / virtual_cluster_dim_x;

  if constexpr (CompileCondition::matches()) {
    int step = 0;
    constexpr bool CONSTANT_C_LOOP = PERSISTENT && NUM_VIRTUAL_CLUSTERS % (C / C_PER_CLUSTER) == 0;
    NCScheduler<CONSTANT_C_LOOP, C, C_PER_CLUSTER, NUM_VIRTUAL_CLUSTERS, PERSISTENT> nc_scheduler(n);
    while (true) {  // TODO: unroll the loop
      if constexpr (PERSISTENT) {
        if (nc_scheduler.at_end(n)) {
          break;
        }
      }
      auto [n_loop, c_loop] = nc_scheduler.get_nc();
      if constexpr (PERSISTENT) {
        nc_scheduler.next(n);
      }
      static_assert(C_PER_BLOCK % VEC_ELEMS == 0, "cannot vectorize");
      static_assert((BLOCK_DIM_X * VEC_ELEMS) % C_PER_BLOCK == 0,
                    "each block should load one or more C_PER_BLOCK at once");
      constexpr int ROWS_PER_IO = BLOCK_DIM_X * VEC_ELEMS / C_PER_BLOCK;
      static_assert(ROWS_PER_BLOCK % ROWS_PER_IO == 0, "cannot determine the IO times per batch");
      int block_channel_start = virtual_block_idx_x * C_PER_BLOCK + c_loop * C_PER_CLUSTER;
      int block_group_start = block_channel_start / CPG;
      int thread_channel_start = block_channel_start + threadIdx.x % (C_PER_BLOCK / VEC_ELEMS) * VEC_ELEMS;
      U frag[ROWS_PER_BLOCK / ROWS_PER_IO];

      // GCD_VEC_CPG is an important constant that determines how many channels can be merged in reduction computation
      //   For example, VEC_ELEMS=4 and CPG=10, then GCD_VEC_CPG=2,
      //   so we need to store only 2 sums on each thread, and compute only 2 mean&var for each thread.
      constexpr int GCD_VEC_CPG = gcd(VEC_ELEMS, CPG);

      // If each block handles only one group, run warpReduce and store the sum to `sum_per_channel_single_group`;
      // otherwise store (VEC_ELEMS / GCD_VEC_CPG) sums to `sum_per_channel_multi_group`, where `relative_prime` is used
      // for swizzle.
      constexpr bool SINGLE_GROUP_PER_BLOCK = CPG % C_PER_BLOCK == 0;
      [[maybe_unused]] __shared__ float2 sum_per_channel_single_group[BLOCK_DIM_X / 32];
      [[maybe_unused]] __shared__ float2 sum_per_channel_multi_group[C_PER_BLOCK / GCD_VEC_CPG][relative_prime(
          128 / (int)sizeof(float2), ROWS_PER_IO)];

      if constexpr (LOAD_TWICE) {
        float2 frag_sum_per_channel[VEC_ELEMS / GCD_VEC_CPG]{};
        for (int j = 0; j < ROWS_PER_BLOCK / ROWS_PER_IO; j++) {
          int64_t input_idx =
              n_loop * HW * C +
              (virtual_block_idx_y * ROWS_PER_BLOCK + j * ROWS_PER_IO + threadIdx.x / (C_PER_BLOCK / VEC_ELEMS)) * C +
              thread_channel_start;
          U val = *reinterpret_cast<U const*>(&x[input_idx]);
          for (int i = 0; i < VEC_ELEMS / GCD_VEC_CPG; i++) {
            float2 sum = frag_sum_per_channel[i];
            for (int k = 0; k < GCD_VEC_CPG; k++) {
              sum.x += (float)val.data[i * GCD_VEC_CPG + k];
              sum.y += (float)val.data[i * GCD_VEC_CPG + k] * (float)val.data[i * GCD_VEC_CPG + k];
            }
            frag_sum_per_channel[i] = sum;
          }
        }
        for (int i = 0; i < VEC_ELEMS / GCD_VEC_CPG; i++) {
          if constexpr (SINGLE_GROUP_PER_BLOCK) {
            for (int mask = 16; mask > 0; mask >>= 1) {
              frag_sum_per_channel[i].x += __shfl_xor_sync(FINAL_MASK, frag_sum_per_channel[i].x, mask, 32);
              frag_sum_per_channel[i].y += __shfl_xor_sync(FINAL_MASK, frag_sum_per_channel[i].y, mask, 32);
            }
            static_assert(VEC_ELEMS / GCD_VEC_CPG == 1, "process only one element for each warp");
            if (threadIdx.x % 32 == 0) {
              sum_per_channel_single_group[threadIdx.x / 32] = frag_sum_per_channel[i];
            }
          } else {
            sum_per_channel_multi_group[i * (C_PER_BLOCK / VEC_ELEMS) + threadIdx.x % (C_PER_BLOCK / VEC_ELEMS)]
                                       [threadIdx.x / (C_PER_BLOCK / VEC_ELEMS)] = frag_sum_per_channel[i];
          }
        }
        __syncthreads();
      } else {
        for (int j = 0; j < ROWS_PER_BLOCK / ROWS_PER_IO; j++) {
          int64_t input_idx =
              n_loop * HW * C +
              (virtual_block_idx_y * ROWS_PER_BLOCK + j * ROWS_PER_IO + threadIdx.x / (C_PER_BLOCK / VEC_ELEMS)) * C +
              thread_channel_start;
          frag[j] = *reinterpret_cast<U const*>(&x[input_idx]);
        }

        for (int i = 0; i < VEC_ELEMS / GCD_VEC_CPG; i++) {
          float2 sum = {0.f, 0.f};
          for (int j = 0; j < ROWS_PER_BLOCK / ROWS_PER_IO; j++) {
            for (int k = 0; k < GCD_VEC_CPG; k++) {
              sum.x += (float)frag[j].data[i * GCD_VEC_CPG + k];
              sum.y += (float)frag[j].data[i * GCD_VEC_CPG + k] * (float)frag[j].data[i * GCD_VEC_CPG + k];
            }
          }
          if constexpr (SINGLE_GROUP_PER_BLOCK) {
            for (int mask = 16; mask > 0; mask >>= 1) {
              sum.x += __shfl_xor_sync(FINAL_MASK, sum.x, mask, 32);
              sum.y += __shfl_xor_sync(FINAL_MASK, sum.y, mask, 32);
            }
            static_assert(VEC_ELEMS / GCD_VEC_CPG == 1, "process only one element for each warp");
            if (threadIdx.x % 32 == 0) {
              sum_per_channel_single_group[threadIdx.x / 32] = sum;
            }
          } else {
            sum_per_channel_multi_group[i * (C_PER_BLOCK / VEC_ELEMS) + threadIdx.x % (C_PER_BLOCK / VEC_ELEMS)]
                                       [threadIdx.x / (C_PER_BLOCK / VEC_ELEMS)] = sum;
          }
        }
        __syncthreads();
      }

      U uw = *reinterpret_cast<U const*>(&w[thread_channel_start]);
      U ub = *reinterpret_cast<U const*>(&b[thread_channel_start]);

      // Three cases for the red_buffer:
      //   - Block sync (VIRTUAL_CLUSTER_SIZE=1): use shared memory
      //   - Virtual cluster sync with HARDWARE_CLUSTER: use distributed shared memory
      //   - Virtual cluster sync without HARDWARE_CLUSTER: use global memory, i.e., `red_buffer`
      constexpr bool USE_SHARED_RED_BUFFER = HARDWARE_CLUSTER || VIRTUAL_CLUSTER_SIZE == 1;

      // Specialize for the case that each group is handled by only one block
      //   For common cases, blockSum produces partial sum and stores it to the red_buffer, and groupSum produces
      //   mean&var For the special case, blockSum produces mean&var directly
      constexpr bool STORE_MEAN_VAR_IN_SHARED_RED_BUFFER =
          VIRTUAL_CLUSTER_SIZE == 1 &&
          MAX_NUM_GROUPS_PER_BLOCK == 1;  // MAX_NUM_GROUPS_PER_BLOCK > 1 is possible but not implemented

      [[maybe_unused]] __align__(16)
          __shared__ float2 shared_red_buffer[MAX_NUM_GROUPS_PER_BLOCK * (STORE_MEAN_VAR_IN_SHARED_RED_BUFFER ? 1 : 2)];

      // Block sum
      if constexpr (SINGLE_GROUP_PER_BLOCK) {
        // block reduce
        if (threadIdx.x < 32) {
          float2 sum_local_group =
              threadIdx.x < BLOCK_DIM_X / 32 ? sum_per_channel_single_group[threadIdx.x] : float2{0.f, 0.f};
          constexpr int warp_num_pow2 = round_up_pow2(BLOCK_DIM_X / 32);
          for (int mask = warp_num_pow2 / 2; mask > 0; mask >>= 1) {
            sum_local_group.x += __shfl_xor_sync(FINAL_MASK, sum_local_group.x, mask, 32);
            sum_local_group.y += __shfl_xor_sync(FINAL_MASK, sum_local_group.y, mask, 32);
          }
          if (threadIdx.x == 0) {
            if constexpr (USE_SHARED_RED_BUFFER) {
              if constexpr (STORE_MEAN_VAR_IN_SHARED_RED_BUFFER) {
                shared_red_buffer[0] = compute_mean_var(sum_local_group);
              } else {
                shared_red_buffer[step * MAX_NUM_GROUPS_PER_BLOCK + 0] = sum_local_group;
              }
            } else {
              *reinterpret_cast<float2*>(
                  &red_buffer[((step * gridDim.y + blockIdx.y) * VIRTUAL_CLUSTER_SIZE * MAX_NUM_GROUPS_PER_BLOCK +
                               virtual_block_idx_x * virtual_cluster_dim_y * MAX_NUM_GROUPS_PER_BLOCK +
                               // (threadIdx.x / THREADS_PER_GROUP) * virtual_cluster_dim_y +
                               virtual_block_idx_y) *
                              2]) = sum_local_group;
            }
          }
        }
      } else {
        // The number of threads to calculate the sum of each group (should be a power of 2 for warp reduce)
        constexpr int THREADS_PER_GROUP = std::min(std::min(32U, round_up_pow2(ROWS_PER_IO)),
                                                   round_up_pow2(BLOCK_DIM_X / MAX_NUM_GROUPS_PER_BLOCK / 2 + 1));
        static_assert(BLOCK_DIM_X >= MAX_NUM_GROUPS_PER_BLOCK * THREADS_PER_GROUP, "not enough threads");
        float2 sum_local_group = {0.f, 0.f};
        if (threadIdx.x / THREADS_PER_GROUP < MAX_NUM_GROUPS_PER_BLOCK) {
          int local_group_idx = block_group_start + threadIdx.x / THREADS_PER_GROUP;
          // TODO: map threads to both the CPG loop and the ROWS loop
          for (int local_c_loop = 0; local_c_loop < CPG; local_c_loop += GCD_VEC_CPG) {
            int c = local_group_idx * CPG + local_c_loop;
            if (C_PER_BLOCK % CPG == 0 || (c >= block_channel_start && c < block_channel_start + C_PER_BLOCK)) {
              for (int src_thread_tile_y = threadIdx.x % THREADS_PER_GROUP; src_thread_tile_y < ROWS_PER_IO;
                   src_thread_tile_y += THREADS_PER_GROUP) {
                int channel_idx = (c - block_channel_start) / GCD_VEC_CPG;
                channel_idx = channel_idx % (VEC_ELEMS / GCD_VEC_CPG) * (C_PER_BLOCK / VEC_ELEMS) +
                              channel_idx / (VEC_ELEMS / GCD_VEC_CPG);
                sum_local_group.x += sum_per_channel_multi_group[channel_idx][src_thread_tile_y].x;
                sum_local_group.y += sum_per_channel_multi_group[channel_idx][src_thread_tile_y].y;
              }
            }
          }
        }
        static_assert(32 % THREADS_PER_GROUP == 0, "cannot shuffle");
        for (int mask = THREADS_PER_GROUP / 2; mask > 0; mask >>= 1) {
          sum_local_group.x += __shfl_xor_sync(FINAL_MASK, sum_local_group.x, mask, 32);
          sum_local_group.y += __shfl_xor_sync(FINAL_MASK, sum_local_group.y, mask, 32);
        }
        if (threadIdx.x % THREADS_PER_GROUP == 0 && threadIdx.x / THREADS_PER_GROUP < MAX_NUM_GROUPS_PER_BLOCK) {
          if constexpr (USE_SHARED_RED_BUFFER) {
            static_assert(HARDWARE_CLUSTER || VIRTUAL_CLUSTER_SIZE == 1, "no distributed shared memory");
            if constexpr (STORE_MEAN_VAR_IN_SHARED_RED_BUFFER) {
              shared_red_buffer[threadIdx.x / THREADS_PER_GROUP] = compute_mean_var(sum_local_group);
            } else {
              shared_red_buffer[step * MAX_NUM_GROUPS_PER_BLOCK + threadIdx.x / THREADS_PER_GROUP] = sum_local_group;
            }
          } else {
            *reinterpret_cast<float2*>(
                &red_buffer[((step * gridDim.y + blockIdx.y) * VIRTUAL_CLUSTER_SIZE * MAX_NUM_GROUPS_PER_BLOCK +
                             virtual_block_idx_x * virtual_cluster_dim_y * MAX_NUM_GROUPS_PER_BLOCK +
                             (threadIdx.x / THREADS_PER_GROUP) * virtual_cluster_dim_y + virtual_block_idx_y) *
                            2]) = sum_local_group;
          }
        }
      }

      virtual_cluster_sync<VIRTUAL_CLUSTER_SIZE, PERSISTENT, HARDWARE_CLUSTER>(barrier);

      // Group sum
      __shared__ float2 mean_var[MAX_NUM_GROUPS_PER_BLOCK];
      if constexpr (!STORE_MEAN_VAR_IN_SHARED_RED_BUFFER) {
        // The number of threads to calculate the sum of each group (should be a power of 2 for warp reduce)
        constexpr int THREADS_PER_GROUP = std::min(std::min(32U, round_up_pow2(virtual_cluster_dim_y)),
                                                   round_up_pow2(BLOCK_DIM_X / MAX_NUM_GROUPS_PER_BLOCK / 2 + 1));
        static_assert(BLOCK_DIM_X >= MAX_NUM_GROUPS_PER_BLOCK * THREADS_PER_GROUP, "not enough threads");
        float2 sum_global_group = {0.f, 0.f};
        if (threadIdx.x / THREADS_PER_GROUP < MAX_NUM_GROUPS_PER_BLOCK) {
          if constexpr (C_PER_BLOCK % CPG == 0) {
            // Special case: no cross-virtual_cluster_dim_x reduction
            float2 buffer[up_div(virtual_cluster_dim_y, THREADS_PER_GROUP)];
            for (int i = threadIdx.x % THREADS_PER_GROUP; i < virtual_cluster_dim_y; i += THREADS_PER_GROUP) {
              float2 val;
              if constexpr (USE_SHARED_RED_BUFFER) {
                if constexpr (VIRTUAL_CLUSTER_SIZE == 1) {
                  val = shared_red_buffer[step * MAX_NUM_GROUPS_PER_BLOCK + threadIdx.x / THREADS_PER_GROUP];
                } else {
                  static_assert(HARDWARE_CLUSTER, "no distributed shared memory");
                  float2 const* src_shared_red_buffer = cg::this_cluster().map_shared_rank(
                      shared_red_buffer, i * virtual_cluster_dim_x + virtual_block_idx_x);
                  val = src_shared_red_buffer[step * MAX_NUM_GROUPS_PER_BLOCK + threadIdx.x / THREADS_PER_GROUP];
                }
              } else {
                val = *reinterpret_cast<float2 const*>(
                    &red_buffer[((step * gridDim.y + blockIdx.y) * VIRTUAL_CLUSTER_SIZE * MAX_NUM_GROUPS_PER_BLOCK +
                                 virtual_block_idx_x * virtual_cluster_dim_y * MAX_NUM_GROUPS_PER_BLOCK +
                                 (threadIdx.x / THREADS_PER_GROUP) * virtual_cluster_dim_y + i) *
                                2]);
              }
              buffer[i / THREADS_PER_GROUP] = val;
            }
            for (int i = threadIdx.x % THREADS_PER_GROUP; i < virtual_cluster_dim_y; i += THREADS_PER_GROUP) {
              float2 val = buffer[i / THREADS_PER_GROUP];
              sum_global_group.x += val.x;
              sum_global_group.y += val.y;
            }
          } else {
            // Common case: cross-virtual_cluster_dim_x reduction
            int local_group_idx = block_group_start + threadIdx.x / THREADS_PER_GROUP;
            for (int i = threadIdx.x % THREADS_PER_GROUP; i < VIRTUAL_CLUSTER_SIZE; i += THREADS_PER_GROUP) {
              int src_virtual_block_idx_x = i % virtual_cluster_dim_x;
              int src_block_channel_start = src_virtual_block_idx_x * C_PER_BLOCK + c_loop * C_PER_CLUSTER;
              int src_block_group_start = src_block_channel_start / CPG;
              int relative_group_idx = local_group_idx - src_block_group_start;
              if (0 <= relative_group_idx && relative_group_idx < MAX_NUM_GROUPS_PER_BLOCK) {
                float2 val;
                if constexpr (USE_SHARED_RED_BUFFER) {
                  static_assert(HARDWARE_CLUSTER, "no distributed shared memory");
                  static_assert(VIRTUAL_CLUSTER_SIZE != 1,
                                "layout error: should not add (step * MAX_NUM_GROUPS_PER_BLOCK)");
                  float2 const* src_shared_red_buffer = cg::this_cluster().map_shared_rank(shared_red_buffer, i);
                  val = src_shared_red_buffer[step * MAX_NUM_GROUPS_PER_BLOCK + relative_group_idx];
                } else {
                  val = *reinterpret_cast<float2 const*>(
                      &red_buffer[((step * gridDim.y + blockIdx.y) * VIRTUAL_CLUSTER_SIZE * MAX_NUM_GROUPS_PER_BLOCK +
                                   src_virtual_block_idx_x * virtual_cluster_dim_y * MAX_NUM_GROUPS_PER_BLOCK +
                                   relative_group_idx * virtual_cluster_dim_y + i / virtual_cluster_dim_x) *
                                  2]);
                }
                sum_global_group.x += val.x;
                sum_global_group.y += val.y;
              }
            }
          }
        }
        if constexpr (USE_SHARED_RED_BUFFER && VIRTUAL_CLUSTER_SIZE > 1) {
          // Need cluster sync after distributed shared memory access, otherwise behavior is undefined
          if constexpr (PERSISTENT) {
            if (nc_scheduler.at_end(n)) {
              cg::this_cluster().barrier_arrive();
            }
          } else {
            cg::this_cluster().barrier_arrive();
          }
        }
        static_assert(32 % THREADS_PER_GROUP == 0, "cannot shuffle");
        for (int mask = THREADS_PER_GROUP / 2; mask > 0; mask >>= 1) {
          sum_global_group.x += __shfl_xor_sync(FINAL_MASK, sum_global_group.x, mask, 32);
          sum_global_group.y += __shfl_xor_sync(FINAL_MASK, sum_global_group.y, mask, 32);
        }
        if (threadIdx.x % THREADS_PER_GROUP == 0 && threadIdx.x / THREADS_PER_GROUP < MAX_NUM_GROUPS_PER_BLOCK) {
          mean_var[threadIdx.x / THREADS_PER_GROUP] = compute_mean_var(sum_global_group);
        }
        __syncthreads();
      }

      auto get_mean_var = [&](int relative_group_idx) {
        return STORE_MEAN_VAR_IN_SHARED_RED_BUFFER ? shared_red_buffer[relative_group_idx]
                                                   : mean_var[relative_group_idx];
      };

      if (mean_var_out) {
        static_assert(MAX_NUM_GROUPS_PER_BLOCK <= BLOCK_DIM_X, "need loop");
        if (virtual_block_idx_y == 0 && threadIdx.x < MAX_NUM_GROUPS_PER_BLOCK) {
          int g = block_group_start + threadIdx.x;
          if (C_PER_BLOCK % CPG == 0 || g < G) {
            *reinterpret_cast<float2*>(&mean_var_out[(n_loop * G + g) * 2]) = get_mean_var(threadIdx.x);
          }
        }
      }

      float frag_mean[VEC_ELEMS / GCD_VEC_CPG];
      float frag_var[VEC_ELEMS / GCD_VEC_CPG];
      for (int k = 0; k < VEC_ELEMS; k += GCD_VEC_CPG) {
        frag_mean[k / GCD_VEC_CPG] = get_mean_var((thread_channel_start + k) / CPG - block_group_start).x;
        frag_var[k / GCD_VEC_CPG] = get_mean_var((thread_channel_start + k) / CPG - block_group_start).y;
      }

      for (int j = 0; j < ROWS_PER_BLOCK / ROWS_PER_IO; j++) {
        int64_t input_idx =
            n_loop * HW * C +
            (virtual_block_idx_y * ROWS_PER_BLOCK + j * ROWS_PER_IO + threadIdx.x / (C_PER_BLOCK / VEC_ELEMS)) * C +
            thread_channel_start;
        U val;
        if constexpr (LOAD_TWICE) {
          val = *reinterpret_cast<U const*>(&x[input_idx]);
        } else {
          val = frag[j];
        }
        for (int k = 0; k < VEC_ELEMS; k++) {
          float f = ((float)val.data[k] - frag_mean[k / GCD_VEC_CPG]) * rsqrtf(frag_var[k / GCD_VEC_CPG] + eps) *
                        (float)uw.data[k] +
                    (float)ub.data[k];
          if constexpr (SILU) f = f / (1.f + expf(-f));
          val.data[k] = f;
        }
        *reinterpret_cast<U*>(&out[input_idx]) = val;
      }

      if constexpr (!STORE_MEAN_VAR_IN_SHARED_RED_BUFFER && USE_SHARED_RED_BUFFER && VIRTUAL_CLUSTER_SIZE > 1) {
        if constexpr (PERSISTENT) {
          if (nc_scheduler.at_end(n)) {
            cg::this_cluster().barrier_wait();
          }
        } else {
          cg::this_cluster().barrier_wait();
        }
      }

      if constexpr (!PERSISTENT) {
        break;
      }
      step ^= 1;
    }
  }
}

enum WgradSyncMethod {
  WGRAD_ARRIVE_AND_WAIT_GRID = 0,  // grid arrive after the last virtual cluster sync
  WGRAD_ARRIVE_AND_WAIT_GROUP,     // group arrive after the last virtual cluster sync (a group sync means synchronizing
                                   // all clusters cooperating on the same groups)
  WGRAD_REUSE_SUM_SYNC_GRID,       // grid sync together with the last virtual cluster sync
  WGRAD_REUSE_SUM_SYNC_GROUP,      // group sync together with the last virtual cluster sync
  WGRAD_SYNC_AT_LAST,              // add a sync at the end of NC loops
  WGRAD_SYNC_UNSPECIFIED,
};

template <typename T, int BLOCK_DIM_X, int BLOCKS_PER_SM, int G, int CPG, int HW, bool SILU, bool REQUIRES_WGRAD,
          int ROWS_PER_BLOCK, int C_PER_BLOCK, int C_PER_CLUSTER, int VEC_ELEMS, bool PERSISTENT,
          int NUM_VIRTUAL_CLUSTERS, bool LOAD_TWICE, bool HARDWARE_CLUSTER, WgradSyncMethod wgrad_sync_method,
          class CompileCondition = CompileConditionAlwaysTrue>
__global__ __launch_bounds__(BLOCK_DIM_X, BLOCKS_PER_SM) void gn_bwd_cuda_kernel(
    T* __restrict__ grad_input, T* __restrict__ grad_weight, T* __restrict__ grad_bias,
    T const* __restrict__ grad_output, T const* __restrict__ x, T const* __restrict__ w, T const* __restrict__ b,
    float const* __restrict__ mean_var, float eps, int64_t n, float* __restrict__ red_buffer,
    unsigned* __restrict__ barrier) {
  // Procedure Overview
  //   1. Thread sum: read from gmem, write partial sum to smem, store input in registers (if no LOAD_TWICE)
  //   2. Block sum: read from smem, write partial sum to gmem (or distributed shared memory if HARDWARE_CLUSTER is
  //   used),
  //        write wgrad to gmem at the last loop (at each loop if not CONSTANT_C_LOOP)
  //   3. Group sum: read from gmem, write mean&var to smem
  //   4. Scale: read mean&var from smem, read input from gmem (if LOAD_TWICE), write output to gmem
  //   5. Wgrad sum: read from gmem, write to gmem

  static_assert(BLOCK_DIM_X % 32 == 0, "warp shuffle error");

  constexpr int C = G * CPG;
  static_assert(C % C_PER_CLUSTER == 0, "cannot divide channels into clusters");
  static_assert(C_PER_CLUSTER % C_PER_BLOCK == 0, "cannot divide a cluster into blocks");
  static_assert(C_PER_CLUSTER % CPG == 0, "no reduce between clusters, would produce incorrect results");
  static_assert(!(C_PER_BLOCK % CPG == 0 && C_PER_CLUSTER != C_PER_BLOCK),
                "inefficient configuration, please reduce C_PER_CLUSTER");

  static_assert(ROWS_PER_BLOCK * C_PER_BLOCK % BLOCK_DIM_X == 0, "cannot divide tile into threads");
  struct alignas(VEC_ELEMS * sizeof(T)) U {
    T data[VEC_ELEMS];
  };

  // This function computes mean_dyw and mean_xdyw.
  // The function name is not changed because it has the same logic as the forward pass.
  auto compute_mean_var = [&](float2 sum) {
    float mean_dyw = sum.x / (HW * CPG);
    float mean_xdyw = sum.y / (HW * CPG);
    return float2{mean_dyw, mean_xdyw};
  };

  static_assert(HW % ROWS_PER_BLOCK == 0,
                "HW must be divisible by ROWS_PER_BLOCK to determine the number of blocks on the HW axis");
  constexpr int MAX_NUM_GROUPS_PER_BLOCK =
      C_PER_BLOCK % CPG == 0 ? C_PER_BLOCK / CPG : up_div(C_PER_BLOCK - gcd(C_PER_BLOCK, CPG), CPG) + 1;
  constexpr int VIRTUAL_CLUSTER_SIZE = (C_PER_CLUSTER / C_PER_BLOCK) * (HW / ROWS_PER_BLOCK);
  constexpr int virtual_cluster_dim_x = C_PER_CLUSTER / C_PER_BLOCK;
  constexpr int virtual_cluster_dim_y = HW / ROWS_PER_BLOCK;
  int virtual_block_idx_x = (blockIdx.x % VIRTUAL_CLUSTER_SIZE) % virtual_cluster_dim_x;
  int virtual_block_idx_y = (blockIdx.x % VIRTUAL_CLUSTER_SIZE) / virtual_cluster_dim_x;

  if constexpr (CompileCondition::matches()) {
    int step = 0;
    constexpr bool CONSTANT_C_LOOP = PERSISTENT && NUM_VIRTUAL_CLUSTERS % (C / C_PER_CLUSTER) == 0;
    if constexpr (!CONSTANT_C_LOOP) {
      static_assert(wgrad_sync_method != WGRAD_ARRIVE_AND_WAIT_GROUP && wgrad_sync_method != WGRAD_REUSE_SUM_SYNC_GROUP,
                    "grid sync is required when each block is responsible for multiple channel ranges");
    }
    NCScheduler<false, C, C_PER_CLUSTER, NUM_VIRTUAL_CLUSTERS, PERSISTENT> nc_scheduler(
        n);  // TODO: I don't know why the template specialization with CONSTANT_C_LOOP=true is slower.

    [[maybe_unused]] int virtual_cluster_idx_c = blockIdx.y % (C / C_PER_CLUSTER);
    [[maybe_unused]] cg::grid_group::arrival_token wgrad_sync_token;
    [[maybe_unused]] float dw_thread[VEC_ELEMS];
    [[maybe_unused]] float db_thread[VEC_ELEMS];
    [[maybe_unused]] __shared__ union {
      float2 dwdb_block_buffer[BLOCK_DIM_X][VEC_ELEMS];
      struct {
        float wgrad_buffer[BLOCK_DIM_X / 32][32];
        float bgrad_buffer[BLOCK_DIM_X / 32][32];
      } transpose_buffer;
    } union_smem;
    if constexpr (REQUIRES_WGRAD && CONSTANT_C_LOOP) {
      for (int i = 0; i < VEC_ELEMS; i++) {
        dw_thread[i] = 0.f;
        db_thread[i] = 0.f;
      }
    }
    float* red_buffer_wgrad =
        &red_buffer[(2 * NUM_VIRTUAL_CLUSTERS * VIRTUAL_CLUSTER_SIZE * MAX_NUM_GROUPS_PER_BLOCK) * 2];
    unsigned* barrier_wgrad = barrier + NUM_VIRTUAL_CLUSTERS;
    if constexpr (REQUIRES_WGRAD && wgrad_sync_method != WGRAD_SYNC_AT_LAST) {
      if (nc_scheduler.at_end(n)) {
        static_assert(PERSISTENT, "persistent is a must for reducing wgrad");
        if constexpr (wgrad_sync_method == WGRAD_ARRIVE_AND_WAIT_GRID) {
          wgrad_sync_token = group_barrier_arrive<NUM_VIRTUAL_CLUSTERS * VIRTUAL_CLUSTER_SIZE, PERSISTENT>(
              barrier_wgrad, blockIdx.x + blockIdx.y == 0);
        } else if constexpr (wgrad_sync_method == WGRAD_ARRIVE_AND_WAIT_GROUP) {
          wgrad_sync_token =
              group_barrier_arrive<NUM_VIRTUAL_CLUSTERS * VIRTUAL_CLUSTER_SIZE / (C / C_PER_CLUSTER), PERSISTENT>(
                  barrier_wgrad + virtual_cluster_idx_c, blockIdx.x + blockIdx.y / (C / C_PER_CLUSTER) == 0);
        } else if constexpr (wgrad_sync_method == WGRAD_REUSE_SUM_SYNC_GRID) {
          wgrad_sync_token = group_barrier_arrive<NUM_VIRTUAL_CLUSTERS * VIRTUAL_CLUSTER_SIZE, PERSISTENT>(
              barrier_wgrad, blockIdx.x + blockIdx.y == 0);
          group_barrier_wait(barrier_wgrad, wgrad_sync_token);
        } else if constexpr (wgrad_sync_method == WGRAD_REUSE_SUM_SYNC_GROUP) {
          wgrad_sync_token =
              group_barrier_arrive<NUM_VIRTUAL_CLUSTERS * VIRTUAL_CLUSTER_SIZE / (C / C_PER_CLUSTER), PERSISTENT>(
                  barrier_wgrad + virtual_cluster_idx_c, blockIdx.x + blockIdx.y / (C / C_PER_CLUSTER) == 0);
          group_barrier_wait(barrier_wgrad + virtual_cluster_idx_c, wgrad_sync_token);
        }
      }
    }

    while (true) {  // TODO: unroll the loop
      if constexpr (PERSISTENT) {
        if (nc_scheduler.at_end(n)) {
          break;
        }
      }
      auto [n_loop, c_loop] = nc_scheduler.get_nc();
      if constexpr (PERSISTENT) {
        nc_scheduler.next(n);
      }
      static_assert(C_PER_BLOCK % VEC_ELEMS == 0, "cannot vectorize");
      static_assert((BLOCK_DIM_X * VEC_ELEMS) % C_PER_BLOCK == 0,
                    "each block should load one or more C_PER_BLOCK at once");
      constexpr int ROWS_PER_IO = BLOCK_DIM_X * VEC_ELEMS / C_PER_BLOCK;
      static_assert(ROWS_PER_BLOCK % ROWS_PER_IO == 0, "cannot determine the IO times per batch");
      int block_channel_start = virtual_block_idx_x * C_PER_BLOCK + c_loop * C_PER_CLUSTER;
      int block_group_start = block_channel_start / CPG;
      int thread_channel_start = block_channel_start + threadIdx.x % (C_PER_BLOCK / VEC_ELEMS) * VEC_ELEMS;
      U frag_x[ROWS_PER_BLOCK / ROWS_PER_IO];
      U frag_dy[ROWS_PER_BLOCK / ROWS_PER_IO];

      constexpr int GCD_VEC_CPG = gcd(VEC_ELEMS, CPG);

      constexpr bool SINGLE_GROUP_PER_BLOCK = CPG % C_PER_BLOCK == 0;
      [[maybe_unused]] __shared__ float2 sum_per_channel_multi_group[C_PER_BLOCK / GCD_VEC_CPG][relative_prime(
          128 / (int)sizeof(float2), ROWS_PER_IO)];
      [[maybe_unused]] __shared__ float2 sum_per_channel_single_group[BLOCK_DIM_X / 32];

      float frag_mean[VEC_ELEMS / GCD_VEC_CPG];
      float frag_var[VEC_ELEMS / GCD_VEC_CPG];
      for (int k = 0; k < VEC_ELEMS; k += GCD_VEC_CPG) {
        float2 value = *reinterpret_cast<float2 const*>(&mean_var[(n_loop * G + (thread_channel_start + k) / CPG) * 2]);
        frag_mean[k / GCD_VEC_CPG] = value.x;
        frag_var[k / GCD_VEC_CPG] = value.y;
      }

      U uw = *reinterpret_cast<U const*>(&w[thread_channel_start]);
      U ub;
      if constexpr (SILU) {
        ub = *reinterpret_cast<U const*>(&b[thread_channel_start]);
      }
      if constexpr (REQUIRES_WGRAD && !CONSTANT_C_LOOP) {
        for (int i = 0; i < VEC_ELEMS; i++) {
          dw_thread[i] = 0.f;
          db_thread[i] = 0.f;
        }
      }

      if constexpr (LOAD_TWICE) {
        float2 frag_sum_per_channel[VEC_ELEMS / GCD_VEC_CPG]{};
        for (int j = 0; j < ROWS_PER_BLOCK / ROWS_PER_IO; j++) {
          int64_t input_idx =
              n_loop * HW * C +
              (virtual_block_idx_y * ROWS_PER_BLOCK + j * ROWS_PER_IO + threadIdx.x / (C_PER_BLOCK / VEC_ELEMS)) * C +
              thread_channel_start;
          U ux = *reinterpret_cast<U const*>(&x[input_idx]);
          U udy = *reinterpret_cast<U const*>(&grad_output[input_idx]);
          for (int i = 0; i < VEC_ELEMS / GCD_VEC_CPG; i++) {
            float2 sum = frag_sum_per_channel[i];
            for (int k = 0; k < GCD_VEC_CPG; k++) {
              float rnorm = rsqrtf(frag_var[i] + eps);
              float x_norm =
                  ((float)ux.data[i * GCD_VEC_CPG + k] - frag_mean[i]) * rnorm;  // TODO: store rsqrtf in mean_var
              float grad_gn = udy.data[i * GCD_VEC_CPG + k];
              if constexpr (SILU) {
                float x_gn = x_norm * (float)uw.data[i * GCD_VEC_CPG + k] + (float)ub.data[i * GCD_VEC_CPG + k];
                float s = 1.f / (1.f + expf(-x_gn));
                grad_gn *= s * (1.f + x_gn * (1.f - s));
              }
              sum.x += grad_gn * (float)uw.data[i * GCD_VEC_CPG + k];
              sum.y += x_norm * (grad_gn * (float)uw.data[i * GCD_VEC_CPG + k]);
              if constexpr (REQUIRES_WGRAD) {
                dw_thread[i * GCD_VEC_CPG + k] += x_norm * grad_gn;
                db_thread[i * GCD_VEC_CPG + k] += grad_gn;
              }
            }
            frag_sum_per_channel[i] = sum;
          }
        }
        for (int i = 0; i < VEC_ELEMS / GCD_VEC_CPG; i++) {
          if constexpr (SINGLE_GROUP_PER_BLOCK) {
            for (int mask = 16; mask > 0; mask >>= 1) {
              frag_sum_per_channel[i].x += __shfl_xor_sync(FINAL_MASK, frag_sum_per_channel[i].x, mask, 32);
              frag_sum_per_channel[i].y += __shfl_xor_sync(FINAL_MASK, frag_sum_per_channel[i].y, mask, 32);
            }
            static_assert(VEC_ELEMS / GCD_VEC_CPG == 1, "process only one element for each warp");
            if (threadIdx.x % 32 == 0) {
              sum_per_channel_single_group[threadIdx.x / 32] = frag_sum_per_channel[i];
            }
          } else {
            sum_per_channel_multi_group[i * (C_PER_BLOCK / VEC_ELEMS) + threadIdx.x % (C_PER_BLOCK / VEC_ELEMS)]
                                       [threadIdx.x / (C_PER_BLOCK / VEC_ELEMS)] = frag_sum_per_channel[i];
          }
        }
        __syncthreads();
      } else {
        for (int j = 0; j < ROWS_PER_BLOCK / ROWS_PER_IO; j++) {
          int64_t input_idx =
              n_loop * HW * C +
              (virtual_block_idx_y * ROWS_PER_BLOCK + j * ROWS_PER_IO + threadIdx.x / (C_PER_BLOCK / VEC_ELEMS)) * C +
              thread_channel_start;
          frag_x[j] = *reinterpret_cast<U const*>(&x[input_idx]);
          frag_dy[j] = *reinterpret_cast<U const*>(&grad_output[input_idx]);
        }

        for (int i = 0; i < VEC_ELEMS / GCD_VEC_CPG; i++) {
          float2 sum = {0.f, 0.f};
          for (int j = 0; j < ROWS_PER_BLOCK / ROWS_PER_IO; j++) {
            for (int k = 0; k < GCD_VEC_CPG; k++) {
              float rnorm = rsqrtf(frag_var[i] + eps);
              float x_norm = ((float)frag_x[j].data[i * GCD_VEC_CPG + k] - frag_mean[i]) *
                             rnorm;  // TODO: store rsqrtf in mean_var
              float grad_gn = frag_dy[j].data[i * GCD_VEC_CPG + k];
              if constexpr (SILU) {
                float x_gn = x_norm * (float)uw.data[i * GCD_VEC_CPG + k] + (float)ub.data[i * GCD_VEC_CPG + k];
                float s = 1.f / (1.f + expf(-x_gn));
                grad_gn *= s * (1.f + x_gn * (1.f - s));
              }
              sum.x += grad_gn * (float)uw.data[i * GCD_VEC_CPG + k];
              sum.y += x_norm * (grad_gn * (float)uw.data[i * GCD_VEC_CPG + k]);
              if constexpr (REQUIRES_WGRAD) {
                dw_thread[i * GCD_VEC_CPG + k] += x_norm * grad_gn;
                db_thread[i * GCD_VEC_CPG + k] += grad_gn;
              }
            }
          }
          if constexpr (SINGLE_GROUP_PER_BLOCK) {
            for (int mask = 16; mask > 0; mask >>= 1) {
              sum.x += __shfl_xor_sync(FINAL_MASK, sum.x, mask, 32);
              sum.y += __shfl_xor_sync(FINAL_MASK, sum.y, mask, 32);
            }
            static_assert(VEC_ELEMS / GCD_VEC_CPG == 1, "process only one element for each warp");
            if (threadIdx.x % 32 == 0) {
              sum_per_channel_single_group[threadIdx.x / 32] = sum;
            }
          } else {
            sum_per_channel_multi_group[i * (C_PER_BLOCK / VEC_ELEMS) + threadIdx.x % (C_PER_BLOCK / VEC_ELEMS)]
                                       [threadIdx.x / (C_PER_BLOCK / VEC_ELEMS)] = sum;
          }
        }
        __syncthreads();
      }

      if ((CONSTANT_C_LOOP && nc_scheduler.at_end(n)) || !CONSTANT_C_LOOP) {
        constexpr int NT_C = max_divisor(C_PER_BLOCK, BLOCK_DIM_X);  // Number of threads on the C axis
        constexpr int NT_R =
            1;  // std::min(32, (int)round_down_pow2(BLOCK_DIM_X / NT_C));  // Number of threads on the ROWS axis
        // TODO: swizzle for NT_R
        for (int i = 0; i < VEC_ELEMS; i++) {
          union_smem.dwdb_block_buffer[threadIdx.x][i ^ ((threadIdx.x / (16 / VEC_ELEMS)) & (VEC_ELEMS - 1))] =
              float2{dw_thread[i], db_thread[i]};
        }
        __syncthreads();
        static_assert(NT_C * NT_R <= BLOCK_DIM_X, "not enough threads");
        static_assert(C_PER_BLOCK % NT_C == 0, "need to loop once more and check c < C_PER_BLOCK");
        for (int i = 0; i < C_PER_BLOCK / NT_C; i++) {
          int c = i * NT_C + threadIdx.x / NT_R;
          float dw_block = 0.f;
          float db_block = 0.f;
          if (BLOCK_DIM_X == NT_C * NT_R || threadIdx.x < NT_C * NT_R) {
            for (int j = threadIdx.x % NT_R; j < ROWS_PER_IO; j += NT_R) {
              int src_thread = j * (C_PER_BLOCK / VEC_ELEMS) + c / VEC_ELEMS;
              float2 val = union_smem.dwdb_block_buffer[src_thread][(c % VEC_ELEMS) ^ ((src_thread / (16 / VEC_ELEMS)) &
                                                                                       (VEC_ELEMS - 1))];
              dw_block += val.x;
              db_block += val.y;
            }
          }
          static_assert(32 % NT_R == 0, "cannot shuffle");
          for (int mask = NT_R / 2; mask > 0; mask >>= 1) {
            dw_block += __shfl_xor_sync(FINAL_MASK, dw_block, mask, 32);
            db_block += __shfl_xor_sync(FINAL_MASK, db_block, mask, 32);
          }
          if (BLOCK_DIM_X == NT_C * NT_R || threadIdx.x < NT_C * NT_R) {
            if (threadIdx.x % NT_R == 0) {
              if constexpr (CONSTANT_C_LOOP) {
                *reinterpret_cast<float2*>(
                    &red_buffer_wgrad
                        [((blockIdx.y / (C / C_PER_CLUSTER) * virtual_cluster_dim_y + virtual_block_idx_y) * C +
                          c_loop * C_PER_CLUSTER + virtual_block_idx_x * C_PER_BLOCK + c) *
                         2]) = float2{dw_block, db_block};
              } else {
                *reinterpret_cast<float2*>(
                    &red_buffer_wgrad[((n_loop * virtual_cluster_dim_y + virtual_block_idx_y) * C +
                                       c_loop * C_PER_CLUSTER + virtual_block_idx_x * C_PER_BLOCK + c) *
                                      2]) = float2{dw_block, db_block};
              }
            }
          }
        }
      }

      constexpr bool USE_SHARED_RED_BUFFER = HARDWARE_CLUSTER || VIRTUAL_CLUSTER_SIZE == 1;
      constexpr bool STORE_MEAN_VAR_IN_SHARED_RED_BUFFER =
          VIRTUAL_CLUSTER_SIZE == 1 &&
          MAX_NUM_GROUPS_PER_BLOCK == 1;  // MAX_NUM_GROUPS_PER_BLOCK > 1 is possible but not implemented
      [[maybe_unused]] __align__(16)
          __shared__ float2 shared_red_buffer[MAX_NUM_GROUPS_PER_BLOCK * (STORE_MEAN_VAR_IN_SHARED_RED_BUFFER ? 1 : 2)];

      // Block sum
      if constexpr (SINGLE_GROUP_PER_BLOCK) {
        // block reduce
        if (threadIdx.x < 32) {
          float2 sum_local_group =
              threadIdx.x < BLOCK_DIM_X / 32 ? sum_per_channel_single_group[threadIdx.x] : float2{0.f, 0.f};
          constexpr int warp_num_pow2 = round_up_pow2(BLOCK_DIM_X / 32);
          for (int mask = warp_num_pow2 / 2; mask > 0; mask >>= 1) {
            sum_local_group.x += __shfl_xor_sync(FINAL_MASK, sum_local_group.x, mask, 32);
            sum_local_group.y += __shfl_xor_sync(FINAL_MASK, sum_local_group.y, mask, 32);
          }
          if (threadIdx.x == 0) {
            if constexpr (USE_SHARED_RED_BUFFER) {
              if constexpr (STORE_MEAN_VAR_IN_SHARED_RED_BUFFER) {
                shared_red_buffer[0] = compute_mean_var(sum_local_group);
              } else {
                shared_red_buffer[step * MAX_NUM_GROUPS_PER_BLOCK + 0] = sum_local_group;
              }
            } else {
              *reinterpret_cast<float2*>(
                  &red_buffer[((step * gridDim.y + blockIdx.y) * VIRTUAL_CLUSTER_SIZE * MAX_NUM_GROUPS_PER_BLOCK +
                               virtual_block_idx_x * virtual_cluster_dim_y * MAX_NUM_GROUPS_PER_BLOCK +
                               // (threadIdx.x / THREADS_PER_GROUP) * virtual_cluster_dim_y +
                               virtual_block_idx_y) *
                              2]) = sum_local_group;
            }
          }
        }
      } else {
        // The number of threads to calculate the sum of each group (should be a power of 2 for warp reduce)
        constexpr int THREADS_PER_GROUP = std::min(std::min(32U, round_up_pow2(ROWS_PER_IO)),
                                                   round_up_pow2(BLOCK_DIM_X / MAX_NUM_GROUPS_PER_BLOCK / 2 + 1));
        static_assert(BLOCK_DIM_X >= MAX_NUM_GROUPS_PER_BLOCK * THREADS_PER_GROUP, "not enough threads");
        float2 sum_local_group = {0.f, 0.f};
        if (threadIdx.x / THREADS_PER_GROUP < MAX_NUM_GROUPS_PER_BLOCK) {
          int local_group_idx = block_group_start + threadIdx.x / THREADS_PER_GROUP;
          // TODO: map threads to both the CPG loop and the ROWS loop
          for (int local_c_loop = 0; local_c_loop < CPG; local_c_loop += GCD_VEC_CPG) {
            int c = local_group_idx * CPG + local_c_loop;
            if (C_PER_BLOCK % CPG == 0 || (c >= block_channel_start && c < block_channel_start + C_PER_BLOCK)) {
              for (int src_thread_tile_y = threadIdx.x % THREADS_PER_GROUP; src_thread_tile_y < ROWS_PER_IO;
                   src_thread_tile_y += THREADS_PER_GROUP) {
                int channel_idx = (c - block_channel_start) / GCD_VEC_CPG;
                channel_idx = channel_idx % (VEC_ELEMS / GCD_VEC_CPG) * (C_PER_BLOCK / VEC_ELEMS) +
                              channel_idx / (VEC_ELEMS / GCD_VEC_CPG);
                sum_local_group.x += sum_per_channel_multi_group[channel_idx][src_thread_tile_y].x;
                sum_local_group.y += sum_per_channel_multi_group[channel_idx][src_thread_tile_y].y;
              }
            }
          }
        }
        static_assert(32 % THREADS_PER_GROUP == 0, "cannot shuffle");
        for (int mask = THREADS_PER_GROUP / 2; mask > 0; mask >>= 1) {
          sum_local_group.x += __shfl_xor_sync(FINAL_MASK, sum_local_group.x, mask, 32);
          sum_local_group.y += __shfl_xor_sync(FINAL_MASK, sum_local_group.y, mask, 32);
        }
        if (threadIdx.x % THREADS_PER_GROUP == 0 && threadIdx.x / THREADS_PER_GROUP < MAX_NUM_GROUPS_PER_BLOCK) {
          if constexpr (USE_SHARED_RED_BUFFER) {
            static_assert(HARDWARE_CLUSTER || VIRTUAL_CLUSTER_SIZE == 1, "no distributed shared memory");
            if constexpr (STORE_MEAN_VAR_IN_SHARED_RED_BUFFER) {
              shared_red_buffer[threadIdx.x / THREADS_PER_GROUP] = compute_mean_var(sum_local_group);
            } else {
              shared_red_buffer[step * MAX_NUM_GROUPS_PER_BLOCK + threadIdx.x / THREADS_PER_GROUP] = sum_local_group;
            }
          } else {
            *reinterpret_cast<float2*>(
                &red_buffer[((step * gridDim.y + blockIdx.y) * VIRTUAL_CLUSTER_SIZE * MAX_NUM_GROUPS_PER_BLOCK +
                             virtual_block_idx_x * virtual_cluster_dim_y * MAX_NUM_GROUPS_PER_BLOCK +
                             (threadIdx.x / THREADS_PER_GROUP) * virtual_cluster_dim_y + virtual_block_idx_y) *
                            2]) = sum_local_group;
          }
        }
      }

      if constexpr (REQUIRES_WGRAD && wgrad_sync_method != WGRAD_SYNC_AT_LAST) {
        if (nc_scheduler.at_end(n)) {
          static_assert(PERSISTENT, "persistent is a must for reducing wgrad");
          if constexpr (wgrad_sync_method == WGRAD_ARRIVE_AND_WAIT_GRID) {
            virtual_cluster_sync<VIRTUAL_CLUSTER_SIZE, PERSISTENT, HARDWARE_CLUSTER>(barrier);
            wgrad_sync_token = group_barrier_arrive<NUM_VIRTUAL_CLUSTERS * VIRTUAL_CLUSTER_SIZE, PERSISTENT>(
                barrier_wgrad, blockIdx.x + blockIdx.y == 0);
          } else if constexpr (wgrad_sync_method == WGRAD_ARRIVE_AND_WAIT_GROUP) {
            virtual_cluster_sync<VIRTUAL_CLUSTER_SIZE, PERSISTENT, HARDWARE_CLUSTER>(barrier);
            wgrad_sync_token =
                group_barrier_arrive<NUM_VIRTUAL_CLUSTERS * VIRTUAL_CLUSTER_SIZE / (C / C_PER_CLUSTER), PERSISTENT>(
                    barrier_wgrad + virtual_cluster_idx_c, blockIdx.x + blockIdx.y / (C / C_PER_CLUSTER) == 0);
          } else if constexpr (wgrad_sync_method == WGRAD_REUSE_SUM_SYNC_GRID) {
            static_assert(!HARDWARE_CLUSTER,
                          "Distributed smem sync cannot reuse gmem sync. Use WGRAD_ARRIVE_AND_WAIT_GRID instead.");
            wgrad_sync_token = group_barrier_arrive<NUM_VIRTUAL_CLUSTERS * VIRTUAL_CLUSTER_SIZE, PERSISTENT>(
                barrier_wgrad, blockIdx.x + blockIdx.y == 0);
            group_barrier_wait(barrier_wgrad, wgrad_sync_token);
          } else if constexpr (wgrad_sync_method == WGRAD_REUSE_SUM_SYNC_GROUP) {
            static_assert(!HARDWARE_CLUSTER,
                          "Distributed smem sync cannot reuse gmem sync. Use WGRAD_ARRIVE_AND_WAIT_GROUP instead.");
            wgrad_sync_token =
                group_barrier_arrive<NUM_VIRTUAL_CLUSTERS * VIRTUAL_CLUSTER_SIZE / (C / C_PER_CLUSTER), PERSISTENT>(
                    barrier_wgrad + virtual_cluster_idx_c, blockIdx.x + blockIdx.y / (C / C_PER_CLUSTER) == 0);
            group_barrier_wait(barrier_wgrad + virtual_cluster_idx_c, wgrad_sync_token);
          }
        } else {
          virtual_cluster_sync<VIRTUAL_CLUSTER_SIZE, PERSISTENT, HARDWARE_CLUSTER>(barrier);
        }
      } else {
        virtual_cluster_sync<VIRTUAL_CLUSTER_SIZE, PERSISTENT, HARDWARE_CLUSTER>(barrier);
      }

      // Group sum
      __shared__ float2 mean_var[MAX_NUM_GROUPS_PER_BLOCK];
      if constexpr (!STORE_MEAN_VAR_IN_SHARED_RED_BUFFER) {
        // The number of threads to calculate the sum of each group (should be a power of 2 for warp reduce)
        constexpr int THREADS_PER_GROUP = std::min(std::min(32U, round_up_pow2(virtual_cluster_dim_y)),
                                                   round_up_pow2(BLOCK_DIM_X / MAX_NUM_GROUPS_PER_BLOCK / 2 + 1));
        static_assert(BLOCK_DIM_X >= MAX_NUM_GROUPS_PER_BLOCK * THREADS_PER_GROUP, "not enough threads");
        float2 sum_global_group = {0.f, 0.f};
        if (threadIdx.x / THREADS_PER_GROUP < MAX_NUM_GROUPS_PER_BLOCK) {
          if constexpr (C_PER_BLOCK % CPG == 0) {
            // Special case: no cross-virtual_cluster_dim_x reduction
            float2 buffer[up_div(virtual_cluster_dim_y, THREADS_PER_GROUP)];
            for (int i = threadIdx.x % THREADS_PER_GROUP; i < virtual_cluster_dim_y; i += THREADS_PER_GROUP) {
              float2 val;
              if constexpr (USE_SHARED_RED_BUFFER) {
                if constexpr (VIRTUAL_CLUSTER_SIZE == 1) {
                  val = shared_red_buffer[step * MAX_NUM_GROUPS_PER_BLOCK + threadIdx.x / THREADS_PER_GROUP];
                } else {
                  static_assert(HARDWARE_CLUSTER, "no distributed shared memory");
                  float2 const* src_shared_red_buffer = cg::this_cluster().map_shared_rank(
                      shared_red_buffer, i * virtual_cluster_dim_x + virtual_block_idx_x);
                  val = src_shared_red_buffer[step * MAX_NUM_GROUPS_PER_BLOCK + threadIdx.x / THREADS_PER_GROUP];
                }
              } else {
                val = *reinterpret_cast<float2 const*>(
                    &red_buffer[((step * gridDim.y + blockIdx.y) * VIRTUAL_CLUSTER_SIZE * MAX_NUM_GROUPS_PER_BLOCK +
                                 virtual_block_idx_x * virtual_cluster_dim_y * MAX_NUM_GROUPS_PER_BLOCK +
                                 (threadIdx.x / THREADS_PER_GROUP) * virtual_cluster_dim_y + i) *
                                2]);
              }
              buffer[i / THREADS_PER_GROUP] = val;
            }
            for (int i = threadIdx.x % THREADS_PER_GROUP; i < virtual_cluster_dim_y; i += THREADS_PER_GROUP) {
              float2 val = buffer[i / THREADS_PER_GROUP];
              sum_global_group.x += val.x;
              sum_global_group.y += val.y;
            }
          } else {
            // Common case: cross-virtual_cluster_dim_x reduction
            int local_group_idx = block_group_start + threadIdx.x / THREADS_PER_GROUP;
            for (int i = threadIdx.x % THREADS_PER_GROUP; i < VIRTUAL_CLUSTER_SIZE; i += THREADS_PER_GROUP) {
              int src_virtual_block_idx_x = i % virtual_cluster_dim_x;
              int src_block_channel_start = src_virtual_block_idx_x * C_PER_BLOCK + c_loop * C_PER_CLUSTER;
              int src_block_group_start = src_block_channel_start / CPG;
              int relative_group_idx = local_group_idx - src_block_group_start;
              if (0 <= relative_group_idx && relative_group_idx < MAX_NUM_GROUPS_PER_BLOCK) {
                float2 val;
                if constexpr (USE_SHARED_RED_BUFFER) {
                  static_assert(HARDWARE_CLUSTER, "no distributed shared memory");
                  static_assert(VIRTUAL_CLUSTER_SIZE != 1,
                                "layout error: should not add (step * MAX_NUM_GROUPS_PER_BLOCK)");
                  float2 const* src_shared_red_buffer = cg::this_cluster().map_shared_rank(shared_red_buffer, i);
                  val = src_shared_red_buffer[step * MAX_NUM_GROUPS_PER_BLOCK + relative_group_idx];
                } else {
                  val = *reinterpret_cast<float2 const*>(
                      &red_buffer[((step * gridDim.y + blockIdx.y) * VIRTUAL_CLUSTER_SIZE * MAX_NUM_GROUPS_PER_BLOCK +
                                   src_virtual_block_idx_x * virtual_cluster_dim_y * MAX_NUM_GROUPS_PER_BLOCK +
                                   relative_group_idx * virtual_cluster_dim_y + i / virtual_cluster_dim_x) *
                                  2]);
                }
                sum_global_group.x += val.x;
                sum_global_group.y += val.y;
              }
            }
          }
        }
        if constexpr (USE_SHARED_RED_BUFFER && VIRTUAL_CLUSTER_SIZE > 1) {
          // Need cluster sync after distributed shared memory access, otherwise behavior is undefined
          if constexpr (PERSISTENT) {
            if (nc_scheduler.at_end(n)) {
              cg::this_cluster().barrier_arrive();
            }
          } else {
            cg::this_cluster().barrier_arrive();
          }
        }
        static_assert(32 % THREADS_PER_GROUP == 0, "cannot shuffle");
        for (int mask = THREADS_PER_GROUP / 2; mask > 0; mask >>= 1) {
          sum_global_group.x += __shfl_xor_sync(FINAL_MASK, sum_global_group.x, mask, 32);
          sum_global_group.y += __shfl_xor_sync(FINAL_MASK, sum_global_group.y, mask, 32);
        }
        if (threadIdx.x % THREADS_PER_GROUP == 0 && threadIdx.x / THREADS_PER_GROUP < MAX_NUM_GROUPS_PER_BLOCK) {
          mean_var[threadIdx.x / THREADS_PER_GROUP] = compute_mean_var(sum_global_group);
        }
        __syncthreads();
      }

      auto get_mean_var = [&](int relative_group_idx) {
        return STORE_MEAN_VAR_IN_SHARED_RED_BUFFER ? shared_red_buffer[relative_group_idx]
                                                   : mean_var[relative_group_idx];
      };

      float frag_dyw[VEC_ELEMS / GCD_VEC_CPG];
      float frag_xdyw[VEC_ELEMS / GCD_VEC_CPG];
      for (int k = 0; k < VEC_ELEMS; k += GCD_VEC_CPG) {
        frag_dyw[k / GCD_VEC_CPG] = get_mean_var((thread_channel_start + k) / CPG - block_group_start).x;
        frag_xdyw[k / GCD_VEC_CPG] = get_mean_var((thread_channel_start + k) / CPG - block_group_start).y;
      }

      for (int j = 0; j < ROWS_PER_BLOCK / ROWS_PER_IO; j++) {
        int64_t input_idx =
            n_loop * HW * C +
            (virtual_block_idx_y * ROWS_PER_BLOCK + j * ROWS_PER_IO + threadIdx.x / (C_PER_BLOCK / VEC_ELEMS)) * C +
            thread_channel_start;
        U ux;
        U udy;
        if constexpr (LOAD_TWICE) {
          ux = *reinterpret_cast<U const*>(&x[input_idx]);
          udy = *reinterpret_cast<U const*>(&grad_output[input_idx]);
        } else {
          ux = frag_x[j];
          udy = frag_dy[j];
        }
        U val;
        for (int k = 0; k < VEC_ELEMS; k++) {
          float rnorm = rsqrtf(frag_var[k / GCD_VEC_CPG] + eps);
          float x_norm = ((float)ux.data[k] - frag_mean[k / GCD_VEC_CPG]) * rnorm;  // TODO: store rsqrtf in mean_var
          float grad_gn = udy.data[k];
          if constexpr (SILU) {
            float x_gn = x_norm * (float)uw.data[k] + (float)ub.data[k];
            float s = 1.f / (1.f + expf(-x_gn));
            grad_gn *= s * (1.f + x_gn * (1.f - s));
          }
          val.data[k] =
              (grad_gn * (float)uw.data[k] - frag_dyw[k / GCD_VEC_CPG] - frag_xdyw[k / GCD_VEC_CPG] * x_norm) * rnorm;
        }
        *reinterpret_cast<U*>(&grad_input[input_idx]) = val;
      }

      if constexpr (!STORE_MEAN_VAR_IN_SHARED_RED_BUFFER && USE_SHARED_RED_BUFFER && VIRTUAL_CLUSTER_SIZE > 1) {
        if constexpr (PERSISTENT) {
          if (nc_scheduler.at_end(n)) {
            cg::this_cluster().barrier_wait();
          }
        } else {
          cg::this_cluster().barrier_wait();
        }
      }

      if constexpr (!PERSISTENT) {
        break;
      }
      step ^= 1;
    }

    // Wgrad sum
    if constexpr (REQUIRES_WGRAD) {
      static_assert(PERSISTENT, "cannot reduce wgrad");
      static_assert(C % 32 == 0, "cannot reduce wgrad");
      if constexpr (wgrad_sync_method == WGRAD_ARRIVE_AND_WAIT_GRID) {
        group_barrier_wait(barrier_wgrad, wgrad_sync_token);
      } else if constexpr (wgrad_sync_method == WGRAD_ARRIVE_AND_WAIT_GROUP) {
        group_barrier_wait(barrier_wgrad + virtual_cluster_idx_c, wgrad_sync_token);
      } else if constexpr (wgrad_sync_method == WGRAD_SYNC_AT_LAST) {
        cg::this_grid().sync();
      }

      // If group sync, map blocks that are responsible for the same range of channels to these channels (named "split
      // channels"); otherwise, map all blocks to all channels.
      constexpr bool split_channels =
          wgrad_sync_method == WGRAD_ARRIVE_AND_WAIT_GROUP || wgrad_sync_method == WGRAD_REUSE_SUM_SYNC_GROUP;

      for (int c = split_channels ? virtual_cluster_idx_c * C_PER_CLUSTER +
                                        32 * (blockIdx.y / (C / C_PER_CLUSTER) * VIRTUAL_CLUSTER_SIZE + blockIdx.x)
                                  : 32 * (blockIdx.y * VIRTUAL_CLUSTER_SIZE + blockIdx.x);
           split_channels ? c < (virtual_cluster_idx_c + 1) * C_PER_CLUSTER : c < C;
           c += split_channels ? 32 * (NUM_VIRTUAL_CLUSTERS / (C / C_PER_CLUSTER) * VIRTUAL_CLUSTER_SIZE)
                               : 32 * (NUM_VIRTUAL_CLUSTERS * VIRTUAL_CLUSTER_SIZE)) {
        int64_t rows = (CONSTANT_C_LOOP ? std::min(n, (int64_t)NUM_VIRTUAL_CLUSTERS / (C / C_PER_CLUSTER)) : n) *
                       virtual_cluster_dim_y;
        float sum_wgrad = 0.f;
        float sum_bgrad = 0.f;
        if ((split_channels &&
             (C_PER_CLUSTER % 32 == 0 || c + threadIdx.x % 32 < (virtual_cluster_idx_c + 1) * C_PER_CLUSTER)) ||
            (!split_channels && (C % 32 == 0 || c + threadIdx.x % 32 < C))) {
          for (int64_t i = threadIdx.x / 32; i < rows; i += BLOCK_DIM_X / 32) {
            float2 val = *reinterpret_cast<float2 const*>(&red_buffer_wgrad[(i * C + c + threadIdx.x % 32) * 2]);
            sum_wgrad += val.x;
            sum_bgrad += val.y;
          }
        }
        constexpr int warp_num_pow2 = round_up_pow2(BLOCK_DIM_X / 32);
        union_smem.transpose_buffer
            .wgrad_buffer[threadIdx.x / 32][(threadIdx.x % 32) ^ ((threadIdx.x / 32) * (32 / warp_num_pow2))] =
            sum_wgrad;
        union_smem.transpose_buffer
            .bgrad_buffer[threadIdx.x / 32][(threadIdx.x % 32) ^ ((threadIdx.x / 32) * (32 / warp_num_pow2))] =
            sum_bgrad;
        __syncthreads();
        for (int i = threadIdx.x / warp_num_pow2;
             i < 32 &&
             ((split_channels && (C_PER_CLUSTER % 32 == 0 || c + i < (virtual_cluster_idx_c + 1) * C_PER_CLUSTER)) ||
              (!split_channels && (C % 32 == 0 || c + i < C)));
             i += BLOCK_DIM_X / warp_num_pow2) {
          int j = threadIdx.x % warp_num_pow2;
          float sum_wgrad =
              j < BLOCK_DIM_X / 32 ? union_smem.transpose_buffer.wgrad_buffer[j][i ^ (j * (32 / warp_num_pow2))] : 0.f;
          float sum_bgrad =
              j < BLOCK_DIM_X / 32 ? union_smem.transpose_buffer.bgrad_buffer[j][i ^ (j * (32 / warp_num_pow2))] : 0.f;
          for (int mask = warp_num_pow2 / 2; mask > 0; mask >>= 1) {
            sum_wgrad += __shfl_xor_sync((uint64_t(1) << warp_num_pow2) - 1, sum_wgrad, mask, warp_num_pow2);
            sum_bgrad += __shfl_xor_sync((uint64_t(1) << warp_num_pow2) - 1, sum_bgrad, mask, warp_num_pow2);
          }
          if (j == 0) {
            grad_weight[c + i] = sum_wgrad;
            grad_bias[c + i] = sum_bgrad;
          }
        }
        __syncthreads();
      }
    }
  }
}

}  // namespace group_norm_v2


================================================
FILE: apex/contrib/csrc/group_norm_v2/gn_dispatch_hw_c.hpp
================================================
#pragma once

#define DISPATCH_HW_C(hw, c, HW, C, ...)                                                          \
  [&] {                                                                                           \
    if (hw == 64 && c == 1280) {                                                                  \
      constexpr int HW = 64, C = 1280;                                                            \
      return __VA_ARGS__();                                                                       \
    }                                                                                             \
    if (hw == 64 && c == 2560) {                                                                  \
      constexpr int HW = 64, C = 2560;                                                            \
      return __VA_ARGS__();                                                                       \
    }                                                                                             \
    if (hw == 256 && c == 640) {                                                                  \
      constexpr int HW = 256, C = 640;                                                            \
      return __VA_ARGS__();                                                                       \
    }                                                                                             \
    if (hw == 256 && c == 1280) {                                                                 \
      constexpr int HW = 256, C = 1280;                                                           \
      return __VA_ARGS__();                                                                       \
    }                                                                                             \
    if (hw == 256 && c == 1920) {                                                                 \
      constexpr int HW = 256, C = 1920;                                                           \
      return __VA_ARGS__();                                                                       \
    }                                                                                             \
    if (hw == 256 && c == 2560) {                                                                 \
      constexpr int HW = 256, C = 2560;                                                           \
      return __VA_ARGS__();                                                                       \
    }                                                                                             \
    if (hw == 1024 && c == 320) {                                                                 \
      constexpr int HW = 1024, C = 320;                                                           \
      return __VA_ARGS__();                                                                       \
    }                                                                                             \
    if (hw == 1024 && c == 640) {                                                                 \
      constexpr int HW = 1024, C = 640;                                                           \
      return __VA_ARGS__();                                                                       \
    }                                                                                             \
    if (hw == 1024 && c == 960) {                                                                 \
      constexpr int HW = 1024, C = 960;                                                           \
      return __VA_ARGS__();                                                                       \
    }                                                                                             \
    if (hw == 1024 && c == 1280) {                                                                \
      constexpr int HW = 1024, C = 1280;                                                          \
      return __VA_ARGS__();                                                                       \
    }                                                                                             \
    if (hw == 1024 && c == 1920) {                                                                \
      constexpr int HW = 1024, C = 1920;                                                          \
      return __VA_ARGS__();                                                                       \
    }                                                                                             \
    if (hw == 4096 && c == 320) {                                                                 \
      constexpr int HW = 4096, C = 320;                                                           \
      return __VA_ARGS__();                                                                       \
    }                                                                                             \
    if (hw == 4096 && c == 640) {                                                                 \
      constexpr int HW = 4096, C = 640;                                                           \
      return __VA_ARGS__();                                                                       \
    }                                                                                             \
    if (hw == 4096 && c == 960) {                                                                 \
      constexpr int HW = 4096, C = 960;                                                           \
      return __VA_ARGS__();                                                                       \
    }                                                                                             \
    throw std::invalid_argument("DISPATCH_HW_C " + std::to_string(hw) + " " + std::to_string(c)); \
  }()


================================================
FILE: apex/contrib/csrc/group_norm_v2/gn_utils.cpp
================================================
#include "gn_utils.hpp"

#include <mutex>
#include <vector>

namespace group_norm_v2 {

cudaDeviceProp const& get_device_prop(int device_id) {
  static std::vector<cudaDeviceProp> device_props;
  static std::once_flag flag;
  std::call_once(flag, [&] {
    int count;
    CUDA_CHECK(cudaGetDeviceCount(&count));
    device_props.resize(count);
    for (int i = 0; i < count; i++) {
      CUDA_CHECK(cudaGetDeviceProperties(&device_props[i], i));
    }
  });
  return device_props.at(device_id);
}

}  // namespace group_norm_v2


================================================
FILE: apex/contrib/csrc/group_norm_v2/gn_utils.hpp
================================================
#pragma once

#include <cuda_runtime.h>

#include <cassert>
#include <cstdio>
#include <cstdlib>

#include "gn.hpp"

// Definition of CUDA_CHECK macro
#define CUDA_CHECK(call)                                                                                               \
  do {                                                                                                                 \
    cudaError_t err_ = call;                                                                                           \
    if (err_ != cudaSuccess) {                                                                                         \
      fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", __FILE__, __LINE__, err_, cudaGetErrorString(err_), \
              #call);                                                                                                  \
      exit(EXIT_FAILURE);                                                                                              \
    }                                                                                                                  \
  } while (0)

#define GN_CUDA_HOST_PARAMS(T)                                                                                      \
  T *out, T *x, T *w, T *b, float eps, bool silu, int64_t n, int64_t hw, int num_groups, int channels_per_group,    \
      float *mean_var_out, float *red_buffer, unsigned *barrier, int sm_margin, cudaStream_t stream, int device_id, \
      Meta *meta_ptr, bool meta_only

#define GN_BWD_CUDA_HOST_PARAMS(T)                                                                                    \
  T *grad_input, T *grad_weight, T *grad_bias, T *grad_output, T *x, T *w, T *b, float *mean_var, float eps,          \
      bool silu, int64_t n, int64_t hw, int num_groups, int channels_per_group, float *red_buffer, unsigned *barrier, \
      int sm_margin, cudaStream_t stream, int device_id, Meta *meta_ptr, bool meta_only

#define GN_CUDA_HOST_ARGS                                                                                       \
  out, x, w, b, eps, silu, n, hw, num_groups, channels_per_group, mean_var_out, red_buffer, barrier, sm_margin, \
      stream, device_id, meta_ptr, meta_only

#define GN_BWD_CUDA_HOST_ARGS                                                                       \
  grad_input, grad_weight, grad_bias, grad_output, x, w, b, mean_var, eps, silu, n, hw, num_groups, \
      channels_per_group, red_buffer, barrier, sm_margin, stream, device_id, meta_ptr, meta_only

namespace group_norm_v2 {

cudaDeviceProp const& get_device_prop(int device_id);

#ifdef __CUDA_ARCH__

template <class... Ts>
__host__ __device__ inline int print_rank_0(char const* fmt, Ts&&... args) {
  if (threadIdx.x + threadIdx.y + threadIdx.z == 0 && blockIdx.x + blockIdx.y + blockIdx.z == 0) {
    return printf(fmt, std::forward<Ts>(args)...);
  }
  return 0;
}

#endif

}  // namespace group_norm_v2


================================================
FILE: apex/contrib/csrc/groupbn/batch_norm.cu
================================================
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <cuda.h>

#include "batch_norm.h"

#define cudaCheckErrors(msg)                                                                                  \
  do {                                                                                                        \
    cudaError_t __err = cudaGetLastError();                                                                   \
    if (__err != cudaSuccess) {                                                                               \
      fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", msg, cudaGetErrorString(__err), __FILE__, __LINE__); \
      fprintf(stderr, "*** FAILED - ABORTING\n");                                                             \
      exit(1);                                                                                                \
    }                                                                                                         \
  } while (0)

static size_t round_up_to_multiple(size_t x, int multiple) { return ((x + multiple - 1) / multiple) * multiple; }

struct Workspace {
  Workspace(size_t size) : size(size), data(NULL) {
    auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
    dataPtr = allocator.allocate(size);
    data = dataPtr.get();
  }
  Workspace(const Workspace&) = delete;
  Workspace(Workspace&&) = default;
  Workspace& operator=(Workspace&&) = default;
  ~Workspace() = default;

  size_t size;
  void* data;
  c10::DataPtr dataPtr;
};

// Return {y}
at::Tensor nhwc_bn_fwd_train(const at::Tensor& x, const at::Tensor& scale, const at::Tensor& bias,
                             const at::Tensor& running_mean, const at::Tensor& running_inv_var,
                             const at::Tensor& minibatch_mean, const at::Tensor& minibatch_inv_var,
                             const at::Tensor& ret_cta, const float momentum, const float epsilon, const bool fuse_relu,
                             void* my_data, void* pair_data, void* pair_data2, void* pair_data3, const int bn_group,
                             const at::Tensor& magic_tensor, const int occupancy, const int grid_dim_x,
                             const bool coop) {
  const int N = x.size(0);
  const int H = x.size(1);
  const int W = x.size(2);
  const int C = x.size(3);

  // generating new magic number and use that for sync
  int* magic = magic_tensor.data_ptr<int>();
  *magic = (*magic + 1) & 0xff;

  // Allocate output tensor
  at::Tensor y = at::empty({N, H, W, C}, x.options());

  // Create wrapper
  NhwcBatchNorm* bn = new NhwcBatchNorm();

  bn->setInputDescriptor(CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, N, C, H, W, bn_group);
  bn->setOutputDescriptor(CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, N, C, H, W);

  bn->setConstants(momentum, epsilon);

  // set pointers within the wrapper
  bn->setInputOutputPointers(x.data_ptr<at::Half>(), nullptr, y.data_ptr<at::Half>(), nullptr);

  bn->setWeightPointers({scale.data_ptr<float>(), bias.data_ptr<float>()}, {nullptr, nullptr});
  bn->setParameterPointers({running_mean.data_ptr<float>(), running_inv_var.data_ptr<float>()});

  // deal with workspace(s)
  auto workspace_bytes = bn->numWorkspaceBytes();
  // We'll create explicit tensors for the first 2 workspace ptrs, then allocate & offset
  // an allocated workspace for the others
  size_t total_workspace_bytes = 0;
  std::vector<size_t> workspace_offsets;

  for (auto index = 3; index < workspace_bytes.size(); ++index) {
    total_workspace_bytes = round_up_to_multiple(total_workspace_bytes, 512);
    workspace_offsets.push_back(total_workspace_bytes);

    auto alloc_bytes = workspace_bytes[index];
    total_workspace_bytes += alloc_bytes;
  }

  // Allocate the workspace
  Workspace ws(total_workspace_bytes);

  std::vector<void*> workspace;
  workspace.push_back(minibatch_mean.data_ptr<float>());
  workspace.push_back(minibatch_inv_var.data_ptr<float>());

  auto stream = at::cuda::getCurrentCUDAStream().stream();
  const int retired_cta_bytes = workspace_bytes[2];
  void* retired_ctas = ret_cta.data_ptr<uint8_t>();
  assert(ret_cta.size(0) >= retired_cta_bytes);
  workspace.push_back(retired_ctas);

  for (auto index = 3; index < workspace_bytes.size(); ++index) {
    void* ptr = reinterpret_cast<uint8_t*>(ws.data) + workspace_offsets[index - 3];
    workspace.push_back(ptr);
  }

  bn->setWorkspacePointers(workspace, workspace_bytes);

  // Don't fuse in ReLU for now at least
  bn->fwd(stream, fuse_relu, my_data, pair_data, pair_data2, pair_data3, bn_group, *magic, occupancy, grid_dim_x, coop);

  return y;
}

at::Tensor nhwc_bn_fwd_eval(const at::Tensor& x, const at::Tensor& scale, const at::Tensor& bias,
                            const at::Tensor& running_mean, const at::Tensor& running_inv_var,
                            const at::Tensor& ret_cta, const int bn_group, const float momentum, const float epsilon,
                            const bool fuse_relu) {
  const int N = x.size(0);
  const int H = x.size(1);
  const int W = x.size(2);
  const int C = x.size(3);

  // Allocate output tensor
  at::Tensor y = at::empty({N, H, W, C}, x.options());

  // Create wrapper
  NhwcBatchNorm* bn = new NhwcBatchNorm();

  bn->setInputDescriptor(CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, N, C, H, W, bn_group);
  bn->setOutputDescriptor(CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, N, C, H, W);

  bn->setConstants(momentum, epsilon);

  // set pointers within the wrapper
  bn->setInputOutputPointers(x.data_ptr<at::Half>(), nullptr, y.data_ptr<at::Half>(), nullptr);

  bn->setWeightPointers({scale.data_ptr<float>(), bias.data_ptr<float>()}, {nullptr, nullptr});
  bn->setParameterPointers({running_mean.data_ptr<float>(), running_inv_var.data_ptr<float>()});

  // deal with workspace(s)
  auto workspace_bytes = bn->numWorkspaceBytes();
  // We'll create explicit tensors for the first 2 workspace ptrs, then allocate & offset
  // an allocated workspace for the others
  size_t total_workspace_bytes = 0;
  std::vector<size_t> workspace_offsets;

  for (auto index = 3; index < workspace_bytes.size(); ++index) {
    total_workspace_bytes = round_up_to_multiple(total_workspace_bytes, 512);
    workspace_offsets.push_back(total_workspace_bytes);

    auto alloc_bytes = workspace_bytes[index];
    total_workspace_bytes += alloc_bytes;
  }

  // Allocate the workspace
  Workspace ws(total_workspace_bytes);

  std::vector<void*> workspace;
  workspace.push_back(nullptr);
  workspace.push_back(nullptr);

  auto stream = at::cuda::getCurrentCUDAStream().stream();
  const int retired_cta_bytes = workspace_bytes[2];
  void* retired_ctas = ret_cta.data_ptr<uint8_t>();
  assert(ret_cta.size(0) >= retired_cta_bytes);
  workspace.push_back(retired_ctas);

  for (auto index = 3; index < workspace_bytes.size(); ++index) {
    void* ptr = reinterpret_cast<uint8_t*>(ws.data) + workspace_offsets[index - 3];
    workspace.push_back(ptr);
  }

  bn->setWorkspacePointers(workspace, workspace_bytes);

  // Don't fuse in ReLU for now at least
  bn->fwdInference(stream, fuse_relu);

  return y;
}

std::vector<at::Tensor> nhwc_bn_bwd(const at::Tensor& x, const at::Tensor& dy, const at::Tensor& scale,
                                    const at::Tensor& bias, const at::Tensor& running_mean,
                                    const at::Tensor& running_inv_var, const at::Tensor& minibatch_mean,
                                    const at::Tensor& minibatch_inv_var, const at::Tensor& ret_cta,
                                    const float momentum, const float epsilon, const bool fuse_relu, void* my_data,
                                    void* pair_data, void* pair_data2, void* pair_data3, const int bn_group,
                                    const at::Tensor& magic_tensor, const int occupancy, const int grid_dim_x,
                                    const bool coop) {
  // shape
  const int N = x.size(0);
  const int H = x.size(1);
  const int W = x.size(2);
  const int C = x.size(3);

  // generating new magic number and use that for sync
  int* magic = magic_tensor.data_ptr<int>();
  *magic = (*magic + 1) & 0xff;

  // outputs
  at::Tensor x_grad, scale_grad, bias_grad;

  // Allocate outputs
  x_grad = at::empty_like(x);
  scale_grad = at::empty_like(scale);
  bias_grad = at::empty_like(bias);

  // Create wrapper
  NhwcBatchNorm* bn = new NhwcBatchNorm();

  bn->setInputDescriptor(CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, N, C, H, W, bn_group);
  bn->setOutputDescriptor(CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, N, C, H, W);

  bn->setConstants(momentum, epsilon);

  // set pointers within the wrapper
  bn->setInputOutputPointers(x.data_ptr<at::Half>(), x_grad.data_ptr<at::Half>(), nullptr, dy.data_ptr<at::Half>());

  bn->setWeightPointers({scale.data_ptr<float>(), bias.data_ptr<float>()},
                        {scale_grad.data_ptr<float>(), bias_grad.data_ptr<float>()});
  bn->setParameterPointers({running_mean.data_ptr<float>(), running_inv_var.data_ptr<float>()});

  // deal with workspace(s)
  auto workspace_bytes = bn->numWorkspaceBytes();
  // We'll create explicit tensors for the first 2 workspace ptrs, then allocate & offset
  // an allocated workspace for the others
  size_t total_workspace_bytes = 0;
  std::vector<size_t> workspace_offsets;

  for (auto index = 3; index < workspace_bytes.size(); ++index) {
    total_workspace_bytes = round_up_to_multiple(total_workspace_bytes, 512);
    workspace_offsets.push_back(total_workspace_bytes);

    auto alloc_bytes = workspace_bytes[index];
    total_workspace_bytes += alloc_bytes;
  }

  // Allocate the workspace
  Workspace ws(total_workspace_bytes);

  std::vector<void*> workspace;
  workspace.push_back(minibatch_mean.data_ptr<float>());
  workspace.push_back(minibatch_inv_var.data_ptr<float>());

  auto stream = at::cuda::getCurrentCUDAStream().stream();
  const int retired_cta_bytes = workspace_bytes[2];
  void* retired_ctas = ret_cta.data_ptr<uint8_t>();
  assert(ret_cta.size(0) >= retired_cta_bytes);
  workspace.push_back(retired_ctas);

  for (auto index = 3; index < workspace_bytes.size(); ++index) {
    void* ptr = reinterpret_cast<uint8_t*>(ws.data) + workspace_offsets[index - 3];
    workspace.push_back(ptr);
  }

  bn->setWorkspacePointers(workspace, workspace_bytes);

  bn->dgrad(stream, fuse_relu, my_data, pair_data, pair_data2, pair_data3, bn_group, *magic, occupancy, grid_dim_x,
            coop);

  return std::vector<at::Tensor>{x_grad, scale_grad, bias_grad};
}

int nhwc_bn_fwd_occupancy() {
  int device_id = -1;
  cudaGetDevice(&device_id);

  // max occupancy supported by the code is 2
  return NhwcBatchNorm::smem_driven_fwd_occupancy(device_id, 2);
}

int nhwc_bn_bwd_occupancy() {
  int device_id = -1;
  cudaGetDevice(&device_id);

  // max occupancy supported by the code is 2
  return NhwcBatchNorm::smem_driven_bwd_occupancy(device_id, 2);
}


================================================
FILE: apex/contrib/csrc/groupbn/batch_norm.h
================================================
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*!
 * Copyright (c) 2018 by Contributors
 * \file nhwc_batch_norm.h
 * \brief CUDA NHWC Batch Normalization code
 * \author Shankara Rao Thejaswi Nanditale, Dick Carter, Evgeni Krimer
 */
#ifndef MXNET_OPERATOR_NN_CUDNN_NHWC_BATCH_NORM_H_
#define MXNET_OPERATOR_NN_CUDNN_NHWC_BATCH_NORM_H_

#include <cudnn.h>

#include <algorithm>
#include <iostream>
#include <string>
#include <vector>

#include "cuda_utils.h"
#include "nhwc_batch_norm_kernel.h"

#define VERBOSE_DEFAULT false

class NhwcBatchNorm {
 public:
  NhwcBatchNorm() {
    name_ = "nhwc_batchnorm";
    createTensorDescriptor(&X_tensor_desc_);
    createTensorDescriptor(&Y_tensor_desc_);
  }

  ~NhwcBatchNorm() {
    destroyTensorDescriptor(X_tensor_desc_);
    destroyTensorDescriptor(Y_tensor_desc_);
  }

  void die() {
    std::cerr << "batchnorm not initialized" << std::endl;
    exit(-1);
  }

  void fwd(cudaStream_t stream, bool use_relu, void* my_data, void* pair_data, void* pair_data2, void* pair_data3,
           const int bn_group, const int magic, const int occupancy, const int grid_dim_x, const bool coop);
  void dgrad(cudaStream_t stream, bool use_relu, void* my_data, void* pair_data, void* pair_data2, void* pair_data3,
             const int bn_group, const int magic, const int occupancy, const int grid_dim_x, const bool coop);
  void fwdInference(cudaStream_t stream, bool use_relu);
  dim3 calc_fwd_grid(int* loop, const int grid_dim_x);
  dim3 calc_bwd_grid(int* loop, const int grid_dim_x);

  void setInputDescriptor(const cudnnTensorFormat_t format, const cudnnDataType_t data_type, int n, int c, int h, int w,
                          int bn_group) {
    m_ = n * h * w;
    int m_bn_adjusted = m_ * bn_group;
    c_ = c;
    // factor to scale sum of squared errors to get saved variance.  Must be 1/nhw.
    svar_inv_count_ = 1.f / m_bn_adjusted;
    // factor to scale sum of squared errors to get running variance. Should be 1/(nhw-1).
    int divisor = m_bn_adjusted - 1;
    // nhw == 1 is unlikely, but by setting the rvar_inv_count_ == 1.f, we avoid running var infs.
    rvar_inv_count_ = divisor == 0 ? 1.f : 1.f / divisor;
    setTensorDescriptor(X_tensor_desc_, format, data_type, n, c, h, w);
  }

  void setOutputDescriptor(const cudnnTensorFormat_t format, const cudnnDataType_t data_type, int n, int c, int h,
                           int w) {
    setTensorDescriptor(Y_tensor_desc_, format, data_type, n, c, h, w);
  }

  const std::vector<size_t> numWorkspaceBytes() const;

  void setWorkspacePointers(const std::vector<void*>& workspace, const std::vector<size_t>& num_workspace_bytes);

  void setInputOutputPointers(void* X, void* dX, void* Y, void* dY) {
    X_ = X;
    dX_ = dX;
    Y_ = Y;
    dY_ = dY;
  }

  // Sets the pointers for the scale and weight (in that order) data and derivative buffers.
  void setWeightPointers(const std::vector<void*>& weight_pointers, const std::vector<void*>& deriv_pointers) {
    assert(weight_pointers.size() == 2);
    assert(deriv_pointers.size() == 2);
    scale_ = static_cast<float*>(weight_pointers[0]);
    bias_ = static_cast<float*>(weight_pointers[1]);
    dscale_ = static_cast<float*>(deriv_pointers[0]);
    dbias_ = static_cast<float*>(deriv_pointers[1]);
  }

  // Sets the pointers for the population mean and variance buffers, in that order.
  void setParameterPointers(const std::vector<void*>& param_pointers) {
    assert(param_pointers.size() == 2);
    population_mean_ = static_cast<float*>(param_pointers[0]);
    population_variance_ = static_cast<float*>(param_pointers[1]);
  }

  void setConstants(const double exp_avg_factor, const double eps) {
    exp_avg_factor_ = exp_avg_factor;
    eps_ = eps;
  }

  void processCudnnStatus(const cudnnStatus_t& status, const std::string& string = std::string(),
                          bool verbose = VERBOSE_DEFAULT) {
    if (status != CUDNN_STATUS_SUCCESS)
      LOG(FATAL) << string << " " << cudnnGetErrorString(status);
    else if (verbose)
      LOG(INFO) << string << " " << cudnnGetErrorString(status);
  }

  void checkCudaStatus(const std::string& string = std::string(), bool verbose = VERBOSE_DEFAULT) {
    cudaError_t status = cudaGetLastError();
    if (status != cudaSuccess)
      LOG(FATAL) << string << " " << cudaGetErrorString(status);
    else if (verbose)
      LOG(INFO) << string << " " << cudaGetErrorString(status);
  }

  size_t size_retired_ctas(int grid_y) const {
    // Note that the value of max_grid_y to handle known GPUs is about 160.
    const int max_grid_y = 1024;
    if (grid_y > max_grid_y) LOG(INFO) << "GPU capabilities exceeds assumptions.";
    const int retired_cta_bytes = max_grid_y * 2 * sizeof(int);
    // Since the region will be initialized once and used for many kernels,
    // the idea is to return an ample size that will cover all uses.
    return retired_cta_bytes;
  }

  cudnnTensorDescriptor_t X_tensor_desc_ = nullptr;
  cudnnTensorDescriptor_t Y_tensor_desc_ = nullptr;

  void* X_ = nullptr;
  void* dX_ = nullptr;
  void* Y_ = nullptr;
  void* dY_ = nullptr;

  // Learned scale and bias weights.
  float* scale_ = nullptr;
  float* dscale_ = nullptr;
  float* bias_ = nullptr;
  float* dbias_ = nullptr;

  // Computed population mean and variance parameters.
  float* population_mean_ = nullptr;
  float* population_variance_ = nullptr;

  // Workspace buffers for minibatch mean and variance (computed in fwd, needed by bwd).
  float* minibatch_mean_ = nullptr;
  float* minibatch_variance_ = nullptr;

  int m_ = 0;  // Number of values per channel that BN is normalizing.
  int c_ = 0;  // Number of channels over which BN is normalizing.

  float svar_inv_count_ = 0.f;  // factor to scale sum of squared errors to get saved variance
  float rvar_inv_count_ = 0.f;  // factor to scale sum of squared errors to get running variance

  double exp_avg_factor_ = 0.;
  double eps_ = 0.;
  std::string name_;

 private:
  void setTensorDescriptor(cudnnTensorDescriptor_t descriptor, cudnnTensorFormat_t format, cudnnDataType_t data_type,
                           int n, int c, int h, int w) {
    cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
    status = cudnnSetTensor4dDescriptor(descriptor, format, data_type, n, c, h, w);
    processCudnnStatus(status, "set tensor descriptor");
  }

  void createTensorDescriptor(cudnnTensorDescriptor_t* descriptor) {
    cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
    status = cudnnCreateTensorDescriptor(descriptor);
    processCudnnStatus(status, "create tensor_descriptor");
  }

  void destroyTensorDescriptor(cudnnTensorDescriptor_t descriptor) {
    cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
    status = cudnnDestroyTensorDescriptor(descriptor);
    processCudnnStatus(status, "destroy tensor_descriptor");
  }

 protected:
  float* partial_sums_ = nullptr;
  int* partial_counts_ = nullptr;
  int* retired_ctas_ = nullptr;

  void _setFwdParams(NhwcBatchNormFwdParams* params) const;
  void _setFwdInferenceParams(NhwcBatchNormFwdInferenceParams* params) const;
  void _setBwdParams(NhwcBatchNormBwdParams* params) const;

  // @todo: ability to configure these?
  // Kernel params
  static const int USE_ONLINE_APPROACH = 1;
  static const int THREADS_PER_CTA = 512;
  static const int THREADS_PER_PIXEL = 16;
  static const int C_ELEMENTS_PER_CTA = 64;
  static const int ELEMENTS_PER_LDG = C_ELEMENTS_PER_CTA / THREADS_PER_PIXEL;
  static const int MAX_SMEM_WITHOUT_OPT_IN = 48 * 1024;

  typedef uint16_t StorageType;
  // typedef float StorageType;
  //  increasing this to 6 causes spills in fwd kernel!
  static const int PIXELS_PER_THREAD_IN_REGISTERS_FWD = 5;
  static const int PIXELS_PER_THREAD_IN_REGISTERS_BWD = 3;
  static const int PIXELS_PER_THREAD_IN_SMEM_FWD = 10;
  static const int PIXELS_PER_THREAD_IN_SMEM_BWD = 5;

  static const int PIXELS_PER_THREAD_FWD = PIXELS_PER_THREAD_IN_REGISTERS_FWD + PIXELS_PER_THREAD_IN_SMEM_FWD;
  static const int PIXELS_PER_THREAD_BWD = PIXELS_PER_THREAD_IN_REGISTERS_BWD + PIXELS_PER_THREAD_IN_SMEM_BWD;
  static const int PIXELS_PER_THREAD_FWD_INFERENCE = 4;

  // Derived params
  static const size_t SMEM_SIZE_FWD =
      PIXELS_PER_THREAD_IN_SMEM_FWD * THREADS_PER_CTA * ELEMENTS_PER_LDG * sizeof(StorageType);
  static const size_t SMEM_SIZE_BWD =
      PIXELS_PER_THREAD_IN_SMEM_BWD * THREADS_PER_CTA * ELEMENTS_PER_LDG * 2 * sizeof(StorageType);
  static const int PIXELS_PER_LDG = THREADS_PER_CTA / THREADS_PER_PIXEL;
  static const int PIXELS_PER_CTA_FWD = THREADS_PER_CTA / THREADS_PER_PIXEL * PIXELS_PER_THREAD_FWD;
  static const int PIXELS_PER_CTA_BWD = THREADS_PER_CTA / THREADS_PER_PIXEL * PIXELS_PER_THREAD_BWD;
  static const int PIXELS_PER_CTA_FWD_INFERENCE = THREADS_PER_CTA / THREADS_PER_PIXEL * PIXELS_PER_THREAD_FWD_INFERENCE;

  // max grid.y in case of group bn is limited by exchange buffer size
  static const int MAX_GBN_BLOCK_Y = 256;

  // Helper function to launch the forward kernel.

  // We calculate (based on smem usage) the achievable occupancy and make sure we run a kernel
  // version that was compiled with that occupancy in its launch bounds.  This way, we avoid
  // needless register spills.
  void _fwdKernelLauncher(cudaStream_t stream, NhwcBatchNormFwdParams params, dim3 grid_dim, int outer_loops,
                          bool use_relu, const int occupancy, const bool coop) {
#define LAUNCH_FWD_KERNEL(OUTER_LOOPS, USE_RELU, USE_ADD_RELU, COMPILED_FOR_OCCUPANCY, COOP)                          \
  do {                                                                                                                \
    CHECK(SMEM_SIZE_FWD <= MAX_SMEM_WITHOUT_OPT_IN) << "Nhwc batchnorm kernel smem too big.";                         \
    auto fwd_func =                                                                                                   \
        nhwc_batch_norm_fwd<StorageType, THREADS_PER_CTA, THREADS_PER_PIXEL, PIXELS_PER_THREAD_IN_REGISTERS_FWD,      \
                            PIXELS_PER_THREAD_IN_SMEM_FWD, ELEMENTS_PER_LDG, USE_ONLINE_APPROACH, OUTER_LOOPS,        \
                            USE_RELU, USE_ADD_RELU, COMPILED_FOR_OCCUPANCY>;                                          \
    if (COMPILED_FOR_OCCUPANCY > 1) {                                                                                 \
      cudaFuncSetAttribute(fwd_func, cudaFuncAttributePreferredSharedMemoryCarveout, 100);                            \
      checkCudaStatus(name_ + " fwd ser coop kernel (cudaFuncSetAttribute carveout)");                                \
    }                                                                                                                 \
    void* params_ptr = static_cast<void*>(&params);                                                                   \
    using FWD_FUNC = decltype(nhwc_batch_norm_fwd<StorageType, THREADS_PER_CTA, THREADS_PER_PIXEL,                    \
                                                  PIXELS_PER_THREAD_IN_REGISTERS_FWD, PIXELS_PER_THREAD_IN_SMEM_FWD,  \
                                                  ELEMENTS_PER_LDG, USE_ONLINE_APPROACH, OUTER_LOOPS, USE_RELU,       \
                                                  USE_ADD_RELU, COMPILED_FOR_OCCUPANCY>);                             \
    if (COOP) {                                                                                                       \
      cudaLaunchCooperativeKernel<FWD_FUNC>(fwd_func, grid_dim, THREADS_PER_CTA, &params_ptr, SMEM_SIZE_FWD, stream); \
    } else {                                                                                                          \
      cudaLaunchKernel<FWD_FUNC>(fwd_func, grid_dim, THREADS_PER_CTA, &params_ptr, SMEM_SIZE_FWD, stream);            \
    }                                                                                                                 \
    checkCudaStatus(name_ + " fwd ser coop kernel");                                                                  \
  } while (0)

    // Don't try for an occupancy > 2 as this will squeeze register use and create spills.
    if (outer_loops == 1 && use_relu) {
      if (occupancy >= 2)
        LAUNCH_FWD_KERNEL(1, true, false, 2, coop);
      else
        LAUNCH_FWD_KERNEL(1, true, false, 1, coop);
    } else if (outer_loops == 1 && !use_relu) {
      if (occupancy >= 2)
        LAUNCH_FWD_KERNEL(1, false, false, 2, coop);
      else
        LAUNCH_FWD_KERNEL(1, false, false, 1, coop);
    } else if (use_relu) {
      if (occupancy >= 2)
        LAUNCH_FWD_KERNEL(0, true, false, 2, coop);
      else
        LAUNCH_FWD_KERNEL(0, true, false, 1, coop);
    } else {
      if (occupancy >= 2)
        LAUNCH_FWD_KERNEL(0, false, false, 2, coop);
      else
        LAUNCH_FWD_KERNEL(0, false, false, 1, coop);
    }
#undef LAUNCH_FWD_KERNEL
  }

  // Helper function to launch the backward kernel.

  void _bwdKernelLauncher(cudaStream_t stream, NhwcBatchNormBwdParams params, dim3 grid_dim, int outer_loops,
                          bool use_relu, const int occupancy, const bool coop) {
#define LAUNCH_BWD_KERNEL(OUTER_LOOPS, COMPILED_FOR_OCCUPANCY, COOP)                                                  \
  do {                                                                                                                \
    CHECK(SMEM_SIZE_BWD <= MAX_SMEM_WITHOUT_OPT_IN) << "Nhwc batchnorm kernel smem too big.";                         \
    auto bwd_func = nhwc_batch_norm_bwd<StorageType, THREADS_PER_CTA, THREADS_PER_PIXEL,                              \
                                        PIXELS_PER_THREAD_IN_REGISTERS_BWD, PIXELS_PER_THREAD_IN_SMEM_BWD,            \
                                        ELEMENTS_PER_LDG, USE_ONLINE_APPROACH, OUTER_LOOPS, COMPILED_FOR_OCCUPANCY>;  \
    if (COMPILED_FOR_OCCUPANCY > 1) {                                                                                 \
      cudaFuncSetAttribute(bwd_func, cudaFuncAttributePreferredSharedMemoryCarveout, 100);                            \
      checkCudaStatus(name_ + " bwd coop serial kernel (cudaFuncSetAttribute carveout)");                             \
    }                                                                                                                 \
    void* params_ptr = static_cast<void*>(&params);                                                                   \
    using BWD_FUNC =                                                                                                  \
        decltype(nhwc_batch_norm_bwd<StorageType, THREADS_PER_CTA, THREADS_PER_PIXEL,                                 \
                                     PIXELS_PER_THREAD_IN_REGISTERS_BWD, PIXELS_PER_THREAD_IN_SMEM_BWD,               \
                                     ELEMENTS_PER_LDG, USE_ONLINE_APPROACH, OUTER_LOOPS, COMPILED_FOR_OCCUPANCY>);    \
    if (COOP) {                                                                                                       \
      cudaLaunchCooperativeKernel<BWD_FUNC>(bwd_func, grid_dim, THREADS_PER_CTA, &params_ptr, SMEM_SIZE_BWD, stream); \
    } else {                                                                                                          \
      cudaLaunchKernel<BWD_FUNC>(bwd_func, grid_dim, THREADS_PER_CTA, &params_ptr, SMEM_SIZE_BWD, stream);            \
    }                                                                                                                 \
    checkCudaStatus(name_ + " bwd coop serial kernel");                                                               \
  } while (0)

#define LAUNCH_BWD_RELU_KERNEL(OUTER_LOOPS, COMPILED_FOR_OCCUPANCY, COOP)                                              \
  do {                                                                                                                 \
    CHECK(SMEM_SIZE_BWD <= MAX_SMEM_WITHOUT_OPT_IN) << "Nhwc batchnorm kernel smem too big.";                          \
    auto bwd_relu_func =                                                                                               \
        nhwc_batch_norm_bwd_relu<StorageType, THREADS_PER_CTA, THREADS_PER_PIXEL, PIXELS_PER_THREAD_IN_REGISTERS_BWD,  \
                                 PIXELS_PER_THREAD_IN_SMEM_BWD, ELEMENTS_PER_LDG, USE_ONLINE_APPROACH, OUTER_LOOPS,    \
                                 COMPILED_FOR_OCCUPANCY>;                                                              \
    if (COMPILED_FOR_OCCUPANCY > 1) {                                                                                  \
      cudaFuncSetAttribute(bwd_relu_func, cudaFuncAttributePreferredSharedMemoryCarveout, 100);                        \
      checkCudaStatus(name_ + " bwd-relu coop serial kernel (cudaFuncSetAttribute carveout)");                         \
    }                                                                                                                  \
    void* params_ptr = static_cast<void*>(&params);                                                                    \
    using BWD_RELU_FUNC =                                                                                              \
        decltype(nhwc_batch_norm_bwd_relu<StorageType, THREADS_PER_CTA, THREADS_PER_PIXEL,                             \
                                          PIXELS_PER_THREAD_IN_REGISTERS_BWD, PIXELS_PER_THREAD_IN_SMEM_BWD,           \
                                          ELEMENTS_PER_LDG, USE_ONLINE_APPROACH, OUTER_LOOPS,                          \
                                          COMPILED_FOR_OCCUPANCY>);                                                    \
    if (COOP) {                                                                                                        \
      cudaLaunchCooperativeKernel<BWD_RELU_FUNC>(bwd_relu_func, grid_dim, THREADS_PER_CTA, &params_ptr, SMEM_SIZE_BWD, \
                                                 stream);                                                              \
    } else {                                                                                                           \
      cudaLaunchKernel<BWD_RELU_FUNC>(bwd_relu_func, grid_dim, THREADS_PER_CTA, &params_ptr, SMEM_SIZE_BWD, stream);   \
    }                                                                                                                  \
    checkCudaStatus(name_ + " bwd-relu coop serial kernel");                                                           \
  } while (0)

    // Don't try for an occupancy > 2 as this will squeeze register use and create spills.
    if (outer_loops == 1 && use_relu) {
      if (occupancy >= 2)
        LAUNCH_BWD_RELU_KERNEL(1, 2, coop);
      else
        LAUNCH_BWD_RELU_KERNEL(1, 1, coop);
    } else if (outer_loops == 1 && !use_relu) {
      if (occupancy >= 2)
        LAUNCH_BWD_KERNEL(1, 2, coop);
      else
        LAUNCH_BWD_KERNEL(1, 1, coop);
    } else if (use_relu) {
      if (occupancy >= 2)
        LAUNCH_BWD_RELU_KERNEL(0, 2, coop);
      else
        LAUNCH_BWD_RELU_KERNEL(0, 1, coop);
    } else {
      if (occupancy >= 2)
        LAUNCH_BWD_KERNEL(0, 2, coop);
      else
        LAUNCH_BWD_KERNEL(0, 1, coop);
    }
#undef LAUNCH_BWD_KERNEL
  }

 public:
  // Calculate the expected fwd kernel occupancy, as dictated by shared memory usage.
  static int smem_driven_fwd_occupancy(int device_id, const int max_cta_per_sm) {
    using namespace at::cuda::utils;
    int fwd_reduction_bytes = THREADS_PER_PIXEL * (THREADS_PER_CTA / 32) * ELEMENTS_PER_LDG * sizeof(float);
    int fwd_smem_bytes = SMEM_SIZE_FWD + fwd_reduction_bytes;
    int occupancy = MaxSharedMemoryPerMultiprocessor(device_id) / fwd_smem_bytes;
    return std::min(max_cta_per_sm, occupancy);
  }

  // Calculate the expected bwd kernel occupancy, as dictated by shared memory usage.
  static int smem_driven_bwd_occupancy(int device_id, const int max_cta_per_sm) {
    using namespace at::cuda::utils;
    int bwd_reduction_bytes = THREADS_PER_PIXEL * (THREADS_PER_CTA / 32) * ELEMENTS_PER_LDG * sizeof(float);
    int bwd_smem_bytes = SMEM_SIZE_BWD + bwd_reduction_bytes;
    int occupancy = MaxSharedMemoryPerMultiprocessor(device_id) / bwd_smem_bytes;
    return std::min(max_cta_per_sm, occupancy);
  }
};

const std::vector<size_t> NhwcBatchNorm::numWorkspaceBytes() const {
  assert(c_ > 0);

  // choose the max memory required between fwd/bwd passes
  int grid_x_fwd = div_up(m_, PIXELS_PER_CTA_FWD);
  int grid_x_bwd = div_up(m_, PIXELS_PER_CTA_BWD);
  int grid_x = max(grid_x_fwd, grid_x_bwd);
  int grid_y = div_up(c_, C_ELEMENTS_PER_CTA);

  const size_t num_mean_bytes = c_ * sizeof(float);
  const size_t num_variance_bytes = num_mean_bytes;
  const size_t size_sums = grid_y * grid_x * THREADS_PER_PIXEL * ELEMENTS_PER_LDG * 2 * sizeof(float);
  const size_t size_counts = grid_y * grid_x * sizeof(int);

  return {num_mean_bytes, num_variance_bytes, size_retired_ctas(grid_y), size_sums, size_counts};
}

void NhwcBatchNorm::setWorkspacePointers(const std::vector<void*>& workspace,
                                         const std::vector<size_t>& num_workspace_bytes) {
  assert(workspace.size() == 5);
  assert(num_workspace_bytes.size() == 5);

  minibatch_mean_ = static_cast<float*>(workspace[0]);
  minibatch_variance_ = static_cast<float*>(workspace[1]);
  retired_ctas_ = static_cast<int*>(workspace[2]);
  partial_sums_ = static_cast<float*>(workspace[3]);
  partial_counts_ = static_cast<int*>(workspace[4]);
}

void NhwcBatchNorm::_setFwdParams(NhwcBatchNormFwdParams* params) const {
  params->gmem_src = static_cast<uint16_t*>(X_);
  params->gmem_dst = static_cast<uint16_t*>(Y_);
  params->gmem_src1 = nullptr;
  params->gmem_bias = bias_;
  params->gmem_scale = scale_;
  params->gmem_running_mean = population_mean_;
  params->gmem_running_var = population_variance_;
  params->gmem_saved_mean = minibatch_mean_;
  params->gmem_saved_var = minibatch_variance_;
  params->gmem_relu_bitmask = nullptr;
  params->nhw = m_;
  params->c = c_;
  params->svar_inv_count = svar_inv_count_;
  params->rvar_inv_count = rvar_inv_count_;
  params->gmem_sums = partial_sums_;
  params->gmem_counts = partial_counts_;
  params->gmem_retired_ctas = retired_ctas_;
  params->var_eps = eps_;
  params->outer_loops = 0;
  params->exp_avg_factor = static_cast<float>(exp_avg_factor_);
  params->c_blks = div_up(c_, C_ELEMENTS_PER_CTA);
}

void NhwcBatchNorm::_setFwdInferenceParams(NhwcBatchNormFwdInferenceParams* params) const {
  params->gmem_src = static_cast<uint16_t*>(X_);
  params->gmem_dst = static_cast<uint16_t*>(Y_);
  params->gmem_src1 = nullptr;
  params->gmem_bias = bias_;
  params->gmem_scale = scale_;
  params->gmem_mean = population_mean_;
  params->gmem_var = population_variance_;
  params->nhw = m_;
  params->c = c_;
  params->var_eps = eps_;
}

void NhwcBatchNorm::_setBwdParams(NhwcBatchNormBwdParams* params) const {
  params->gmem_src = static_cast<uint16_t*>(X_);
  params->gmem_dy = static_cast<uint16_t*>(dY_);
  params->gmem_dst = static_cast<uint16_t*>(dX_);
  params->gmem_dst1 = nullptr;
  params->gmem_relu_bitmask = nullptr;
  params->gmem_dscale = dscale_;
  params->gmem_dbias = dbias_;
  params->gmem_scale = scale_;
  params->gmem_bias = bias_;
  params->gmem_saved_mean = minibatch_mean_;
  params->gmem_saved_var = minibatch_variance_;
  params->nhw = m_;
  params->c = c_;
  params->svar_inv_count = svar_inv_count_;
  params->gmem_sums = partial_sums_;
  params->gmem_retired_ctas = retired_ctas_;
  params->outer_loops = 0;
  params->c_blks = div_up(c_, C_ELEMENTS_PER_CTA);
}

void NhwcBatchNorm::fwdInference(cudaStream_t stream, bool use_relu) {
  bool ptrs_are_set = X_tensor_desc_ != nullptr && Y_tensor_desc_ != nullptr && scale_ != nullptr &&
                      bias_ != nullptr
                      //      && minibatch_mean_ != nullptr
                      //      && minibatch_variance_ != nullptr
                      && population_mean_ != nullptr && population_variance_ != nullptr &&
                      X_ != nullptr
                      //      && dX_ != nullptr
                      && Y_ != nullptr
                      //      && dY_ != nullptr
                      //      && dscale_ != nullptr
                      //      && dbias_ != nullptr
                      && partial_sums_ != nullptr && partial_counts_ != nullptr;

  if (!ptrs_are_set) die();

  dim3 grid_dim;
  grid_dim.x = div_up(m_, PIXELS_PER_CTA_FWD_INFERENCE);
  grid_dim.y = div_up(c_, C_ELEMENTS_PER_CTA);

  // @todo: maybe just move this inside initialize routine?
  NhwcBatchNormFwdInferenceParams params;
  _setFwdInferenceParams(&params);

  if (use_relu) {
    nhwc_batch_norm_fwd_inference<StorageType, THREADS_PER_CTA, THREADS_PER_PIXEL, ELEMENTS_PER_LDG, true, false>
        <<<grid_dim, THREADS_PER_CTA, 0, stream>>>(params);
    checkCudaStatus(name_ + " fwd_inference-relu kernel");
  } else {
    nhwc_batch_norm_fwd_inference<StorageType, THREADS_PER_CTA, THREADS_PER_PIXEL, ELEMENTS_PER_LDG, false, false>
        <<<grid_dim, THREADS_PER_CTA, 0, stream>>>(params);
    checkCudaStatus(name_ + " fwd_inference kernel");
  }
}

dim3 NhwcBatchNorm::calc_fwd_grid(int* loop, const int grid_dim_x) {
  dim3 grid_dim;
  grid_dim.x = div_up(m_, PIXELS_PER_CTA_FWD);
  int c_blks = div_up(c_, C_ELEMENTS_PER_CTA);
  unsigned int max_grid_x = grid_dim_x;
  if (grid_dim.x <= max_grid_x) {
    *loop = 1;
    if (max_grid_x / grid_dim.x > 1) {
      grid_dim.y = std::min(c_blks, static_cast<int>(max_grid_x / grid_dim.x));
      assert(grid_dim.y < MAX_GBN_BLOCK_Y);  // FIXME: turn into a loop
    } else {
      grid_dim.y = 1;
    }
  } else {
    grid_dim.x = max_grid_x;
    grid_dim.y = 1;
    int nhw_in_regs = m_ - PIXELS_PER_THREAD_IN_SMEM_FWD * PIXELS_PER_LDG * grid_dim.x;
    int pixels_per_iteration = PIXELS_PER_THREAD_IN_REGISTERS_FWD * PIXELS_PER_LDG * grid_dim.x;
    *loop = div_up(nhw_in_regs, pixels_per_iteration);
  }
  return grid_dim;
}

dim3 NhwcBatchNorm::calc_bwd_grid(int* loop, const int grid_dim_x) {
  dim3 grid_dim;
  grid_dim.x = div_up(m_, PIXELS_PER_CTA_BWD);
  int c_blks = div_up(c_, C_ELEMENTS_PER_CTA);
  unsigned int max_grid_x = grid_dim_x;
  if (grid_dim.x <= max_grid_x) {
    *loop = 1;
    if (max_grid_x / grid_dim.x > 1) {
      grid_dim.y = std::min(c_blks, static_cast<int>(max_grid_x / grid_dim.x));
      assert(grid_dim.y < MAX_GBN_BLOCK_Y);  // FIXME: turn into a loop
    } else {
      grid_dim.y = 1;
    }
  } else {
    grid_dim.x = max_grid_x;
    grid_dim.y = 1;
    int nhw_in_regs = m_ - PIXELS_PER_THREAD_IN_SMEM_BWD * PIXELS_PER_LDG * grid_dim.x;
    int pixels_per_iteration = PIXELS_PER_THREAD_IN_REGISTERS_BWD * PIXELS_PER_LDG * grid_dim.x;
    *loop = div_up(nhw_in_regs, pixels_per_iteration);
  }
  return grid_dim;
}

void NhwcBatchNorm::fwd(cudaStream_t stream, bool use_relu, void* my_data, void* pair_data, void* pair_data2,
                        void* pair_data3, const int bn_group, const int magic, const int occupancy,
                        const int grid_dim_x, const bool coop) {
  bool ptrs_are_set = X_tensor_desc_ != nullptr && Y_tensor_desc_ != nullptr && scale_ != nullptr && bias_ != nullptr &&
                      minibatch_mean_ != nullptr && minibatch_variance_ != nullptr && population_mean_ != nullptr &&
                      population_variance_ != nullptr &&
                      X_ != nullptr
                      //      && dX_ != nullptr
                      && Y_ != nullptr
                      //      && dY_ != nullptr
                      //      && dscale_ != nullptr
                      //      && dbias_ != nullptr
                      && partial_sums_ != nullptr && partial_counts_ != nullptr && retired_ctas_ != nullptr;

  if (!ptrs_are_set) die();

  // reset of retired_cta_count no longer needed

  NhwcBatchNormFwdParams params;
  _setFwdParams(&params);
  params.my_data = my_data;
  params.pair_datas[0] = pair_data;
  params.pair_datas[1] = pair_data2;
  params.pair_datas[2] = pair_data3;
  params.magic = magic;
  params.sync_iters = (bn_group == 8) ? 3 : (bn_group >> 1);

  dim3 grid_dim = calc_fwd_grid(&params.outer_loops, grid_dim_x);
  _fwdKernelLauncher(stream, params, grid_dim, params.outer_loops, use_relu, occupancy, coop);
}

void NhwcBatchNorm::dgrad(cudaStream_t stream, bool use_relu, void* my_data, void* pair_data, void* pair_data2,
                          void* pair_data3, const int bn_group, const int magic, const int occupancy,
                          const int grid_dim_x, const bool coop) {
  bool ptrs_are_set = X_tensor_desc_ != nullptr && Y_tensor_desc_ != nullptr && scale_ != nullptr &&
                      (bias_ != nullptr || !use_relu) && minibatch_mean_ != nullptr &&
                      minibatch_variance_ != nullptr
                      //      && population_mean_ != nullptr
                      //      && population_variance_ != nullptr
                      && X_ != nullptr &&
                      dX_ != nullptr
                      //      && Y_ != nullptr
                      && dY_ != nullptr && dscale_ != nullptr && dbias_ != nullptr;

  if (!ptrs_are_set) die();

  // reset of retired_cta_count no longer needed

  NhwcBatchNormBwdParams params;
  _setBwdParams(&params);
  params.my_data = my_data;
  params.pair_datas[0] = pair_data;
  params.pair_datas[1] = pair_data2;
  params.pair_datas[2] = pair_data3;
  params.magic = magic;
  params.sync_iters = (bn_group == 8) ? 3 : (bn_group >> 1);
  params.wgrad_coeff = 1.0 / bn_group;

  dim3 grid_dim = calc_bwd_grid(&params.outer_loops, grid_dim_x);
  _bwdKernelLauncher(stream, params, grid_dim, params.outer_loops, use_relu, occupancy, coop);
}

#endif  // MXNET_OPERATOR_NN_CUDNN_NHWC_BATCH_NORM_H_


================================================
FILE: apex/contrib/csrc/groupbn/batch_norm_add_relu.cu
================================================
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <cuda.h>

#include "batch_norm_add_relu.h"

// FIXME move the common stuff to common h file
#define cudaCheckErrors(msg)                                                                                  \
  do {                                                                                                        \
    cudaError_t __err = cudaGetLastError();                                                                   \
    if (__err != cudaSuccess) {                                                                               \
      fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", msg, cudaGetErrorString(__err), __FILE__, __LINE__); \
      fprintf(stderr, "*** FAILED - ABORTING\n");                                                             \
      exit(1);                                                                                                \
    }                                                                                                         \
  } while (0)

static size_t round_up_to_multiple(size_t x, int multiple) { return ((x + multiple - 1) / multiple) * multiple; }

struct Workspace {
  Workspace(size_t size) : size(size), data(NULL) {
    auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
    dataPtr = allocator.allocate(size);
    data = dataPtr.get();
  }
  Workspace(const Workspace&) = delete;
  Workspace(Workspace&&) = default;
  Workspace& operator=(Workspace&&) = default;
  ~Workspace() = default;

  size_t size;
  void* data;
  c10::DataPtr dataPtr;
};

// Return {y}
at::Tensor nhwc_bn_addrelu_fwd_train(const at::Tensor& x, const at::Tensor& z, const at::Tensor& scale,
                                     const at::Tensor& bias, const at::Tensor& running_mean,
                                     const at::Tensor& running_inv_var, const at::Tensor& minibatch_mean,
                                     const at::Tensor& minibatch_inv_var, const at::Tensor& bitmask,
                                     const at::Tensor& ret_cta, const float momentum, const float epsilon,
                                     void* my_data, void* pair_data, void* pair_data2, void* pair_data3,
                                     const int bn_group, const at::Tensor& magic_tensor, const int occupancy,
                                     const int grid_dim_x, const bool coop) {
  const int N = x.size(0);
  const int H = x.size(1);
  const int W = x.size(2);
  const int C = x.size(3);

  // generating new magic number and use that for sync
  int* magic = magic_tensor.data_ptr<int>();
  *magic = (*magic + 1) & 0xff;

  // Allocate output tensor
  at::Tensor y = at::empty({N, H, W, C}, x.options());

  // Create wrapper
  NhwcBatchNormAddRelu* bn = new NhwcBatchNormAddRelu();

  bn->setInputDescriptor(CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, N, C, H, W, bn_group);
  bn->setOutputDescriptor(CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, N, C, H, W);

  bn->setConstants(momentum, epsilon);

  // set pointers within the wrapper
  bn->setInputOutputPointers(x.data_ptr<at::Half>(), nullptr, y.data_ptr<at::Half>(), nullptr, z.data_ptr<at::Half>(),
                             nullptr);

  bn->setWeightPointers({scale.data_ptr<float>(), bias.data_ptr<float>()}, {nullptr, nullptr});
  bn->setParameterPointers({running_mean.data_ptr<float>(), running_inv_var.data_ptr<float>()});

  // deal with workspace(s)
  auto workspace_bytes = bn->numWorkspaceBytes();
  // We'll create explicit tensors for the first 2 workspace ptrs, then allocate & offset
  // an allocated workspace for the others
  size_t total_workspace_bytes = 0;
  std::vector<size_t> workspace_offsets;

  for (auto index = 4; index < workspace_bytes.size(); ++index) {
    total_workspace_bytes = round_up_to_multiple(total_workspace_bytes, 512);
    workspace_offsets.push_back(total_workspace_bytes);

    auto alloc_bytes = workspace_bytes[index];
    total_workspace_bytes += alloc_bytes;
  }

  // Allocate the workspace
  Workspace ws(total_workspace_bytes);

  std::vector<void*> workspace;
  workspace.push_back(minibatch_mean.data_ptr<float>());
  workspace.push_back(minibatch_inv_var.data_ptr<float>());
  workspace.push_back(bitmask.data_ptr<int32_t>());

  auto stream = at::cuda::getCurrentCUDAStream().stream();
  const int retired_cta_bytes = workspace_bytes[3];
  void* retired_ctas = ret_cta.data_ptr<uint8_t>();
  assert(ret_cta.size(0) >= retired_cta_bytes);

  workspace.push_back(retired_ctas);

  for (auto index = 4; index < workspace_bytes.size(); ++index) {
    void* ptr = reinterpret_cast<uint8_t*>(ws.data) + workspace_offsets[index - 4];
    workspace.push_back(ptr);
  }

  bn->setWorkspacePointers(workspace, workspace_bytes);

  // Don't fuse in ReLU for now at least
  bn->fwd(stream, my_data, pair_data, pair_data2, pair_data3, bn_group, *magic, occupancy, grid_dim_x, coop);

  return y;
}

at::Tensor nhwc_bn_addrelu_fwd_eval(const at::Tensor& x, const at::Tensor& z, const at::Tensor& scale,
                                    const at::Tensor& bias, const at::Tensor& running_mean,
                                    const at::Tensor& running_inv_var, const at::Tensor& ret_cta, const int bn_group,
                                    const float momentum, const float epsilon) {
  const int N = x.size(0);
  const int H = x.size(1);
  const int W = x.size(2);
  const int C = x.size(3);

  // Allocate output tensor
  at::Tensor y = at::empty({N, H, W, C}, x.options());

  // Create wrapper
  NhwcBatchNormAddRelu* bn = new NhwcBatchNormAddRelu();

  bn->setInputDescriptor(CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, N, C, H, W, bn_group);
  bn->setOutputDescriptor(CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, N, C, H, W);

  bn->setConstants(momentum, epsilon);

  // set pointers within the wrapper
  bn->setInputOutputPointers(x.data_ptr<at::Half>(), nullptr, y.data_ptr<at::Half>(), nullptr, z.data_ptr<at::Half>(),
                             nullptr);

  bn->setWeightPointers({scale.data_ptr<float>(), bias.data_ptr<float>()}, {nullptr, nullptr});
  bn->setParameterPointers({running_mean.data_ptr<float>(), running_inv_var.data_ptr<float>()});

  // deal with workspace(s)
  auto workspace_bytes = bn->numWorkspaceBytes();
  // We'll create explicit tensors for the first 2 workspace ptrs, then allocate & offset
  // an allocated workspace for the others
  size_t total_workspace_bytes = 0;
  std::vector<size_t> workspace_offsets;

  for (auto index = 4; index < workspace_bytes.size(); ++index) {
    total_workspace_bytes = round_up_to_multiple(total_workspace_bytes, 512);
    workspace_offsets.push_back(total_workspace_bytes);

    auto alloc_bytes = workspace_bytes[index];
    total_workspace_bytes += alloc_bytes;
  }

  // Allocate the workspace
  Workspace ws(total_workspace_bytes);

  std::vector<void*> workspace;
  workspace.push_back(nullptr);
  workspace.push_back(nullptr);
  workspace.push_back(nullptr);

  auto stream = at::cuda::getCurrentCUDAStream().stream();
  const int retired_cta_bytes = workspace_bytes[3];
  void* retired_ctas = ret_cta.data_ptr<uint8_t>();
  assert(ret_cta.size(0) >= retired_cta_bytes);
  workspace.push_back(retired_ctas);

  for (auto index = 4; index < workspace_bytes.size(); ++index) {
    void* ptr = reinterpret_cast<uint8_t*>(ws.data) + workspace_offsets[index - 4];
    workspace.push_back(ptr);
  }

  bn->setWorkspacePointers(workspace, workspace_bytes);

  // Don't fuse in ReLU for now at least
  bn->fwdInference(stream);

  return y;
}

std::vector<at::Tensor> nhwc_bn_addrelu_bwd(const at::Tensor& x, const at::Tensor& dy, const at::Tensor& scale,
                                            const at::Tensor& bias, const at::Tensor& running_mean,
                                            const at::Tensor& running_inv_var, const at::Tensor& minibatch_mean,
                                            const at::Tensor& minibatch_inv_var, const at::Tensor& bitmask,
                                            const at::Tensor& ret_cta, const float momentum, const float epsilon,
                                            void* my_data, void* pair_data, void* pair_data2, void* pair_data3,
                                            const int bn_group, const at::Tensor& magic_tensor, const int occupancy,
                                            const int grid_dim_x, const bool coop) {
  // shape
  const int N = x.size(0);
  const int H = x.size(1);
  const int W = x.size(2);
  const int C = x.size(3);

  // generating new magic number and use that for sync
  int* magic = magic_tensor.data_ptr<int>();
  *magic = (*magic + 1) & 0xff;

  // outputs
  at::Tensor x_grad, z_grad, scale_grad, bias_grad;

  // Allocate outputs
  x_grad = at::empty_like(x);
  z_grad = at::empty_like(x);
  scale_grad = at::empty_like(scale);
  bias_grad = at::empty_like(bias);

  // Create wrapper
  NhwcBatchNormAddRelu* bn = new NhwcBatchNormAddRelu();

  bn->setInputDescriptor(CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, N, C, H, W, bn_group);
  bn->setOutputDescriptor(CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, N, C, H, W);

  bn->setConstants(momentum, epsilon);

  // set pointers within the wrapper
  bn->setInputOutputPointers(x.data_ptr<at::Half>(), x_grad.data_ptr<at::Half>(), nullptr, dy.data_ptr<at::Half>(),
                             nullptr, z_grad.data_ptr<at::Half>());

  bn->setWeightPointers({scale.data_ptr<float>(), bias.data_ptr<float>()},
                        {scale_grad.data_ptr<float>(), bias_grad.data_ptr<float>()});
  bn->setParameterPointers({running_mean.data_ptr<float>(), running_inv_var.data_ptr<float>()});

  // deal with workspace(s)
  auto workspace_bytes = bn->numWorkspaceBytes();
  // We'll create explicit tensors for the first 2 workspace ptrs, then allocate & offset
  // an allocated workspace for the others
  size_t total_workspace_bytes = 0;
  std::vector<size_t> workspace_offsets;

  for (auto index = 4; index < workspace_bytes.size(); ++index) {
    total_workspace_bytes = round_up_to_multiple(total_workspace_bytes, 512);
    workspace_offsets.push_back(total_workspace_bytes);

    auto alloc_bytes = workspace_bytes[index];
    total_workspace_bytes += alloc_bytes;
  }

  // Allocate the workspace
  Workspace ws(total_workspace_bytes);

  std::vector<void*> workspace;
  workspace.push_back(minibatch_mean.data_ptr<float>());
  workspace.push_back(minibatch_inv_var.data_ptr<float>());
  workspace.push_back(bitmask.data_ptr<int32_t>());

  auto stream = at::cuda::getCurrentCUDAStream().stream();
  const int retired_cta_bytes = workspace_bytes[3];
  void* retired_ctas = ret_cta.data_ptr<uint8_t>();
  assert(ret_cta.size(0) >= retired_cta_bytes);
  workspace.push_back(retired_ctas);

  for (auto index = 4; index < workspace_bytes.size(); ++index) {
    void* ptr = reinterpret_cast<uint8_t*>(ws.data) + workspace_offsets[index - 4];
    workspace.push_back(ptr);
  }

  bn->setWorkspacePointers(workspace, workspace_bytes);

  bn->dgrad(stream, my_data, pair_data, pair_data2, pair_data3, bn_group, *magic, occupancy, grid_dim_x, coop);

  return std::vector<at::Tensor>{x_grad, z_grad, scale_grad, bias_grad};
}

int nhwc_bn_addrelu_fwd_occupancy() {
  int device_id = -1;
  cudaGetDevice(&device_id);

  // max occupancy supported by the code is 2
  return NhwcBatchNormAddRelu::smem_driven_fwd_occupancy(device_id, 2);
}

int nhwc_bn_addrelu_bwd_occupancy() {
  int device_id = -1;
  cudaGetDevice(&device_id);

  // max occupancy supported by the code is 2
  return NhwcBatchNormAddRelu::smem_driven_bwd_occupancy(device_id, 2);
}


================================================
FILE: apex/contrib/csrc/groupbn/batch_norm_add_relu.h
================================================
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*!
 * Copyright (c) 2018 by Contributors
 * \file nhwc_batch_norm_add_relu.h
 * \brief CUDA NHWC Batch Normalization code with fused addition
 * \author Shankara Rao Thejaswi Nanditale, Dick Carter, Maxim Milakov, Evgeni Krimer
 */
#ifndef MXNET_OPERATOR_NN_CUDNN_NHWC_BATCH_NORM_ADD_RELU_H_
#define MXNET_OPERATOR_NN_CUDNN_NHWC_BATCH_NORM_ADD_RELU_H_

#include <cudnn.h>

#include <algorithm>
#include <iostream>
#include <string>
#include <vector>

#include "cuda_utils.h"
#include "nhwc_batch_norm_kernel.h"

#define VERBOSE_DEFAULT false

class NhwcBatchNormAddRelu {
 public:
  NhwcBatchNormAddRelu() {
    name_ = "nhwc_batchnormaddrelu";
    createTensorDescriptor(&X_tensor_desc_);
    createTensorDescriptor(&Y_tensor_desc_);
  }

  ~NhwcBatchNormAddRelu() {
    destroyTensorDescriptor(X_tensor_desc_);
    destroyTensorDescriptor(Y_tensor_desc_);
  }

  void die() {
    std::cerr << "batchnormaddrelu not initialized" << std::endl;
    exit(-1);
  }

  void fwd(cudaStream_t stream, void* my_data, void* pair_data, void* pair_data2, void* pair_data3, const int bn_group,
           const int magic, const int occupancy, const int grid_dim_x, const bool coop);
  void dgrad(cudaStream_t stream, void* my_data, void* pair_data, void* pair_data2, void* pair_data3,
             const int bn_group, const int magic, const int occupancy, const int grid_dim_x, const bool coop);
  void fwdInference(cudaStream_t stream);
  dim3 calc_fwd_grid(int* loop, const int grid_dim_x);
  dim3 calc_bwd_grid(int* loop, const int grid_dim_x);

  void setInputDescriptor(const cudnnTensorFormat_t format, const cudnnDataType_t data_type, int n, int c, int h, int w,
                          int bn_group) {
    m_ = n * h * w;
    int m_bn_adjusted = m_ * bn_group;
    c_ = c;
    // factor to scale sum of squared errors to get saved variance.  Must be 1/nhw.
    svar_inv_count_ = 1.f / m_bn_adjusted;
    // factor to scale sum of squared errors to get running variance. Should be 1/(nhw-1).
    int divisor = m_bn_adjusted - 1;
    // nhw == 1 is unlikely, but by setting the rvar_inv_count_ == 1.f, we avoid running var infs.
    rvar_inv_count_ = divisor == 0 ? 1.f : 1.f / divisor;
    setTensorDescriptor(X_tensor_desc_, format, data_type, n, c, h, w);
  }

  void setOutputDescriptor(const cudnnTensorFormat_t format, const cudnnDataType_t data_type, int n, int c, int h,
                           int w) {
    setTensorDescriptor(Y_tensor_desc_, format, data_type, n, c, h, w);
  }

  const std::vector<size_t> numWorkspaceBytes() const;

  void setWorkspacePointers(const std::vector<void*>& workspace, const std::vector<size_t>& num_workspace_bytes);

  void setInputOutputPointers(void* X, void* dX, void* Y, void* dY, void* addend, void* dAddend) {
    X_ = X;
    dX_ = dX;
    Y_ = Y;
    dY_ = dY;
    addend_ = addend;
    dAddend_ = dAddend;
  }

  // Sets the pointers for the scale and weight (in that order) data and derivative buffers.
  void setWeightPointers(const std::vector<void*>& weight_pointers, const std::vector<void*>& deriv_pointers) {
    assert(weight_pointers.size() == 2);
    assert(deriv_pointers.size() == 2);
    scale_ = static_cast<float*>(weight_pointers[0]);
    bias_ = static_cast<float*>(weight_pointers[1]);
    dscale_ = static_cast<float*>(deriv_pointers[0]);
    dbias_ = static_cast<float*>(deriv_pointers[1]);
  }

  // Sets the pointers for the population mean and variance buffers, in that order.
  void setParameterPointers(const std::vector<void*>& param_pointers) {
    assert(param_pointers.size() == 2);
    population_mean_ = static_cast<float*>(param_pointers[0]);
    population_variance_ = static_cast<float*>(param_pointers[1]);
  }

  void setConstants(const double exp_avg_factor, const double eps) {
    exp_avg_factor_ = exp_avg_factor;
    eps_ = eps;
  }

  void processCudnnStatus(const cudnnStatus_t& status, const std::string& string = std::string(),
                          bool verbose = VERBOSE_DEFAULT) {
    if (status != CUDNN_STATUS_SUCCESS)
      LOG(FATAL) << string << " " << cudnnGetErrorString(status);
    else if (verbose)
      LOG(INFO) << string << " " << cudnnGetErrorString(status);
  }

  void checkCudaStatus(const std::string& string = std::string(), bool verbose = VERBOSE_DEFAULT) {
    cudaError_t status = cudaGetLastError();
    if (status != cudaSuccess)
      LOG(FATAL) << string << " " << cudaGetErrorString(status);
    else if (verbose)
      LOG(INFO) << string << " " << cudaGetErrorString(status);
  }

  size_t size_retired_ctas(int grid_y) const {
    // Note that the value of max_grid_y to handle known GPUs is about 160.
    const int max_grid_y = 1024;
    if (grid_y > max_grid_y) LOG(INFO) << "GPU capabilities exceeds assumptions.";
    const int retired_cta_bytes = max_grid_y * 2 * sizeof(int);
    // Since the region will be initialized once and used for many kernels,
    // the idea is to return an ample size that will cover all uses.
    return retired_cta_bytes;
  }

  cudnnTensorDescriptor_t X_tensor_desc_ = nullptr;
  cudnnTensorDescriptor_t Y_tensor_desc_ = nullptr;

  void* X_ = nullptr;
  void* dX_ = nullptr;
  void* Y_ = nullptr;
  void* dY_ = nullptr;
  void* addend_ = nullptr;
  void* dAddend_ = nullptr;

  // Learned scale and bias weights.
  float* scale_ = nullptr;
  float* dscale_ = nullptr;
  float* bias_ = nullptr;
  float* dbias_ = nullptr;

  // Computed population mean and variance parameters.
  float* population_mean_ = nullptr;
  float* population_variance_ = nullptr;

  // Workspace buffers for minibatch mean and variance (computed in fwd, needed by bwd).
  float* minibatch_mean_ = nullptr;
  float* minibatch_variance_ = nullptr;

  int m_ = 0;  // Number of values per channel that BN is normalizing.
  int c_ = 0;  // Number of channels over which BN is normalizing.

  float svar_inv_count_ = 0.f;  // factor to scale sum of squared errors to get saved variance
  float rvar_inv_count_ = 0.f;  // factor to scale sum of squared errors to get running variance

  double exp_avg_factor_ = 0.;
  double eps_ = 0.;
  std::string name_;

 private:
  void setTensorDescriptor(cudnnTensorDescriptor_t descriptor, cudnnTensorFormat_t format, cudnnDataType_t data_type,
                           int n, int c, int h, int w) {
    cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
    status = cudnnSetTensor4dDescriptor(descriptor, format, data_type, n, c, h, w);
    processCudnnStatus(status, "set tensor descriptor");
  }

  void createTensorDescriptor(cudnnTensorDescriptor_t* descriptor) {
    cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
    status = cudnnCreateTensorDescriptor(descriptor);
    processCudnnStatus(status, "create tensor_descriptor");
  }

  void destroyTensorDescriptor(cudnnTensorDescriptor_t descriptor) {
    cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
    status = cudnnDestroyTensorDescriptor(descriptor);
    processCudnnStatus(status, "destroy tensor_descriptor");
  }

 protected:
  float* partial_sums_ = nullptr;
  int* partial_counts_ = nullptr;
  int* retired_ctas_ = nullptr;
  unsigned int* relu_bitmask_ = nullptr;

  void _setFwdParams(NhwcBatchNormFwdParams* params) const;
  void _setFwdInferenceParams(NhwcBatchNormFwdInferenceParams* params) const;
  void _setBwdParams(NhwcBatchNormBwdParams* params) const;

  // @todo: ability to configure these?
  // Kernel params
  static const int USE_ONLINE_APPROACH = 1;
  static const int THREADS_PER_CTA = 512;
  static const int THREADS_PER_PIXEL = 16;
  static const int C_ELEMENTS_PER_CTA = 64;
  static const int ELEMENTS_PER_LDG = C_ELEMENTS_PER_CTA / THREADS_PER_PIXEL;
  static const int MAX_SMEM_WITHOUT_OPT_IN = 48 * 1024;

  typedef uint16_t StorageType;
  // increasing this to 6 causes spills in fwd kernel!
  static const int PIXELS_PER_THREAD_IN_REGISTERS_FWD = 5;
  static const int PIXELS_PER_THREAD_IN_REGISTERS_BWD = 3;
  static const int PIXELS_PER_THREAD_IN_SMEM_FWD = 10;
  static const int PIXELS_PER_THREAD_IN_SMEM_BWD = 5;

  static const int PIXELS_PER_THREAD_FWD = PIXELS_PER_THREAD_IN_REGISTERS_FWD + PIXELS_PER_THREAD_IN_SMEM_FWD;
  static const int PIXELS_PER_THREAD_BWD = PIXELS_PER_THREAD_IN_REGISTERS_BWD + PIXELS_PER_THREAD_IN_SMEM_BWD;
  static const int PIXELS_PER_THREAD_FWD_INFERENCE = 4;

  // Derived params
  static const size_t SMEM_SIZE_FWD =
      PIXELS_PER_THREAD_IN_SMEM_FWD * THREADS_PER_CTA * ELEMENTS_PER_LDG * sizeof(StorageType);
  static const size_t SMEM_SIZE_BWD =
      PIXELS_PER_THREAD_IN_SMEM_BWD * THREADS_PER_CTA * ELEMENTS_PER_LDG * 2 * sizeof(StorageType);
  static const int PIXELS_PER_LDG = THREADS_PER_CTA / THREADS_PER_PIXEL;
  static const int PIXELS_PER_CTA_FWD = THREADS_PER_CTA / THREADS_PER_PIXEL * PIXELS_PER_THREAD_FWD;
  static const int PIXELS_PER_CTA_BWD = THREADS_PER_CTA / THREADS_PER_PIXEL * PIXELS_PER_THREAD_BWD;
  static const int PIXELS_PER_CTA_FWD_INFERENCE = THREADS_PER_CTA / THREADS_PER_PIXEL * PIXELS_PER_THREAD_FWD_INFERENCE;

  // max grid.y in case of group bn is limited by exchange buffer size
  static const int MAX_GBN_BLOCK_Y = 256;

  // Helper function to launch the forward kernel.

  // We calculate (based on smem usage) the achievable occupancy and make sure we run a kernel
  // version that was compiled with that occupancy in its launch bounds.  This way, we avoid
  // needless register spills.
  void _fwdKernelLauncher(cudaStream_t stream, NhwcBatchNormFwdParams params, dim3 grid_dim, int outer_loops,
                          const int occupancy, const bool coop) {
#define LAUNCH_FWD_KERNEL(OUTER_LOOPS, USE_RELU, USE_ADD_RELU, COMPILED_FOR_OCCUPANCY, COOP)                          \
  do {                                                                                                                \
    CHECK(SMEM_SIZE_FWD <= MAX_SMEM_WITHOUT_OPT_IN) << "Nhwc batchnormaddrelu kernel smem too big.";                  \
    auto fwd_func =                                                                                                   \
        nhwc_batch_norm_fwd<StorageType, THREADS_PER_CTA, THREADS_PER_PIXEL, PIXELS_PER_THREAD_IN_REGISTERS_FWD,      \
                            PIXELS_PER_THREAD_IN_SMEM_FWD, ELEMENTS_PER_LDG, USE_ONLINE_APPROACH, OUTER_LOOPS,        \
                            USE_RELU, USE_ADD_RELU, COMPILED_FOR_OCCUPANCY>;                                          \
    if (COMPILED_FOR_OCCUPANCY > 1) {                                                                                 \
      cudaFuncSetAttribute(fwd_func, cudaFuncAttributePreferredSharedMemoryCarveout, 100);                            \
      checkCudaStatus(name_ + " fwd ser coop kernel (cudaFuncSetAttribute carveout)");                                \
    }                                                                                                                 \
    void* params_ptr = static_cast<void*>(&params);                                                                   \
    using FWD_FUNC = decltype(nhwc_batch_norm_fwd<StorageType, THREADS_PER_CTA, THREADS_PER_PIXEL,                    \
                                                  PIXELS_PER_THREAD_IN_REGISTERS_FWD, PIXELS_PER_THREAD_IN_SMEM_FWD,  \
                                                  ELEMENTS_PER_LDG, USE_ONLINE_APPROACH, OUTER_LOOPS, USE_RELU,       \
                                                  USE_ADD_RELU, COMPILED_FOR_OCCUPANCY>);                             \
    if (COOP) {                                                                                                       \
      cudaLaunchCooperativeKernel<FWD_FUNC>(fwd_func, grid_dim, THREADS_PER_CTA, &params_ptr, SMEM_SIZE_FWD, stream); \
    } else {                                                                                                          \
      cudaLaunchKernel<FWD_FUNC>(fwd_func, grid_dim, THREADS_PER_CTA, &params_ptr, SMEM_SIZE_FWD, stream);            \
    }                                                                                                                 \
    checkCudaStatus(name_ + " fwd ser coop kernel");                                                                  \
  } while (0)

    // Don't try for an occupancy > 2 as this will squeeze register use and create spills.
    if (outer_loops == 1) {
      if (occupancy >= 2)
        LAUNCH_FWD_KERNEL(1, false, true, 2, coop);
      else
        LAUNCH_FWD_KERNEL(1, false, true, 1, coop);
    } else {
      if (occupancy >= 2)
        LAUNCH_FWD_KERNEL(0, false, true, 2, coop);
      else
        LAUNCH_FWD_KERNEL(0, false, true, 1, coop);
    }
#undef LAUNCH_FWD_KERNEL
  }

  // Helper function to launch the backward kernel.

  void _bwdKernelLauncher(cudaStream_t stream, NhwcBatchNormBwdParams params, dim3 grid_dim, int outer_loops,
                          const int occupancy, const bool coop) {
#define LAUNCH_BWD_ADD_RELU_KERNEL(OUTER_LOOPS, COMPILED_FOR_OCCUPANCY, COOP)                                       \
  do {                                                                                                              \
    CHECK(SMEM_SIZE_BWD <= MAX_SMEM_WITHOUT_OPT_IN) << "Nhwc batchnormaddrelu kernel smem too big.";                \
    auto bwd_add_relu_func =                                                                                        \
        nhwc_batch_norm_bwd_add_relu<StorageType, THREADS_PER_CTA, THREADS_PER_PIXEL,                               \
                                     PIXELS_PER_THREAD_IN_REGISTERS_BWD, PIXELS_PER_THREAD_IN_SMEM_BWD,             \
                                     ELEMENTS_PER_LDG, USE_ONLINE_APPROACH, OUTER_LOOPS, COMPILED_FOR_OCCUPANCY>;   \
    if (COMPILED_FOR_OCCUPANCY > 1) {                                                                               \
      cudaFuncSetAttribute(bwd_add_relu_func, cudaFuncAttributePreferredSharedMemoryCarveout, 100);                 \
      checkCudaStatus(name_ + " bwd-add-relu coop serial kernel (cudaFuncSetAttribute carveout)");                  \
    }                                                                                                               \
    void* params_ptr = static_cast<void*>(&params);                                                                 \
    using BWD_ADD_RELU_FUNC =                                                                                       \
        decltype(nhwc_batch_norm_bwd_add_relu<StorageType, THREADS_PER_CTA, THREADS_PER_PIXEL,                      \
                                              PIXELS_PER_THREAD_IN_REGISTERS_BWD, PIXELS_PER_THREAD_IN_SMEM_BWD,    \
                                              ELEMENTS_PER_LDG, USE_ONLINE_APPROACH, OUTER_LOOPS,                   \
                                              COMPILED_FOR_OCCUPANCY>);                                             \
    if (COOP) {                                                                                                     \
      cudaLaunchCooperativeKernel<BWD_ADD_RELU_FUNC>(bwd_add_relu_func, grid_dim, THREADS_PER_CTA, &params_ptr,     \
                                                     SMEM_SIZE_BWD, stream);                                        \
    } else {                                                                                                        \
      cudaLaunchKernel<BWD_ADD_RELU_FUNC>(bwd_add_relu_func, grid_dim, THREADS_PER_CTA, &params_ptr, SMEM_SIZE_BWD, \
                                          stream);                                                                  \
    }                                                                                                               \
    checkCudaStatus(name_ + " bwd-add-relu coop serial kernel");                                                    \
  } while (0)

    // Don't try for an occupancy > 2 as this will squeeze register use and create spills.
    if (outer_loops == 1) {
      if (occupancy >= 2)
        LAUNCH_BWD_ADD_RELU_KERNEL(1, 2, coop);
      else
        LAUNCH_BWD_ADD_RELU_KERNEL(1, 1, coop);
    } else {
      if (occupancy >= 2)
        LAUNCH_BWD_ADD_RELU_KERNEL(0, 2, coop);
      else
        LAUNCH_BWD_ADD_RELU_KERNEL(0, 1, coop);
    }
#undef LAUNCH_BWD_KERNEL
  }

 public:
  // Calculate the expected fwd kernel occupancy, as dictated by shared memory usage.
  static int smem_driven_fwd_occupancy(int device_id, const int max_cta_per_sm) {
    using namespace at::cuda::utils;
    int fwd_reduction_bytes = THREADS_PER_PIXEL * (THREADS_PER_CTA / 32) * ELEMENTS_PER_LDG * sizeof(float);
    int fwd_smem_bytes = SMEM_SIZE_FWD + fwd_reduction_bytes;
    int occupancy = MaxSharedMemoryPerMultiprocessor(device_id) / fwd_smem_bytes;
    return std::min(max_cta_per_sm, occupancy);
  }

  // Calculate the expected bwd kernel occupancy, as dictated by shared memory usage.
  static int smem_driven_bwd_occupancy(int device_id, const int max_cta_per_sm) {
    using namespace at::cuda::utils;
    int bwd_reduction_bytes = THREADS_PER_PIXEL * (THREADS_PER_CTA / 32) * ELEMENTS_PER_LDG * sizeof(float);
    int bwd_smem_bytes = SMEM_SIZE_BWD + bwd_reduction_bytes;
    int occupancy = MaxSharedMemoryPerMultiprocessor(device_id) / bwd_smem_bytes;
    return std::min(max_cta_per_sm, occupancy);
  }
};

const std::vector<size_t> NhwcBatchNormAddRelu::numWorkspaceBytes() const {
  assert(c_ > 0);

  // choose the max memory required between fwd/bwd passes
  int grid_x_fwd = div_up(m_, PIXELS_PER_CTA_FWD);
  int grid_x_bwd = div_up(m_, PIXELS_PER_CTA_BWD);
  int grid_x = max(grid_x_fwd, grid_x_bwd);
  int grid_y = div_up(c_, C_ELEMENTS_PER_CTA);

  const size_t num_mean_bytes = c_ * sizeof(float);
  const size_t num_variance_bytes = num_mean_bytes;

  int elems_per_group = ((m_ + 31) & ~31) * 2;
  int group_count = div_up(c_, C_ELEMENTS_PER_CTA);
  const size_t bitmask_bytes = elems_per_group * group_count * sizeof(unsigned int);

  const size_t size_sums = grid_y * grid_x * THREADS_PER_PIXEL * ELEMENTS_PER_LDG * 2 * sizeof(float);
  const size_t size_counts = grid_y * grid_x * sizeof(int);

  return {num_mean_bytes, num_variance_bytes, bitmask_bytes, size_retired_ctas(grid_y), size_sums, size_counts};
}

void NhwcBatchNormAddRelu::setWorkspacePointers(const std::vector<void*>& workspace,
                                                const std::vector<size_t>& num_workspace_bytes) {
  assert(workspace.size() == 6);
  assert(num_workspace_bytes.size() == 6);

  minibatch_mean_ = static_cast<float*>(workspace[0]);
  minibatch_variance_ = static_cast<float*>(workspace[1]);
  relu_bitmask_ = static_cast<unsigned int*>(workspace[2]);
  retired_ctas_ = static_cast<int*>(workspace[3]);
  partial_sums_ = static_cast<float*>(workspace[4]);
  partial_counts_ = static_cast<int*>(workspace[5]);
}

void NhwcBatchNormAddRelu::_setFwdParams(NhwcBatchNormFwdParams* params) const {
  params->gmem_src = static_cast<uint16_t*>(X_);
  params->gmem_dst = static_cast<uint16_t*>(Y_);
  params->gmem_src1 = static_cast<uint16_t*>(addend_);
  params->gmem_bias = bias_;
  params->gmem_scale = scale_;
  params->gmem_running_mean = population_mean_;
  params->gmem_running_var = population_variance_;
  params->gmem_saved_mean = minibatch_mean_;
  params->gmem_saved_var = minibatch_variance_;
  params->gmem_relu_bitmask = relu_bitmask_;
  params->nhw = m_;
  params->c = c_;
  params->svar_inv_count = svar_inv_count_;
  params->rvar_inv_count = rvar_inv_count_;
  params->gmem_sums = partial_sums_;
  params->gmem_counts = partial_counts_;
  params->gmem_retired_ctas = retired_ctas_;
  params->var_eps = eps_;
  params->outer_loops = 0;
  params->exp_avg_factor = static_cast<float>(exp_avg_factor_);
  params->c_blks = div_up(c_, C_ELEMENTS_PER_CTA);
}

void NhwcBatchNormAddRelu::_setFwdInferenceParams(NhwcBatchNormFwdInferenceParams* params) const {
  params->gmem_src = static_cast<uint16_t*>(X_);
  params->gmem_dst = static_cast<uint16_t*>(Y_);
  params->gmem_src1 = static_cast<uint16_t*>(addend_);
  params->gmem_bias = bias_;
  params->gmem_scale = scale_;
  params->gmem_mean = population_mean_;
  params->gmem_var = population_variance_;
  params->nhw = m_;
  params->c = c_;
  params->var_eps = eps_;
}

void NhwcBatchNormAddRelu::_setBwdParams(NhwcBatchNormBwdParams* params) const {
  params->gmem_src = static_cast<uint16_t*>(X_);
  params->gmem_dy = static_cast<uint16_t*>(dY_);
  params->gmem_dst = static_cast<uint16_t*>(dX_);
  params->gmem_dst1 = static_cast<uint16_t*>(dAddend_);
  params->gmem_relu_bitmask = relu_bitmask_;
  params->gmem_dscale = dscale_;
  params->gmem_dbias = dbias_;
  params->gmem_scale = scale_;
  params->gmem_bias = bias_;
  params->gmem_saved_mean = minibatch_mean_;
  params->gmem_saved_var = minibatch_variance_;
  params->nhw = m_;
  params->c = c_;
  params->svar_inv_count = svar_inv_count_;
  params->gmem_sums = partial_sums_;
  params->gmem_retired_ctas = retired_ctas_;
  params->outer_loops = 0;
  params->c_blks = div_up(c_, C_ELEMENTS_PER_CTA);
}

void NhwcBatchNormAddRelu::fwdInference(cudaStream_t stream) {
  bool ptrs_are_set = X_tensor_desc_ != nullptr && Y_tensor_desc_ != nullptr && scale_ != nullptr &&
                      bias_ != nullptr
                      //      && minibatch_mean_ != nullptr
                      //      && minibatch_variance_ != nullptr
                      && population_mean_ != nullptr && population_variance_ != nullptr &&
                      X_ != nullptr
                      //      && dX_ != nullptr
                      && Y_ != nullptr &&
                      addend_ != nullptr
                      //      && dY_ != nullptr
                      //      && dscale_ != nullptr
                      //      && dbias_ != nullptr
                      && partial_sums_ != nullptr && partial_counts_ != nullptr;

  if (!ptrs_are_set) die();

  dim3 grid_dim;
  grid_dim.x = div_up(m_, PIXELS_PER_CTA_FWD_INFERENCE);
  grid_dim.y = div_up(c_, C_ELEMENTS_PER_CTA);

  // @todo: maybe just move this inside initialize routine?
  NhwcBatchNormFwdInferenceParams params;
  _setFwdInferenceParams(&params);

  nhwc_batch_norm_fwd_inference<StorageType, THREADS_PER_CTA, THREADS_PER_PIXEL, ELEMENTS_PER_LDG, false, true>
      <<<grid_dim, THREADS_PER_CTA, 0, stream>>>(params);
  checkCudaStatus(name_ + " fwd_inference-relu kernel");
}

dim3 NhwcBatchNormAddRelu::calc_fwd_grid(int* loop, const int grid_dim_x) {
  dim3 grid_dim;
  grid_dim.x = div_up(m_, PIXELS_PER_CTA_FWD);
  int c_blks = div_up(c_, C_ELEMENTS_PER_CTA);
  unsigned int max_grid_x = grid_dim_x;
  if (grid_dim.x <= max_grid_x) {
    *loop = 1;
    if (max_grid_x / grid_dim.x > 1) {
      grid_dim.y = std::min(c_blks, static_cast<int>(max_grid_x / grid_dim.x));
      assert(grid_dim.y < MAX_GBN_BLOCK_Y);  // FIXME: turn into a loop
    } else {
      grid_dim.y = 1;
    }
  } else {
    grid_dim.x = max_grid_x;
    grid_dim.y = 1;
    int nhw_in_regs = m_ - PIXELS_PER_THREAD_IN_SMEM_FWD * PIXELS_PER_LDG * grid_dim.x;
    int pixels_per_iteration = PIXELS_PER_THREAD_IN_REGISTERS_FWD * PIXELS_PER_LDG * grid_dim.x;
    *loop = div_up(nhw_in_regs, pixels_per_iteration);
  }
  return grid_dim;
}

dim3 NhwcBatchNormAddRelu::calc_bwd_grid(int* loop, const int grid_dim_x) {
  dim3 grid_dim;
  grid_dim.x = div_up(m_, PIXELS_PER_CTA_BWD);
  int c_blks = div_up(c_, C_ELEMENTS_PER_CTA);
  unsigned int max_grid_x = grid_dim_x;
  if (grid_dim.x <= max_grid_x) {
    *loop = 1;
    if (max_grid_x / grid_dim.x > 1) {
      grid_dim.y = std::min(c_blks, static_cast<int>(max_grid_x / grid_dim.x));
      assert(grid_dim.y < MAX_GBN_BLOCK_Y);  // FIXME: turn into a loop
    } else {
      grid_dim.y = 1;
    }
  } else {
    grid_dim.x = max_grid_x;
    grid_dim.y = 1;
    int nhw_in_regs = m_ - PIXELS_PER_THREAD_IN_SMEM_BWD * PIXELS_PER_LDG * grid_dim.x;
    int pixels_per_iteration = PIXELS_PER_THREAD_IN_REGISTERS_BWD * PIXELS_PER_LDG * grid_dim.x;
    *loop = div_up(nhw_in_regs, pixels_per_iteration);
  }
  return grid_dim;
}

void NhwcBatchNormAddRelu::fwd(cudaStream_t stream, void* my_data, void* pair_data, void* pair_data2, void* pair_data3,
                               const int bn_group, const int magic, const int occupancy, const int grid_dim_x,
                               const bool coop) {
  bool ptrs_are_set = X_tensor_desc_ != nullptr && Y_tensor_desc_ != nullptr && scale_ != nullptr && bias_ != nullptr &&
                      minibatch_mean_ != nullptr && minibatch_variance_ != nullptr && relu_bitmask_ != nullptr &&
                      population_mean_ != nullptr && population_variance_ != nullptr &&
                      X_ != nullptr
                      //      && dX_ != nullptr
                      && Y_ != nullptr &&
                      addend_ != nullptr
                      //      && dY_ != nullptr
                      //      && dscale_ != nullptr
                      //      && dbias_ != nullptr
                      && partial_sums_ != nullptr && partial_counts_ != nullptr && retired_ctas_ != nullptr;

  if (!ptrs_are_set) die();

  // reset of retired_cta_count no longer needed

  NhwcBatchNormFwdParams params;
  _setFwdParams(&params);

  params.my_data = my_data;
  params.pair_datas[0] = pair_data;
  params.pair_datas[1] = pair_data2;
  params.pair_datas[2] = pair_data3;
  params.magic = magic;
  params.sync_iters = (bn_group == 8) ? 3 : (bn_group >> 1);

  dim3 grid_dim = calc_fwd_grid(&params.outer_loops, grid_dim_x);
  _fwdKernelLauncher(stream, params, grid_dim, params.outer_loops, occupancy, coop);
}

void NhwcBatchNormAddRelu::dgrad(cudaStream_t stream, void* my_data, void* pair_data, void* pair_data2,
                                 void* pair_data3, const int bn_group, const int magic, const int occupancy,
                                 const int grid_dim_x, const bool coop) {
  bool ptrs_are_set = X_tensor_desc_ != nullptr && Y_tensor_desc_ != nullptr && scale_ != nullptr && bias_ != nullptr &&
                      minibatch_mean_ != nullptr && minibatch_variance_ != nullptr &&
                      relu_bitmask_ != nullptr
                      //      && population_mean_ != nullptr
                      //      && population_variance_ != nullptr
                      && X_ != nullptr &&
                      dX_ != nullptr
                      //      && Y_ != nullptr
                      && dY_ != nullptr && dAddend_ != nullptr && dscale_ != nullptr && dbias_ != nullptr &&
                      retired_ctas_ != nullptr;

  if (!ptrs_are_set) die();

  // reset of retired_cta_count no longer needed

  NhwcBatchNormBwdParams params;
  _setBwdParams(&params);

  params.my_data = my_data;
  params.pair_datas[0] = pair_data;
  params.pair_datas[1] = pair_data2;
  params.pair_datas[2] = pair_data3;
  params.magic = magic;
  params.sync_iters = (bn_group == 8) ? 3 : (bn_group >> 1);
  params.wgrad_coeff = 1.0 / bn_group;

  dim3 grid_dim = calc_bwd_grid(&params.outer_loops, grid_dim_x);
  _bwdKernelLauncher(stream, params, grid_dim, params.outer_loops, occupancy, coop);
}

#endif  // MXNET_OPERATOR_NN_CUDNN_NHWC_BATCH_NORM_ADD_RELU_H_


================================================
FILE: apex/contrib/csrc/groupbn/cuda_utils.h
================================================
#include <ATen/cuda/CUDAContext.h>
#ifndef CUDA_UTILS_H
#define CUDA_UTILS_H

namespace at {
namespace cuda {

namespace utils {

static inline int MaxSharedMemoryPerMultiprocessor(int device_id) {
  return getDeviceProperties(device_id)->sharedMemPerMultiprocessor;
}

}  // namespace utils
}  // namespace cuda
}  // namespace at

#endif


================================================
FILE: apex/contrib/csrc/groupbn/interface.cpp
================================================
#include <ATen/ATen.h>
#include <ATen/ArrayRef.h>
#include <ATen/ScalarType.h>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <torch/extension.h>

#include "ATen/Generator.h"
#include "ATen/Scalar.h"
#include "ATen/Storage.h"
#include "ATen/Tensor.h"

namespace py = pybind11;

int64_t get_buffer_size(const int bn_sync_steps);

void* get_data_ptr(const at::Tensor& data);

void* get_remote_data_ptr(const at::Tensor& handle, const int64_t offset);

void close_remote_data(const at::Tensor& handle);

at::Tensor nhwc_bn_fwd_train(const at::Tensor& x, const at::Tensor& scale, const at::Tensor& bias,
                             const at::Tensor& running_mean, const at::Tensor& running_inv_var,
                             const at::Tensor& minibatch_mean, const at::Tensor& minibatch_inv_var,
                             const at::Tensor& ret_cta, const float momentum, const float epsilon, const bool fuse_relu,
                             void* my_data, void* pair_data, void* pair_data2, void* pair_data3, const int bn_group,
                             const at::Tensor& magic_tensor, const int occupancy, const int grid_dim_x,
                             const bool coop);

at::Tensor nhwc_bn_fwd_eval(const at::Tensor& x, const at::Tensor& scale, const at::Tensor& bias,
                            const at::Tensor& running_mean, const at::Tensor& running_inv_var,
                            const at::Tensor& ret_cta, const int bn_group, const float momentum, const float epsilon,
                            const bool fuse_relu);

std::vector<at::Tensor> nhwc_bn_bwd(const at::Tensor& x, const at::Tensor& dy, const at::Tensor& scale,
                                    const at::Tensor& bias, const at::Tensor& running_mean,
                                    const at::Tensor& running_inv_var, const at::Tensor& minibatch_mean,
                                    const at::Tensor& minibatch_inv_var, const at::Tensor& ret_cta,
                                    const float momentum, const float epsilon, const bool fuse_relu, void* my_data,
                                    void* pair_data, void* pair_data2, void* pair_data3, const int bn_group,
                                    const at::Tensor& magic_tensor, const int occupancy, const int grid_dim_x,
                                    const bool coop);

at::Tensor nhwc_bn_addrelu_fwd_train(const at::Tensor& x, const at::Tensor& z, const at::Tensor& scale,
                                     const at::Tensor& bias, const at::Tensor& running_mean,
                                     const at::Tensor& running_inv_var, const at::Tensor& minibatch_mean,
                                     const at::Tensor& minibatch_inv_var, const at::Tensor& bitmask,
                                     const at::Tensor& ret_cta, const float momentum, const float epsilon,
                                     void* my_data, void* pair_data, void* pair_data2, void* pair_data3,
                                     const int bn_group, const at::Tensor& magic_tensor, const int occupancy,
                                     const int grid_dim_x, const bool coop);

at::Tensor nhwc_bn_addrelu_fwd_eval(const at::Tensor& x, const at::Tensor& z, const at::Tensor& scale,
                                    const at::Tensor& bias, const at::Tensor& running_mean,
                                    const at::Tensor& running_inv_var, const at::Tensor& ret_cta, const int bn_group,
                                    const float momentum, const float epsilon);

std::vector<at::Tensor> nhwc_bn_addrelu_bwd(const at::Tensor& x, const at::Tensor& dy, const at::Tensor& scale,
                                            const at::Tensor& bias, const at::Tensor& running_mean,
                                            const at::Tensor& running_inv_var, const at::Tensor& minibatch_mean,
                                            const at::Tensor& minibatch_inv_var, const at::Tensor& bitmask,
                                            const at::Tensor& ret_cta, const float momentum, const float epsilon,
                                            void* my_data, void* pair_data, void* pair_data2, void* pair_data3,
                                            const int bn_group, const at::Tensor& magic_tensor, const int occupancy,
                                            const int grid_dim_x, const bool coop);

int nhwc_bn_fwd_occupancy();
int nhwc_bn_bwd_occupancy();

int nhwc_bn_addrelu_fwd_occupancy();
int nhwc_bn_addrelu_bwd_occupancy();

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("get_buffer_size", &get_buffer_size, "get_buffer_size", py::call_guard<py::gil_scoped_release>());
  m.def("get_data_ptr", &get_data_ptr, "get_data_ptr", py::call_guard<py::gil_scoped_release>());
  m.def("get_remote_data_ptr", &get_remote_data_ptr, "get_remote_data_ptr", py::call_guard<py::gil_scoped_release>());
  m.def("close_remote_data", &close_remote_data, "close_remote_data", py::call_guard<py::gil_scoped_release>());

  m.def("bn_fwd_nhwc", &nhwc_bn_fwd_train, "bn_fwd_nhwc", py::call_guard<py::gil_scoped_release>());
  m.def("bn_fwd_eval_nhwc", &nhwc_bn_fwd_eval, "bn_fwd_eval_nhwc", py::call_guard<py::gil_scoped_release>());
  m.def("bn_bwd_nhwc", &nhwc_bn_bwd, "bn_bwd_nhwc", py::call_guard<py::gil_scoped_release>());

  m.def("bn_fwd_nhwc_occupancy", &nhwc_bn_fwd_occupancy, "bn_fwd_nhwc_occupancy",
        py::call_guard<py::gil_scoped_release>());
  m.def("bn_bwd_nhwc_occupancy", &nhwc_bn_bwd_occupancy, "bn_bwd_nhwc_occupancy",
        py::call_guard<py::gil_scoped_release>());

  m.def("bn_addrelu_fwd_nhwc", &nhwc_bn_addrelu_fwd_train, "bn_addrelu_fwd_nhwc",
        py::call_guard<py::gil_scoped_release>());
  m.def("bn_addrelu_fwd_eval_nhwc", &nhwc_bn_addrelu_fwd_eval, "bn_addrelu_fwd_eval_nhwc",
        py::call_guard<py::gil_scoped_release>());
  m.def("bn_addrelu_bwd_nhwc", &nhwc_bn_addrelu_bwd, "bn_addrelu_bwd_nhwc", py::call_guard<py::gil_scoped_release>());

  m.def("bn_addrelu_fwd_nhwc_occupancy", &nhwc_bn_addrelu_fwd_occupancy, "bn_addrelu_fwd_nhwc_occupancy",
        py::call_guard<py::gil_scoped_release>());
  m.def("bn_addrelu_bwd_nhwc_occupancy", &nhwc_bn_addrelu_bwd_occupancy, "bn_addrelu_bwd_nhwc_occupancy",
        py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: apex/contrib/csrc/groupbn/ipc.cu
================================================
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>

#define cudaCheckErrors(msg)                                                                                  \
  do {                                                                                                        \
    cudaError_t __err = cudaGetLastError();                                                                   \
    if (__err != cudaSuccess) {                                                                               \
      fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", msg, cudaGetErrorString(__err), __FILE__, __LINE__); \
      fprintf(stderr, "*** FAILED - ABORTING\n");                                                             \
      exit(1);                                                                                                \
    }                                                                                                         \
  } while (0)

template <>
struct std::hash<cudaIpcMemHandle_t> {
  size_t operator()(const cudaIpcMemHandle_t& handle) const {
    size_t hash = 0;
    uint8_t* ptr = (uint8_t*)&handle;
    assert(sizeof(uint8_t) == 1);
    for (int i = 0; i < sizeof(cudaIpcMemHandle_t); i++) {
      hash += *ptr;
      ptr++;
    }
    return hash;
  }
};

template <>
struct std::equal_to<cudaIpcMemHandle_t> {
  bool operator()(const cudaIpcMemHandle_t& lhs, const cudaIpcMemHandle_t& rhs) const {
    return (std::memcmp((void*)&lhs, (void*)&rhs, sizeof(cudaIpcMemHandle_t)) == 0);
  }
};

namespace {

namespace gpuipc {
// from: src/operator/nn/cudnn/nhwc_batch_norm_kernel.h
//  The number of threads per pixel.
const int THREADS_PER_PIXEL = 16;
// The number of elements per ldg.
const int ELEMENTS_PER_LDG = 4;
// The number of reducing ops, each uses its own space : mean, var, dscale, dbias
const int REDUCE_OPS = 4;
// Maximum block.y supported - limited due to buffer allocation
const int MAX_BLOCK_Y = 256;
const int MAX_OFFSET = REDUCE_OPS * MAX_BLOCK_Y;
const int BYTES_PER_ELEM = 4;
// Buffer size per sync step
const int SINGLE_SYNC_BUFFER_BYTES = MAX_OFFSET * THREADS_PER_PIXEL * 2 * ELEMENTS_PER_LDG * BYTES_PER_ELEM;
};  // namespace gpuipc

class IpcMemHandleRegistry {
 public:
  void* getPtr(const cudaIpcMemHandle_t& handle, int64_t offset) {
    if (registry_.count(handle) == 0) {
      registry_.insert(std::make_pair(handle, RegistryEntry()));
      registry_[handle].dev_ptr = ipcOpenMem(handle);
    }
    registry_[handle].ref_count++;
    return (((uint8_t*)registry_[handle].dev_ptr) + offset);
  }

  void releasePtr(const cudaIpcMemHandle_t& handle) {
    if (registry_.count(handle) == 0) {
    }
    if (--registry_[handle].ref_count == 0) {
      ipcCloseMem(registry_[handle].dev_ptr);
      registry_.erase(handle);
    }
  }

  struct RegistryEntry {
    void* dev_ptr;
    int ref_count;
    RegistryEntry() : dev_ptr(NULL), ref_count(0) {}
  };

 protected:
  std::unordered_map<cudaIpcMemHandle_t, RegistryEntry> registry_;

  void* ipcOpenMem(const cudaIpcMemHandle_t& handle) {
    void* data;
    cudaIpcOpenMemHandle(&data, handle, cudaIpcMemLazyEnablePeerAccess);
    cudaCheckErrors("ipc init");
    return data;
  }

  void ipcCloseMem(void* dev_ptr) {
    cudaIpcCloseMemHandle(dev_ptr);
    cudaCheckErrors("ipc close");
  }
};

}  // namespace

static IpcMemHandleRegistry ipc_mem_registry;

int64_t get_buffer_size(const int bn_sync_steps) { return bn_sync_steps * gpuipc::SINGLE_SYNC_BUFFER_BYTES; }

void* get_remote_data_ptr(const at::Tensor& handle, const int64_t offset) {
  cudaIpcMemHandle_t my_handle;
  memcpy((unsigned char*)(&my_handle), handle.data_ptr<uint8_t>(), sizeof(my_handle));
  return ipc_mem_registry.getPtr(my_handle, offset);
}

void close_remote_data(const at::Tensor& handle) {
  cudaIpcMemHandle_t my_handle;
  memcpy((unsigned char*)(&my_handle), handle.data_ptr<uint8_t>(), sizeof(my_handle));
  ipc_mem_registry.releasePtr(my_handle);
}

void* get_data_ptr(const at::Tensor& data) { return data.data_ptr<uint8_t>(); }


================================================
FILE: apex/contrib/csrc/groupbn/nhwc_batch_norm_kernel.h
================================================
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*!
 * Copyright (c) 2018 by Contributors
 * \file nhwc_batch_norm_kernel.h
 * \brief CUDA NHWC Batch Normalization code
 * \author Shankara Rao Thejaswi Nanditale, Dick Carter, Maxim Milakov, Evgeni Krimer
 */
#ifndef MXNET_OPERATOR_NN_CUDNN_NHWC_BATCH_NORM_KERNEL_H_
#define MXNET_OPERATOR_NN_CUDNN_NHWC_BATCH_NORM_KERNEL_H_

#include <stdint.h>

#include <algorithm>

#define DEVICE_FUNCTION static inline __device__

// CTA margin used by cooperative launch. Can be overridden by env var NHWC_BATCHNORM_LAUNCH_MARGIN.
#define NHWC_BATCHNORM_LAUNCH_MARGIN_MIN 3
#define NHWC_BATCHNORM_LAUNCH_MARGIN_DEFAULT NHWC_BATCHNORM_LAUNCH_MARGIN_MIN

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename T, int ELEMENTS_PER_LDG>
struct PackedStorage {
  enum { PACKED_ELEMENTS_PER_LDG = ELEMENTS_PER_LDG };
  typedef T Type;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int ELEMENTS_PER_LDG>
struct PackedStorage<uint16_t, ELEMENTS_PER_LDG> {
  enum { PACKED_ELEMENTS_PER_LDG = ELEMENTS_PER_LDG / 2 };
  typedef int Type;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
DEVICE_FUNCTION void from_float(int (&dst)[N], const float (&src)[2 * N]) {
#pragma unroll
  for (int i = 0; i < N; ++i) {
    uint16_t lo, hi;
    asm volatile("cvt.rn.f16.f32 %0, %1;" : "=h"(lo) : "f"(src[2 * i + 0]));
    asm volatile("cvt.rn.f16.f32 %0, %1;" : "=h"(hi) : "f"(src[2 * i + 1]));
    asm volatile("mov.b32 %0, {%1, %2};" : "=r"(dst[i]) : "h"(lo), "h"(hi));
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
DEVICE_FUNCTION void from_float(float (&dst)[N], const float (&src)[N]) {
#pragma unroll
  for (int i = 0; i < N; ++i) {
    dst[i] = src[i];
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
DEVICE_FUNCTION void to_float(float (&dst)[2 * N], int (&src)[N]) {
#pragma unroll
  for (int i = 0; i < N; ++i) {
    uint16_t lo, hi;
    asm volatile("mov.b32 {%0, %1}, %2;" : "=h"(lo), "=h"(hi) : "r"(src[i]));
    asm volatile("cvt.f32.f16 %0, %1;" : "=f"(dst[2 * i + 0]) : "h"(lo));
    asm volatile("cvt.f32.f16 %0, %1;" : "=f"(dst[2 * i + 1]) : "h"(hi));
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
DEVICE_FUNCTION void to_float(float (&dst)[N], float (&src)[N]) {
#pragma unroll
  for (int i = 0; i < N; ++i) {
    dst[i] = src[i];
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

DEVICE_FUNCTION void ldg(int (&dst)[1], const uint16_t* gmem) { dst[0] = __ldg((const int*)gmem); }

////////////////////////////////////////////////////////////////////////////////////////////////////

DEVICE_FUNCTION void ldg_stream(int (&dst)[1], const uint16_t* gmem) {
  unsigned int tmp;
  asm volatile("ld.global.cs.nc.s32 %0, [%1];" : "=r"(tmp) : "l"((const uint*)gmem));
  dst[0] = tmp;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

DEVICE_FUNCTION void ldg(int (&dst)[2], const uint16_t* gmem) {
  int2 tmp = __ldg((const int2*)gmem);
  dst[0] = tmp.x;
  dst[1] = tmp.y;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

DEVICE_FUNCTION void ldg_stream(int (&dst)[2], const uint16_t* gmem) {
  int2 tmp;
  asm volatile("ld.global.cs.nc.v2.s32 {%0,%1}, [%2];" : "=r"(tmp.x), "=r"(tmp.y) : "l"((const int2*)gmem));
  dst[0] = tmp.x;
  dst[1] = tmp.y;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
DEVICE_FUNCTION void ldg(float (&dst)[N], const uint16_t* gmem) {
  int tmp[N / 2];
  ldg(tmp, gmem);
  to_float(dst, tmp);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
DEVICE_FUNCTION void ldg_stream(float (&dst)[N], const uint16_t* gmem) {
  int tmp[N / 2];
  ldg_stream(tmp, gmem);
  to_float(dst, tmp);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

DEVICE_FUNCTION void stg(uint16_t* gmem, int (&src)[1]) { reinterpret_cast<int*>(gmem)[0] = src[0]; }

////////////////////////////////////////////////////////////////////////////////////////////////////

DEVICE_FUNCTION void stg_stream(uint16_t* gmem, int (&src)[1]) {
  unsigned int tmp = src[0];
  asm volatile("st.global.cs.s32 [%0], %1;" ::"l"((uint*)gmem), "r"(tmp));
}

////////////////////////////////////////////////////////////////////////////////////////////////////

DEVICE_FUNCTION void stg(uint16_t* gmem, int (&src)[2]) {
  reinterpret_cast<int2*>(gmem)[0] = make_int2(src[0], src[1]);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

DEVICE_FUNCTION void stg_stream(uint16_t* gmem, int (&src)[2]) {
  asm volatile("st.global.cs.v2.s32 [%0], {%1,%2};" ::"l"((uint*)gmem), "r"(src[0]), "r"(src[1]));
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
DEVICE_FUNCTION void stg(uint16_t* gmem, float (&src)[N]) {
  int tmp[N / 2];
  from_float(tmp, src);
  stg(gmem, tmp);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
DEVICE_FUNCTION void stg_stream(uint16_t* gmem, float (&src)[N]) {
  int tmp[N / 2];
  from_float(tmp, src);
  stg_stream(gmem, tmp);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

DEVICE_FUNCTION void read_from_gmem(float (&dst)[2], const float* gmem, int idx) {
  float2 tmp = __ldg(reinterpret_cast<const float2*>(&gmem[2 * idx]));
  dst[0] = tmp.x;
  dst[1] = tmp.y;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

DEVICE_FUNCTION void read_from_gmem(float (&dst)[4], const float* gmem, int idx) {
  float4 tmp = __ldg(reinterpret_cast<const float4*>(&gmem[4 * idx]));
  dst[0] = tmp.x;
  dst[1] = tmp.y;
  dst[2] = tmp.z;
  dst[3] = tmp.w;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

DEVICE_FUNCTION void read_from_smem(float (&x)[2], const float* smem, int idx) {
  float2 tmp = *(const float2*)&smem[2 * idx];
  x[0] = tmp.x;
  x[1] = tmp.y;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

DEVICE_FUNCTION void read_from_smem(int (&x)[1], const int* smem, int idx) { x[0] = smem[idx]; }

////////////////////////////////////////////////////////////////////////////////////////////////////

DEVICE_FUNCTION void read_from_smem(float (&x)[4], const float* smem, int idx) {
  float4 tmp = *(const float4*)&smem[4 * idx];
  x[0] = tmp.x;
  x[1] = tmp.y;
  x[2] = tmp.z;
  x[3] = tmp.w;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

DEVICE_FUNCTION void read_from_smem(int (&x)[2], const int* smem, int idx) {
  int2 tmp = *(const int2*)&smem[2 * idx];
  x[0] = tmp.x;
  x[1] = tmp.y;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

DEVICE_FUNCTION void write_to_gmem(float* gmem, int idx, const float (&src)[2]) {
  reinterpret_cast<float2*>(&gmem[2 * idx])[0] = make_float2(src[0], src[1]);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

DEVICE_FUNCTION void write_to_gmem(float* gmem, int idx, const float (&src)[4]) {
  reinterpret_cast<float4*>(&gmem[4 * idx])[0] = make_float4(src[0], src[1], src[2], src[3]);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

DEVICE_FUNCTION void scaled_write_to_gmem(float* gmem, int idx, const float (&src)[4], const float coeff) {
  reinterpret_cast<float4*>(&gmem[4 * idx])[0] =
      make_float4(src[0] * coeff, src[1] * coeff, src[2] * coeff, src[3] * coeff);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

DEVICE_FUNCTION void write_to_smem(float* smem, int idx, const float (&x)[2]) {
  reinterpret_cast<float2*>(&smem[2 * idx])[0] = make_float2(x[0], x[1]);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

DEVICE_FUNCTION void write_to_smem(int* smem, int idx, const int (&x)[1]) { smem[idx] = x[0]; }

////////////////////////////////////////////////////////////////////////////////////////////////////

DEVICE_FUNCTION void write_to_smem(float* smem, int idx, const float (&x)[4]) {
  reinterpret_cast<float4*>(&smem[4 * idx])[0] = make_float4(x[0], x[1], x[2], x[3]);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

DEVICE_FUNCTION void write_to_smem(int* smem, int idx, const int (&x)[2]) {
  reinterpret_cast<int2*>(&smem[2 * idx])[0] = make_int2(x[0], x[1]);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
DEVICE_FUNCTION void zero_array(int (&dst)[N]) {
#pragma unroll
  for (int i = 0; i < N; ++i) {
    dst[i] = 0;
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
DEVICE_FUNCTION void zero_array(float (&dst)[N]) {
#pragma unroll
  for (int i = 0; i < N; ++i) {
    dst[i] = 0.f;
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
DEVICE_FUNCTION void add(float (&x)[N], const float (&y)[N]) {
#pragma unroll
  for (int i = 0; i < N; ++i) {
    x[i] += y[i];
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
DEVICE_FUNCTION void multiply(float (&x)[N], const float (&y)[N]) {
#pragma unroll
  for (int i = 0; i < N; ++i) {
    x[i] *= y[i];
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
DEVICE_FUNCTION void scale_(float (&x)[N], float scalar) {
#pragma unroll
  for (int i = 0; i < N; ++i) {
    x[i] *= scalar;
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
DEVICE_FUNCTION void normalize(float (&x)[N], const float (&bias)[N], const float (&scale)[N], const float (&m1)[N]) {
#pragma unroll
  for (int i = 0; i < N; ++i) {
    x[i] = bias[i] + scale[i] * (x[i] - m1[i]);
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Storage>
DEVICE_FUNCTION Storage relu(Storage in) {
  Storage zero = (Storage)0.f;
  return (in < zero) ? zero : in;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
DEVICE_FUNCTION void relu_activation(float (&x)[N]) {
#pragma unroll
  for (int i = 0; i < N; ++i) {
    x[i] = relu(x[i]);
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////
template <int THREADS_PER_CTA>
DEVICE_FUNCTION void parallel_sums_16x2(float* smem, float (&x)[4], int nhw, void* params_my_data,
                                        void** params_pair_datas, int off, const int magic, const int sync_iters) {
  // The size of a warp.
  const int THREADS_PER_WARP = 32;
  // The number of warps in a CTA.
  const int WARPS_PER_CTA = THREADS_PER_CTA / THREADS_PER_WARP;
  // The number of threads per pixel.
  const int THREADS_PER_PIXEL = 16;
  // The number of elements per ldg.
  const int ELEMENTS_PER_LDG = 4;
  // The number of reducing ops, each uses its own space : mean, var, dscale, dbias
  const int REDUCE_OPS = 4;
  // Maximum block.y supported - limited due to buffer allocation
  const int MAX_BLOCK_Y = 256;
  const int MAX_OFFSET = REDUCE_OPS * MAX_BLOCK_Y;
  // The warp decomposition.
  const int warp_id = threadIdx.x / THREADS_PER_WARP;
  const int lane_id = threadIdx.x % THREADS_PER_WARP;
  // total size of data per sync iter
  const int data_total = MAX_OFFSET * THREADS_PER_PIXEL * ELEMENTS_PER_LDG * 2;

#pragma unroll
  for (int i = 0; i < ELEMENTS_PER_LDG; ++i) {
    x[i] += __shfl_sync(0xffffffffU, x[i], THREADS_PER_PIXEL + lane_id);
  }

  // The warp leaders, write to SMEM.
  if (lane_id < THREADS_PER_PIXEL) {
    write_to_smem(smem, warp_id * THREADS_PER_PIXEL + lane_id, x);
  }

  // The data is in SMEM. Do the final reduction.
  __syncthreads();

  // The 1st warp does all the work.
  // We do the final reduction each half-warp sequentially reduces the final values.
  if (warp_id == 0) {
    read_from_smem(x, smem, threadIdx.x);

#pragma unroll
    for (int offset = 1; offset < WARPS_PER_CTA / (THREADS_PER_WARP / THREADS_PER_PIXEL); ++offset) {
      float y[ELEMENTS_PER_LDG];
      // Read the mean and variance from the other pixel.
      read_from_smem(y, smem, threadIdx.x + offset * THREADS_PER_WARP);
      // Compute the updated sum.
      add(x, y);
    }

    for (int i = 0; i < ELEMENTS_PER_LDG; ++i) {
      x[i] += __shfl_sync(0xffffffffU, x[i], THREADS_PER_PIXEL + lane_id);
    }

    // Make sure the data was read from SMEM.
    __syncwarp();

    // Store the final values.
    if (threadIdx.x < THREADS_PER_PIXEL) {
      // probably could do it earlier, before sync

      for (int sync_iter = 0; sync_iter < sync_iters; ++sync_iter) {
        // float* params_pair_data = (reinterpret_cast<float**>(params_pair_datas))[sync_iter];
        void* params_pair_data = params_pair_datas[sync_iter];

        // skip the space consumed by previous sync iterations
        const int xbuf_offset = sync_iter * data_total;
        // data starts after flags, but have to skip previous
        const int data_offset =
            xbuf_offset + off * ELEMENTS_PER_LDG * THREADS_PER_PIXEL * 2 + ELEMENTS_PER_LDG * threadIdx.x * 2;

        // after sums for this GPU were computed, let CTA0 broadcast the sum to over GPU
        if (blockIdx.x == 0) {
          volatile float* write_data = &((reinterpret_cast<float*>(params_pair_data))[data_offset]);

          // write the data to memory region to be reflected to other GPU
          asm volatile("st.global.wt.v4.b32 [%0], {%1,%2,%3,%4};" ::"l"(write_data), "f"(x[0]), "r"(magic), "f"(x[2]),
                       "r"(magic));

          asm volatile("st.global.wt.v4.b32 [%0], {%1,%2,%3,%4};" ::"l"(write_data + 4), "f"(x[1]), "r"(magic),
                       "f"(x[3]), "r"(magic));
        }

        // now each CTA (on each GPU) reads the data written by CTA 0 of the other GPU
        volatile float* read_data = &((reinterpret_cast<float*>(params_my_data))[data_offset]);

        float other[4];
        uint32_t other_flag_a, other_flag_b;
        do {
          asm volatile("ld.volatile.global.v4.b32 {%0, %1, %2, %3}, [%4];"
                       : "=f"(other[0]), "=r"(other_flag_a), "=f"(other[2]), "=r"(other_flag_b)
                       : "l"(read_data));
        } while ((other_flag_a != magic) || (other_flag_b != magic));

        do {
          asm volatile("ld.volatile.global.v4.b32 {%0, %1, %2, %3}, [%4];"
                       : "=f"(other[1]), "=r"(other_flag_a), "=f"(other[3]), "=r"(other_flag_b)
                       : "l"(read_data + 4));
        } while ((other_flag_a != magic) || (other_flag_b != magic));

        add(x, other);
      }
      // finally, after syncing up and accounting for partial sums from
      // other GPUs as required, write the result

      write_to_smem(smem, threadIdx.x, x);
    }
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int THREADS_PER_CTA>
DEVICE_FUNCTION void parallel_sums_8x4(float* smem, float (&x)[4], int nhw) {
  // The size of a warp.
  const int THREADS_PER_WARP = 32;
  // The number of warps in a CTA.
  const int WARPS_PER_CTA = THREADS_PER_CTA / THREADS_PER_WARP;
  // The number of threads per pixel.
  const int THREADS_PER_PIXEL = 8;
  // The number of elements per ldg.
  const int ELEMENTS_PER_LDG = 4;
  // The warp decomposition.
  const int warp_id = threadIdx.x / THREADS_PER_WARP;
  const int lane_id = threadIdx.x % THREADS_PER_WARP;

#pragma unroll
  for (int i = 0; i < ELEMENTS_PER_LDG; ++i) {
    x[i] += __shfl_sync(0xffffffffU, x[i], THREADS_PER_PIXEL + lane_id);
    x[i] += __shfl_sync(0xffffffffU, x[i], THREADS_PER_PIXEL * 2 + lane_id);
  }

  // The warp leaders, write to SMEM.
  if (lane_id < THREADS_PER_PIXEL) {
    write_to_smem(smem, warp_id * THREADS_PER_PIXEL + lane_id, x);
  }

  // The data is in SMEM. Do the final reduction.
  __syncthreads();

  // The 1st warp does all the work.
  // We do the final reduction each half-warp sequentially reduces the final values.
  if (warp_id == 0) {
    read_from_smem(x, smem, threadIdx.x);

#pragma unroll
    for (int offset = 1; offset < WARPS_PER_CTA / (THREADS_PER_WARP / THREADS_PER_PIXEL); ++offset) {
      float y[ELEMENTS_PER_LDG];
      // Read the mean and variance from the other pixel.
      read_from_smem(y, smem, threadIdx.x + offset * THREADS_PER_WARP);
      // Compute the updated sum.
      add(x, y);
    }

    for (int i = 0; i < ELEMENTS_PER_LDG; ++i) {
      x[i] += __shfl_sync(0xffffffffU, x[i], THREADS_PER_PIXEL + lane_id);
      x[i] += __shfl_sync(0xffffffffU, x[i], THREADS_PER_PIXEL * 2 + lane_id);
    }

    // Make sure the data was read from SMEM.
    __syncwarp();

    // Store the final values.
    if (threadIdx.x < THREADS_PER_PIXEL) {
      write_to_smem(smem, threadIdx.x, x);
    }
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int THREADS_PER_CTA, int THREADS_PER_PIXEL, int ELEMENTS_PER_LDG>
DEVICE_FUNCTION void parallel_sums(float* smem, float (&x)[ELEMENTS_PER_LDG], int nhw) {
  // The size of a warp.
  const int THREADS_PER_WARP = 32;
  // The number of warps in a CTA.
  const int WARPS_PER_CTA = THREADS_PER_CTA / THREADS_PER_WARP;
  // The number of pixels computed by a single warp.
  const int PIXELS_PER_WARP = THREADS_PER_WARP / THREADS_PER_PIXEL;

  // The position in the warp.
  const int nhw_in_warp = nhw % PIXELS_PER_WARP;
  // The C in the warp.
  const int c_in_warp = threadIdx.x % THREADS_PER_PIXEL;

  // Store the values to shared memory.
  write_to_smem(smem, threadIdx.x, x);

  // Compute the parallel sums.
  for (int offset = PIXELS_PER_WARP / 2; offset > 0; offset /= 2) {
    // NOP.
    __syncwarp();

    // Read the running sum from the other thread.
    float y[ELEMENTS_PER_LDG];
    if (nhw_in_warp < offset) {
      read_from_smem(y, smem, threadIdx.x + offset * THREADS_PER_PIXEL);
    }

    // Compute the updated sum.
    add(x, y);

    // NOP.
    __syncwarp();

    // Update the sum in SMEM.
    if (offset > 1 && nhw_in_warp < offset) {
      write_to_smem(smem, threadIdx.x, x);
    }
  }

  // The warps are done. Do the final reduction at the CTA level.
  __syncthreads();

  // The warp leaders, write to SMEM.
  const int idx = (threadIdx.x / THREADS_PER_WARP) * THREADS_PER_PIXEL + c_in_warp;
  if (nhw_in_warp == 0) {
    write_to_smem(smem, idx, x);
  }

  // The data is in SMEM. Do the final reduction.
  __syncthreads();

  // Read the 1st element to prepare the work.
  if (nhw < WARPS_PER_CTA / 2) {
    read_from_smem(x, smem, threadIdx.x);
  }

  // We have the running mean and running m2. Let's build the mean/var of the CTA.
  for (int offset = WARPS_PER_CTA / 2; offset > 0; offset /= 2) {
    // NOP.
    __syncwarp();

    // Read the mean and variance from the other pixel.
    float y[ELEMENTS_PER_LDG];
    if (nhw < offset) {
      read_from_smem(y, smem, threadIdx.x + offset * THREADS_PER_PIXEL);
    }

    // Compute the updated sum.
    add(x, y);

    // NOP.
    __syncwarp();

    // Store the mean/var for the different pixels.
    if (nhw < offset) {
      write_to_smem(smem, threadIdx.x, x);
    }
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int THREADS_PER_PIXEL, int ELEMENTS_PER_LDG>
struct ParallelSums {
  template <int THREADS_PER_CTA>
  DEVICE_FUNCTION void dispatch(float* smem, float (&x)[ELEMENTS_PER_LDG], int nhw) {
    parallel_sums<THREADS_PER_CTA, THREADS_PER_PIXEL, ELEMENTS_PER_LDG>(smem, x, nhw);
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <>
struct ParallelSums<16, 4> {
  template <int THREADS_PER_CTA>
  DEVICE_FUNCTION void dispatch(float* smem, float (&x)[4], int nhw) {
    parallel_sums_16x2<THREADS_PER_CTA>(smem, x, nhw, 0, 0, 0, 0, 0);
  }

  template <int THREADS_PER_CTA>
  DEVICE_FUNCTION void dispatchX(float* smem, float (&x)[4], int nhw, void* params_my_data, void** params_pair_datas,
                                 int off, const int magic, const unsigned int& sync_iters) {
    parallel_sums_16x2<THREADS_PER_CTA>(smem, x, nhw, params_my_data, params_pair_datas, off, magic, sync_iters);
  }
};

template <>
struct ParallelSums<8, 4> {
  template <int THREADS_PER_CTA>
  DEVICE_FUNCTION void dispatch(float* smem, float (&x)[4], int nhw) {
    parallel_sums_8x4<THREADS_PER_CTA>(smem, x, nhw);
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

////////////////////////////////////////////////////////////////////////////////////////////////////

static inline int div_up(int m, int n) { return (m + n - 1) / n; }

////////////////////////////////////////////////////////////////////////////////////////////////////

// It is expected that all threads in the CTA enter this function!
DEVICE_FUNCTION void inter_block_sync(int* gmem_retired_ctas, int expected_count, bool master) {
  // Register the CTA.
  if (threadIdx.x == 0) {
    // Issue the membar.
    __threadfence();
    // Notify that the CTA is done.
    int val_to_add = 1;
    if (master) {
      val_to_add = -(expected_count - 1);
    }
    atomicAdd(gmem_retired_ctas, val_to_add);
  }

  // Are all CTAs done?
  if (threadIdx.x == 0) {
    int retired_ctas = -1;
    do {
      __threadfence();
      asm volatile("ld.global.cg.b32 %0, [%1];" : "=r"(retired_ctas) : "l"(gmem_retired_ctas));
    } while (retired_ctas != 0);
  }
  __syncthreads();
}

////////////////////////////////////////////////////////////////////////////////////////////////////

struct NhwcBatchNormFwdInferenceParams {
  // The input/output tensors.
  uint16_t *gmem_src, *gmem_dst, *gmem_src1;
  // the final mean and variance as calculated during the training process
  float *gmem_mean, *gmem_var;
  // The bias/scale.
  float *gmem_bias, *gmem_scale;
  // The dimensions.
  int nhw, c;
  // epsilon
  float var_eps;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

// No DESIRED_OCCUPANCY launch bounds needed, as this is not launched cooperatively
template <typename Storage, int THREADS_PER_CTA, int THREADS_PER_PIXEL, int ELEMENTS_PER_LDG, bool USE_RELU,
          bool USE_ADD_RELU>
__global__ __launch_bounds__(THREADS_PER_CTA) void nhwc_batch_norm_fwd_inference(
    NhwcBatchNormFwdInferenceParams params) {
  // The number of pixels loaded in a single LDG.
  const int PIXELS_PER_LDG = THREADS_PER_CTA / THREADS_PER_PIXEL;
  // The number of C elements per CTA.
  const int C_ELEMENTS_PER_CTA = THREADS_PER_PIXEL * ELEMENTS_PER_LDG;

  // The start position in the NHW dimension where the CTA starts.
  const int cta_nhw_stride = gridDim.x * PIXELS_PER_LDG;
  // Compute the NHW coordinate of the thread in the CTA.
  const int thread_in_cta_nhw = threadIdx.x / THREADS_PER_PIXEL;
  // thread's starting point in NHW
  const int thread_nhw = thread_in_cta_nhw + blockIdx.x * PIXELS_PER_LDG;

  // The position in the C dimension where the CTA starts.
  const int cta_c = blockIdx.y * C_ELEMENTS_PER_CTA;
  // Compute the C coordinate of the thread in the CTA.
  const int thread_in_cta_c = threadIdx.x % THREADS_PER_PIXEL;
  // Compute the C coordinate of the thread.
  const int thread_c = cta_c + thread_in_cta_c * ELEMENTS_PER_LDG;

  // Is the thread working on a valid C dimension?
  const int is_valid_c = thread_c < params.c;

  float mean[ELEMENTS_PER_LDG], var[ELEMENTS_PER_LDG];
  float scale[ELEMENTS_PER_LDG], bias[ELEMENTS_PER_LDG];
  zero_array(mean);
  zero_array(var);
  zero_array(scale);
  zero_array(bias);
  if (is_valid_c) {
    read_from_gmem(var, &params.gmem_var[cta_c], thread_in_cta_c);
    read_from_gmem(scale, &params.gmem_scale[cta_c], thread_in_cta_c);
    read_from_gmem(mean, &params.gmem_mean[cta_c], thread_in_cta_c);
    read_from_gmem(bias, &params.gmem_bias[cta_c], thread_in_cta_c);
  }

// Update the scale with the stddev and eps.
#pragma unroll
  for (int i = 0; i < ELEMENTS_PER_LDG; ++i) {
    scale[i] *= rsqrtf(var[i] + params.var_eps);
  }

  // The base pointers for reading/writing
  uint16_t* const gmem_src = &params.gmem_src[thread_c];
  uint16_t* const gmem_dst = &params.gmem_dst[thread_c];
  const uint16_t* gmem_src1 = nullptr;
  if (USE_ADD_RELU) {
    gmem_src1 = &params.gmem_src1[thread_c];
  }

  // apply BN
  for (int nhw = thread_nhw; nhw < params.nhw; nhw += cta_nhw_stride) {
    float x_math[ELEMENTS_PER_LDG];
    zero_array(x_math);
    if (is_valid_c) {
      ldg(x_math, &gmem_src[nhw * params.c]);
    }

    // Normalize and apply activation function
    normalize(x_math, bias, scale, mean);
    if (USE_ADD_RELU) {
      float x1_math[ELEMENTS_PER_LDG];
      ldg(x1_math, &gmem_src1[nhw * params.c]);
      add(x_math, x1_math);
      relu_activation(x_math);
    } else if (USE_RELU) {
      relu_activation(x_math);
    }

    if (is_valid_c) {
      stg(&gmem_dst[nhw * params.c], x_math);
    }
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

struct NhwcBatchNormFwdParams {
  // The input/output tensors.
  uint16_t *gmem_src, *gmem_dst, *gmem_src1;
  // The bias/scale.
  float *gmem_bias, *gmem_scale;
  // running mean/var (refer BN API from cudnn doc)
  float *gmem_running_mean, *gmem_running_var;
  // saved mean/var (refer BN API from cudnn doc)
  float *gmem_saved_mean, *gmem_saved_var;
  // ReLU bitmask
  unsigned int* gmem_relu_bitmask;
  // The dimensions.
  int nhw, c;
  // factor to scale sum of squared errors to get saved variance.  Must be 1/nhw.
  float svar_inv_count;
  // factor to scale sum of squared errors to get running variance. Should be 1/nhw or 1/(nhw-1).
  float rvar_inv_count;
  // The buffer to do the reduction for mean, stddev and count.
  float* gmem_sums;
  // The buffer to count items in the different CTAs.
  int* gmem_counts;
  // The counters of retired CTAs.
  int* gmem_retired_ctas;
  // The epsilon to apply to the computation of the variance.
  float var_eps;
  // outer loop count
  int outer_loops;
  // exponential average factor
  float exp_avg_factor;
  // number of CTAs along .x dimension
  int c_blks;

  void* my_data;
  void* pair_datas[4];
  int magic;
  int sync_iters;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Storage, int THREADS_PER_CTA, int THREADS_PER_PIXEL, int PIXELS_PER_THREAD_IN_REGISTERS,
          int PIXELS_PER_THREAD_IN_SMEM, int ELEMENTS_PER_LDG, int USE_ONLINE_APPROACH, int OUTER_LOOPS_, bool USE_RELU,
          bool USE_ADD_RELU, int DESIRED_OCCUPANCY>
__global__ __launch_bounds__(THREADS_PER_CTA,
                             DESIRED_OCCUPANCY) void nhwc_batch_norm_fwd(NhwcBatchNormFwdParams params) {
  // The number of pixels loaded in a single LDG.
  const int PIXELS_PER_LDG = THREADS_PER_CTA / THREADS_PER_PIXEL;
  // The number of pixels computed per CTA stored in registers.
  const int PIXELS_PER_CTA_IN_REGISTERS = PIXELS_PER_THREAD_IN_REGISTERS * PIXELS_PER_LDG;
  // The number of pixels computed per CTA stored in SMEM.
  const int PIXELS_PER_CTA_IN_SMEM = PIXELS_PER_THREAD_IN_SMEM * PIXELS_PER_LDG;
  // The number of C elements per CTA.
  const int C_ELEMENTS_PER_CTA = THREADS_PER_PIXEL * ELEMENTS_PER_LDG;

  // Shared memory to do CTA-wide parallel sums.
  __shared__ float smem[THREADS_PER_PIXEL * (THREADS_PER_CTA / 32) * ELEMENTS_PER_LDG];

  // Compute the NHW coordinate of the thread in the CTA.
  const int thread_in_cta_nhw = threadIdx.x / THREADS_PER_PIXEL;

  // The adapter for the storage.
  typedef PackedStorage<Storage, ELEMENTS_PER_LDG> PackedStorage_;
  // The data type for packed storage in SMEM.
  typedef typename PackedStorage_::Type PackedStorageType;
  // The number of elements in the packed storage.
  const int PACKED_ELEMENTS_PER_LDG = PackedStorage_::PACKED_ELEMENTS_PER_LDG;
  // Registers to keep the data live for the persistent approach.
  PackedStorageType x_storage[PIXELS_PER_THREAD_IN_REGISTERS][PACKED_ELEMENTS_PER_LDG];

  // Shared memory buffer to store the extra pixels.
  extern __shared__ PackedStorageType smem_storage_packed[];

  for (int c_blk_index = blockIdx.y; c_blk_index < params.c_blks; c_blk_index += gridDim.y) {
    // The position in the NHW dimension where the CTA starts.
    int cta_nhw_regs = blockIdx.x * PIXELS_PER_CTA_IN_REGISTERS;
    // The position in the NHW dimension where the CTA starts for the portion in SMEM.
    int cta_nhw_smem = blockIdx.x * PIXELS_PER_CTA_IN_SMEM;

    // The position in the C dimension where the CTA starts.
    const int cta_c = c_blk_index * C_ELEMENTS_PER_CTA;
    // Compute the C coordinate of the thread in the CTA.
    const int thread_in_cta_c = threadIdx.x % THREADS_PER_PIXEL;
    // Compute the C coordinate of the thread.
    int thread_c = cta_c + thread_in_cta_c * ELEMENTS_PER_LDG;

    // Is the thread working on a valid C dimension?
    const int is_valid_c = thread_c < params.c;

    // Clamp thread_c so that we load from valid locations even if we don't use the value
    if (!is_valid_c) thread_c = params.c - 4;

    // Single pass numerically stable algorithm, see:
    // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm
    //
    // n = 0, mean = 0.0, M2 = 0.0
    //
    // for x in data:
    //     n += 1
    //     delta = x - mean
    //     mean += delta/n
    //     delta2 = x - mean
    //     M2 += delta*delta2
    //
    // if n < 2:
    //     return float('nan')
    // else:
    //     return M2 / (n - 1)

    // Register to store the number of elements read so far.
    float count = 0.f, mean[ELEMENTS_PER_LDG], m2[ELEMENTS_PER_LDG];
#pragma unroll
    for (int i = 0; i < ELEMENTS_PER_LDG; ++i) {
      mean[i] = 0.f;
      m2[i] = 0.f;
    }

    // The number of elements loaded by this CTA.
    int cta_count = 0;
    // The base pointer to load from.
    const uint16_t* gmem_src = &params.gmem_src[thread_c];

    // outer loops
    int OUTER_LOOPS = OUTER_LOOPS_ == 1 ? 1 : params.outer_loops;
    // Load the batch of elements. Compute the mean/var across those elements.
    const int pixels_per_iteration = PIXELS_PER_CTA_IN_REGISTERS * gridDim.x;

    if (OUTER_LOOPS_ != 1) {
      // We cannot load everything to store persistently, so let's makes sure registers and
      // smem are fully utilized, offset is evenly divisible by 32
      int offset = (pixels_per_iteration * OUTER_LOOPS + PIXELS_PER_CTA_IN_SMEM * gridDim.x - params.nhw) & ~31;
      cta_nhw_regs -= offset;
      cta_nhw_smem -= offset;
    }

#pragma unroll 1
    for (int loop_i = 0; loop_i < OUTER_LOOPS; ++loop_i) {
      // The nhw position.
      int nhw_regs = cta_nhw_regs + loop_i * pixels_per_iteration;
      // Update the number of elements loaded by this CTA. TODO: Skip if <= 0!!!
      cta_count += max(min(nhw_regs + PIXELS_PER_CTA_IN_REGISTERS, params.nhw) - max(nhw_regs, 0), 0);

      // Load the data and compute the local mean/sum and the variance.
      if (USE_ONLINE_APPROACH) {
        // Read the elements from memory.
        float is_valid[PIXELS_PER_THREAD_IN_REGISTERS];
#pragma unroll
        for (int i = 0; i < PIXELS_PER_THREAD_IN_REGISTERS; ++i) {
          const int idx = nhw_regs + thread_in_cta_nhw + i * PIXELS_PER_LDG;
          zero_array(x_storage[i]);
          is_valid[i] = 0.f;
          if (((unsigned int)idx < (unsigned int)params.nhw) && is_valid_c) {
            if (loop_i == OUTER_LOOPS - 1) {
              ldg_stream(x_storage[i], &gmem_src[idx * params.c]);
            } else {
              ldg(x_storage[i], &gmem_src[idx * params.c]);
            }
            is_valid[i] = 1.f;
          }
        }

// Do the math.
#pragma unroll
        for (int i = 0; i < PIXELS_PER_THREAD_IN_REGISTERS; ++i) {
          // Convert to float.
          float x_math[ELEMENTS_PER_LDG];
          to_float(x_math, x_storage[i]);

          // Update the count.
          count += is_valid[i];
          // Invert the count.
          float inv_count = is_valid[i] ? 1.f / count : 0.f;

// Update the mean and m2 using deltas.
#pragma unroll
          for (int j = 0; j < ELEMENTS_PER_LDG; ++j) {
            float delta0 = x_math[j] - mean[j];
            mean[j] += delta0 * inv_count;
            float delta1 = x_math[j] - mean[j];
            m2[j] += delta0 * delta1 * is_valid[i];
          }
        }
      } else {
// Read the elements from memory.
#pragma unroll
        for (int i = 0; i < PIXELS_PER_THREAD_IN_REGISTERS; ++i) {
          const int idx = nhw_regs + thread_in_cta_nhw + i * PIXELS_PER_LDG;
          zero_array(x_storage[i]);
          if (((unsigned int)idx < (unsigned int)params.nhw) && is_valid_c) {
            if (loop_i == OUTER_LOOPS - 1) {
              ldg_stream(x_storage[i], &gmem_src[idx * params.c]);
            } else {
              ldg(x_storage[i], &gmem_src[idx * params.c]);
            }
            count += 1.f;
          }
        }

// Sum the elements in registers.
#pragma unroll
        for (int i = 0; i < PIXELS_PER_THREAD_IN_REGISTERS; ++i) {
          // Convert to float.
          float x_math[ELEMENTS_PER_LDG];
          to_float(x_math, x_storage[i]);

// Update the mean and m2 using deltas.
#pragma unroll
          for (int j = 0; j < ELEMENTS_PER_LDG; ++j) {
            mean[j] += x_math[j];
          }
        }

        // Compute the mean.
        float inv_count = 1.f / count;
#pragma unroll
        for (int j = 0; j < ELEMENTS_PER_LDG; ++j) {
          mean[j] *= inv_count;
        }

// Compute the variance.
#pragma unroll
        for (int i = 0; i < PIXELS_PER_THREAD_IN_REGISTERS; ++i) {
          // Convert to float.
          float x_math[ELEMENTS_PER_LDG];
          to_float(x_math, x_storage[i]);

          // Is it a valid pixel?
          float is_valid = i < static_cast<int>(count) ? 1.f : 0.f;
// Update the mean and m2 using deltas.
#pragma unroll
          for (int j = 0; j < ELEMENTS_PER_LDG; ++j) {
            m2[j] += (x_math[j] - mean[j]) * (x_math[j] - mean[j]) * is_valid;
          }
        }
      }
    }

    // The elements to load and store in SMEM.
    int smem_nhw = OUTER_LOOPS * pixels_per_iteration + cta_nhw_smem;
    // Load elements from SMEM, update the CTA count.
    int pixels_in_smem = min(smem_nhw + PIXELS_PER_CTA_IN_SMEM, params.nhw) - max(smem_nhw, 0);
    if (pixels_in_smem > 0) {
      cta_count += pixels_in_smem;
      for (int i = 0; i < PIXELS_PER_THREAD_IN_SMEM; ++i) {
        const int idx = smem_nhw + thread_in_cta_nhw + i * PIXELS_PER_LDG;
        float is_pixel_valid = (((unsigned int)idx < (unsigned int)params.nhw) && is_valid_c) ? 1.f : 0.f;

        PackedStorageType x_storage_local[PACKED_ELEMENTS_PER_LDG];
        ldg_stream(x_storage_local, &gmem_src[(is_pixel_valid ? idx : 0) * params.c]);

        // The offset to store in SMEM.
        const int offset = i * THREADS_PER_CTA * PACKED_ELEMENTS_PER_LDG;
        // Store in SMEM.
        write_to_smem(&smem_storage_packed[offset], threadIdx.x, x_storage_local);
        // Update the count.
        count += is_pixel_valid;
        // Invert the count.
        float inv_count = is_pixel_valid ? 1.f / count : 0.f;

        float x_math[ELEMENTS_PER_LDG];
        to_float(x_math, x_storage_local);
// Update the mean and m2 using deltas.
#pragma unroll
        for (int j = 0; j < ELEMENTS_PER_LDG; ++j) {
          float delta0 = x_math[j] - mean[j];
          mean[j] += delta0 * inv_count;
          float delta1 = x_math[j] - mean[j];
          m2[j] += delta0 * delta1 * is_pixel_valid;
        }
      }
    }

    // We scale the mean by the number of elements. It brings more stability.
    float m1[ELEMENTS_PER_LDG];
#pragma unroll
    for (int i = 0; i < ELEMENTS_PER_LDG; ++i) {
      m1[i] = mean[i] * count;
    }

    // Run the parallel sum accross the CTA to get the local sum.
    ParallelSums<THREADS_PER_PIXEL, ELEMENTS_PER_LDG>::dispatch<THREADS_PER_CTA>(smem, m1, thread_in_cta_nhw);
    __syncthreads();

    // The values in shared memory correspond to the CTA-wide sums.
    read_from_smem(m1, smem, thread_in_cta_c);
    __syncthreads();

    // Adjust the variance.
    float inv_cta_count = 1.f / static_cast<float>(cta_count);
#pragma unroll
    for (int i = 0; i < ELEMENTS_PER_LDG; ++i) {
      float mean_diff = m1[i] * inv_cta_count - mean[i];
      m2[i] = m2[i] + mean_diff * mean_diff * count;
    }

    // Run the parallel sum accross the CTA to get the local adjusted variance.
    ParallelSums<THREADS_PER_PIXEL, ELEMENTS_PER_LDG>::dispatch<THREADS_PER_CTA>(smem, m2, thread_in_cta_nhw);

    // The workspace in global memory is distributed across the different CTA.
    int gmem_sums_offset = c_blk_index * gridDim.x * C_ELEMENTS_PER_CTA * 2;

    // Write the data for the CTA to global memory.
    float* gmem_sums = &params.gmem_sums[gmem_sums_offset];
    if (threadIdx.x < THREADS_PER_PIXEL) {
      const int idx = blockIdx.x * THREADS_PER_PIXEL + threadIdx.x;
      write_to_gmem(&gmem_sums[0], idx, m1);
      write_to_gmem(&gmem_sums[C_ELEMENTS_PER_CTA * gridDim.x], idx, m2);
    }

    // The memory location to store the number of pixels per CTA.
    int* gmem_counts = &params.gmem_counts[c_blk_index * gridDim.x];
    if (threadIdx.x == 0) {
      gmem_counts[blockIdx.x] = cta_count;
    }

    // Read the bias and scale.
    float bias[ELEMENTS_PER_LDG], scale[ELEMENTS_PER_LDG];
    if (is_valid_c) {
      read_from_gmem(bias, &params.gmem_bias[cta_c], thread_in_cta_c);
      read_from_gmem(scale, &params.gmem_scale[cta_c], thread_in_cta_c);
    }

    // The counters to count how many CTAs have retired at this point.
    // A given cta uses the same counter every other time through the outer loop.
    int* gmem_retired_ctas = &params.gmem_retired_ctas[c_blk_index % (2 * gridDim.y)];
    inter_block_sync(gmem_retired_ctas, gridDim.x, blockIdx.x == 0);

// Reset the mean to compute the global mean.
#pragma unroll
    for (int i = 0; i < ELEMENTS_PER_LDG; ++i) {
      m1[i] = 0.f;
    }

// Build the global mean.
#pragma unroll 1
    for (int idx = threadIdx.x; idx < THREADS_PER_PIXEL * gridDim.x; idx += THREADS_PER_CTA) {
      float tmp[ELEMENTS_PER_LDG];
      read_from_gmem(tmp, gmem_sums, idx);
      add(m1, tmp);
    }

    if (params.sync_iters > 0) {
      ParallelSums<THREADS_PER_PIXEL, ELEMENTS_PER_LDG>::dispatchX<THREADS_PER_CTA>(
          smem, m1, thread_in_cta_nhw, params.my_data, params.pair_datas, 4 * c_blk_index + 3, params.magic,
          params.sync_iters);
    } else {
      ParallelSums<THREADS_PER_PIXEL, ELEMENTS_PER_LDG>::dispatch<THREADS_PER_CTA>(smem, m1, thread_in_cta_nhw);
    }
    __syncthreads();

    // The values in shared memory correspond to the CTA-wide sums.
    read_from_smem(m1, smem, thread_in_cta_c);
    __syncthreads();

// Normalize the mean.
#pragma unroll
    for (int i = 0; i < ELEMENTS_PER_LDG; ++i) {
      m1[i] = m1[i] * params.svar_inv_count;
    }

// Reset the variance.
#pragma unroll
    for (int i = 0; i < ELEMENTS_PER_LDG; ++i) {
      m2[i] = 0.f;
    }

    // for add+relu fusion
    const uint16_t* gmem_src1 = nullptr;
    if (USE_ADD_RELU) {
      gmem_src1 = &params.gmem_src1[thread_c];
    }

// Build the global variance.
#pragma unroll 1
    for (int idx = threadIdx.x; idx < THREADS_PER_PIXEL * gridDim.x; idx += THREADS_PER_CTA) {
      // Read the means computed by different CTAs (again). Reuse tmp if we have 1 iteration.
      float tmp_mean[ELEMENTS_PER_LDG], tmp_var[ELEMENTS_PER_LDG];
      read_from_gmem(tmp_mean, &gmem_sums[0], idx);
      read_from_gmem(tmp_var, &gmem_sums[C_ELEMENTS_PER_CTA * gridDim.x], idx);

      // Read the number of pixels visited by a given CTA.
      cta_count = __ldg(&gmem_counts[idx / THREADS_PER_PIXEL]);

      // Compute the diff to update the variance.
      float mean_diff[ELEMENTS_PER_LDG], inv_cta_count = 1.f / static_cast<float>(cta_count);
#pragma unroll
      for (int i = 0; i < ELEMENTS_PER_LDG; ++i) {
        mean_diff[i] = m1[i] - tmp_mean[i] * inv_cta_count;
      }

// Update the variance.
#pragma unroll
      for (int i = 0; i < ELEMENTS_PER_LDG; ++i) {
        m2[i] += tmp_var[i] + mean_diff[i] * mean_diff[i] * static_cast<float>(cta_count);
      }
    }

    if (params.sync_iters > 0) {
      ParallelSums<THREADS_PER_PIXEL, ELEMENTS_PER_LDG>::dispatchX<THREADS_PER_CTA>(
          smem, m2, thread_in_cta_nhw, params.my_data, params.pair_datas, 4 * c_blk_index + 2, params.magic,
          params.sync_iters);
    } else {
      ParallelSums<THREADS_PER_PIXEL, ELEMENTS_PER_LDG>::dispatch<THREADS_PER_CTA>(smem, m2, thread_in_cta_nhw);
    }
    __syncthreads();

    read_from_smem(m2, smem, thread_in_cta_c);

    // Finalize the stddev.
    // becasue saved var and running var may have different denominator, we don't do it here
    // scale_(m2, inv_count);

    // store the saved mean/var
    float svarinv[ELEMENTS_PER_LDG];
    bool is_valid_for_saving = is_valid_c && blockIdx.x == 0 && thread_in_cta_nhw == 0;
#pragma unroll
    for (int i = 0; i < ELEMENTS_PER_LDG; ++i) {
      svarinv[i] = rsqrtf(m2[i] * params.svar_inv_count + params.var_eps);
    }
    if (is_valid_for_saving) {
      write_to_gmem(params.gmem_saved_mean, thread_c / ELEMENTS_PER_LDG, m1);
      write_to_gmem(params.gmem_saved_var, thread_c / ELEMENTS_PER_LDG, svarinv);
    }

    // store the running mean/var
    float rmean[ELEMENTS_PER_LDG], rvar[ELEMENTS_PER_LDG];
    zero_array(rmean);
    zero_array(rvar);
    if (params.exp_avg_factor != 1.f && is_valid_for_saving) {
      read_from_gmem(rmean, params.gmem_running_mean, thread_c / ELEMENTS_PER_LDG);
      read_from_gmem(rvar, params.gmem_running_var, thread_c / ELEMENTS_PER_LDG);
    }
#pragma unroll
    for (int i = 0; i < ELEMENTS_PER_LDG; ++i) {
      rmean[i] = (1.f - params.exp_avg_factor) * rmean[i] + params.exp_avg_factor * m1[i];
      rvar[i] = (1.f - params.exp_avg_factor) * rvar[i] + params.exp_avg_factor * (m2[i] * params.rvar_inv_count);
    }
    if (is_valid_for_saving) {
      write_to_gmem(params.gmem_running_mean, thread_c / ELEMENTS_PER_LDG, rmean);
      write_to_gmem(params.gmem_running_var, thread_c / ELEMENTS_PER_LDG, rvar);
    }

    // Update the scale with the stddev and eps.
    multiply(scale, svarinv);

    // The base pointer to write to.
    uint16_t* const gmem_dst = &params.gmem_dst[thread_c];

    unsigned int* const gmem_relu_bitmask = params.gmem_relu_bitmask + ((params.nhw + 31) & ~31) * 2 * c_blk_index;

// Store the elements in registers.
#pragma unroll 1
    for (int loop_i = OUTER_LOOPS - 1; loop_i >= 0; --loop_i) {
      // The value for nhw.
      int out_nhw = cta_nhw_regs + loop_i * pixels_per_iteration;

// Normalize the elements and write to memory.
#pragma unroll
      for (int i = 0; i < PIXELS_PER_THREAD_IN_REGISTERS; ++i) {
        const int idx = out_nhw + thread_in_cta_nhw + i * PIXELS_PER_LDG;
        const bool is_valid_nhw = static_cast<unsigned int>(idx) < static_cast<unsigned int>(params.nhw);
        const bool is_valid = is_valid_nhw && is_valid_c;
        // Convert to float.
        float x_math[ELEMENTS_PER_LDG];
        to_float(x_math, x_storage[i]);

        // Normalize and apply activation function
        normalize(x_math, bias, scale, m1);
        if (USE_ADD_RELU) {
          float x1_math[ELEMENTS_PER_LDG];
          ldg_stream(x1_math, &gmem_src1[(is_valid ? idx : 0) * params.c]);
          add(x_math, x1_math);
          unsigned int relu_mask;
          int lane_id = threadIdx.x & 31;
#pragma unroll
          for (int i = 0; i < ELEMENTS_PER_LDG; ++i) {
            bool rectified = x_math[i] < 0.0F;
            unsigned int local_relu_mask = __ballot_sync(0xFFFFFFFFU, rectified);
            if (lane_id == i) {
              // Thread 0 remembers the relu_mask from the first time through this
              // loop, Thread 1 the next, Thread 2 the next, and Thread 3 the last.
              relu_mask = local_relu_mask;
            }
            if (rectified) {
              x_math[i] = 0.0F;
            }
          }
          if (is_valid_nhw && (lane_id < ELEMENTS_PER_LDG)) {
            gmem_relu_bitmask[idx * 2 + lane_id] = relu_mask;
          }
        } else if (USE_RELU) {
          relu_activation(x_math);
        }

        // Write back.
        if (is_valid) {
          stg_stream(&gmem_dst[idx * params.c], x_math);
        }
      }

      // The next value of nhw.
      out_nhw -= pixels_per_iteration;

// Read the next elements from memory.
#pragma unroll
      for (int i = 0; i < PIXELS_PER_THREAD_IN_REGISTERS; ++i) {
        const int idx = out_nhw + thread_in_cta_nhw + i * PIXELS_PER_LDG;
        if (((unsigned int)idx < (unsigned int)params.nhw) && is_valid_c) {
          ldg_stream(x_storage[i], &gmem_src[idx * params.c]);
        }
      }
    }

    // Normalize the elements from SMEM and write them out.
    if (pixels_in_smem > 0) {
#pragma unroll 2
      for (int i = 0; i < PIXELS_PER_THREAD_IN_SMEM; ++i) {
        const int idx = smem_nhw + thread_in_cta_nhw + i * PIXELS_PER_LDG;
        const bool is_valid_nhw = static_cast<unsigned int>(idx) < static_cast<unsigned int>(params.nhw);
        const bool is_valid = is_valid_nhw && is_valid_c;

        // Read from SMEM.
        const int offset = i * THREADS_PER_CTA * PACKED_ELEMENTS_PER_LDG;
        PackedStorageType x_storage_local[PACKED_ELEMENTS_PER_LDG];
        read_from_smem(x_storage_local, &smem_storage_packed[offset], threadIdx.x);
        float x_math[ELEMENTS_PER_LDG];
        to_float(x_math, x_storage_local);

        // Normalize and apply activation function
        normalize(x_math, bias, scale, m1);
        if (USE_ADD_RELU) {
          float x1_math[ELEMENTS_PER_LDG];
          ldg_stream(x1_math, &gmem_src1[(is_valid ? idx : 0) * params.c]);
          add(x_math, x1_math);
          unsigned int relu_mask;
          int lane_id = threadIdx.x & 31;
#pragma unroll
          for (int i = 0; i < ELEMENTS_PER_LDG; ++i) {
            bool rectified = x_math[i] < 0.0F;
            unsigned int local_relu_mask = __ballot_sync(0xFFFFFFFFU, rectified);
            if (lane_id == i) {
              relu_mask = local_relu_mask;
            }
            if (rectified) {
              x_math[i] = 0.0F;
            }
          }
          if (is_valid_nhw && (lane_id < ELEMENTS_PER_LDG)) {
            gmem_relu_bitmask[idx * 2 + lane_id] = relu_mask;
          }
        } else if (USE_RELU) {
          relu_activation(x_math);
        }

        // Write back.
        if (is_valid) {
          stg_stream(&gmem_dst[idx * params.c], x_math);
        }
      }
    }
    // We're about to start on the next c-blk.  Needed?
    __syncthreads();
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

struct NhwcBatchNormBwdParams {
  // The input/output tensors.
  uint16_t *gmem_src, *gmem_dy, *gmem_dst, *gmem_dst1;
  // dscale/dbias
  float *gmem_dscale, *gmem_dbias;
  // The scale and bias.
  float *gmem_scale, *gmem_bias;
  // The mean/inv-var saved from fwd pass
  float *gmem_saved_mean, *gmem_saved_var;
  // ReLU bitmask
  unsigned int* gmem_relu_bitmask;
  // The dimensions.
  int nhw, c;
  // factor to scale sum of squared errors to get saved variance.  Must be 1/nhw.
  float svar_inv_count;
  // The buffer to do the reduction for dscale and dbias
  float* gmem_sums;
  // The counters of retired CTAs.
  int* gmem_retired_ctas;
  // outer loop count
  int outer_loops;
  // number of CTAs along .x dimension
  int c_blks;

  void* my_data;
  void* pair_datas[4];
  int magic;
  int sync_iters;
  float wgrad_coeff;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
DEVICE_FUNCTION void relu_bwd(float (&dy)[N], const float (&x)[N], const float (&mean_var_scale_bias)[N],
                              const float (&var_scale)[N], bool valid_data) {
#pragma unroll
  for (int j = 0; j < N; ++j) {
    float y = (x[j] * var_scale[j]) + mean_var_scale_bias[j];
    if ((y <= 0.f) && valid_data) {
      dy[j] = 0.f;
    }
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
DEVICE_FUNCTION void relu_bwd(float (&dy)[N], const float (&y)[N], bool valid_data) {
#pragma unroll
  for (int j = 0; j < N; ++j) {
    if ((y[j] <= 0.f) && valid_data) {
      dy[j] = 0.f;
    }
  }
}

template <int N>
DEVICE_FUNCTION void relu_bwd(float (&dy)[N], const bool (&rectified)[N], bool valid_data) {
#pragma unroll
  for (int j = 0; j < N; ++j) {
    if (rectified[j] && valid_data) {
      dy[j] = 0.f;
    }
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
DEVICE_FUNCTION void relu_bwd_for_dx(float (&dy)[N], const float (&x)[N], const float (&mean_var_scale_bias)[N],
                                     const float (&var_scale)[N]) {
#pragma unroll
  for (int j = 0; j < N; ++j) {
    float y = (x[j] * var_scale[j]) + mean_var_scale_bias[j];
    if (y <= 0.f) {
      dy[j] = 0.f;
    }
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
DEVICE_FUNCTION void relu_bwd_for_dx(float (&dy)[N], const float (&y)[N]) {
#pragma unroll
  for (int j = 0; j < N; ++j) {
    if (y[j] <= 0.f) {
      dy[j] = 0.f;
    }
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
DEVICE_FUNCTION void bwd_update(float (&dscale)[N], float (&dbias)[N], const float (&dy)[N], const float (&x)[N],
                                const float (&mean)[N], float inv_count) {
#pragma unroll
  for (int j = 0; j < N; ++j) {
    float delta0 = dy[j] - dbias[j];
    dbias[j] += delta0 * inv_count;
    delta0 = (dy[j] * (x[j] - mean[j])) - dscale[j];
    dscale[j] += delta0 * inv_count;
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int N>
DEVICE_FUNCTION void bwd_dx(float (&dx)[N], const float (&dy)[N], const float (&var)[N], const float (&x)[N],
                            const float (&mean)[N], const float (&dscale)[N], const float (&dbias)[N],
                            float inv_count) {
#pragma unroll
  for (int j = 0; j < N; ++j) {
    float tmp1 = dy[j] - (dbias[j] * inv_count);
    float tmp2 = dscale[j] * inv_count;
    float tmp3 = x[j] - mean[j];
    dx[j] = var[j] * (tmp1 - (tmp2 * tmp3));
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Storage, int THREADS_PER_CTA, int THREADS_PER_PIXEL, int PIXELS_PER_THREAD_IN_REGISTERS,
          int PIXELS_PER_THREAD_IN_SMEM, int ELEMENTS_PER_LDG, int USE_ONLINE_APPROACH, int OUTER_LOOPS_,
          int DESIRED_OCCUPANCY>
__global__ __launch_bounds__(THREADS_PER_CTA,
                             DESIRED_OCCUPANCY) void nhwc_batch_norm_bwd(NhwcBatchNormBwdParams params) {
  // The number of pixels loaded in a single LDG.
  const int PIXELS_PER_LDG = THREADS_PER_CTA / THREADS_PER_PIXEL;
  // The number of pixels computed per CTA stored in registers.
  const int PIXELS_PER_CTA_IN_REGISTERS = PIXELS_PER_THREAD_IN_REGISTERS * PIXELS_PER_LDG;
  // The number of pixels computed per CTA stored in SMEM.
  const int PIXELS_PER_CTA_IN_SMEM = PIXELS_PER_THREAD_IN_SMEM * PIXELS_PER_LDG;
  // The number of C elements per CTA.
  const int C_ELEMENTS_PER_CTA = THREADS_PER_PIXEL * ELEMENTS_PER_LDG;

  // Shared memory to do CTA-wide parallel sums.
  __shared__ float smem[THREADS_PER_PIXEL * (THREADS_PER_CTA / 32) * ELEMENTS_PER_LDG];

  // The adapter for the storage.
  typedef PackedStorage<Storage, ELEMENTS_PER_LDG> PackedStorage_;
  // The data type for packed storage in SMEM.
  typedef typename PackedStorage_::Type PackedStorageType;
  // The number of elements in the packed storage.
  const int PACKED_ELEMENTS_PER_LDG = PackedStorage_::PACKED_ELEMENTS_PER_LDG;
  // Registers to keep the data live for the persistent approach.
  PackedStorageType x_storage[PIXELS_PER_THREAD_IN_REGISTERS][PACKED_ELEMENTS_PER_LDG];
  PackedStorageType dy_storage[PIXELS_PER_THREAD_IN_REGISTERS][PACKED_ELEMENTS_PER_LDG];

  // Shared memory buffer to store the extra pixels.
  extern __shared__ PackedStorageType smem_storage_packed[];

  for (int c_blk_index = blockIdx.y; c_blk_index < params.c_blks; c_blk_index += gridDim.y) {
    // The position in the NHW dimension where the CTA starts.
    int cta_nhw_regs = blockIdx.x * PIXELS_PER_CTA_IN_REGISTERS;
    // The position in the NHW dimension where the CTA starts for the portion in SMEM.
    int cta_nhw_smem = blockIdx.x * PIXELS_PER_CTA_IN_SMEM;
    // Compute the NHW coordinate of the thread in the CTA.
    const int thread_in_cta_nhw = threadIdx.x / THREADS_PER_PIXEL;

    // The position in the C dimension where the CTA starts.
    const int cta_c = c_blk_index * C_ELEMENTS_PER_CTA;
    // Compute the C coordinate of the thread in the CTA.
    const int thread_in_cta_c = threadIdx.x % THREADS_PER_PIXEL;
    // Compute the C coordinate of the thread.
    const int thread_c = cta_c + thread_in_cta_c * ELEMENTS_PER_LDG;

    // Is the thread working on a valid C dimension?
    const int is_valid_c = thread_c < params.c;

    // Registers to store the mean used for entire duration
    float mean[ELEMENTS_PER_LDG];
    zero_array(mean);
    if (is_valid_c) {
      read_from_gmem(mean, params.gmem_saved_mean, thread_c / ELEMENTS_PER_LDG);
    }

    // accumulation related registers
    float count = 0.f, dscale[ELEMENTS_PER_LDG], dbias[ELEMENTS_PER_LDG];
    zero_array(dscale);
    zero_array(dbias);

    // The number of elements loaded by this CTA.
    int cta_count = 0;
    // The base pointers to load from.
    const uint16_t* gmem_src = &params.gmem_src[thread_c];
    const uint16_t* gmem_dy = &params.gmem_dy[thread_c];

    // outer loops
    int OUTER_LOOPS = OUTER_LOOPS_ == 1 ? 1 : params.outer_loops;
    // Load the batch of elements. Compute sum across them
    const int pixels_per_iteration = PIXELS_PER_CTA_IN_REGISTERS * gridDim.x;

    if (OUTER_LOOPS_ != 1) {
      // We cannot load everything to store persistently, so let's makes sure registers and
      // smem are fully utilized
      int offset = params.nhw - pixels_per_iteration * OUTER_LOOPS - PIXELS_PER_CTA_IN_SMEM * gridDim.x;
      cta_nhw_regs += offset;
      cta_nhw_smem += offset;
    }

#pragma unroll 1
    for (int loop_i = 0; loop_i < OUTER_LOOPS; ++loop_i) {
      // The nhw position.
      int nhw_regs = cta_nhw_regs + loop_i * pixels_per_iteration;
      // Update the number of elements loaded by this CTA. TODO: Skip if <= 0!!!
      cta_count += max(0, min(PIXELS_PER_CTA_IN_REGISTERS, params.nhw - nhw_regs));

      // Read the elements from memory.
      float is_valid[PIXELS_PER_THREAD_IN_REGISTERS];
#pragma unroll
      for (int i = 0; i < PIXELS_PER_THREAD_IN_REGISTERS; ++i) {
        const int idx = nhw_regs + thread_in_cta_nhw + i * PIXELS_PER_LDG;
        zero_array(x_storage[i]);
        zero_array(dy_storage[i]);
        is_valid[i] = 0.f;
        if (((unsigned int)idx < (unsigned int)params.nhw) && is_valid_c) {
          if (loop_i == OUTER_LOOPS - 1) {
            ldg_stream(x_storage[i], &gmem_src[idx * params.c]);
            ldg_stream(dy_storage[i], &gmem_dy[idx * params.c]);
          } else {
            ldg(x_storage[i], &gmem_src[idx * params.c]);
            ldg(dy_storage[i], &gmem_dy[idx * params.c]);
          }
          is_valid[i] = 1.f;
        }
      }

// Do the math.
#pragma unroll
      for (int i = 0; i < PIXELS_PER_THREAD_IN_REGISTERS; ++i) {
        // Convert to float and update
        float x_math[ELEMENTS_PER_LDG], dy_math[ELEMENTS_PER_LDG];
        to_float(x_math, x_storage[i]);
        to_float(dy_math, dy_storage[i]);

        // Update the count.
        count += is_valid[i];
        // Invert the count.
        float inv_count = is_valid[i] ? 1.f / count : 0.f;

        bwd_update(dscale, dbias, dy_math, x_math, mean, inv_count);
      }
    }

    // The elements to load and store in SMEM.
    int smem_nhw = OUTER_LOOPS * pixels_per_iteration + cta_nhw_smem;
    // Load elements from SMEM, update the CTA count.
    int pixels_in_smem = min(PIXELS_PER_CTA_IN_SMEM, params.nhw - smem_nhw);
    if (pixels_in_smem > 0) {
      cta_count += pixels_in_smem;
      for (int i = 0; i < PIXELS_PER_THREAD_IN_SMEM; ++i) {
        const int idx = smem_nhw + thread_in_cta_nhw + i * PIXELS_PER_LDG;
        bool is_pixel_valid = (((unsigned int)idx < (unsigned int)params.nhw) && is_valid_c);
        PackedStorageType x_storage_local[PACKED_ELEMENTS_PER_LDG], dy_storage_local[PACKED_ELEMENTS_PER_LDG];
        zero_array(x_storage_local);
        zero_array(dy_storage_local);
        if (is_pixel_valid) {
          ldg_stream(x_storage_local, &gmem_src[idx * params.c]);
          ldg_stream(dy_storage_local, &gmem_dy[idx * params.c]);
        }

        // The offset to store in SMEM.
        int offset = i * THREADS_PER_CTA * PACKED_ELEMENTS_PER_LDG;
        // Store in SMEM.
        write_to_smem(&smem_storage_packed[offset], threadIdx.x, x_storage_local);
        offset += PIXELS_PER_THREAD_IN_SMEM * THREADS_PER_CTA * PACKED_ELEMENTS_PER_LDG;
        write_to_smem(&smem_storage_packed[offset], threadIdx.x, dy_storage_local);
        // Update the count.
        count += is_pixel_valid;
        // Invert the count.
        float inv_count = is_pixel_valid ? 1.f / count : 0.f;

        float x_math[ELEMENTS_PER_LDG], dy_math[ELEMENTS_PER_LDG];
        to_float(x_math, x_storage_local);
        to_float(dy_math, dy_storage_local);

        bwd_update(dscale, dbias, dy_math, x_math, mean, inv_count);
      }
    }

// We scale the mean by the number of elements. It brings more stability.
#pragma unroll
    for (int i = 0; i < ELEMENTS_PER_LDG; ++i) {
      dbias[i] *= count;
      dscale[i] *= count;
    }

    // dscale parallel sum
    ParallelSums<THREADS_PER_PIXEL, ELEMENTS_PER_LDG>::dispatch<THREADS_PER_CTA>(smem, dscale, thread_in_cta_nhw);
    __syncthreads();
    // The values in shared memory correspond to the CTA-wide sums.
    read_from_smem(dscale, smem, thread_in_cta_c);
    __syncthreads();

    // dbias parallel sum
    ParallelSums<THREADS_PER_PIXEL, ELEMENTS_PER_LDG>::dispatch<THREADS_PER_CTA>(smem, dbias, thread_in_cta_nhw);
    __syncthreads();
    // The values in shared memory correspond to the CTA-wide sums.
    read_from_smem(dbias, smem, thread_in_cta_c);
    __syncthreads();

    // The workspace in global memory is distributed across the different CTA.
    int gmem_sums_offset = c_blk_index * gridDim.x * C_ELEMENTS_PER_CTA * 2;
    // Write the data for the CTA to global memory.
    float* gmem_sums = &params.gmem_sums[gmem_sums_offset];
    if (threadIdx.x < THREADS_PER_PIXEL) {
      const int idx = blockIdx.x * THREADS_PER_PIXEL + threadIdx.x;
      write_to_gmem(&gmem_sums[0], idx, dscale);
      write_to_gmem(&gmem_sums[C_ELEMENTS_PER_CTA * gridDim.x], idx, dbias);
    }

    // The counters to count how many CTAs have retired at this point.
    // A given cta uses the same counter every other time through the outer loop.
    int* gmem_retired_ctas = &params.gmem_retired_ctas[c_blk_index % (2 * gridDim.y)];
    inter_block_sync(gmem_retired_ctas, gridDim.x, blockIdx.x == 0);

    // Reset the accumulators for global summation
    zero_array(dscale);
    zero_array(dbias);

// Build the global accumulation
#pragma unroll 1
    for (int idx = threadIdx.x; idx < THREADS_PER_PIXEL * gridDim.x; idx += THREADS_PER_CTA) {
      float tmp1[ELEMENTS_PER_LDG], tmp2[ELEMENTS_PER_LDG];
      read_from_gmem(tmp1, gmem_sums, idx);
      read_from_gmem(tmp2, gmem_sums + C_ELEMENTS_PER_CTA * gridDim.x, idx);

#pragma unroll
      for (int i = 0; i < ELEMENTS_PER_LDG; ++i) {
        dscale[i] += tmp1[i];
        dbias[i] += tmp2[i];
      }
    }

    // dscale parallel sum
    if (params.sync_iters > 0) {
      ParallelSums<THREADS_PER_PIXEL, ELEMENTS_PER_LDG>::dispatchX<THREADS_PER_CTA>(
          smem, dscale, thread_in_cta_nhw, params.my_data, params.pair_datas, 4 * c_blk_index + 1, params.magic,
          params.sync_iters);
    } else {
      ParallelSums<THREADS_PER_PIXEL, ELEMENTS_PER_LDG>::dispatch<THREADS_PER_CTA>(smem, dscale, thread_in_cta_nhw);
    }

    __syncthreads();
    // The values in shared memory correspond to the CTA-wide sums.
    read_from_smem(dscale, smem, thread_in_cta_c);
    __syncthreads();

    // dbias parallel sum
    if (params.sync_iters > 0) {
      ParallelSums<THREADS_PER_PIXEL, ELEMENTS_PER_LDG>::dispatchX<THREADS_PER_CTA>(
          smem, dbias, thread_in_cta_nhw, params.my_data, params.pair_datas, 4 * c_blk_index + 0, params.magic,
          params.sync_iters);
    } else {
      ParallelSums<THREADS_PER_PIXEL, ELEMENTS_PER_LDG>::dispatch<THREADS_PER_CTA>(smem, dbias, thread_in_cta_nhw);
    }

    __syncthreads();
    // The values in shared memory correspond to the CTA-wide sums.
    read_from_smem(dbias, smem, thread_in_cta_c);

    // inv-var
    float var[ELEMENTS_PER_LDG];
    zero_array(var);
    if (is_valid_c) {
      read_from_gmem(var, params.gmem_saved_var, thread_c / ELEMENTS_PER_LDG);
    }

    // Normalize the dscale.
    multiply(dscale, var);

    // store dscale/dbias
    bool is_valid_for_saving = is_valid_c && blockIdx.x == 0 && thread_in_cta_nhw == 0;
    if (is_valid_for_saving) {
      if (params.sync_iters > 0) {
        scaled_write_to_gmem(params.gmem_dscale, thread_c / ELEMENTS_PER_LDG, dscale, params.wgrad_coeff);
        scaled_write_to_gmem(params.gmem_dbias, thread_c / ELEMENTS_PER_LDG, dbias, params.wgrad_coeff);
      } else {
        write_to_gmem(params.gmem_dscale, thread_c / ELEMENTS_PER_LDG, dscale);
        write_to_gmem(params.gmem_dbias, thread_c / ELEMENTS_PER_LDG, dbias);
      }
    }

    // scale
    float scale[ELEMENTS_PER_LDG];
    zero_array(scale);
    if (is_valid_c) {
      read_from_gmem(scale, params.gmem_scale, thread_c / ELEMENTS_PER_LDG);
    }

    // Further normalize the dscale to be used in dx calculation
    multiply(dscale, var);
    // scale the inv-var as well, afterwards
    multiply(var, scale);

    // inverse count
    float inv_count = params.svar_inv_count;

    // The base pointer to write to.
    uint16_t* const gmem_dst = &params.gmem_dst[thread_c];

// Store the elements in registers.
#pragma unroll 1
    for (int loop_i = OUTER_LOOPS - 1; loop_i >= 0; --loop_i) {
      // The value for nhw.
      int out_nhw = cta_nhw_regs + loop_i * pixels_per_iteration;

// Normalize the elements and write to memory.
#pragma unroll
      for (int i = 0; i < PIXELS_PER_THREAD_IN_REGISTERS; ++i) {
        // Convert to float.
        float x_math[ELEMENTS_PER_LDG], dy_math[ELEMENTS_PER_LDG];
        to_float(x_math, x_storage[i]);
        to_float(dy_math, dy_storage[i]);

        float dx[ELEMENTS_PER_LDG];
        bwd_dx(dx, dy_math, var, x_math, mean, dscale, dbias, inv_count);

        // Write back.
        const int idx = out_nhw + thread_in_cta_nhw + i * PIXELS_PER_LDG;
        if (((unsigned int)idx < (unsigned int)params.nhw) && is_valid_c) {
          stg_stream(&gmem_dst[idx * params.c], dx);
        }
      }

      // The next value of nhw.
      out_nhw -= pixels_per_iteration;

// Read the next elements from memory.
#pragma unroll
      for (int i = 0; i < PIXELS_PER_THREAD_IN_REGISTERS; ++i) {
        const int idx = out_nhw + thread_in_cta_nhw + i * PIXELS_PER_LDG;
        if (((unsigned int)idx < (unsigned int)params.nhw) && is_valid_c) {
          ldg_stream(x_storage[i], &gmem_src[idx * params.c]);
          ldg_stream(dy_storage[i], &gmem_dy[idx * params.c]);
        }
      }
    }

    // Normalize the elements from SMEM and write them out.
    if (pixels_in_smem > 0) {
      for (int i = 0; i < PIXELS_PER_THREAD_IN_SMEM; ++i) {
        const int idx = smem_nhw + thread_in_cta_nhw + i * PIXELS_PER_LDG;
        const bool is_valid = ((unsigned int)idx < (unsigned int)params.nhw) && is_valid_c;
        if (is_valid) {
          // Read from SMEM.
          int offset = i * THREADS_PER_CTA * PACKED_ELEMENTS_PER_LDG;
          PackedStorageType x_storage_local[PACKED_ELEMENTS_PER_LDG], dy_storage_local[PACKED_ELEMENTS_PER_LDG];
          read_from_smem(x_storage_local, &smem_storage_packed[offset], threadIdx.x);
          offset += PIXELS_PER_THREAD_IN_SMEM * THREADS_PER_CTA * PACKED_ELEMENTS_PER_LDG;
          read_from_smem(dy_storage_local, &smem_storage_packed[offset], threadIdx.x);
          float x_math[ELEMENTS_PER_LDG], dy_math[ELEMENTS_PER_LDG];
          to_float(x_math, x_storage_local);
          to_float(dy_math, dy_storage_local);

          float dx[ELEMENTS_PER_LDG];
          bwd_dx(dx, dy_math, var, x_math, mean, dscale, dbias, inv_count);

          // Write back.
          stg_stream(&gmem_dst[idx * params.c], dx);
        }
      }
    }
    // We're about to start on the next c-blk.  Needed?
    __syncthreads();
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Storage, int THREADS_PER_CTA, int THREADS_PER_PIXEL, int PIXELS_PER_THREAD_IN_REGISTERS,
          int PIXELS_PER_THREAD_IN_SMEM, int ELEMENTS_PER_LDG, int USE_ONLINE_APPROACH, int OUTER_LOOPS_,
          int DESIRED_OCCUPANCY>
__global__ __launch_bounds__(THREADS_PER_CTA,
                             DESIRED_OCCUPANCY) void nhwc_batch_norm_bwd_relu(NhwcBatchNormBwdParams params) {
  // The number of pixels loaded in a single LDG.
  const int PIXELS_PER_LDG = THREADS_PER_CTA / THREADS_PER_PIXEL;
  // The number of pixels computed per CTA stored in registers.
  const int PIXELS_PER_CTA_IN_REGISTERS = PIXELS_PER_THREAD_IN_REGISTERS * PIXELS_PER_LDG;
  // The number of pixels computed per CTA stored in SMEM.
  const int PIXELS_PER_CTA_IN_SMEM = PIXELS_PER_THREAD_IN_SMEM * PIXELS_PER_LDG;
  // The number of C elements per CTA.
  const int C_ELEMENTS_PER_CTA = THREADS_PER_PIXEL * ELEMENTS_PER_LDG;

  // Shared memory to do CTA-wide parallel sums.
  __shared__ float smem[THREADS_PER_PIXEL * (THREADS_PER_CTA / 32) * ELEMENTS_PER_LDG];

  // The adapter for the storage.
  typedef PackedStorage<Storage, ELEMENTS_PER_LDG> PackedStorage_;
  // The data type for packed storage in SMEM.
  typedef typename PackedStorage_::Type PackedStorageType;
  // The number of elements in the packed storage.
  const int PACKED_ELEMENTS_PER_LDG = PackedStorage_::PACKED_ELEMENTS_PER_LDG;
  // Registers to keep the data live for the persistent approach.
  PackedStorageType x_storage[PIXELS_PER_THREAD_IN_REGISTERS][PACKED_ELEMENTS_PER_LDG];
  PackedStorageType dy_storage[PIXELS_PER_THREAD_IN_REGISTERS][PACKED_ELEMENTS_PER_LDG];

  // Shared memory buffer to store the extra pixels.
  extern __shared__ PackedStorageType smem_storage_packed[];

  for (int c_blk_index = blockIdx.y; c_blk_index < params.c_blks; c_blk_index += gridDim.y) {
    // The position in the NHW dimension where the CTA starts.
    int cta_nhw_regs = blockIdx.x * PIXELS_PER_CTA_IN_REGISTERS;
    // The position in the NHW dimension where the CTA starts for the portion in SMEM.
    int cta_nhw_smem = blockIdx.x * PIXELS_PER_CTA_IN_SMEM;
    // Compute the NHW coordinate of the thread in the CTA.
    const int thread_in_cta_nhw = threadIdx.x / THREADS_PER_PIXEL;

    // The position in the C dimension where the CTA starts.
    const int cta_c = c_blk_index * C_ELEMENTS_PER_CTA;
    // Compute the C coordinate of the thread in the CTA.
    const int thread_in_cta_c = threadIdx.x % THREADS_PER_PIXEL;
    // Compute the C coordinate of the thread.
    const int thread_c = cta_c + thread_in_cta_c * ELEMENTS_PER_LDG;

    // Is the thread working on a valid C dimension?
    const int is_valid_c = thread_c < params.c;

    // Registers to store the mean/var/scale/bias used for the entire duration
    // Register usage optimizations:
    // 1. Can combine bias - (mean * var * scale) into a single register
    // 2. Can combine var * scale into a single register
    float varscale[ELEMENTS_PER_LDG];
    zero_array(varscale);
    if (is_valid_c) {
      read_from_gmem(varscale, params.gmem_saved_var, thread_c / ELEMENTS_PER_LDG);
    }
    float tmp[ELEMENTS_PER_LDG];
    zero_array(tmp);
    if (is_valid_c) {
      read_from_gmem(tmp, params.gmem_scale, thread_c / ELEMENTS_PER_LDG);
    }
    multiply(varscale, tmp);
    float mean[ELEMENTS_PER_LDG];
    zero_array(mean);
    if (is_valid_c) {
      read_from_gmem(mean, params.gmem_saved_mean, thread_c / ELEMENTS_PER_LDG);
    }
    zero_array(tmp);
    if (is_valid_c) {
      read_from_gmem(tmp, params.gmem_bias, thread_c / ELEMENTS_PER_LDG);
    }
    float mean_var_scale_bias[ELEMENTS_PER_LDG];
#pragma unroll
    for (int i = 0; i < ELEMENTS_PER_LDG; ++i) {
      mean_var_scale_bias[i] = tmp[i] - (mean[i] * varscale[i]);
    }

    // accumulation related registers
    float count = 0.f, dscale[ELEMENTS_PER_LDG], dbias[ELEMENTS_PER_LDG];
    zero_array(dscale);
    zero_array(dbias);

    // The number of elements loaded by this CTA.
    int cta_count = 0;
    // The base pointers to load from.
    const uint16_t* gmem_src = &params.gmem_src[thread_c];
    const uint16_t* gmem_dy = &params.gmem_dy[thread_c];

    // outer loops
    int OUTER_LOOPS = OUTER_LOOPS_ == 1 ? 1 : params.outer_loops;
    // Load the batch of elements. Compute sum across them
    const int pixels_per_iteration = PIXELS_PER_CTA_IN_REGISTERS * gridDim.x;

    if (OUTER_LOOPS_ != 1) {
      // We cannot load everything to store persistently, so let's makes sure registers and
      // smem are fully utilized
      int offset = params.nhw - pixels_per_iteration * OUTER_LOOPS - PIXELS_PER_CTA_IN_SMEM * gridDim.x;
      cta_nhw_regs += offset;
      cta_nhw_smem += offset;
    }

#pragma unroll 1
    for (int loop_i = 0; loop_i < OUTER_LOOPS; ++loop_i) {
      // The nhw position.
      int nhw_regs = cta_nhw_regs + loop_i * pixels_per_iteration;
      // Update the number of elements loaded by this CTA. TODO: Skip if <= 0!!!
      cta_count += max(0, min(PIXELS_PER_CTA_IN_REGISTERS, params.nhw - nhw_regs));

      // Read the elements from memory.
      float is_valid[PIXELS_PER_THREAD_IN_REGISTERS];
#pragma unroll
      for (int i = 0; i < PIXELS_PER_THREAD_IN_REGISTERS; ++i) {
        const int idx = nhw_regs + thread_in_cta_nhw + i * PIXELS_PER_LDG;
        zero_array(x_storage[i]);
        zero_array(dy_storage[i]);
        is_valid[i] = 0.f;
        if (((unsigned int)idx < (unsigned int)params.nhw) && is_valid_c) {
          if (loop_i == OUTER_LOOPS - 1) {
            ldg_stream(x_storage[i], &gmem_src[idx * params.c]);
            ldg_stream(dy_storage[i], &gmem_dy[idx * params.c]);
          } else {
            ldg(x_storage[i], &gmem_src[idx * params.c]);
            ldg(dy_storage[i], &gmem_dy[idx * params.c]);
          }
          is_valid[i] = 1.f;
        }
      }

// Do the math.
#pragma unroll
      for (int i = 0; i < PIXELS_PER_THREAD_IN_REGISTERS; ++i) {
        // Convert to float and update
        float x_math[ELEMENTS_PER_LDG], dy_math[ELEMENTS_PER_LDG];
        to_float(x_math, x_storage[i]);
        to_float(dy_math, dy_storage[i]);

        // Update the count.
        count += is_valid[i];
        // Invert the count.
        float inv_count = is_valid[i] ? 1.f / count : 0.f;

        relu_bwd(dy_math, x_math, mean_var_scale_bias, varscale, is_valid[i]);
        bwd_update(dscale, dbias, dy_math, x_math, mean, inv_count);
      }
    }

    // The elements to load and store in SMEM.
    int smem_nhw = OUTER_LOOPS * pixels_per_iteration + cta_nhw_smem;
    // Load elements from SMEM, update the CTA count.
    int pixels_in_smem = min(PIXELS_PER_CTA_IN_SMEM, params.nhw - smem_nhw);
    if (pixels_in_smem > 0) {
      cta_count += pixels_in_smem;
      for (int i = 0; i < PIXELS_PER_THREAD_IN_SMEM; ++i) {
        const int idx = smem_nhw + thread_in_cta_nhw + i * PIXELS_PER_LDG;
        bool is_pixel_valid = (((unsigned int)idx < (unsigned int)params.nhw) && is_valid_c);
        PackedStorageType x_storage_local[PACKED_ELEMENTS_PER_LDG], dy_storage_local[PACKED_ELEMENTS_PER_LDG];
        zero_array(x_storage_local);
        zero_array(dy_storage_local);
        if (is_pixel_valid) {
          ldg_stream(x_storage_local, &gmem_src[idx * params.c]);
          ldg_stream(dy_storage_local, &gmem_dy[idx * params.c]);
        }

        // The offset to store in SMEM.
        int offset = i * THREADS_PER_CTA * PACKED_ELEMENTS_PER_LDG;
        // Store in SMEM.
        write_to_smem(&smem_storage_packed[offset], threadIdx.x, x_storage_local);
        offset += PIXELS_PER_THREAD_IN_SMEM * THREADS_PER_CTA * PACKED_ELEMENTS_PER_LDG;
        write_to_smem(&smem_storage_packed[offset], threadIdx.x, dy_storage_local);
        // Update the count.
        count += is_pixel_valid;
        // Invert the count.
        float inv_count = is_pixel_valid ? 1.f / count : 0.f;

        float x_math[ELEMENTS_PER_LDG], dy_math[ELEMENTS_PER_LDG];
        to_float(x_math, x_storage_local);
        to_float(dy_math, dy_storage_local);

        relu_bwd(dy_math, x_math, mean_var_scale_bias, varscale, is_pixel_valid);
        bwd_update(dscale, dbias, dy_math, x_math, mean, inv_count);
      }
    }

// We scale the mean by the number of elements. It brings more stability.
#pragma unroll
    for (int i = 0; i < ELEMENTS_PER_LDG; ++i) {
      dbias[i] *= count;
      dscale[i] *= count;
    }

    // dscale parallel sum
    ParallelSums<THREADS_PER_PIXEL, ELEMENTS_PER_LDG>::dispatch<THREADS_PER_CTA>(smem, dscale, thread_in_cta_nhw);
    __syncthreads();
    // The values in shared memory correspond to the CTA-wide sums.
    read_from_smem(dscale, smem, thread_in_cta_c);
    __syncthreads();

    // dbias parallel sum
    ParallelSums<THREADS_PER_PIXEL, ELEMENTS_PER_LDG>::dispatch<THREADS_PER_CTA>(smem, dbias, thread_in_cta_nhw);
    __syncthreads();
    // The values in shared memory correspond to the CTA-wide sums.
    read_from_smem(dbias, smem, thread_in_cta_c);
    __syncthreads();

    // The workspace in global memory is distributed across the different CTA.
    int gmem_sums_offset = c_blk_index * gridDim.x * C_ELEMENTS_PER_CTA * 2;
    // Write the data for the CTA to global memory.
    float* gmem_sums = &params.gmem_sums[gmem_sums_offset];
    if (threadIdx.x < THREADS_PER_PIXEL) {
      const int idx = blockIdx.x * THREADS_PER_PIXEL + threadIdx.x;
      write_to_gmem(&gmem_sums[0], idx, dscale);
      write_to_gmem(&gmem_sums[C_ELEMENTS_PER_CTA * gridDim.x], idx, dbias);
    }

    // The counters to count how many CTAs have retired at this point.
    // A given cta uses the same counter every other time through the outer loop.
    int* gmem_retired_ctas = &params.gmem_retired_ctas[c_blk_index % (2 * gridDim.y)];
    inter_block_sync(gmem_retired_ctas, gridDim.x, blockIdx.x == 0);

    // Reset the accumulators for global summation
    zero_array(dscale);
    zero_array(dbias);

// Build the global accumulation
#pragma unroll 1
    for (int idx = threadIdx.x; idx < THREADS_PER_PIXEL * gridDim.x; idx += THREADS_PER_CTA) {
      float tmp1[ELEMENTS_PER_LDG], tmp2[ELEMENTS_PER_LDG];
      read_from_gmem(tmp1, gmem_sums, idx);
      read_from_gmem(tmp2, gmem_sums + C_ELEMENTS_PER_CTA * gridDim.x, idx);

#pragma unroll
      for (int i = 0; i < ELEMENTS_PER_LDG; ++i) {
        dscale[i] += tmp1[i];
        dbias[i] += tmp2[i];
      }
    }

    // dscale parallel sum
    if (params.sync_iters > 0) {
      ParallelSums<THREADS_PER_PIXEL, ELEMENTS_PER_LDG>::dispatchX<THREADS_PER_CTA>(
          smem, dscale, thread_in_cta_nhw, params.my_data, params.pair_datas, 4 * c_blk_index + 1, params.magic,
          params.sync_iters);
    } else {
      ParallelSums<THREADS_PER_PIXEL, ELEMENTS_PER_LDG>::dispatch<THREADS_PER_CTA>(smem, dscale, thread_in_cta_nhw);
    }

    __syncthreads();
    // The values in shared memory correspond to the CTA-wide sums.
    read_from_smem(dscale, smem, thread_in_cta_c);
    __syncthreads();

    // dbias parallel sum
    if (params.sync_iters > 0) {
      ParallelSums<THREADS_PER_PIXEL, ELEMENTS_PER_LDG>::dispatchX<THREADS_PER_CTA>(
          smem, dbias, thread_in_cta_nhw, params.my_data, params.pair_datas, 4 * c_blk_index + 0, params.magic,
          params.sync_iters);
    } else {
      ParallelSums<THREADS_PER_PIXEL, ELEMENTS_PER_LDG>::dispatch<THREADS_PER_CTA>(smem, dbias, thread_in_cta_nhw);
    }

    __syncthreads();
    // The values in shared memory correspond to the CTA-wide sums.
    read_from_smem(dbias, smem, thread_in_cta_c);

    // Normalize the dscale.
    float var[ELEMENTS_PER_LDG];
    zero_array(var);
    if (is_valid_c) {
      read_from_gmem(var, params.gmem_saved_var, thread_c / ELEMENTS_PER_LDG);
    }
    multiply(dscale, var);

    // store dscale/dbias
    bool is_valid_for_saving = is_valid_c && blockIdx.x == 0 && thread_in_cta_nhw == 0;
    if (is_valid_for_saving) {
      if (params.sync_iters > 0) {
        scaled_write_to_gmem(params.gmem_dscale, thread_c / ELEMENTS_PER_LDG, dscale, params.wgrad_coeff);
        scaled_write_to_gmem(params.gmem_dbias, thread_c / ELEMENTS_PER_LDG, dbias, params.wgrad_coeff);
      } else {
        write_to_gmem(params.gmem_dscale, thread_c / ELEMENTS_PER_LDG, dscale);
        write_to_gmem(params.gmem_dbias, thread_c / ELEMENTS_PER_LDG, dbias);
      }
    }

    // Further normalize the dscale to be used in dx calculation
    float scale[ELEMENTS_PER_LDG];
    zero_array(scale);
    if (is_valid_c) {
      read_from_gmem(scale, params.gmem_scale, thread_c / ELEMENTS_PER_LDG);
    }
    multiply(dscale, var);
    // scale the inv-var as well, afterwards
    multiply(var, scale);

    // inverse count
    float inv_count = params.svar_inv_count;

    // The base pointer to write to.
    uint16_t* const gmem_dst = &params.gmem_dst[thread_c];

// Store the elements in registers.
#pragma unroll 1
    for (int loop_i = OUTER_LOOPS - 1; loop_i >= 0; --loop_i) {
      // The value for nhw.
      int out_nhw = cta_nhw_regs + loop_i * pixels_per_iteration;

// Normalize the elements and write to memory.
#pragma unroll
      for (int i = 0; i < PIXELS_PER_THREAD_IN_REGISTERS; ++i) {
        // Convert to float.
        float x_math[ELEMENTS_PER_LDG], dy_math[ELEMENTS_PER_LDG];
        to_float(x_math, x_storage[i]);
        to_float(dy_math, dy_storage[i]);
        relu_bwd_for_dx(dy_math, x_math, mean_var_scale_bias, var);

        float dx[ELEMENTS_PER_LDG];
        bwd_dx(dx, dy_math, var, x_math, mean, dscale, dbias, inv_count);

        // Write back.
        const int idx = out_nhw + thread_in_cta_nhw + i * PIXELS_PER_LDG;
        if (((unsigned int)idx < (unsigned int)params.nhw) && is_valid_c) {
          stg_stream(&gmem_dst[idx * params.c], dx);
        }
      }

      // The next value of nhw.
      out_nhw -= pixels_per_iteration;

// Read the next elements from memory.
#pragma unroll
      for (int i = 0; i < PIXELS_PER_THREAD_IN_REGISTERS; ++i) {
        const int idx = out_nhw + thread_in_cta_nhw + i * PIXELS_PER_LDG;
        if (((unsigned int)idx < (unsigned int)params.nhw) && is_valid_c) {
          ldg_stream(x_storage[i], &gmem_src[idx * params.c]);
          ldg_stream(dy_storage[i], &gmem_dy[idx * params.c]);
        }
      }
    }

    // Normalize the elements from SMEM and write them out.
    if (pixels_in_smem > 0) {
      for (int i = 0; i < PIXELS_PER_THREAD_IN_SMEM; ++i) {
        const int idx = smem_nhw + thread_in_cta_nhw + i * PIXELS_PER_LDG;
        const bool is_valid = ((unsigned int)idx < (unsigned int)params.nhw) && is_valid_c;
        if (is_valid) {
          // Read from SMEM.
          int offset = i * THREADS_PER_CTA * PACKED_ELEMENTS_PER_LDG;
          PackedStorageType x_storage_local[PACKED_ELEMENTS_PER_LDG], dy_storage_local[PACKED_ELEMENTS_PER_LDG];
          read_from_smem(x_storage_local, &smem_storage_packed[offset], threadIdx.x);
          offset += PIXELS_PER_THREAD_IN_SMEM * THREADS_PER_CTA * PACKED_ELEMENTS_PER_LDG;
          read_from_smem(dy_storage_local, &smem_storage_packed[offset], threadIdx.x);
          float x_math[ELEMENTS_PER_LDG], dy_math[ELEMENTS_PER_LDG];
          to_float(x_math, x_storage_local);
          to_float(dy_math, dy_storage_local);
          relu_bwd_for_dx(dy_math, x_math, mean_var_scale_bias, var);

          float dx[ELEMENTS_PER_LDG];
          bwd_dx(dx, dy_math, var, x_math, mean, dscale, dbias, inv_count);

          // Write back.
          stg_stream(&gmem_dst[idx * params.c], dx);
        }
      }
    }
    // We're about to start on the next c-blk.  Needed?
    __syncthreads();
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Storage, int THREADS_PER_CTA, int THREADS_PER_PIXEL, int PIXELS_PER_THREAD_IN_REGISTERS,
          int PIXELS_PER_THREAD_IN_SMEM, int ELEMENTS_PER_LDG, int USE_ONLINE_APPROACH, int OUTER_LOOPS_,
          int DESIRED_OCCUPANCY>
__global__ __launch_bounds__(THREADS_PER_CTA,
                             DESIRED_OCCUPANCY) void nhwc_batch_norm_bwd_add_relu(NhwcBatchNormBwdParams params) {
  // The number of pixels loaded in a single LDG.
  const int PIXELS_PER_LDG = THREADS_PER_CTA / THREADS_PER_PIXEL;
  // The number of pixels computed per CTA stored in registers.
  const int PIXELS_PER_CTA_IN_REGISTERS = PIXELS_PER_THREAD_IN_REGISTERS * PIXELS_PER_LDG;
  // The number of pixels computed per CTA stored in SMEM.
  const int PIXELS_PER_CTA_IN_SMEM = PIXELS_PER_THREAD_IN_SMEM * PIXELS_PER_LDG;
  // The number of C elements per CTA.
  const int C_ELEMENTS_PER_CTA = THREADS_PER_PIXEL * ELEMENTS_PER_LDG;

  // Shared memory to do CTA-wide parallel sums.
  __shared__ float smem[THREADS_PER_PIXEL * (THREADS_PER_CTA / 32) * ELEMENTS_PER_LDG];

  // The adapter for the storage.
  typedef PackedStorage<Storage, ELEMENTS_PER_LDG> PackedStorage_;
  // The data type for packed storage in SMEM.
  typedef typename PackedStorage_::Type PackedStorageType;
  // The number of elements in the packed storage.
  const int PACKED_ELEMENTS_PER_LDG = PackedStorage_::PACKED_ELEMENTS_PER_LDG;
  // Registers to keep the data live for the persistent approach.
  PackedStorageType x_storage[PIXELS_PER_THREAD_IN_REGISTERS][PACKED_ELEMENTS_PER_LDG];
  PackedStorageType dy_storage[PIXELS_PER_THREAD_IN_REGISTERS][PACKED_ELEMENTS_PER_LDG];

  // Shared memory buffer to store the extra pixels.
  extern __shared__ PackedStorageType smem_storage_packed[];

  for (int c_blk_index = blockIdx.y; c_blk_index < params.c_blks; c_blk_index += gridDim.y) {
    // The position in the NHW dimension where the CTA starts.
    int cta_nhw_regs = blockIdx.x * PIXELS_PER_CTA_IN_REGISTERS;
    // The position in the NHW dimension where the CTA starts for the portion in SMEM.
    int cta_nhw_smem = blockIdx.x * PIXELS_PER_CTA_IN_SMEM;
    // Compute the NHW coordinate of the thread in the CTA.
    const int thread_in_cta_nhw = threadIdx.x / THREADS_PER_PIXEL;

    // The position in the C dimension where the CTA starts.
    const int cta_c = c_blk_index * C_ELEMENTS_PER_CTA;
    // Compute the C coordinate of the thread in the CTA.
    const int thread_in_cta_c = threadIdx.x % THREADS_PER_PIXEL;
    // Compute the C coordinate of the thread.
    const int thread_c = cta_c + thread_in_cta_c * ELEMENTS_PER_LDG;

    // Is the thread working on a valid C dimension?
    const int is_valid_c = thread_c < params.c;

    float mean[ELEMENTS_PER_LDG];
    zero_array(mean);
    if (is_valid_c) {
      read_from_gmem(mean, params.gmem_saved_mean, thread_c / ELEMENTS_PER_LDG);
    }

    // accumulation related registers
    float count = 0.f, dscale[ELEMENTS_PER_LDG], dbias[ELEMENTS_PER_LDG];
    zero_array(dscale);
    zero_array(dbias);

    // The number of elements loaded by this CTA.
    int cta_count = 0;
    // The base pointers to load from.
    const uint16_t* gmem_src = &params.gmem_src[thread_c];
    const uint16_t* gmem_dy = &params.gmem_dy[thread_c];
    uint16_t* gmem_dst1 = &params.gmem_dst1[thread_c];

    // outer loops
    int OUTER_LOOPS = OUTER_LOOPS_ == 1 ? 1 : params.outer_loops;
    // Load the batch of elements. Compute sum across them
    const int pixels_per_iteration = PIXELS_PER_CTA_IN_REGISTERS * gridDim.x;

    if (OUTER_LOOPS_ != 1) {
      // We cannot load everything to store persistently, so let's makes sure registers and
      // smem are fully utilized, offset is evenly divisible by 32
      int offset = (pixels_per_iteration * OUTER_LOOPS + PIXELS_PER_CTA_IN_SMEM * gridDim.x - params.nhw) & ~31;
      cta_nhw_regs -= offset;
      cta_nhw_smem -= offset;
    }

    const unsigned int* const gmem_relu_bitmask =
        params.gmem_relu_bitmask + ((params.nhw + 31) & ~31) * 2 * c_blk_index;

#pragma unroll 1
    for (int loop_i = 0; loop_i < OUTER_LOOPS; ++loop_i) {
      // The nhw position.
      int nhw_regs = cta_nhw_regs + loop_i * pixels_per_iteration;
      // Update the number of elements loaded by this CTA. TODO: Skip if <= 0!!!
      cta_count += max(0, min(PIXELS_PER_CTA_IN_REGISTERS, params.nhw - nhw_regs));

      int lane_id = threadIdx.x & 31;

      // Read the elements from memory.
      float is_valid[PIXELS_PER_THREAD_IN_REGISTERS];
      unsigned int relu_mask[PIXELS_PER_THREAD_IN_REGISTERS];
#pragma unroll
      for (int i = 0; i < PIXELS_PER_THREAD_IN_REGISTERS; ++i) {
        const int idx = nhw_regs + thread_in_cta_nhw + i * PIXELS_PER_LDG;
        zero_array(x_storage[i]);
        zero_array(dy_storage[i]);
        is_valid[i] = 0.f;
        const bool is_valid_nhw = static_cast<unsigned int>(idx) < static_cast<unsigned int>(params.nhw);
        if (is_valid_nhw) {
          if (is_valid_c) {
            if (loop_i == OUTER_LOOPS - 1) {
              ldg_stream(x_storage[i], &gmem_src[idx * params.c]);
              ldg_stream(dy_storage[i], &gmem_dy[idx * params.c]);
            } else {
              ldg(x_storage[i], &gmem_src[idx * params.c]);
              ldg(dy_storage[i], &gmem_dy[idx * params.c]);
            }
            is_valid[i] = 1.f;
          }

          if (lane_id < ELEMENTS_PER_LDG) {
            relu_mask[i] = gmem_relu_bitmask[idx * 2 + lane_id];
          }
        }
      }

// Do the math.
#pragma unroll
      for (int i = 0; i < PIXELS_PER_THREAD_IN_REGISTERS; ++i) {
        const int idx = nhw_regs + thread_in_cta_nhw + i * PIXELS_PER_LDG;
        // Convert to float and update
        float x_math[ELEMENTS_PER_LDG], dy_math[ELEMENTS_PER_LDG];
        bool rectified[ELEMENTS_PER_LDG];
#pragma unroll
        for (int j = 0; j < ELEMENTS_PER_LDG; ++j) {
          rectified[j] = ((__shfl_sync(0xFFFFFFFFU, relu_mask[i], j) & (1U << lane_id)) != 0);
        }
        to_float(x_math, x_storage[i]);
        to_float(dy_math, dy_storage[i]);

        // Update the count.
        count += is_valid[i];
        // Invert the count.
        float inv_count = is_valid[i] ? 1.f / count : 0.f;

        relu_bwd(dy_math, rectified, is_valid[i]);
        bwd_update(dscale, dbias, dy_math, x_math, mean, inv_count);

        // Lastly we need 'dy' only for BN, so store the 'relu-dgrad'ed version
        from_float(dy_storage[i], dy_math);

        // dZ for elementwise add
        if (is_valid[i]) {
          if (loop_i == OUTER_LOOPS - 1) {
            stg_stream(&gmem_dst1[idx * params.c], dy_storage[i]);
          } else {
            stg(&gmem_dst1[idx * params.c], dy_storage[i]);
          }
        }
      }
    }

    // The elements to load and store in SMEM.
    int smem_nhw = OUTER_LOOPS * pixels_per_iteration + cta_nhw_smem;
    // Load elements from SMEM, update the CTA count.
    int pixels_in_smem = min(PIXELS_PER_CTA_IN_SMEM, params.nhw - smem_nhw);
    if (pixels_in_smem > 0) {
      cta_count += pixels_in_smem;
      for (int i = 0; i < PIXELS_PER_THREAD_IN_SMEM; ++i) {
        const int idx = smem_nhw + thread_in_cta_nhw + i * PIXELS_PER_LDG;
        const bool is_pixel_valid_nhw = static_cast<unsigned int>(idx) < static_cast<unsigned int>(params.nhw);
        const bool is_pixel_valid = is_pixel_valid_nhw && is_valid_c;
        PackedStorageType x_storage_local[PACKED_ELEMENTS_PER_LDG], dy_storage_local[PACKED_ELEMENTS_PER_LDG];
        unsigned int relu_mask;
        int lane_id = threadIdx.x & 31;
        zero_array(x_storage_local);
        zero_array(dy_storage_local);
        if (is_pixel_valid_nhw) {
          if (is_valid_c) {
            ldg_stream(x_storage_local, &gmem_src[idx * params.c]);
            ldg_stream(dy_storage_local, &gmem_dy[idx * params.c]);
          }
          if (lane_id < ELEMENTS_PER_LDG) {
            relu_mask = gmem_relu_bitmask[idx * 2 + lane_id];
          }
        }
        bool rectified[ELEMENTS_PER_LDG];
#pragma unroll
        for (int j = 0; j < ELEMENTS_PER_LDG; ++j) {
          rectified[j] = ((__shfl_sync(0xFFFFFFFFU, relu_mask, j) & (1U << lane_id)) != 0);
        }

        // The offset to store in SMEM.
        int offset = i * THREADS_PER_CTA * PACKED_ELEMENTS_PER_LDG;
        // Store in SMEM.
        write_to_smem(&smem_storage_packed[offset], threadIdx.x, x_storage_local);
        offset += PIXELS_PER_THREAD_IN_SMEM * THREADS_PER_CTA * PACKED_ELEMENTS_PER_LDG;
        // Update the count.
        count += is_pixel_valid;
        // Invert the count.
        float inv_count = is_pixel_valid ? 1.f / count : 0.f;

        float x_math[ELEMENTS_PER_LDG], dy_math[ELEMENTS_PER_LDG];
        to_float(x_math, x_storage_local);
        to_float(dy_math, dy_storage_local);

        relu_bwd(dy_math, rectified, is_pixel_valid);
        bwd_update(dscale, dbias, dy_math, x_math, mean, inv_count);

        from_float(dy_storage_local, dy_math);
        // dZ for elementwise add
        if (is_pixel_valid) {
          stg_stream(&gmem_dst1[idx * params.c], dy_storage_local);
        }
        // only store the 'relu-dgrad'ed version!
        write_to_smem(&smem_storage_packed[offset], threadIdx.x, dy_storage_local);
      }
    }

// We scale the mean by the number of elements. It brings more stability.
#pragma unroll
    for (int i = 0; i < ELEMENTS_PER_LDG; ++i) {
      dbias[i] *= count;
      dscale[i] *= count;
    }

    // dscale parallel sum
    ParallelSums<THREADS_PER_PIXEL, ELEMENTS_PER_LDG>::dispatch<THREADS_PER_CTA>(smem, dscale, thread_in_cta_nhw);
    __syncthreads();
    // The values in shared memory correspond to the CTA-wide sums.
    read_from_smem(dscale, smem, thread_in_cta_c);
    __syncthreads();

    // dbias parallel sum
    ParallelSums<THREADS_PER_PIXEL, ELEMENTS_PER_LDG>::dispatch<THREADS_PER_CTA>(smem, dbias, thread_in_cta_nhw);
    __syncthreads();
    // The values in shared memory correspond to the CTA-wide sums.
    read_from_smem(dbias, smem, thread_in_cta_c);
    __syncthreads();

    // The workspace in global memory is distributed across the different CTA.
    int gmem_sums_offset = c_blk_index * gridDim.x * C_ELEMENTS_PER_CTA * 2;
    // Write the data for the CTA to global memory.
    float* gmem_sums = &params.gmem_sums[gmem_sums_offset];
    if (threadIdx.x < THREADS_PER_PIXEL) {
      const int idx = blockIdx.x * THREADS_PER_PIXEL + threadIdx.x;
      write_to_gmem(&gmem_sums[0], idx, dscale);
      write_to_gmem(&gmem_sums[C_ELEMENTS_PER_CTA * gridDim.x], idx, dbias);
    }

    // The counters to count how many CTAs have retired at this point.
    // A given cta uses the same counter every other time through the outer loop.
    int* gmem_retired_ctas = &params.gmem_retired_ctas[c_blk_index % (2 * gridDim.y)];
    inter_block_sync(gmem_retired_ctas, gridDim.x, blockIdx.x == 0);

    // Reset the accumulators for global summation
    zero_array(dscale);
    zero_array(dbias);

// Build the global accumulation
#pragma unroll 1
    for (int idx = threadIdx.x; idx < THREADS_PER_PIXEL * gridDim.x; idx += THREADS_PER_CTA) {
      float tmp1[ELEMENTS_PER_LDG], tmp2[ELEMENTS_PER_LDG];
      read_from_gmem(tmp1, gmem_sums, idx);
      read_from_gmem(tmp2, gmem_sums + C_ELEMENTS_PER_CTA * gridDim.x, idx);

#pragma unroll
      for (int i = 0; i < ELEMENTS_PER_LDG; ++i) {
        dscale[i] += tmp1[i];
        dbias[i] += tmp2[i];
      }
    }

    // dscale parallel sum
    if (params.sync_iters > 0) {
      ParallelSums<THREADS_PER_PIXEL, ELEMENTS_PER_LDG>::dispatchX<THREADS_PER_CTA>(
          smem, dscale, thread_in_cta_nhw, params.my_data, params.pair_datas, 4 * c_blk_index + 1, params.magic,
          params.sync_iters);
    } else {
      ParallelSums<THREADS_PER_PIXEL, ELEMENTS_PER_LDG>::dispatch<THREADS_PER_CTA>(smem, dscale, thread_in_cta_nhw);
    }

    __syncthreads();
    // The values in shared memory correspond to the CTA-wide sums.
    read_from_smem(dscale, smem, thread_in_cta_c);
    __syncthreads();

    // dbias parallel sum
    if (params.sync_iters > 0) {
      ParallelSums<THREADS_PER_PIXEL, ELEMENTS_PER_LDG>::dispatchX<THREADS_PER_CTA>(
          smem, dbias, thread_in_cta_nhw, params.my_data, params.pair_datas, 4 * c_blk_index + 0, params.magic,
          params.sync_iters);
    } else {
      ParallelSums<THREADS_PER_PIXEL, ELEMENTS_PER_LDG>::dispatch<THREADS_PER_CTA>(smem, dbias, thread_in_cta_nhw);
    }

    __syncthreads();
    // The values in shared memory correspond to the CTA-wide sums.
    read_from_smem(dbias, smem, thread_in_cta_c);

    // Normalize the dscale.
    float var[ELEMENTS_PER_LDG];
    zero_array(var);
    if (is_valid_c) {
      read_from_gmem(var, params.gmem_saved_var, thread_c / ELEMENTS_PER_LDG);
    }
    multiply(dscale, var);

    // store dscale/dbias
    bool is_valid_for_saving = is_valid_c && blockIdx.x == 0 && thread_in_cta_nhw == 0;
    if (is_valid_for_saving) {
      if (params.sync_iters > 0) {
        scaled_write_to_gmem(params.gmem_dscale, thread_c / ELEMENTS_PER_LDG, dscale, params.wgrad_coeff);
        scaled_write_to_gmem(params.gmem_dbias, thread_c / ELEMENTS_PER_LDG, dbias, params.wgrad_coeff);
      } else {
        write_to_gmem(params.gmem_dscale, thread_c / ELEMENTS_PER_LDG, dscale);
        write_to_gmem(params.gmem_dbias, thread_c / ELEMENTS_PER_LDG, dbias);
      }
    }

    // Further normalize the dscale to be used in dx calculation
    float scale[ELEMENTS_PER_LDG];
    zero_array(scale);
    if (is_valid_c) {
      read_from_gmem(scale, params.gmem_scale, thread_c / ELEMENTS_PER_LDG);
    }
    multiply(dscale, var);
    // scale the inv-var as well, afterwards
    multiply(var, scale);

    // inverse count
    float inv_count = params.svar_inv_count;

    // The base pointer to write to.
    uint16_t* const gmem_dst = &params.gmem_dst[thread_c];

// Store the elements in registers.
#pragma unroll 1
    for (int loop_i = OUTER_LOOPS - 1; loop_i >= 0; --loop_i) {
      // The value for nhw.
      int out_nhw = cta_nhw_regs + loop_i * pixels_per_iteration;

// Normalize the elements and write to memory.
#pragma unroll
      for (int i = 0; i < PIXELS_PER_THREAD_IN_REGISTERS; ++i) {
        const int idx = out_nhw + thread_in_cta_nhw + i * PIXELS_PER_LDG;
        const bool is_valid = ((unsigned int)idx < (unsigned int)params.nhw) && is_valid_c;
        // Convert to float.
        float x_math[ELEMENTS_PER_LDG], dy_math[ELEMENTS_PER_LDG];
        to_float(x_math, x_storage[i]);
        to_float(dy_math, dy_storage[i]);

        float dx[ELEMENTS_PER_LDG];
        bwd_dx(dx, dy_math, var, x_math, mean, dscale, dbias, inv_count);

        // Write back.
        if (is_valid) {
          stg_stream(&gmem_dst[idx * params.c], dx);
        }
      }

      // The next value of nhw.
      out_nhw -= pixels_per_iteration;

// Read the next elements from memory.
#pragma unroll
      for (int i = 0; i < PIXELS_PER_THREAD_IN_REGISTERS; ++i) {
        const int idx = out_nhw + thread_in_cta_nhw + i * PIXELS_PER_LDG;
        float y[ELEMENTS_PER_LDG];
        zero_array(y);
        if (((unsigned int)idx < (unsigned int)params.nhw) && is_valid_c) {
          ldg_stream(x_storage[i], &gmem_src[idx * params.c]);
          ldg_stream(dy_storage[i], &gmem_dst1[idx * params.c]);
        }
      }
    }

    // Normalize the elements from SMEM and write them out.
    if (pixels_in_smem > 0) {
      for (int i = 0; i < PIXELS_PER_THREAD_IN_SMEM; ++i) {
        const int idx = smem_nhw + thread_in_cta_nhw + i * PIXELS_PER_LDG;
        const bool is_valid = ((unsigned int)idx < (unsigned int)params.nhw) && is_valid_c;
        if (is_valid) {
          // Read from SMEM.
          int offset = i * THREADS_PER_CTA * PACKED_ELEMENTS_PER_LDG;
          PackedStorageType x_storage_local[PACKED_ELEMENTS_PER_LDG], dy_storage_local[PACKED_ELEMENTS_PER_LDG];
          read_from_smem(x_storage_local, &smem_storage_packed[offset], threadIdx.x);
          offset += PIXELS_PER_THREAD_IN_SMEM * THREADS_PER_CTA * PACKED_ELEMENTS_PER_LDG;
          read_from_smem(dy_storage_local, &smem_storage_packed[offset], threadIdx.x);
          float x_math[ELEMENTS_PER_LDG], dy_math[ELEMENTS_PER_LDG];
          to_float(x_math, x_storage_local);
          to_float(dy_math, dy_storage_local);

          float dx[ELEMENTS_PER_LDG];
          bwd_dx(dx, dy_math, var, x_math, mean, dscale, dbias, inv_count);

          // Write back.
          stg_stream(&gmem_dst[idx * params.c], dx);
        }
      }
    }
    // We're about to start on the next c-blk.  Needed?
    __syncthreads();
  }
}

#endif  // MXNET_OPERATOR_NN_CUDNN_NHWC_BATCH_NORM_KERNEL_H_


================================================
FILE: apex/contrib/csrc/index_mul_2d/index_mul_2d_cuda.cpp
================================================
#include <torch/torch.h>

#include <cstdint>
#include <vector>

void index_mul_2d_float_foward_cuda(at::Tensor& out, const at::Tensor& in1, const at::Tensor& in2,
                                    const at::Tensor& idx1);

void index_mul_2d_float_backward_cuda(at::Tensor& grad_in1, at::Tensor& grad_in2, const at::Tensor& grad_out,
                                      const at::Tensor& in1, const at::Tensor& in2, const at::Tensor& idx1);

void index_mul_2d_float_backward_backward_cuda(at::Tensor& grad_grad_out, at::Tensor& grad_in1, at::Tensor& grad_in2,
                                               const at::Tensor& grad_out, const at::Tensor& grad_grad_in1,
                                               const at::Tensor& grad_grad_in2, const at::Tensor& in1,
                                               const at::Tensor& in2, const at::Tensor& idx1);

void index_mul_2d_half_foward_cuda(at::Tensor& out, const at::Tensor& in1, const at::Tensor& in2,
                                   const at::Tensor& idx1);

void index_mul_2d_half_backward_cuda(at::Tensor& grad_in1, at::Tensor& grad_in2, const at::Tensor& grad_out,
                                     const at::Tensor& in1, const at::Tensor& in2, const at::Tensor& idx1);

void index_mul_2d_half_backward_backward_cuda(at::Tensor& grad_grad_out, at::Tensor& grad_in1, at::Tensor& grad_in2,
                                              const at::Tensor& grad_out, const at::Tensor& grad_grad_in1,
                                              const at::Tensor& grad_grad_in2, const at::Tensor& in1,
                                              const at::Tensor& in2, const at::Tensor& idx1);

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
  CHECK_CUDA(x);       \
  CHECK_CONTIGUOUS(x)

void index_mul_2d_float_forward(at::Tensor& out, const at::Tensor& in1, const at::Tensor& in2, const at::Tensor& idx1) {
  return index_mul_2d_float_foward_cuda(out, in1, in2, idx1);
}

void index_mul_2d_float_backward(at::Tensor& grad_in1, at::Tensor& grad_in2, const at::Tensor& grad_out,
                                 const at::Tensor& in1, const at::Tensor& in2, const at::Tensor& idx1) {
  return index_mul_2d_float_backward_cuda(grad_in1, grad_in2, grad_out, in1, in2, idx1);
}

void index_mul_2d_float_backwrad_backward(at::Tensor& grad_grad_out, at::Tensor& grad_in1, at::Tensor& grad_in2,
                                          const at::Tensor& grad_out, const at::Tensor& grad_grad_in1,
                                          const at::Tensor& grad_grad_in2, const at::Tensor& in1, const at::Tensor& in2,
                                          const at::Tensor& idx1) {
  return index_mul_2d_float_backward_backward_cuda(grad_grad_out, grad_in1, grad_in2, grad_out, grad_grad_in1,
                                                   grad_grad_in2, in1, in2, idx1);
}

void index_mul_2d_half_forward(at::Tensor& out, const at::Tensor& in1, const at::Tensor& in2, const at::Tensor& idx1) {
  return index_mul_2d_half_foward_cuda(out, in1, in2, idx1);
}

void index_mul_2d_half_backward(at::Tensor& grad_in1, at::Tensor& grad_in2, const at::Tensor& grad_out,
                                const at::Tensor& in1, const at::Tensor& in2, const at::Tensor& idx1) {
  return index_mul_2d_half_backward_cuda(grad_in1, grad_in2, grad_out, in1, in2, idx1);
}

void index_mul_2d_half_backwrad_backward(at::Tensor& grad_grad_out, at::Tensor& grad_in1, at::Tensor& grad_in2,
                                         const at::Tensor& grad_out, const at::Tensor& grad_grad_in1,
                                         const at::Tensor& grad_grad_in2, const at::Tensor& in1, const at::Tensor& in2,
                                         const at::Tensor& idx1) {
  return index_mul_2d_half_backward_backward_cuda(grad_grad_out, grad_in1, grad_in2, grad_out, grad_grad_in1,
                                                  grad_grad_in2, in1, in2, idx1);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("float_forward", &index_mul_2d_float_forward, "index mul float calculation forward (CUDA)",
        py::call_guard<py::gil_scoped_release>());
  m.def("float_backward", &index_mul_2d_float_backward, "index mul float calculation backward (CUDA)",
        py::call_guard<py::gil_scoped_release>());
  m.def("float_backward_backward", &index_mul_2d_float_backwrad_backward,
        "index mul float calculation backward backward (CUDA)", py::call_guard<py::gil_scoped_release>());
  m.def("half_forward", &index_mul_2d_half_forward, "index mul half calculation forward (CUDA)",
        py::call_guard<py::gil_scoped_release>());
  m.def("half_backward", &index_mul_2d_half_backward, "index mul half calculation backward (CUDA)",
        py::call_guard<py::gil_scoped_release>());
  m.def("half_backward_backward", &index_mul_2d_half_backwrad_backward,
        "index mul half calculation backward backward (CUDA)", py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: apex/contrib/csrc/index_mul_2d/index_mul_2d_cuda_kernel.cu
================================================
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>

#include <ATen/cuda/Atomic.cuh>

__global__ void index_mul_2d_float_dim64(float* out, const float* in1, const float* in2, const int64_t* idx1,
                                         const int64_t size) {
  const int tidx = threadIdx.x;
  const int tidy = threadIdx.y;
  const int bidx = blockIdx.x;
  const int start_idx = bidx * blockDim.y + tidy;
  constexpr int fea_dim = 64;

  if (start_idx < size) {
    int64_t vec_idx1 = (idx1[start_idx] * fea_dim) / 4 + tidx;
    int64_t vec_idx2 = (start_idx * fea_dim) / 4 + tidx;

    float4 res, src1, src2;
    src1 = reinterpret_cast<const float4*>(in1)[vec_idx1];
    src2 = reinterpret_cast<const float4*>(in2)[vec_idx2];
    res.x = src1.x * src2.x;
    res.y = src1.y * src2.y;
    res.z = src1.z * src2.z;
    res.w = src1.w * src2.w;
    reinterpret_cast<float4*>(out)[vec_idx2] = res;
  }
}

__global__ void index_mul_2d_float(float* out, const float* in1, const float* in2, const int64_t* idx1,
                                   const int64_t size, const int64_t fea_dim) {
  const int tidx = threadIdx.x;
  const int tidy = threadIdx.y;
  const int bidx = blockIdx.x;
  const int start_idx = bidx * blockDim.y + tidy;
  const int stride = blockDim.x;

  if (start_idx < size) {
    int64_t vec_idx1 = (idx1[start_idx] * fea_dim);
    int64_t vec_idx2 = (start_idx * fea_dim);

    for (int i = tidx; i < fea_dim; i += stride) {
      out[vec_idx2 + i] = in1[vec_idx1 + i] * in2[vec_idx2 + i];
    }
  }
}

__global__ void index_mul_2d_half(at::Half* out, const at::Half* in1, const at::Half* in2, const int64_t* idx1,
                                  const int64_t size, const int64_t fea_dim) {
  const int tidx = threadIdx.x;
  const int tidy = threadIdx.y;
  const int bidx = blockIdx.x;
  const int start_idx = bidx * blockDim.y + tidy;
  const int stride = blockDim.x;

  if (start_idx < size) {
    int64_t vec_idx1 = (idx1[start_idx] * fea_dim);
    int64_t vec_idx2 = (start_idx * fea_dim);

    for (int i = tidx; i < fea_dim; i += stride) {
      out[vec_idx2 + i] = at::Half(static_cast<float>(in1[vec_idx1 + i]) * static_cast<float>(in2[vec_idx2 + i]));
    }
  }
}

__global__ void index_mul_2d_grad_float_dim64(float* grad_in1, float* grad_in2, const float* grad_out, const float* in1,
                                              const float* in2, const int64_t* idx1, const int64_t size) {
  const int tidx = threadIdx.x;
  const int tidy = threadIdx.y;
  const int bidx = blockIdx.x;
  const int start_idx = bidx * blockDim.y + tidy;
  constexpr int fea_dim = 64;

  if (start_idx < size) {
    int64_t vec_idx1 = (idx1[start_idx] * fea_dim) / 4 + tidx;
    int64_t vec_idx2 = (start_idx * fea_dim) / 4 + tidx;

    float4 src_in1, src_in2, src_grad_out, dst_grad_in2;
    src_grad_out = reinterpret_cast<const float4*>(grad_out)[vec_idx2];
    src_in1 = reinterpret_cast<const float4*>(in1)[vec_idx1];
    src_in2 = reinterpret_cast<const float4*>(in2)[vec_idx2];
    int64_t grad_in1_base_idx = idx1[start_idx] * fea_dim + tidx * 4;
    gpuAtomicAdd(grad_in1 + grad_in1_base_idx + 0, src_grad_out.x * src_in2.x);
    gpuAtomicAdd(grad_in1 + grad_in1_base_idx + 1, src_grad_out.y * src_in2.y);
    gpuAtomicAdd(grad_in1 + grad_in1_base_idx + 2, src_grad_out.z * src_in2.z);
    gpuAtomicAdd(grad_in1 + grad_in1_base_idx + 3, src_grad_out.w * src_in2.w);
    dst_grad_in2.x = src_grad_out.x * src_in1.x;
    dst_grad_in2.y = src_grad_out.y * src_in1.y;
    dst_grad_in2.z = src_grad_out.z * src_in1.z;
    dst_grad_in2.w = src_grad_out.w * src_in1.w;
    reinterpret_cast<float4*>(grad_in2)[vec_idx2] = dst_grad_in2;
  }
}

__global__ void index_mul_2d_grad_float(float* grad_in1, float* grad_in2, const float* grad_out, const float* in1,
                                        const float* in2, const int64_t* idx1, const int64_t size,
                                        const int64_t fea_dim) {
  const int tidx = threadIdx.x;
  const int tidy = threadIdx.y;
  const int bidx = blockIdx.x;
  const int start_idx = bidx * blockDim.y + tidy;
  const int stride = blockDim.x;

  if (start_idx < size) {
    int64_t vec_idx1 = idx1[start_idx] * fea_dim;
    int64_t vec_idx2 = start_idx * fea_dim;

    for (int i = tidx; i < fea_dim; i += stride) {
      float src_in1 = in1[vec_idx1 + i];
      float src_in2 = in2[vec_idx2 + i];
      float src_grad_out = grad_out[vec_idx2 + i];
      grad_in2[vec_idx2 + i] = src_grad_out * src_in1;
      gpuAtomicAdd(grad_in1 + vec_idx1 + i, src_grad_out * src_in2);
    }
  }
}

__global__ void index_mul_2d_grad_half(at::Half* grad_in1, at::Half* grad_in2, const at::Half* grad_out,
                                       const at::Half* in1, const at::Half* in2, const int64_t* idx1,
                                       const int64_t size, const int64_t fea_dim) {
  const int tidx = threadIdx.x;
  const int tidy = threadIdx.y;
  const int bidx = blockIdx.x;
  const int start_idx = bidx * blockDim.y + tidy;
  const int stride = blockDim.x;

  if (start_idx < size) {
    int64_t vec_idx1 = idx1[start_idx] * fea_dim;
    int64_t vec_idx2 = start_idx * fea_dim;

    for (int i = tidx; i < fea_dim; i += stride) {
      float src_in1 = static_cast<float>(in1[vec_idx1 + i]);
      float src_in2 = static_cast<float>(in2[vec_idx2 + i]);
      float src_grad_out = static_cast<float>(grad_out[vec_idx2 + i]);
      grad_in2[vec_idx2 + i] = at::Half(src_grad_out * src_in1);
      gpuAtomicAdd(grad_in1 + vec_idx1 + i, at::Half(src_grad_out * src_in2));
    }
  }
}

__global__ void index_mul_2d_grad_grad_float_dim64(float* grad_grad_out, float* grad_in1, float* grad_in2,
                                                   const float* grad_out, const float* grad_grad_in1,
                                                   const float* grad_grad_in2, const float* in1, const float* in2,
                                                   const int64_t* idx1, const int64_t size) {
  const int tidx = threadIdx.x;
  const int tidy = threadIdx.y;
  const int bidx = blockIdx.x;
  const int start_idx = bidx * blockDim.y + tidy;
  constexpr int fea_dim = 64;

  if (start_idx < size) {
    int64_t vec_idx1 = (idx1[start_idx] * fea_dim) / 4 + tidx;
    int64_t vec_idx2 = (start_idx * fea_dim) / 4 + tidx;

    float4 src_grad_grad_in1, src_in1, src_grad_grad_in2, src_in2, src_grad_out;
    float4 dst_grad_grad_out, dst_grad_in2;
    src_grad_grad_in1 = reinterpret_cast<const float4*>(grad_grad_in1)[vec_idx1];
    src_in1 = reinterpret_cast<const float4*>(in1)[vec_idx1];
    src_grad_grad_in2 = reinterpret_cast<const float4*>(grad_grad_in2)[vec_idx2];
    src_in2 = reinterpret_cast<const float4*>(in2)[vec_idx2];
    dst_grad_grad_out.x = src_grad_grad_in1.x * src_in2.x + src_grad_grad_in2.x * src_in1.x;
    dst_grad_grad_out.y = src_grad_grad_in1.y * src_in2.y + src_grad_grad_in2.y * src_in1.y;
    dst_grad_grad_out.z = src_grad_grad_in1.z * src_in2.z + src_grad_grad_in2.z * src_in1.z;
    dst_grad_grad_out.w = src_grad_grad_in1.w * src_in2.w + src_grad_grad_in2.w * src_in1.w;
    reinterpret_cast<float4*>(grad_grad_out)[vec_idx2] = dst_grad_grad_out;
    src_grad_out = reinterpret_cast<const float4*>(grad_out)[vec_idx2];
    int64_t grad_in1_base_idx = idx1[start_idx] * fea_dim + tidx * 4;
    gpuAtomicAdd(grad_in1 + grad_in1_base_idx + 0, src_grad_grad_in2.x * src_grad_out.x);
    gpuAtomicAdd(grad_in1 + grad_in1_base_idx + 1, src_grad_grad_in2.y * src_grad_out.y);
    gpuAtomicAdd(grad_in1 + grad_in1_base_idx + 2, src_grad_grad_in2.z * src_grad_out.z);
    gpuAtomicAdd(grad_in1 + grad_in1_base_idx + 3, src_grad_grad_in2.w * src_grad_out.w);
    dst_grad_in2.x = src_grad_grad_in1.x * src_grad_out.x;
    dst_grad_in2.y = src_grad_grad_in1.y * src_grad_out.y;
    dst_grad_in2.z = src_grad_grad_in1.z * src_grad_out.z;
    dst_grad_in2.w = src_grad_grad_in1.w * src_grad_out.w;
    reinterpret_cast<float4*>(grad_in2)[vec_idx2] = dst_grad_in2;
  }
}

__global__ void index_mul_2d_grad_grad_float(float* grad_grad_out, float* grad_in1, float* grad_in2,
                                             const float* grad_out, const float* grad_grad_in1,
                                             const float* grad_grad_in2, const float* in1, const float* in2,
                                             const int64_t* idx1, const int64_t size, const int64_t fea_dim) {
  const int tidx = threadIdx.x;
  const int tidy = threadIdx.y;
  const int bidx = blockIdx.x;
  const int start_idx = bidx * blockDim.y + tidy;
  const int stride = blockDim.x;

  if (start_idx < size) {
    int64_t vec_idx1 = idx1[start_idx] * fea_dim;
    int64_t vec_idx2 = start_idx * fea_dim;

    for (int i = tidx; i < fea_dim; i += stride) {
      float src_grad_grad_in1 = grad_grad_in1[vec_idx1 + i];
      float src_grad_grad_in2 = grad_grad_in2[vec_idx2 + i];
      float src_in1 = in1[vec_idx1 + i];
      float src_in2 = in2[vec_idx2 + i];
      float src_grad_out = grad_out[vec_idx2 + i];
      grad_grad_out[vec_idx2 + i] = src_grad_grad_in1 * src_in2 + src_grad_grad_in2 * src_in1;
      grad_in2[vec_idx2 + i] = src_grad_grad_in1 * src_grad_out;
      gpuAtomicAdd(grad_in1 + vec_idx1 + i, src_grad_grad_in2 * src_grad_out);
    }
  }
}

__global__ void index_mul_2d_grad_grad_half(at::Half* grad_grad_out, at::Half* grad_in1, at::Half* grad_in2,
                                            const at::Half* grad_out, const at::Half* grad_grad_in1,
                                            const at::Half* grad_grad_in2, const at::Half* in1, const at::Half* in2,
                                            const int64_t* idx1, const int64_t size, const int64_t fea_dim) {
  const int tidx = threadIdx.x;
  const int tidy = threadIdx.y;
  const int bidx = blockIdx.x;
  const int start_idx = bidx * blockDim.y + tidy;
  const int stride = blockDim.x;

  if (start_idx < size) {
    int64_t vec_idx1 = idx1[start_idx] * fea_dim;
    int64_t vec_idx2 = start_idx * fea_dim;

    for (int i = tidx; i < fea_dim; i += stride) {
      float src_grad_grad_in1 = static_cast<float>(grad_grad_in1[vec_idx1 + i]);
      float src_grad_grad_in2 = static_cast<float>(grad_grad_in2[vec_idx2 + i]);
      float src_in1 = static_cast<float>(in1[vec_idx1 + i]);
      float src_in2 = static_cast<float>(in2[vec_idx2 + i]);
      float src_grad_out = static_cast<float>(grad_out[vec_idx2 + i]);
      grad_grad_out[vec_idx2 + i] = at::Half(src_grad_grad_in1 * src_in2 + src_grad_grad_in2 * src_in1);
      grad_in2[vec_idx2 + i] = at::Half(src_grad_grad_in1 * src_grad_out);
      gpuAtomicAdd(grad_in1 + vec_idx1 + i, at::Half(src_grad_grad_in2 * src_grad_out));
    }
  }
}

void index_mul_2d_float_foward_cuda(at::Tensor& out, const at::Tensor& in1, const at::Tensor& in2,
                                    const at::Tensor& idx1) {
  const int64_t size = in2.size(0);
  const int64_t fea_dim = in2.size(1);
  if (size < 0) {
    return;
  }

  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  if (fea_dim == 64) {
    const int BLOCK_THREADS_DIMX = 16;
    const int BLOCK_THREADS_DIMY = 16;
    const int BLOCK_NUMS = (size + BLOCK_THREADS_DIMY - 1) / BLOCK_THREADS_DIMY;

    index_mul_2d_float_dim64<<<BLOCK_NUMS, {BLOCK_THREADS_DIMX, BLOCK_THREADS_DIMY, 1}, 0, stream>>>(
        out.data_ptr<float>(), in1.data_ptr<float>(), in2.data_ptr<float>(), idx1.data_ptr<int64_t>(), size);
  } else {
    const int BLOCK_THREADS_DIMX = 32;
    const int BLOCK_THREADS_DIMY = 8;
    const int BLOCK_NUMS = (size + BLOCK_THREADS_DIMY - 1) / BLOCK_THREADS_DIMY;

    index_mul_2d_float<<<BLOCK_NUMS, {BLOCK_THREADS_DIMX, BLOCK_THREADS_DIMY, 1}, 0, stream>>>(
        out.data_ptr<float>(), in1.data_ptr<float>(), in2.data_ptr<float>(), idx1.data_ptr<int64_t>(), size, fea_dim);
  }

  AT_CUDA_CHECK(cudaGetLastError());
}

void index_mul_2d_float_backward_cuda(at::Tensor& grad_in1, at::Tensor& grad_in2, const at::Tensor& grad_out,
                                      const at::Tensor& in1, const at::Tensor& in2, const at::Tensor& idx1) {
  const int64_t size = in2.size(0);
  const int64_t fea_dim = in2.size(1);
  if (size < 0) {
    return;
  }

  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  if (fea_dim == 64) {
    const int BLOCK_THREADS_DIMX = 16;
    const int BLOCK_THREADS_DIMY = 16;
    const int BLOCK_NUMS = (size + BLOCK_THREADS_DIMY - 1) / BLOCK_THREADS_DIMY;

    index_mul_2d_grad_float_dim64<<<BLOCK_NUMS, {BLOCK_THREADS_DIMX, BLOCK_THREADS_DIMY, 1}, 0, stream>>>(
        grad_in1.data_ptr<float>(), grad_in2.data_ptr<float>(), grad_out.data_ptr<float>(), in1.data_ptr<float>(),
        in2.data_ptr<float>(), idx1.data_ptr<int64_t>(), size);

    AT_CUDA_CHECK(cudaGetLastError());
  } else {
    const int BLOCK_THREADS_DIMX = 32;
    const int BLOCK_THREADS_DIMY = 8;
    const int BLOCK_NUMS = (size + BLOCK_THREADS_DIMY - 1) / BLOCK_THREADS_DIMY;

    index_mul_2d_grad_float<<<BLOCK_NUMS, {BLOCK_THREADS_DIMX, BLOCK_THREADS_DIMY, 1}, 0, stream>>>(
        grad_in1.data_ptr<float>(), grad_in2.data_ptr<float>(), grad_out.data_ptr<float>(), in1.data_ptr<float>(),
        in2.data_ptr<float>(), idx1.data_ptr<int64_t>(), size, fea_dim);
  }
}

void index_mul_2d_float_backward_backward_cuda(at::Tensor& grad_grad_out, at::Tensor& grad_in1, at::Tensor& grad_in2,
                                               const at::Tensor& grad_out, const at::Tensor& grad_grad_in1,
                                               const at::Tensor& grad_grad_in2, const at::Tensor& in1,
                                               const at::Tensor& in2, const at::Tensor& idx1) {
  const int64_t size = in2.size(0);
  const int64_t fea_dim = in2.size(1);
  if (size < 0) {
    return;
  }

  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  if (fea_dim == 64) {
    const int BLOCK_THREADS_DIMX = 16;
    const int BLOCK_THREADS_DIMY = 16;
    const int BLOCK_NUMS = (size + BLOCK_THREADS_DIMY - 1) / BLOCK_THREADS_DIMY;

    index_mul_2d_grad_grad_float_dim64<<<BLOCK_NUMS, {BLOCK_THREADS_DIMX, BLOCK_THREADS_DIMY, 1}, 0, stream>>>(
        grad_grad_out.data_ptr<float>(), grad_in1.data_ptr<float>(), grad_in2.data_ptr<float>(),
        grad_out.data_ptr<float>(), grad_grad_in1.data_ptr<float>(), grad_grad_in2.data_ptr<float>(),
        in1.data_ptr<float>(), in2.data_ptr<float>(), idx1.data_ptr<int64_t>(), size);
  } else {
    const int BLOCK_THREADS_DIMX = 32;
    const int BLOCK_THREADS_DIMY = 8;
    const int BLOCK_NUMS = (size + BLOCK_THREADS_DIMY - 1) / BLOCK_THREADS_DIMY;

    index_mul_2d_grad_grad_float<<<BLOCK_NUMS, {BLOCK_THREADS_DIMX, BLOCK_THREADS_DIMY, 1}, 0, stream>>>(
        grad_grad_out.data_ptr<float>(), grad_in1.data_ptr<float>(), grad_in2.data_ptr<float>(),
        grad_out.data_ptr<float>(), grad_grad_in1.data_ptr<float>(), grad_grad_in2.data_ptr<float>(),
        in1.data_ptr<float>(), in2.data_ptr<float>(), idx1.data_ptr<int64_t>(), size, fea_dim);
  }

  AT_CUDA_CHECK(cudaGetLastError());
}

void index_mul_2d_half_foward_cuda(at::Tensor& out, const at::Tensor& in1, const at::Tensor& in2,
                                   const at::Tensor& idx1) {
  const int64_t size = in2.size(0);
  const int64_t fea_dim = in2.size(1);
  if (size < 0) {
    return;
  }

  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  const int BLOCK_THREADS_DIMX = 32;
  const int BLOCK_THREADS_DIMY = 8;
  const int BLOCK_NUMS = (size + BLOCK_THREADS_DIMY - 1) / BLOCK_THREADS_DIMY;

  index_mul_2d_half<<<BLOCK_NUMS, {BLOCK_THREADS_DIMX, BLOCK_THREADS_DIMY, 1}, 0, stream>>>(
      out.data_ptr<at::Half>(), in1.data_ptr<at::Half>(), in2.data_ptr<at::Half>(), idx1.data_ptr<int64_t>(), size,
      fea_dim);

  AT_CUDA_CHECK(cudaGetLastError());
}

void index_mul_2d_half_backward_cuda(at::Tensor& grad_in1, at::Tensor& grad_in2, const at::Tensor& grad_out,
                                     const at::Tensor& in1, const at::Tensor& in2, const at::Tensor& idx1) {
  const int64_t size = in2.size(0);
  const int64_t fea_dim = in2.size(1);
  if (size < 0) {
    return;
  }

  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  const int BLOCK_THREADS_DIMX = 32;
  const int BLOCK_THREADS_DIMY = 8;
  const int BLOCK_NUMS = (size + BLOCK_THREADS_DIMY - 1) / BLOCK_THREADS_DIMY;

  index_mul_2d_grad_half<<<BLOCK_NUMS, {BLOCK_THREADS_DIMX, BLOCK_THREADS_DIMY, 1}, 0, stream>>>(
      grad_in1.data_ptr<at::Half>(), grad_in2.data_ptr<at::Half>(), grad_out.data_ptr<at::Half>(),
      in1.data_ptr<at::Half>(), in2.data_ptr<at::Half>(), idx1.data_ptr<int64_t>(), size, fea_dim);
}

void index_mul_2d_half_backward_backward_cuda(at::Tensor& grad_grad_out, at::Tensor& grad_in1, at::Tensor& grad_in2,
                                              const at::Tensor& grad_out, const at::Tensor& grad_grad_in1,
                                              const at::Tensor& grad_grad_in2, const at::Tensor& in1,
                                              const at::Tensor& in2, const at::Tensor& idx1) {
  const int64_t size = in2.size(0);
  const int64_t fea_dim = in2.size(1);
  if (size < 0) {
    return;
  }

  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  const int BLOCK_THREADS_DIMX = 32;
  const int BLOCK_THREADS_DIMY = 8;
  const int BLOCK_NUMS = (size + BLOCK_THREADS_DIMY - 1) / BLOCK_THREADS_DIMY;

  index_mul_2d_grad_grad_half<<<BLOCK_NUMS, {BLOCK_THREADS_DIMX, BLOCK_THREADS_DIMY, 1}, 0, stream>>>(
      grad_grad_out.data_ptr<at::Half>(), grad_in1.data_ptr<at::Half>(), grad_in2.data_ptr<at::Half>(),
      grad_out.data_ptr<at::Half>(), grad_grad_in1.data_ptr<at::Half>(), grad_grad_in2.data_ptr<at::Half>(),
      in1.data_ptr<at::Half>(), in2.data_ptr<at::Half>(), idx1.data_ptr<int64_t>(), size, fea_dim);

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: apex/contrib/csrc/layer_norm/ln.h
================================================
#pragma once

#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <stdint.h>
#include <stdio.h>

#include <unordered_map>

namespace layer_norm {

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Params>
struct LaunchParams {
  size_t workspace_bytes;
  size_t barrier_size;

  cudaDeviceProp* props;

  cudaStream_t stream;

  Params params;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

struct FwdParams {
  FwdParams()
      : ctas_per_col(0),
        rows(0),
        cols(0),
        x(nullptr),
        z(nullptr),
        mu(nullptr),
        rs(nullptr),
        gamma(nullptr),
        beta(nullptr),
        workspace(nullptr),
        barrier(nullptr),
        epsilon(0.f) {}

  // For Multi-CTA, number of different CTA groups. Otherwise same as gridDim.x.
  int ctas_per_col;

  // Input is interpreted as matrix. We normalize across columns.
  int rows;
  int cols;

  // Common data pointers.
  void* x;
  void* z;
  void* mu;
  void* rs;
  void* gamma;
  void* beta;

  // Multi-CTA workspace in gmem.
  void* workspace;

  // Multi-CTA sync barriers in gmem.
  int* barrier;

  // Output of LN FWD.
  float epsilon;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

struct BwdParams : public FwdParams {
  BwdParams()
      : FwdParams(),
        dz(nullptr),
        dbeta_part(nullptr),
        dgamma_part(nullptr),
        dx(nullptr),
        dbeta(nullptr),
        dgamma(nullptr) {}
  // Input: gradient wrt. LN FWD output.
  void* dz;

  // Workspace for Wgrad pre-reduction.
  void* dbeta_part;
  void* dgamma_part;

  // Output: Dgrad.
  void* dx;
  // Output: Wgrad.
  void* dbeta;
  void* dgamma;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

using FwdFunction = std::function<void(LaunchParams<FwdParams>&, const bool)>;
using BwdFunction = std::function<void(LaunchParams<BwdParams>&, const bool)>;
using FunctionKey = uint64_t;
using FwdRegistry = std::unordered_map<FunctionKey, FwdFunction>;
using BwdRegistry = std::unordered_map<FunctionKey, BwdFunction>;

extern FwdRegistry FWD_FUNCS;
extern BwdRegistry BWD_FUNCS;

////////////////////////////////////////////////////////////////////////////////////////////////////

using fp32 = float;
using fp16 = half;
using bf16 = nv_bfloat16;

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename T>
struct TypeId {};

template <>
struct TypeId<fp16> {
  constexpr static uint32_t Value = 0;
};

template <>
struct TypeId<bf16> {
  constexpr static uint32_t Value = 1;
};

template <>
struct TypeId<fp32> {
  constexpr static uint32_t Value = 2;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename T, int S>
struct Type2Key {
  constexpr static uint32_t Value = TypeId<T>::Value << S;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename T>
struct WeightType2Key : public Type2Key<T, 0> {};

template <typename T>
struct InputType2Key : public Type2Key<T, 2> {};

template <typename T>
struct OutputType2Key : public Type2Key<T, 4> {};

template <typename T>
struct ComputeType2Key : public Type2Key<T, 6> {};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename W, typename I, typename O, typename C>
struct Types2Key {
  constexpr static uint32_t Value =
      WeightType2Key<W>::Value | InputType2Key<I>::Value | OutputType2Key<O>::Value | ComputeType2Key<C>::Value;
  constexpr static inline uint64_t get(const uint64_t hidden_size) {
    constexpr uint64_t type_key = Value;
    return (type_key << 32) | hidden_size;
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename W, typename I, typename O, typename C, uint64_t HIDDEN_SIZE>
struct FwdRegistrar {
  FwdRegistrar(FwdFunction f) {
    uint64_t key = Types2Key<W, I, O, C>::get(HIDDEN_SIZE);
    FWD_FUNCS.insert({key, f});
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename W, typename I, typename O, typename C, uint64_t HIDDEN_SIZE>
struct BwdRegistrar {
  BwdRegistrar(BwdFunction f) {
    uint64_t key = Types2Key<W, I, O, C>::get(HIDDEN_SIZE);
    BWD_FUNCS.insert({key, f});
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

}  // namespace layer_norm


================================================
FILE: apex/contrib/csrc/layer_norm/ln_api.cpp
================================================
#include <torch/extension.h>

#include "ATen/cuda/CUDAContext.h"
#include "ln.h"

/*

Supported Type combinations:

input    compute   weights   output
=======================================
fp32     fp32      fp32      fp32
fp16     fp32      fp16      fp16
bf16     fp32      bf16      bf16
fp32     fp32      fp16      fp16
fp32     fp32      bf16      bf16

Remarks:
Output type = Weight type
Compute always in FP32

*/

namespace layer_norm {

// Create registries and provide runtime versions of config hash functions.

FwdRegistry FWD_FUNCS;
BwdRegistry BWD_FUNCS;

////////////////////////////////////////////////////////////////////////////////////////////////////

uint32_t get_type_id(torch::Dtype dtype) {
  if (dtype == torch::kFloat16) {
    return TypeId<fp16>::Value;
  } else if (dtype == torch::kBFloat16) {
    return TypeId<bf16>::Value;
  } else if (dtype == torch::kFloat32) {
    return TypeId<fp32>::Value;
  } else {
    TORCH_CHECK(false, "Type not supported: ", dtype);
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

uint64_t get_key(torch::Dtype wtype, torch::Dtype itype, torch::Dtype otype, torch::Dtype ctype, uint64_t hidden_size) {
  using namespace layer_norm;
  uint64_t type_key =
      get_type_id(wtype) | (get_type_id(itype) << 2) | (get_type_id(otype) << 4) | (get_type_id(ctype) << 6);
  uint64_t launcher_key = (type_key << 32) | hidden_size;
  return launcher_key;
}

}  // namespace layer_norm

////////////////////////////////////////////////////////////////////////////////////////////////////

layer_norm::FwdFunction& get_fwd_launcher(torch::Dtype wtype, torch::Dtype itype, torch::Dtype otype,
                                          torch::Dtype ctype, uint32_t hidden_size) {
  auto iter = layer_norm::FWD_FUNCS.find(layer_norm::get_key(wtype, itype, otype, ctype, hidden_size));
  if (iter != layer_norm::FWD_FUNCS.end()) {
    return iter->second;
  } else {
    TORCH_CHECK(false, "FWD: Unsupported hidden_size or types: ", hidden_size, wtype, itype, otype, ctype);
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

layer_norm::BwdFunction& get_bwd_launcher(torch::Dtype wtype, torch::Dtype itype, torch::Dtype otype,
                                          torch::Dtype ctype, uint32_t hidden_size) {
  auto iter = layer_norm::BWD_FUNCS.find(layer_norm::get_key(wtype, itype, otype, ctype, hidden_size));
  if (iter != layer_norm::BWD_FUNCS.end()) {
    return iter->second;
  } else {
    TORCH_CHECK(false, "BWD: Unsupported hidden_size or types: ", hidden_size, wtype, itype, otype, ctype);
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

std::vector<at::Tensor> ln_fwd(const at::Tensor& x,      // BxSxhidden_size
                               const at::Tensor& gamma,  // hidden_size
                               const at::Tensor& beta,   // hidden_size
                               const float epsilon) {
  auto itype = x.scalar_type();
  auto wtype = gamma.scalar_type();
  auto otype = wtype;
  auto ctype = torch::kFloat32;

  TORCH_CHECK(beta.scalar_type() == wtype);

  TORCH_CHECK(x.is_cuda())
  TORCH_CHECK(gamma.is_cuda())
  TORCH_CHECK(beta.is_cuda())

  TORCH_CHECK(x.is_contiguous());
  auto sizes = x.sizes();
  TORCH_CHECK(sizes.size() == 2);

  const int rows = sizes[0];
  const int cols = sizes[1];
  auto hidden_size = gamma.numel();

  TORCH_CHECK(gamma.sizes() == beta.sizes());
  TORCH_CHECK(hidden_size == cols);

  TORCH_CHECK(epsilon >= 0.f);

  auto opts = x.options();

  auto z = torch::empty(sizes, opts.dtype(otype));

  auto mu = torch::empty({rows}, opts.dtype(ctype));
  auto rsigma = torch::empty({rows}, opts.dtype(ctype));

  layer_norm::LaunchParams<layer_norm::FwdParams> launch_params;

  launch_params.props = at::cuda::getCurrentDeviceProperties();
  launch_params.stream = at::cuda::getCurrentCUDAStream().stream();

  // Request the kernel launcher.
  auto launcher = get_fwd_launcher(wtype, itype, otype, ctype, hidden_size);

  // Query the kernel-specific launch parameters.
  launcher(launch_params, true);

  at::Tensor workspace, barrier;

  // Set the kernel runtime parameters.
  layer_norm::FwdParams& params = launch_params.params;
  params.rows = rows;
  params.cols = cols;
  params.z = z.data_ptr();
  params.mu = mu.data_ptr();
  params.rs = rsigma.data_ptr();
  params.gamma = gamma.data_ptr();
  params.beta = beta.data_ptr();
  params.x = x.data_ptr();
  params.epsilon = epsilon;

  if (launch_params.barrier_size > 0) {
    auto options = x.options();
    barrier = torch::zeros(launch_params.barrier_size, options.dtype(torch::kInt32));
    workspace = torch::empty(launch_params.workspace_bytes, options.dtype(torch::kChar));
    params.workspace = workspace.data_ptr();
    params.barrier = barrier.data_ptr<int>();
  }

  // Launch the kernel.
  launcher(launch_params, false);

  return {z, mu, rsigma};
}

////////////////////////////////////////////////////////////////////////////////////////////////////
std::vector<at::Tensor> ln_bwd(const at::Tensor& dz,                    // BxSxhidden_size
                               const at::Tensor& x_or_z,                // BxSxhidden_size
                               c10::optional<const at::Tensor>& mu_,    // BxS, FP32!
                               const at::Tensor& rsigma,                // BxS, FP32!
                               const at::Tensor& gamma,                 // hidden_size
                               c10::optional<const at::Tensor>& beta_,  // hidden_size
                               bool memory_efficient) {
  auto itype = x_or_z.scalar_type();
  auto wtype = gamma.scalar_type();
  auto otype = wtype;
  auto ctype = torch::kFloat32;

  TORCH_CHECK(dz.dtype() == otype);
  TORCH_CHECK(rsigma.dtype() == ctype);
  if (mu_.has_value()) {
    TORCH_CHECK(mu_.value().dtype() == ctype);
  }

  TORCH_CHECK(x_or_z.is_cuda());
  TORCH_CHECK(dz.is_cuda());
  TORCH_CHECK(rsigma.is_cuda());
  TORCH_CHECK(gamma.is_cuda());
  if (beta_.has_value()) {
    TORCH_CHECK(beta_.value().is_cuda());
    TORCH_CHECK(beta_.value().dtype() == wtype);
  }

  TORCH_CHECK(x_or_z.is_contiguous());
  TORCH_CHECK(dz.is_contiguous());

  auto sizes = x_or_z.sizes();
  TORCH_CHECK(sizes.size() == 2);
  TORCH_CHECK(dz.sizes() == sizes);
  auto rows = sizes[0];
  auto cols = sizes[1];

  auto hidden_size = gamma.numel();

  TORCH_CHECK(gamma.numel() == cols);
  if (beta_.has_value()) {
    TORCH_CHECK(beta_.value().numel() == cols);
  }

  auto options = x_or_z.options();

  auto dx = torch::empty_like(x_or_z);
  auto dgamma = torch::empty_like(gamma);
  auto dbeta = torch::empty_like(gamma);

  layer_norm::LaunchParams<layer_norm::BwdParams> launch_params;
  launch_params.stream = at::cuda::getCurrentCUDAStream().stream();
  launch_params.props = at::cuda::getCurrentDeviceProperties();

  auto launcher = get_bwd_launcher(wtype, itype, otype, ctype, hidden_size);

  launcher(launch_params, true);

  auto dgamma_part = torch::empty({launch_params.params.ctas_per_col, hidden_size}, options.dtype(ctype));
  auto dbeta_part = torch::empty({launch_params.params.ctas_per_col, hidden_size}, options.dtype(ctype));
  at::Tensor workspace, barrier;

  layer_norm::BwdParams& params = launch_params.params;
  params.rows = rows;
  params.cols = cols;
  if (memory_efficient) {
    params.z = x_or_z.data_ptr();
    params.beta = beta_.value().data_ptr();
  } else {
    params.x = x_or_z.data_ptr();
    params.mu = mu_.value().data_ptr();
  }
  params.rs = rsigma.data_ptr();
  params.gamma = gamma.data_ptr();
  params.dz = dz.data_ptr();
  params.dx = dx.data_ptr();
  params.dbeta = dbeta.data_ptr();
  params.dgamma = dgamma.data_ptr();
  params.dbeta_part = dbeta_part.data_ptr();
  params.dgamma_part = dgamma_part.data_ptr();

  if (launch_params.barrier_size > 0) {
    // TODO Any way to avoid this?
    barrier = torch::zeros(launch_params.barrier_size, options.dtype(torch::kInt32));
    workspace = torch::empty(launch_params.workspace_bytes, options.dtype(torch::kChar));
    params.workspace = workspace.data_ptr();
    params.barrier = barrier.data_ptr<int>();
  }

  launcher(launch_params, false);

  return {dx, dgamma, dbeta, dgamma_part, dbeta_part};
}

////////////////////////////////////////////////////////////////////////////////////////////////////

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.doc() = "CUDA LayerNorm";
  m.def("ln_fwd", &ln_fwd, "Run LayerNorm forward kernel", py::call_guard<py::gil_scoped_release>());
  m.def("ln_bwd", &ln_bwd, "Run LayerNorm backward kernel", py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: apex/contrib/csrc/layer_norm/ln_bwd_kernels.cuh
================================================
#pragma once

#include "ln_utils.cuh"

namespace layer_norm {

template <typename Ktraits>
__global__ __launch_bounds__(Ktraits::THREADS_PER_CTA) void ln_bwd_kernel(layer_norm::BwdParams params) {
  enum { ROWS_PER_CTA = Ktraits::ROWS_PER_CTA };
  enum { WARPS_M = Ktraits::WARPS_M };
  enum { WARPS_N = Ktraits::WARPS_N };
  enum { THREADS_PER_ROW = Ktraits::THREADS_PER_ROW };
  enum { COLS = Ktraits::COLS };
  enum { BYTES_PER_ROW = Ktraits::BYTES_PER_ROW };
  enum { LDGS = Ktraits::LDGS };
  enum { NUM_ELTS = Ktraits::ELTS_PER_LDG };
  enum { THREADS_PER_WARP = Ktraits::THREADS_PER_WARP };
  enum { CTAS_PER_ROW = Ktraits::CTAS_PER_ROW };

  using compute_t = typename Ktraits::compute_t;
  using index_t = typename Ktraits::index_t;
  using Ivec = typename Ktraits::Ivec;
  using Ovec = typename Ktraits::Ovec;
  using Wvec = typename Ktraits::Wvec;
  using Cvec = typename Ktraits::Cvec;
  using Reducer = typename Ktraits::Reducer;
  using reduce_t = typename Reducer::Type;

  extern __shared__ char smem_[];

  const index_t tidx = threadIdx.x;
  const index_t bidn = blockIdx.x % CTAS_PER_ROW;
  const index_t bidm = blockIdx.x / CTAS_PER_ROW;
  const index_t lane = tidx % THREADS_PER_WARP;
  const index_t warp = tidx / THREADS_PER_WARP;
  const index_t warp_m = warp / Ktraits::WARPS_N;
  const index_t warp_n = warp % Ktraits::WARPS_N;
  const index_t tid_r = warp_n * THREADS_PER_WARP + lane;

  const index_t r = bidm * Ktraits::ROWS_PER_CTA + warp_m;
  const index_t c = bidn * THREADS_PER_ROW + warp_n * THREADS_PER_WARP + lane;

  static_assert(COLS == THREADS_PER_ROW * LDGS * NUM_ELTS * CTAS_PER_ROW);

  Cvec dzy_sum[LDGS];
  Cvec dz_sum[LDGS];

  memset(dzy_sum, 0, sizeof(dzy_sum));
  memset(dz_sum, 0, sizeof(dz_sum));

  compute_t* smem_wgrad = reinterpret_cast<compute_t*>(smem_);
  char* smem_dgrad = smem_ + Ktraits::SMEM_BYTES_WGRAD;

  Reducer reducer(params, bidm, bidn, warp_m, warp_n, lane, smem_dgrad);

  Sum<reduce_t> sum;

  constexpr float rn = 1.f / float(COLS);
  Wvec gamma[LDGS];
  Wvec beta[LDGS];
  index_t idx = c;
#pragma unroll
  for (int it = 0; it < LDGS; it++) {
    gamma[it].load_from(params.gamma, idx);
    if (params.z != nullptr) {
      beta[it].load_from(params.beta, idx);
    }
    idx += Ktraits::VEC_COLS_PER_LDG;
  }
// TODO if ROWS_PER_CTA does not divide rows, we might get divergence in the
// last blocks with syncthreads!
// grid stride over rows
#pragma unroll 1
  for (int row = r; row < params.rows; row += params.ctas_per_col * ROWS_PER_CTA) {
    const compute_t mu_r = params.z == nullptr ? static_cast<const compute_t*>(params.mu)[row] : 0.f;
    const compute_t rs_r = static_cast<const compute_t*>(params.rs)[row];
    Ivec x_or_z[LDGS];
    Ovec dz[LDGS];
    index_t idx = row * Ktraits::VEC_COLS + c;
#pragma unroll
    for (int it = 0; it < LDGS; it++) {
      dz[it].load_from(params.dz, idx);
      if (params.z != nullptr) {
        x_or_z[it].load_from(params.z, idx);
      } else {
        x_or_z[it].load_from(params.x, idx);
      }
      idx += Ktraits::VEC_COLS_PER_LDG;
    }

    compute_t dy[LDGS * NUM_ELTS];
    compute_t y[LDGS * NUM_ELTS];

    compute_t mdy_local = 0.f;
    compute_t mdyy_local = 0.f;
#pragma unroll
    for (int it = 0; it < LDGS; it++) {
#pragma unroll
      for (int jt = 0; jt < NUM_ELTS; jt++) {
        compute_t gamma_tmp = compute_t(gamma[it].data.elt[jt]);
        compute_t beta_tmp = compute_t(beta[it].data.elt[jt]);
        compute_t x_or_z_tmp = compute_t(x_or_z[it].data.elt[jt]);
        compute_t y_tmp = params.z != nullptr ? (x_or_z_tmp - beta_tmp) / gamma_tmp : rs_r * (x_or_z_tmp - mu_r);
        compute_t dy_tmp = compute_t(dz[it].data.elt[jt]) * gamma_tmp;
        compute_t dz_tmp = dz[it].data.elt[jt];

        mdy_local += dy_tmp;
        mdyy_local += dy_tmp * y_tmp;

        dy[it * NUM_ELTS + jt] = dy_tmp;
        y[it * NUM_ELTS + jt] = y_tmp;

        dzy_sum[it].data.elt[jt] += dz_tmp * y_tmp;
        dz_sum[it].data.elt[jt] += dz_tmp;
      }
    }

    reduce_t result = reducer.allreduce({mdy_local, mdyy_local}, sum);
    mdy_local = layer_norm::Get<0>::of<reduce_t, compute_t>(result) * rn;
    mdyy_local = layer_norm::Get<1>::of<reduce_t, compute_t>(result) * rn;

    Ivec dx[LDGS];
    idx = row * Ktraits::VEC_COLS + c;
#pragma unroll
    for (int it = 0; it < LDGS; it++) {
#pragma unroll
      for (int jt = 0; jt < NUM_ELTS; jt++) {
        compute_t dy_tmp = dy[it * NUM_ELTS + jt];
        compute_t y_tmp = y[it * NUM_ELTS + jt];
        compute_t dx_tmp = rs_r * (dy_tmp - (mdyy_local * y_tmp + mdy_local));
        dx[it].data.elt[jt] = dx_tmp;
      }
      dx[it].store_to(params.dx, idx);
      idx += Ktraits::VEC_COLS_PER_LDG;
    }

  }  // end: grid stride loop

  if (WARPS_M == 1) {
    idx = r * Ktraits::VEC_COLS + c;
#pragma unroll
    for (int it = 0; it < LDGS; it++) {
      dz_sum[it].store_to(params.dbeta_part, idx);
      dzy_sum[it].store_to(params.dgamma_part, idx);
      idx += Ktraits::VEC_COLS_PER_LDG;
    }
  } else {
    static_assert(WARPS_M == 1 || Ktraits::CTAS_PER_ROW == 1, "Multiple rows per CTA not supported for Multi-CTA.");
    // Finalize reduction of part dgamma and dbeta for this CTA
    // by reducing over the rows held across the WARPS_M warps

    // Assumption: blockSize divides hidden size.
    enum { NUM_RES = COLS / Ktraits::THREADS_PER_CTA };
    static_assert(NUM_RES * Ktraits::THREADS_PER_CTA == COLS, "");

    idx = warp_m * Ktraits::VEC_COLS + tid_r;
#pragma unroll
    for (int it = 0; it < LDGS; it++) {
      dz_sum[it].store_to(smem_wgrad, idx);
      idx += THREADS_PER_ROW;
    }
    __syncthreads();
    compute_t cta_dz_sum[NUM_RES];
    memset(cta_dz_sum, 0, sizeof(compute_t) * NUM_RES);
    for (int it = 0; it < ROWS_PER_CTA; it++) {
      for (int jt = 0; jt < NUM_RES; jt++) {
        cta_dz_sum[jt] += smem_wgrad[it * COLS + tidx + jt * Ktraits::THREADS_PER_CTA];
      }
    }
    __syncthreads();

    idx = warp_m * Ktraits::VEC_COLS + tid_r;
#pragma unroll
    for (int it = 0; it < LDGS; it++) {
      dzy_sum[it].store_to(smem_wgrad, idx);
      idx += THREADS_PER_ROW;
    }
    __syncthreads();
    compute_t cta_dzy_sum[NUM_RES];
    memset(cta_dzy_sum, 0, sizeof(compute_t) * NUM_RES);
    for (int it = 0; it < ROWS_PER_CTA; it++) {
      for (int jt = 0; jt < NUM_RES; jt++) {
        cta_dzy_sum[jt] += smem_wgrad[it * COLS + tidx + jt * Ktraits::THREADS_PER_CTA];
      }
    }

    compute_t* dgamma_part = static_cast<compute_t*>(params.dgamma_part) + bidm * COLS + tidx;
    for (int jt = 0; jt < NUM_RES; jt++) {
      *dgamma_part = cta_dzy_sum[jt];
      dgamma_part += Ktraits::THREADS_PER_CTA;
    }

    compute_t* dbeta_part = static_cast<compute_t*>(params.dbeta_part) + bidm * COLS + tidx;
    for (int jt = 0; jt < NUM_RES; jt++) {
      *dbeta_part = cta_dz_sum[jt];
      dbeta_part += Ktraits::THREADS_PER_CTA;
    }
  }
}

template <typename Kernel_traits>
__global__ __launch_bounds__(Kernel_traits::THREADS_PER_CTA) void ln_bwd_finalize_kernel(BwdParams params) {
  using compute_t = typename Kernel_traits::compute_t;
  using weight_t = typename Kernel_traits::weight_t;
  using index_t = typename Kernel_traits::index_t;
  using Reducer = typename Kernel_traits::Reducer;
  using reduce_t = typename Reducer::Type;

  Sum<reduce_t> sum;
  enum { NUM_ELT = Kernel_traits::ELTS_PER_LDG };
  enum { THREADS_PER_WARP = Kernel_traits::THREADS_PER_WARP };

  __shared__ char smem_[Kernel_traits::SMEM_BYTES_PER_CTA];

  constexpr uint32_t bidm = 0;

  const uint32_t bidn = blockIdx.x;
  const uint32_t tidx = threadIdx.x;
  const uint32_t warp = tidx / THREADS_PER_WARP;
  const uint32_t lane = tidx % THREADS_PER_WARP;

  Reducer reducer(params, bidm, bidn, 0, 0, lane, smem_);

  const uint32_t c = bidn * THREADS_PER_WARP + lane;
  const uint32_t c_out = bidn * THREADS_PER_WARP / 2 + lane;
  constexpr uint32_t COL_STRIDE = Kernel_traits::CTAS * THREADS_PER_WARP;
  for (uint32_t col = c, col_out = c_out; col < Kernel_traits::COLS; col += COL_STRIDE, col_out += COL_STRIDE / 2) {
    // Each thread sums over NUM_ELT columns.
    Vec<compute_t, NUM_ELT> dbeta_local, dgamma_local;
    memset(&dgamma_local, 0, sizeof(dgamma_local));
    memset(&dbeta_local, 0, sizeof(dbeta_local));
    for (uint32_t row = warp; row < params.ctas_per_col; row += Kernel_traits::ROWS_PER_CTA) {
      index_t idx = row * Kernel_traits::COLS + col;

      Vec<compute_t, NUM_ELT> dbeta_part, dgamma_part;
      dbeta_part.load_from(params.dbeta_part, idx);
      dgamma_part.load_from(params.dgamma_part, idx);
#pragma unroll
      for (int it = 0; it < NUM_ELT; it++) {
        dgamma_local.data.elt[it] += dgamma_part.data.elt[it];
        dbeta_local.data.elt[it] += dbeta_part.data.elt[it];
      }
    }

    void* smem_gamma = smem_;
    void* smem_beta = &smem_[Kernel_traits::SMEM_BYTES_TRANSPOSE];

    const int write_row = warp;
    const int write_col = lane ^ write_row;
    const int write_idx = write_row * THREADS_PER_WARP + write_col;

    dgamma_local.store_to(smem_gamma, write_idx);
    dbeta_local.store_to(smem_beta, write_idx);

    __syncthreads();

    // It would be probably safe to reuse the first row of smem_beta and smem_gamma
    void* smem_gamma_out = &smem_[2 * Kernel_traits::SMEM_BYTES_TRANSPOSE];
    void* smem_beta_out = &smem_[2 * Kernel_traits::SMEM_BYTES_TRANSPOSE + Kernel_traits::SMEM_BYTES_OUTPUT];

    // More than one iter iff ROWS_PER_CTA < 32.
    for (int w = warp; w < THREADS_PER_WARP; w += Kernel_traits::ROWS_PER_CTA) {
      const int read_row = lane;
      const int read_col = w ^ read_row;
      const int read_idx = read_row * THREADS_PER_WARP + read_col;

      memset(&dbeta_local, 0, sizeof(dbeta_local));
      memset(&dgamma_local, 0, sizeof(dgamma_local));

      // Load beta and gamma transposed
      if (read_row < Kernel_traits::ROWS_PER_CTA) {
        dbeta_local.load_from(smem_beta, read_idx);
        dgamma_local.load_from(smem_gamma, read_idx);
      }

// Call reducer on the loaded value(s) and convert.
#pragma unroll
      for (int it = 0; it < NUM_ELT; it++) {
        compute_t b_i = dbeta_local.data.elt[it];
        compute_t g_i = dgamma_local.data.elt[it];
        b_i = reducer.allreduce(b_i, sum);
        g_i = reducer.allreduce(g_i, sum);

        dgamma_local.data.elt[it] = g_i;
        dbeta_local.data.elt[it] = b_i;
      }

      // Leader stores the result at the current column.
      if (lane == 0) {
        dgamma_local.store_to(smem_gamma_out, w);
        dbeta_local.store_to(smem_beta_out, w);
      }
    }

    // All writes done.
    __syncthreads();

    // Pack and store: 2-wide stores with half the threads.
    if (warp == Kernel_traits::ROWS_PER_CTA - 1 && lane < THREADS_PER_WARP / 2) {
      using src_t = typename TypeToVec2<compute_t>::Type;
      using dst_t = typename TypeToVec2<weight_t>::Type;
      Vec<src_t, NUM_ELT> dbeta_vec2, dgamma_vec2;
      Vec<dst_t, NUM_ELT> dbeta_out2, dgamma_out2;

      dgamma_vec2.load_from(smem_gamma_out, lane);
      dbeta_vec2.load_from(smem_beta_out, lane);
#pragma unroll
      for (int it = 0; it < NUM_ELT; it++) {
        dgamma_out2.data.elt[it] = Converter<src_t, dst_t>::convert(dgamma_vec2.data.elt[it]);
        dbeta_out2.data.elt[it] = Converter<src_t, dst_t>::convert(dbeta_vec2.data.elt[it]);
      }
      dgamma_out2.store_to(params.dgamma, col_out);
      dbeta_out2.store_to(params.dbeta, col_out);
    }
  }
}
}  // namespace layer_norm


================================================
FILE: apex/contrib/csrc/layer_norm/ln_bwd_semi_cuda_kernel.cu
================================================
#include "ln.h"
#include "ln_bwd_kernels.cuh"
#include "ln_kernel_traits.h"
#include "ln_utils.cuh"

using namespace layer_norm;

template <typename weight_t, typename input_t, typename output_t, typename compute_t, typename index_t, int HIDDEN_SIZE,
          int CTAS_PER_ROW, int WARPS_M, int WARPS_N, int BYTES_PER_LDG_MAIN, int BYTES_PER_LDG_FINAL>
void launch_(LaunchParams<BwdParams>& launch_params, const bool configure_params) {
  using Kernel_traits = Kernel_traits<weight_t, input_t, output_t, compute_t, index_t, HIDDEN_SIZE, CTAS_PER_ROW,
                                      WARPS_M, WARPS_N, BYTES_PER_LDG_MAIN>;
  auto kernel = &ln_bwd_kernel<Kernel_traits>;

  if (configure_params) {
    int ctas_per_sm;
    cudaError status_ = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &ctas_per_sm, kernel, Kernel_traits::THREADS_PER_CTA, Kernel_traits::SMEM_BYTES);
    launch_params.params.ctas_per_col =
        launch_params.props->multiProcessorCount * ctas_per_sm / Kernel_traits::CTAS_PER_ROW;
    launch_params.barrier_size = 0;
    launch_params.workspace_bytes = 0;
    if (Kernel_traits::CTAS_PER_ROW > 1) {
      launch_params.barrier_size = 2 * launch_params.params.ctas_per_col;
      launch_params.workspace_bytes = launch_params.params.ctas_per_col * Kernel_traits::WARPS_M *
                                      Kernel_traits::CTAS_PER_ROW * sizeof(typename Kernel_traits::reduce_t) * 2;
    }
    return;
  }

  if (Kernel_traits::SMEM_BYTES >= 48 * 1024) {
    CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, Kernel_traits::SMEM_BYTES));
  }
  auto stream = launch_params.stream;
  auto ctas_per_col = launch_params.params.ctas_per_col;

  if (Kernel_traits::CTAS_PER_ROW == 1) {
    kernel<<<ctas_per_col, Kernel_traits::THREADS_PER_CTA, Kernel_traits::SMEM_BYTES, stream>>>(launch_params.params);
  } else {
    dim3 grid(Kernel_traits::CTAS_PER_ROW * ctas_per_col);
    dim3 block(Kernel_traits::THREADS_PER_CTA);
    void* params_ = (void*)&launch_params.params;
    cudaLaunchCooperativeKernel((void*)kernel, grid, block, (void**)&params_, Kernel_traits::SMEM_BYTES, stream);
  }

  using Kernel_traits_f =
      layer_norm::Kernel_traits_finalize<HIDDEN_SIZE, weight_t, input_t, output_t, compute_t, index_t,
                                         32 * 32,  // THREADS_PER_CTA
                                         BYTES_PER_LDG_FINAL>;

  auto kernel_f = &layer_norm::ln_bwd_finalize_kernel<Kernel_traits_f>;
  kernel_f<<<Kernel_traits_f::CTAS, Kernel_traits_f::THREADS_PER_CTA, 0, stream>>>(launch_params.params);
}

// Create backward launch function and register. Macro signature:
//  HIDDEN_SIZE, WTYPE, ITYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL

REGISTER_BWD_LAUNCHER(768, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
REGISTER_BWD_LAUNCHER(768, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
REGISTER_BWD_LAUNCHER(768, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
REGISTER_BWD_LAUNCHER(768, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
REGISTER_BWD_LAUNCHER(768, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);

REGISTER_BWD_LAUNCHER(1024, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
REGISTER_BWD_LAUNCHER(1024, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
REGISTER_BWD_LAUNCHER(1024, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
REGISTER_BWD_LAUNCHER(1024, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
REGISTER_BWD_LAUNCHER(1024, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);

REGISTER_BWD_LAUNCHER(1536, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(1536, fp16, fp16, fp16, fp32, 1, 1, 4, 8, 4);
REGISTER_BWD_LAUNCHER(1536, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(1536, bf16, bf16, bf16, fp32, 1, 1, 4, 8, 4);
REGISTER_BWD_LAUNCHER(1536, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);

REGISTER_BWD_LAUNCHER(2048, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(2048, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(2048, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(2048, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(2048, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);

REGISTER_BWD_LAUNCHER(2304, fp32, fp32, fp32, fp32, 1, 1, 4, 8, 4);
REGISTER_BWD_LAUNCHER(2304, fp16, fp16, fp16, fp32, 1, 1, 4, 4, 4);
REGISTER_BWD_LAUNCHER(2304, fp16, fp32, fp16, fp32, 1, 1, 4, 8, 4);
REGISTER_BWD_LAUNCHER(2304, bf16, bf16, bf16, fp32, 1, 1, 4, 4, 4);
REGISTER_BWD_LAUNCHER(2304, bf16, fp32, bf16, fp32, 1, 1, 4, 8, 4);

REGISTER_BWD_LAUNCHER(3072, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(3072, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(3072, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(3072, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(3072, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);

REGISTER_BWD_LAUNCHER(3840, fp32, fp32, fp32, fp32, 1, 1, 4, 8, 4);
REGISTER_BWD_LAUNCHER(3840, fp16, fp16, fp16, fp32, 1, 1, 4, 4, 4);
REGISTER_BWD_LAUNCHER(3840, fp16, fp32, fp16, fp32, 1, 1, 4, 8, 4);
REGISTER_BWD_LAUNCHER(3840, bf16, bf16, bf16, fp32, 1, 1, 4, 4, 4);
REGISTER_BWD_LAUNCHER(3840, bf16, fp32, bf16, fp32, 1, 1, 4, 8, 4);

REGISTER_BWD_LAUNCHER(4096, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(4096, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(4096, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(4096, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(4096, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);

REGISTER_BWD_LAUNCHER(5120, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(5120, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(5120, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(5120, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(5120, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);

REGISTER_BWD_LAUNCHER(6144, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
REGISTER_BWD_LAUNCHER(6144, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4);
REGISTER_BWD_LAUNCHER(6144, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4);
REGISTER_BWD_LAUNCHER(6144, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4);
REGISTER_BWD_LAUNCHER(6144, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4);

REGISTER_BWD_LAUNCHER(8192, fp32, fp32, fp32, fp32, 2, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(8192, fp16, fp16, fp16, fp32, 2, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(8192, fp16, fp32, fp16, fp32, 2, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(8192, bf16, bf16, bf16, fp32, 2, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(8192, bf16, fp32, bf16, fp32, 2, 1, 4, 16, 4);

REGISTER_BWD_LAUNCHER(10240, fp32, fp32, fp32, fp32, 2, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(10240, fp16, fp16, fp16, fp32, 2, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(10240, fp16, fp32, fp16, fp32, 2, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(10240, bf16, bf16, bf16, fp32, 2, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(10240, bf16, fp32, bf16, fp32, 2, 1, 4, 16, 4);

REGISTER_BWD_LAUNCHER(12288, fp32, fp32, fp32, fp32, 4, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(12288, fp16, fp16, fp16, fp32, 4, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(12288, fp16, fp32, fp16, fp32, 4, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(12288, bf16, bf16, bf16, fp32, 4, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(12288, bf16, fp32, bf16, fp32, 4, 1, 4, 16, 4);

REGISTER_BWD_LAUNCHER(12800, fp32, fp32, fp32, fp32, 5, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(12800, fp16, fp16, fp16, fp32, 5, 1, 4, 8, 4);
REGISTER_BWD_LAUNCHER(12800, fp16, fp32, fp16, fp32, 5, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(12800, bf16, bf16, bf16, fp32, 5, 1, 4, 8, 4);
REGISTER_BWD_LAUNCHER(12800, bf16, fp32, bf16, fp32, 5, 1, 4, 16, 4);

REGISTER_BWD_LAUNCHER(14336, fp32, fp32, fp32, fp32, 4, 1, 4, 8, 4);
REGISTER_BWD_LAUNCHER(14336, fp16, fp16, fp16, fp32, 4, 1, 4, 8, 4);
REGISTER_BWD_LAUNCHER(14336, fp16, fp32, fp16, fp32, 4, 1, 4, 8, 4);
REGISTER_BWD_LAUNCHER(14336, bf16, bf16, bf16, fp32, 4, 1, 4, 8, 4);
REGISTER_BWD_LAUNCHER(14336, bf16, fp32, bf16, fp32, 4, 1, 4, 8, 4);

REGISTER_BWD_LAUNCHER(15360, fp32, fp32, fp32, fp32, 4, 1, 4, 8, 4);
REGISTER_BWD_LAUNCHER(15360, fp16, fp16, fp16, fp32, 4, 1, 4, 4, 4);
REGISTER_BWD_LAUNCHER(15360, fp16, fp32, fp16, fp32, 4, 1, 4, 8, 4);
REGISTER_BWD_LAUNCHER(15360, bf16, bf16, bf16, fp32, 4, 1, 4, 4, 4);
REGISTER_BWD_LAUNCHER(15360, bf16, fp32, bf16, fp32, 4, 1, 4, 8, 4);

REGISTER_BWD_LAUNCHER(16384, fp32, fp32, fp32, fp32, 4, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(16384, fp16, fp16, fp16, fp32, 4, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(16384, fp16, fp32, fp16, fp32, 4, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(16384, bf16, bf16, bf16, fp32, 4, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(16384, bf16, fp32, bf16, fp32, 4, 1, 4, 16, 4);

REGISTER_BWD_LAUNCHER(18432, fp32, fp32, fp32, fp32, 4, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(18432, fp16, fp16, fp16, fp32, 4, 1, 4, 8, 4);
REGISTER_BWD_LAUNCHER(18432, fp16, fp32, fp16, fp32, 4, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(18432, bf16, bf16, bf16, fp32, 4, 1, 4, 8, 4);
REGISTER_BWD_LAUNCHER(18432, bf16, fp32, bf16, fp32, 4, 1, 4, 16, 4);

REGISTER_BWD_LAUNCHER(20480, fp32, fp32, fp32, fp32, 4, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(20480, fp16, fp16, fp16, fp32, 4, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(20480, fp16, fp32, fp16, fp32, 4, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(20480, bf16, bf16, bf16, fp32, 4, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(20480, bf16, fp32, bf16, fp32, 4, 1, 4, 16, 4);

REGISTER_BWD_LAUNCHER(24576, fp32, fp32, fp32, fp32, 4, 1, 8, 16, 4);
REGISTER_BWD_LAUNCHER(24576, fp16, fp16, fp16, fp32, 4, 1, 8, 16, 4);
REGISTER_BWD_LAUNCHER(24576, fp16, fp32, fp16, fp32, 4, 1, 8, 16, 4);
REGISTER_BWD_LAUNCHER(24576, bf16, bf16, bf16, fp32, 4, 1, 8, 16, 4);
REGISTER_BWD_LAUNCHER(24576, bf16, fp32, bf16, fp32, 4, 1, 8, 16, 4);

REGISTER_BWD_LAUNCHER(25600, fp32, fp32, fp32, fp32, 5, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(25600, fp16, fp16, fp16, fp32, 5, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(25600, fp16, fp32, fp16, fp32, 5, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(25600, bf16, bf16, bf16, fp32, 5, 1, 4, 16, 4);
REGISTER_BWD_LAUNCHER(25600, bf16, fp32, bf16, fp32, 5, 1, 4, 16, 4);

REGISTER_BWD_LAUNCHER(30720, fp32, fp32, fp32, fp32, 4, 1, 8, 8, 4);
REGISTER_BWD_LAUNCHER(30720, fp16, fp16, fp16, fp32, 4, 1, 8, 4, 4);
REGISTER_BWD_LAUNCHER(30720, fp16, fp32, fp16, fp32, 4, 1, 8, 8, 4);
REGISTER_BWD_LAUNCHER(30720, bf16, bf16, bf16, fp32, 4, 1, 8, 4, 4);
REGISTER_BWD_LAUNCHER(30720, bf16, fp32, bf16, fp32, 4, 1, 8, 8, 4);

REGISTER_BWD_LAUNCHER(32768, fp32, fp32, fp32, fp32, 4, 1, 8, 16, 4);
REGISTER_BWD_LAUNCHER(32768, fp16, fp16, fp16, fp32, 4, 1, 8, 16, 4);
REGISTER_BWD_LAUNCHER(32768, fp16, fp32, fp16, fp32, 4, 1, 8, 16, 4);
REGISTER_BWD_LAUNCHER(32768, bf16, bf16, bf16, fp32, 4, 1, 8, 16, 4);
REGISTER_BWD_LAUNCHER(32768, bf16, fp32, bf16, fp32, 4, 1, 8, 16, 4);

REGISTER_BWD_LAUNCHER(40960, fp32, fp32, fp32, fp32, 4, 1, 8, 16, 4);
REGISTER_BWD_LAUNCHER(40960, fp16, fp16, fp16, fp32, 4, 1, 8, 16, 4);
REGISTER_BWD_LAUNCHER(40960, fp16, fp32, fp16, fp32, 4, 1, 8, 16, 4);
REGISTER_BWD_LAUNCHER(40960, bf16, bf16, bf16, fp32, 4, 1, 8, 16, 4);
REGISTER_BWD_LAUNCHER(40960, bf16, fp32, bf16, fp32, 4, 1, 8, 16, 4);

REGISTER_BWD_LAUNCHER(49152, fp32, fp32, fp32, fp32, 8, 1, 8, 16, 4);
REGISTER_BWD_LAUNCHER(49152, fp16, fp16, fp16, fp32, 8, 1, 8, 16, 4);
REGISTER_BWD_LAUNCHER(49152, fp16, fp32, fp16, fp32, 8, 1, 8, 16, 4);
REGISTER_BWD_LAUNCHER(49152, bf16, bf16, bf16, fp32, 8, 1, 8, 16, 4);
REGISTER_BWD_LAUNCHER(49152, bf16, fp32, bf16, fp32, 8, 1, 8, 16, 4);

REGISTER_BWD_LAUNCHER(65536, fp32, fp32, fp32, fp32, 8, 1, 8, 16, 4);
REGISTER_BWD_LAUNCHER(65536, fp16, fp16, fp16, fp32, 8, 1, 8, 16, 4);
REGISTER_BWD_LAUNCHER(65536, fp16, fp32, fp16, fp32, 8, 1, 8, 16, 4);
REGISTER_BWD_LAUNCHER(65536, bf16, bf16, bf16, fp32, 8, 1, 8, 16, 4);
REGISTER_BWD_LAUNCHER(65536, bf16, fp32, bf16, fp32, 8, 1, 8, 16, 4);


================================================
FILE: apex/contrib/csrc/layer_norm/ln_fwd_cuda_kernel.cu
================================================
#include "ln.h"
#include "ln_fwd_kernels.cuh"
#include "ln_kernel_traits.h"
#include "ln_utils.cuh"

using namespace layer_norm;

template <typename weight_t, typename input_t, typename output_t, typename compute_t, typename index_t, int HIDDEN_SIZE,
          int CTAS_PER_ROW, int WARPS_M, int WARPS_N, int BYTES_PER_LDG>
void launch_(LaunchParams<FwdParams>& launch_params, const bool configure_params) {
  using Kernel_traits = Kernel_traits<weight_t, input_t, output_t, compute_t, index_t, HIDDEN_SIZE, CTAS_PER_ROW,
                                      WARPS_M, WARPS_N, BYTES_PER_LDG>;
  auto kernel = &ln_fwd_kernel<Kernel_traits>;

  if (configure_params) {
    int ctas_per_sm;
    cudaError status_ = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &ctas_per_sm, kernel, Kernel_traits::THREADS_PER_CTA, Kernel_traits::SMEM_BYTES_FWD);
    launch_params.params.ctas_per_col =
        launch_params.props->multiProcessorCount * ctas_per_sm / Kernel_traits::CTAS_PER_ROW;
    launch_params.barrier_size = 0;
    launch_params.workspace_bytes = 0;
    if (Kernel_traits::CTAS_PER_ROW > 1) {
      launch_params.barrier_size = 2 * launch_params.params.ctas_per_col;
      launch_params.workspace_bytes = launch_params.params.ctas_per_col * Kernel_traits::WARPS_M *
                                      Kernel_traits::CTAS_PER_ROW * sizeof(typename Kernel_traits::Stats::stats_t) * 2;
    }
    return;
  }

  if (Kernel_traits::SMEM_BYTES_FWD >= 48 * 1024) {
    CHECK_CUDA(
        cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, Kernel_traits::SMEM_BYTES_FWD));
  }
  auto stream = launch_params.stream;
  auto ctas_per_col = launch_params.params.ctas_per_col;

  if (Kernel_traits::CTAS_PER_ROW == 1) {
    kernel<<<ctas_per_col, Kernel_traits::THREADS_PER_CTA, Kernel_traits::SMEM_BYTES_FWD, stream>>>(
        launch_params.params);
  } else {
    dim3 grid(Kernel_traits::CTAS_PER_ROW * ctas_per_col);
    dim3 block(Kernel_traits::THREADS_PER_CTA);
    void* params_ = (void*)&launch_params.params;
    cudaLaunchCooperativeKernel((void*)kernel, grid, block, (void**)&params_, Kernel_traits::SMEM_BYTES_FWD, stream);
  }
}

// Create forward launch function and register. Macro signature:
//  HIDDEN_SIZE, WTYPE, ITYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG

REGISTER_FWD_LAUNCHER(768, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
REGISTER_FWD_LAUNCHER(768, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
REGISTER_FWD_LAUNCHER(768, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
REGISTER_FWD_LAUNCHER(768, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
REGISTER_FWD_LAUNCHER(768, bf16, fp32, bf16, fp32, 1, 4, 1, 16);

REGISTER_FWD_LAUNCHER(1024, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
REGISTER_FWD_LAUNCHER(1024, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
REGISTER_FWD_LAUNCHER(1024, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
REGISTER_FWD_LAUNCHER(1024, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
REGISTER_FWD_LAUNCHER(1024, bf16, fp32, bf16, fp32, 1, 4, 1, 16);

REGISTER_FWD_LAUNCHER(1536, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
REGISTER_FWD_LAUNCHER(1536, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
REGISTER_FWD_LAUNCHER(1536, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
REGISTER_FWD_LAUNCHER(1536, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
REGISTER_FWD_LAUNCHER(1536, bf16, fp32, bf16, fp32, 1, 4, 1, 16);

REGISTER_FWD_LAUNCHER(2048, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
REGISTER_FWD_LAUNCHER(2048, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
REGISTER_FWD_LAUNCHER(2048, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
REGISTER_FWD_LAUNCHER(2048, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
REGISTER_FWD_LAUNCHER(2048, bf16, fp32, bf16, fp32, 1, 4, 1, 16);

REGISTER_FWD_LAUNCHER(2304, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
REGISTER_FWD_LAUNCHER(2304, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
REGISTER_FWD_LAUNCHER(2304, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
REGISTER_FWD_LAUNCHER(2304, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
REGISTER_FWD_LAUNCHER(2304, bf16, fp32, bf16, fp32, 1, 4, 1, 16);

REGISTER_FWD_LAUNCHER(3072, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
REGISTER_FWD_LAUNCHER(3072, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
REGISTER_FWD_LAUNCHER(3072, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
REGISTER_FWD_LAUNCHER(3072, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
REGISTER_FWD_LAUNCHER(3072, bf16, fp32, bf16, fp32, 1, 1, 4, 16);

REGISTER_FWD_LAUNCHER(3840, fp32, fp32, fp32, fp32, 1, 1, 4, 4);
REGISTER_FWD_LAUNCHER(3840, fp16, fp16, fp16, fp32, 1, 1, 4, 4);
REGISTER_FWD_LAUNCHER(3840, fp16, fp32, fp16, fp32, 1, 1, 4, 4);
REGISTER_FWD_LAUNCHER(3840, bf16, bf16, bf16, fp32, 1, 1, 4, 4);
REGISTER_FWD_LAUNCHER(3840, bf16, fp32, bf16, fp32, 1, 1, 4, 4);

REGISTER_FWD_LAUNCHER(4096, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
REGISTER_FWD_LAUNCHER(4096, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
REGISTER_FWD_LAUNCHER(4096, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
REGISTER_FWD_LAUNCHER(4096, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
REGISTER_FWD_LAUNCHER(4096, bf16, fp32, bf16, fp32, 1, 1, 4, 16);

REGISTER_FWD_LAUNCHER(5120, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
REGISTER_FWD_LAUNCHER(5120, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
REGISTER_FWD_LAUNCHER(5120, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
REGISTER_FWD_LAUNCHER(5120, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
REGISTER_FWD_LAUNCHER(5120, bf16, fp32, bf16, fp32, 1, 1, 4, 16);

REGISTER_FWD_LAUNCHER(6144, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
REGISTER_FWD_LAUNCHER(6144, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
REGISTER_FWD_LAUNCHER(6144, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
REGISTER_FWD_LAUNCHER(6144, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
REGISTER_FWD_LAUNCHER(6144, bf16, fp32, bf16, fp32, 1, 1, 4, 16);

REGISTER_FWD_LAUNCHER(8192, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
REGISTER_FWD_LAUNCHER(8192, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
REGISTER_FWD_LAUNCHER(8192, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
REGISTER_FWD_LAUNCHER(8192, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
REGISTER_FWD_LAUNCHER(8192, bf16, fp32, bf16, fp32, 1, 1, 4, 16);

REGISTER_FWD_LAUNCHER(10240, fp32, fp32, fp32, fp32, 2, 1, 4, 16);
REGISTER_FWD_LAUNCHER(10240, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
REGISTER_FWD_LAUNCHER(10240, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
REGISTER_FWD_LAUNCHER(10240, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
REGISTER_FWD_LAUNCHER(10240, bf16, fp32, bf16, fp32, 1, 1, 4, 16);

REGISTER_FWD_LAUNCHER(12288, fp32, fp32, fp32, fp32, 2, 1, 4, 16);
REGISTER_FWD_LAUNCHER(12288, fp16, fp16, fp16, fp32, 2, 1, 4, 16);
REGISTER_FWD_LAUNCHER(12288, fp16, fp32, fp16, fp32, 2, 1, 4, 16);
REGISTER_FWD_LAUNCHER(12288, bf16, bf16, bf16, fp32, 2, 1, 4, 16);
REGISTER_FWD_LAUNCHER(12288, bf16, fp32, bf16, fp32, 2, 1, 4, 16);

REGISTER_FWD_LAUNCHER(12800, fp32, fp32, fp32, fp32, 2, 1, 4, 4);
REGISTER_FWD_LAUNCHER(12800, fp16, fp16, fp16, fp32, 2, 1, 4, 4);
REGISTER_FWD_LAUNCHER(12800, fp16, fp32, fp16, fp32, 2, 1, 4, 4);
REGISTER_FWD_LAUNCHER(12800, bf16, bf16, bf16, fp32, 2, 1, 4, 4);
REGISTER_FWD_LAUNCHER(12800, bf16, fp32, bf16, fp32, 2, 1, 4, 4);

REGISTER_FWD_LAUNCHER(14336, fp32, fp32, fp32, fp32, 2, 1, 4, 16);
REGISTER_FWD_LAUNCHER(14336, fp16, fp16, fp16, fp32, 2, 1, 4, 16);
REGISTER_FWD_LAUNCHER(14336, fp16, fp32, fp16, fp32, 2, 1, 4, 16);
REGISTER_FWD_LAUNCHER(14336, bf16, bf16, bf16, fp32, 2, 1, 4, 16);
REGISTER_FWD_LAUNCHER(14336, bf16, fp32, bf16, fp32, 2, 1, 4, 8);

REGISTER_FWD_LAUNCHER(15360, fp32, fp32, fp32, fp32, 2, 1, 4, 8);
REGISTER_FWD_LAUNCHER(15360, fp16, fp16, fp16, fp32, 2, 1, 4, 8);
REGISTER_FWD_LAUNCHER(15360, fp16, fp32, fp16, fp32, 2, 1, 4, 8);
REGISTER_FWD_LAUNCHER(15360, bf16, bf16, bf16, fp32, 2, 1, 4, 8);
REGISTER_FWD_LAUNCHER(15360, bf16, fp32, bf16, fp32, 2, 1, 4, 8);

REGISTER_FWD_LAUNCHER(16384, fp32, fp32, fp32, fp32, 2, 1, 4, 16);
REGISTER_FWD_LAUNCHER(16384, fp16, fp16, fp16, fp32, 2, 1, 4, 16);
REGISTER_FWD_LAUNCHER(16384, fp16, fp32, fp16, fp32, 2, 1, 4, 16);
REGISTER_FWD_LAUNCHER(16384, bf16, bf16, bf16, fp32, 2, 1, 4, 16);
REGISTER_FWD_LAUNCHER(16384, bf16, fp32, bf16, fp32, 2, 1, 4, 16);

REGISTER_FWD_LAUNCHER(18432, fp32, fp32, fp32, fp32, 4, 1, 4, 16);
REGISTER_FWD_LAUNCHER(18432, fp16, fp16, fp16, fp32, 2, 1, 4, 16);
REGISTER_FWD_LAUNCHER(18432, fp16, fp32, fp16, fp32, 2, 1, 4, 16);
REGISTER_FWD_LAUNCHER(18432, bf16, bf16, bf16, fp32, 2, 1, 4, 16);
REGISTER_FWD_LAUNCHER(18432, bf16, fp32, bf16, fp32, 4, 1, 4, 16);

REGISTER_FWD_LAUNCHER(20480, fp32, fp32, fp32, fp32, 2, 1, 4, 16);
REGISTER_FWD_LAUNCHER(20480, fp16, fp16, fp16, fp32, 2, 1, 4, 16);
REGISTER_FWD_LAUNCHER(20480, fp16, fp32, fp16, fp32, 2, 1, 4, 16);
REGISTER_FWD_LAUNCHER(20480, bf16, bf16, bf16, fp32, 2, 1, 4, 16);
REGISTER_FWD_LAUNCHER(20480, bf16, fp32, bf16, fp32, 2, 1, 4, 16);

REGISTER_FWD_LAUNCHER(24576, fp32, fp32, fp32, fp32, 4, 1, 4, 16);
REGISTER_FWD_LAUNCHER(24576, fp16, fp16, fp16, fp32, 2, 1, 4, 16);
REGISTER_FWD_LAUNCHER(24576, fp16, fp32, fp16, fp32, 2, 1, 4, 16);
REGISTER_FWD_LAUNCHER(24576, bf16, bf16, bf16, fp32, 2, 1, 4, 16);
REGISTER_FWD_LAUNCHER(24576, bf16, fp32, bf16, fp32, 2, 1, 4, 16);

REGISTER_FWD_LAUNCHER(25600, fp32, fp32, fp32, fp32, 4, 1, 4, 4);
REGISTER_FWD_LAUNCHER(25600, fp16, fp16, fp16, fp32, 2, 1, 4, 8);
REGISTER_FWD_LAUNCHER(25600, fp16, fp32, fp16, fp32, 4, 1, 4, 4);
REGISTER_FWD_LAUNCHER(25600, bf16, bf16, bf16, fp32, 2, 1, 4, 8);
REGISTER_FWD_LAUNCHER(25600, bf16, fp32, bf16, fp32, 4, 1, 4, 4);

REGISTER_FWD_LAUNCHER(30720, fp32, fp32, fp32, fp32, 4, 1, 4, 4);
REGISTER_FWD_LAUNCHER(30720, fp16, fp16, fp16, fp32, 4, 1, 4, 4);
REGISTER_FWD_LAUNCHER(30720, fp16, fp32, fp16, fp32, 4, 1, 4, 4);
REGISTER_FWD_LAUNCHER(30720, bf16, bf16, bf16, fp32, 4, 1, 4, 4);
REGISTER_FWD_LAUNCHER(30720, bf16, fp32, bf16, fp32, 4, 1, 4, 4);

REGISTER_FWD_LAUNCHER(32768, fp32, fp32, fp32, fp32, 4, 1, 4, 16);
REGISTER_FWD_LAUNCHER(32768, fp16, fp16, fp16, fp32, 4, 1, 4, 16);
REGISTER_FWD_LAUNCHER(32768, fp16, fp32, fp16, fp32, 4, 1, 4, 16);
REGISTER_FWD_LAUNCHER(32768, bf16, bf16, bf16, fp32, 4, 1, 4, 16);
REGISTER_FWD_LAUNCHER(32768, bf16, fp32, bf16, fp32, 4, 1, 4, 16);

REGISTER_FWD_LAUNCHER(40960, fp32, fp32, fp32, fp32, 4, 1, 4, 16);
REGISTER_FWD_LAUNCHER(40960, fp16, fp16, fp16, fp32, 4, 1, 4, 16);
REGISTER_FWD_LAUNCHER(40960, fp16, fp32, fp16, fp32, 4, 1, 4, 16);
REGISTER_FWD_LAUNCHER(40960, bf16, bf16, bf16, fp32, 4, 1, 4, 16);
REGISTER_FWD_LAUNCHER(40960, bf16, fp32, bf16, fp32, 4, 1, 4, 16);

REGISTER_FWD_LAUNCHER(49152, fp32, fp32, fp32, fp32, 8, 1, 4, 16);
REGISTER_FWD_LAUNCHER(49152, fp16, fp16, fp16, fp32, 4, 1, 4, 16);
REGISTER_FWD_LAUNCHER(49152, fp16, fp32, fp16, fp32, 4, 1, 4, 16);
REGISTER_FWD_LAUNCHER(49152, bf16, bf16, bf16, fp32, 4, 1, 4, 16);
REGISTER_FWD_LAUNCHER(49152, bf16, fp32, bf16, fp32, 4, 1, 4, 16);

REGISTER_FWD_LAUNCHER(65536, fp32, fp32, fp32, fp32, 8, 1, 4, 16);
REGISTER_FWD_LAUNCHER(65536, fp16, fp16, fp16, fp32, 8, 1, 4, 16);
REGISTER_FWD_LAUNCHER(65536, fp16, fp32, fp16, fp32, 8, 1, 4, 16);
REGISTER_FWD_LAUNCHER(65536, bf16, bf16, bf16, fp32, 8, 1, 4, 16);
REGISTER_FWD_LAUNCHER(65536, bf16, fp32, bf16, fp32, 8, 1, 4, 16);


================================================
FILE: apex/contrib/csrc/layer_norm/ln_fwd_kernels.cuh
================================================
#pragma once

#include "ln.h"
#include "ln_utils.cuh"

namespace layer_norm {

template <typename Ktraits>
__global__ __launch_bounds__(Ktraits::THREADS_PER_CTA) void ln_fwd_kernel(FwdParams params) {
  enum { ROWS_PER_CTA = Ktraits::ROWS_PER_CTA };
  enum { WARPS_N = Ktraits::WARPS_N };
  enum { WARPS_M = Ktraits::WARPS_M };
  enum { THREADS_PER_ROW = Ktraits::THREADS_PER_ROW };
  enum { VEC_COLS_PER_LDG = Ktraits::VEC_COLS_PER_LDG };
  enum { BYTES_PER_ROW = Ktraits::BYTES_PER_ROW };
  enum { LDGS = Ktraits::LDGS };
  enum { NUM_ELTS = Ktraits::NUM_ELTS };
  enum { CTAS_PER_ROW = Ktraits::CTAS_PER_ROW };
  enum { THREADS_PER_WARP = Ktraits::THREADS_PER_WARP };

  using output_t = typename Ktraits::output_t;
  using index_t = typename Ktraits::index_t;
  using compute_t = typename Ktraits::compute_t;
  using Ivec = typename Ktraits::Ivec;
  using Ovec = typename Ktraits::Ovec;
  using Wvec = typename Ktraits::Wvec;
  using Cvec = typename Ktraits::Cvec;

  using Stats = typename Ktraits::Stats;
  using stats_t = typename Stats::stats_t;

  extern __shared__ char smem_[];

  const index_t tidx = threadIdx.x;
  const index_t bidn = blockIdx.x % CTAS_PER_ROW;
  const index_t bidm = blockIdx.x / CTAS_PER_ROW;
  const index_t lane = tidx % THREADS_PER_WARP;
  const index_t warp = tidx / THREADS_PER_WARP;
  const index_t warp_m = warp / WARPS_N;
  const index_t warp_n = warp % WARPS_N;

  const index_t r = bidm * ROWS_PER_CTA + warp_m;
  const index_t c = bidn * THREADS_PER_ROW + warp_n * THREADS_PER_WARP + lane;

  Stats stats(params, bidm, bidn, warp_m, warp_n, lane, smem_);

  compute_t* mu_ptr = static_cast<compute_t*>(params.mu);
  compute_t* rs_ptr = static_cast<compute_t*>(params.rs);

  Wvec gamma[LDGS];
  Wvec beta[LDGS];
  index_t idx = c;
#pragma unroll
  for (int it = 0; it < LDGS; it++) {
    gamma[it].load_from(params.gamma, idx);
    beta[it].load_from(params.beta, idx);
    idx += VEC_COLS_PER_LDG;
  }

  constexpr compute_t rn = 1.f / compute_t(Ktraits::COLS);

  for (int row = r; row < params.rows; row += params.ctas_per_col * ROWS_PER_CTA) {
    Ivec x[LDGS];
    index_t idx = row * Ktraits::VEC_COLS + c;
    compute_t xf[LDGS * NUM_ELTS];
#pragma unroll
    for (int it = 0; it < LDGS; it++) {
      x[it].load_from(params.x, idx);
#pragma unroll
      for (int jt = 0; jt < NUM_ELTS; jt++) {
        compute_t x_ij = compute_t(x[it].data.elt[jt]);
        xf[it * NUM_ELTS + jt] = x_ij;
      }
      idx += VEC_COLS_PER_LDG;
    }

    stats_t s = stats.compute(xf, rn);

    compute_t mu = layer_norm::Get<0>::of<stats_t, compute_t>(s);
    compute_t m2 = layer_norm::Get<1>::of<stats_t, compute_t>(s);

    if (bidn == 0 && warp_n == 0 && lane == 0) {
      mu_ptr[row] = mu;
    }

    compute_t rs = rsqrtf(rn * m2 + params.epsilon);

    if (bidn == 0 && warp_n == 0 && lane == 0) {
      rs_ptr[row] = rs;
    }

    Ovec z[LDGS];
    idx = row * Ktraits::VEC_COLS + c;
#pragma unroll
    for (int it = 0; it < LDGS; it++) {
#pragma unroll
      for (int jt = 0; jt < NUM_ELTS; jt++) {
        output_t y_ij = output_t(rs * (xf[it * NUM_ELTS + jt] - mu));
        output_t g_ij = gamma[it].data.elt[jt];
        output_t b_ij = beta[it].data.elt[jt];
        z[it].data.elt[jt] = (g_ij * y_ij + b_ij);
      }
      z[it].store_to(params.z, idx);
      idx += VEC_COLS_PER_LDG;
    }
  }
}

}  // namespace layer_norm


================================================
FILE: apex/contrib/csrc/layer_norm/ln_kernel_traits.h
================================================
#pragma once

////////////////////////////////////////////////////////////////////////////////////////////////////

namespace layer_norm {
template <uint32_t HIDDEN_SIZE_, typename weight_t_, typename input_t_, typename output_t_, typename compute_t_,
          typename index_t_, uint32_t THREADS_PER_CTA_>
struct Kernel_traits_base {
  using weight_t = weight_t_;
  using input_t = input_t_;
  using output_t = output_t_;
  using compute_t = compute_t_;
  using index_t = index_t_;

  enum { HIDDEN_SIZE = HIDDEN_SIZE_ };
  enum { THREADS_PER_CTA = THREADS_PER_CTA_ };
  enum { THREADS_PER_WARP = 32 };
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <uint32_t HIDDEN_SIZE_, typename weight_t_, typename input_t_, typename output_t_, typename compute_t_,
          typename index_t_, uint32_t THREADS_PER_CTA_, uint32_t BYTES_PER_LDG_,
          typename Base =
              Kernel_traits_base<HIDDEN_SIZE_, weight_t_, input_t_, output_t_, compute_t_, index_t_, THREADS_PER_CTA_> >
struct Kernel_traits_finalize : public Base {
  enum { ROWS_PER_CTA = Base::THREADS_PER_CTA / Base::THREADS_PER_WARP };
  static_assert((int)ROWS_PER_CTA <= (int)Base::THREADS_PER_WARP);
  // Bytes per global load from the input.
  enum { BYTES_PER_LDG = BYTES_PER_LDG_ };
  // Number of elements fetched by a global load.
  enum { ELTS_PER_LDG = BYTES_PER_LDG / sizeof(compute_t_) };
  // Bytes per global store of the weights.
  enum { BYTES_PER_STG = ELTS_PER_LDG * sizeof(weight_t_) };
  static_assert(sizeof(BYTES_PER_LDG) == 4, "Conflict-free smem transpose only implemented for 4B compute type!");
  static_assert(Base::THREADS_PER_CTA == ROWS_PER_CTA * Base::THREADS_PER_WARP, "We assume one warp per row!");
  // The total number of BYTES_PER_LDG-wide words in a hidden vector.
  enum { COLS = HIDDEN_SIZE_ * sizeof(compute_t_) / BYTES_PER_LDG };
  static_assert(COLS * BYTES_PER_LDG == HIDDEN_SIZE_ * sizeof(compute_t_));

  // Shared memory size to transpose the CTA result.
  enum { SMEM_BYTES_TRANSPOSE = Base::THREADS_PER_CTA * BYTES_PER_LDG };
  // Shared memory size to coalsece the CTA result.
  enum { SMEM_BYTES_OUTPUT = Base::THREADS_PER_WARP * BYTES_PER_LDG };
  // Shared memory requirement per CTA.
  enum { SMEM_BYTES_PER_CTA = 2 * SMEM_BYTES_TRANSPOSE + 2 * SMEM_BYTES_OUTPUT };

  // The type of the reducer.
  using Reducer = layer_norm::Reducer<compute_t_, 1, 1, 1>;

  // Condition for the whole CTA to participate in syncthreads.
  static_assert(COLS % Base::THREADS_PER_WARP == 0);
  enum { CTAS = COLS / Base::THREADS_PER_WARP };
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename weight_t_, typename input_t_, typename output_t_, typename compute_t_, typename index_t_,
          uint32_t HIDDEN_SIZE_, uint32_t CTAS_PER_ROW_, uint32_t WARPS_M_, uint32_t WARPS_N_,
          uint32_t BYTES_PER_LDG_ = 16,
          typename Base = Kernel_traits_base<HIDDEN_SIZE_, weight_t_, input_t_, output_t_, compute_t_, index_t_,
                                             WARPS_M_ * WARPS_N_ * THREADS_PER_WARP> >
struct Kernel_traits : public Base {
  using input_t = typename Base::input_t;
  using weight_t = typename Base::weight_t;
  using compute_t = typename Base::compute_t;
  using output_t = typename Base::output_t;
  using index_t = typename Base::index_t;

  enum { CTAS_PER_ROW = CTAS_PER_ROW_ };
  enum { WARPS_M = WARPS_M_ };
  enum { WARPS_N = WARPS_N_ };
  enum { COLS = HIDDEN_SIZE_ };
  enum { HIDDEN_SIZE = HIDDEN_SIZE_ };
  enum { BYTES_PER_LDG = BYTES_PER_LDG_ };
  enum { NUM_ELTS = BYTES_PER_LDG / sizeof(input_t) };

  enum { THREADS_PER_ROW = WARPS_N * THREADS_PER_WARP };
  enum { THREADS_PER_CTA = WARPS_M * THREADS_PER_ROW };
  enum { ROWS_PER_CTA = WARPS_M };

  enum { BYTES_PER_ROW = COLS * sizeof(input_t) };
  enum { BYTES_PER_ROW_PER_CTA = THREADS_PER_ROW * BYTES_PER_LDG };
  // Multi-row per CTA not supported for multi-CTA => no smem for WGRAD needed
  enum { SMEM_BYTES_WGRAD = CTAS_PER_ROW > 1 ? 0 : ROWS_PER_CTA * COLS * sizeof(compute_t) };
  static_assert(WARPS_M == 1 || CTAS_PER_ROW == 1);

  using reduce_t = typename layer_norm::TypeToVec2<compute_t>::Type;
  using Reducer = layer_norm::Reducer<reduce_t, CTAS_PER_ROW, WARPS_M, WARPS_N>;

  enum { SMEM_BYTES_DGRAD = Reducer::SMEM_BYTES };
  enum { SMEM_BYTES = SMEM_BYTES_DGRAD + SMEM_BYTES_WGRAD };

  using Ivec = layer_norm::Vec<input_t, NUM_ELTS>;
  using Ovec = layer_norm::Vec<output_t, NUM_ELTS>;
  using Wvec = layer_norm::Vec<weight_t, NUM_ELTS>;
  using Cvec = layer_norm::Vec<compute_t, NUM_ELTS>;
  enum { ELTS_PER_LDG = BYTES_PER_LDG / sizeof(input_t) };

  // Assume that each thread can handle the same number of elements in the output and weights as in the input.
  static_assert(sizeof(input_t) >= sizeof(output_t));
  static_assert(sizeof(input_t) >= sizeof(weight_t));
  // The number of columns fetched per load from input: one per thread.
  enum { VEC_COLS_PER_LDG = CTAS_PER_ROW * THREADS_PER_ROW };
  // The total number of vectorized loads/stores per hidden vector.
  enum { VEC_COLS = COLS / ELTS_PER_LDG };
  // The number of loads per thread for the input.
  enum { LDGS = VEC_COLS / VEC_COLS_PER_LDG };
  static_assert(LDGS * VEC_COLS_PER_LDG == VEC_COLS);
  // static_assert(LDGS * BYTES_PER_ROW_PER_CTA * CTAS_PER_ROW == BYTES_PER_ROW, "");

  using Stats = layer_norm::Stats<compute_t, CTAS_PER_ROW, WARPS_M, WARPS_N>;
  enum { SMEM_BYTES_FWD = Stats::SMEM_BYTES };
};

////////////////////////////////////////////////////////////////////////////////////////////////////

}  // namespace layer_norm


================================================
FILE: apex/contrib/csrc/layer_norm/ln_utils.cuh
================================================
#pragma once

#include <cuda_bf16.h>
#include <cuda_fp16.h>

#include <cassert>

#include "ln.h"

////////////////////////////////////////////////////////////////////////////////////////////////////

constexpr uint32_t THREADS_PER_WARP = 32;

////////////////////////////////////////////////////////////////////////////////////////////////////

inline void check_cuda_(cudaError_t status, const char* file, int line) {
  if (status != cudaSuccess) {
    fprintf(stderr, "CUDA Error: %s %s %d\n", cudaGetErrorString(status), file, line);
    exit(status);
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

#define CHECK_CUDA(ans)                     \
  {                                         \
    check_cuda_((ans), __FILE__, __LINE__); \
  }

////////////////////////////////////////////////////////////////////////////////////////////////////

#define DIVUP(x, y) (((x) + ((y) - 1)) / (y))

////////////////////////////////////////////////////////////////////////////////////////////////////

#define REGISTER_FWD_LAUNCHER(HIDDEN_SIZE, WTYPE, ITYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG) \
  void ln_fwd_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##OTYPE##_##CTYPE(LaunchParams<FwdParams>& launch_params,           \
                                                                    const bool configure_params) {                    \
    launch_<WTYPE, ITYPE, OTYPE, CTYPE, uint32_t, HIDDEN_SIZE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG>(        \
        launch_params, configure_params);                                                                             \
  }                                                                                                                   \
  static FwdRegistrar<WTYPE, ITYPE, OTYPE, CTYPE, HIDDEN_SIZE>                                                        \
  reg_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##OTYPE##_##CTYPE(                                                          \
      ln_fwd_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##OTYPE##_##CTYPE)

////////////////////////////////////////////////////////////////////////////////////////////////////

#define REGISTER_BWD_LAUNCHER(HIDDEN_SIZE, WTYPE, ITYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, \
                              BYTES_PER_LDG_FINALIZE)                                                                 \
  void ln_bwd_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##OTYPE##_##CTYPE(LaunchParams<BwdParams>& launch_params,           \
                                                                    const bool configure_params) {                    \
    launch_<WTYPE, ITYPE, OTYPE, CTYPE, uint32_t, HIDDEN_SIZE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG,         \
            BYTES_PER_LDG_FINALIZE>(launch_params, configure_params);                                                 \
  }                                                                                                                   \
  static BwdRegistrar<WTYPE, ITYPE, OTYPE, CTYPE, HIDDEN_SIZE>                                                        \
  reg_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##OTYPE##_##CTYPE(                                                          \
      ln_bwd_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##OTYPE##_##CTYPE)

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ float2 operator+(const float2& a, const float2& b) { return {a.x + b.x, a.y + b.y}; }

////////////////////////////////////////////////////////////////////////////////////////////////////

inline __device__ void operator+=(float2& a, const float2& b) {
  a.x += b.x;
  a.y += b.y;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename T>
struct Sum {
  inline __device__ Sum() {}
  inline __device__ T operator()(const T& a, const T& b) { return a + b; }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename T>
inline __device__ T warp_shuffle_xor(const T& x, uint32_t idx) {
  return __shfl_xor_sync(uint32_t(-1), x, idx);
}

template <>
inline __device__ float2 warp_shuffle_xor<float2>(const float2& x, uint32_t idx) {
  return {warp_shuffle_xor(x.x, idx), warp_shuffle_xor(x.y, idx)};
}

template <typename T>
inline __device__ T warp_shuffle_down(const T& x, uint32_t idx) {
  return __shfl_down_sync(uint32_t(-1), x, idx);
}

template <>
inline __device__ float2 warp_shuffle_down<float2>(const float2& x, uint32_t idx) {
  return {warp_shuffle_down(x.x, idx), warp_shuffle_down(x.y, idx)};
}

////////////////////////////////////////////////////////////////////////////////////////////////////

namespace layer_norm {

////////////////////////////////////////////////////////////////////////////////////////////////////

struct uint16 {
  uint4 u;
  uint4 v;
  uint4 s;
  uint4 t;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

struct uint8 {
  uint4 u;
  uint4 v;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int BYTES>
struct BytesToType {};

template <>
struct BytesToType<64> {
  using Type = uint16;
  static_assert(sizeof(Type) == 64);
};

template <>
struct BytesToType<32> {
  using Type = uint8;
  static_assert(sizeof(Type) == 32);
};

template <>
struct BytesToType<16> {
  using Type = uint4;
  static_assert(sizeof(Type) == 16);
};

template <>
struct BytesToType<8> {
  using Type = uint64_t;
  static_assert(sizeof(Type) == 8);
};

template <>
struct BytesToType<4> {
  using Type = uint32_t;
  static_assert(sizeof(Type) == 4);
};

template <>
struct BytesToType<2> {
  using Type = uint16_t;
  static_assert(sizeof(Type) == 2);
};

template <>
struct BytesToType<1> {
  using Type = uint8_t;
  static_assert(sizeof(Type) == 1);
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename T>
struct TypeToVec2 {};

template <>
struct TypeToVec2<float> {
  using Type = float2;
};

template <>
struct TypeToVec2<half> {
  using Type = half2;
};

template <>
struct TypeToVec2<nv_bfloat16> {
  using Type = nv_bfloat162;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <int INDEX>
struct Get {
  template <typename T, typename R>
  static inline __device__ R of(const T& vec);
};

template <>
template <typename T, typename R>
inline __device__ R Get<0>::of(const T& vec) {
  return vec.x;
}

template <>
template <typename T, typename R>
inline __device__ R Get<1>::of(const T& vec) {
  return vec.y;
}

template <>
template <typename T, typename R>
inline __device__ R Get<2>::of(const T& vec) {
  return vec.z;
}

template <>
template <typename T, typename R>
inline __device__ R Get<3>::of(const T& vec) {
  return vec.w;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Src, typename Dst>
struct Converter {
  static inline __device__ Dst convert(const Src& from) { return Dst(from); }
};

template <>
struct Converter<float2, half2> {
  static inline __device__ half2 convert(const float2& x) { return __float22half2_rn(x); }
};

template <>
struct Converter<float2, nv_bfloat162> {
  static inline __device__ nv_bfloat162 convert(const float2& x) {
#if __CUDA_ARCH__ >= 800
    return __float22bfloat162_rn(x);
#else
    union {
      nv_bfloat162 raw;
      nv_bfloat16 x;
      nv_bfloat16 y;
    } tmp;
    tmp.x = __float2bfloat16_rn(x.x);
    tmp.y = __float2bfloat16_rn(x.y);
    return tmp.raw;
#endif
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename T>
struct Zeros {
  static inline __device__ T get() { return T(0.f); }
};

template <>
struct Zeros<float2> {
  static inline __device__ float2 get() { return make_float2(0.f, 0.f); }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Elt_type, uint32_t NUM_ELT>
struct Vec {
  enum { BYTES = NUM_ELT * sizeof(Elt_type) };

  using Vec_type = typename BytesToType<BYTES>::Type;

  using Alias_type = union {
    Vec_type vec;
    Elt_type elt[NUM_ELT];
  };

  Alias_type data;

  template <typename S>
  inline __device__ void to(Vec<S, NUM_ELT>& other) {
#pragma unroll
    for (int it = 0; it < NUM_ELT; it++) {
      other.data.elt[it] = S(this->data.elt[it]);
    }
  }

  template <typename Op>
  inline __device__ void assign(const Op& op) {
#pragma unroll
    for (int it = 0; it < NUM_ELT; it++) {
      this->data.elt[it] = op(it);
    }
  }

  inline __device__ void load_from(const void* base_ptr, const size_t idx) {
    this->data.vec = static_cast<const Vec_type*>(base_ptr)[idx];
  }

  inline __device__ void store_to(void* base_ptr, const size_t idx) {
    static_cast<Vec_type*>(base_ptr)[idx] = this->data.vec;
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <uint32_t CTAS_PER_ROW>
struct InterCTASync {
  template <typename Params>
  inline __device__ InterCTASync(Params& params, uint32_t bidm, uint32_t bidn)
      : phase_counter_(0),
        b0_(params.barrier + bidm)  // The barrier for this group of CTAs.
        ,
        b1_(params.barrier + bidm + params.ctas_per_col)  // The barrier for this group of CTAs.
  {
    // BARRIERS ARE ASSUMED TO BE INITIALIZED TO 0!
  }

  inline __device__ void spin_wait_(int* barrier, int step, int expected) {
    asm volatile("red.release.gpu.global.add.s32 [%0], %1;" ::"l"(barrier), "r"(step));
    for (int found = -1; found != expected;) {
      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];" : "=r"(found) : "l"(barrier));
    }
  }

  inline __device__ void sync() {
    // ALL THREADS MUST ENTER!

    // We switch barrier every iteration.
    int* barrier = phase_counter_ & 0x1 ? b1_ : b0_;
    // We decrement every other iteration.
    bool dec = phase_counter_ & 0x2;
    int step = dec ? -1 : 1;
    int expected = dec ? 0 : CTAS_PER_ROW;
    // There are only 4 phases: up/down for b0/b1.
    phase_counter_ = (phase_counter_ + 1) & 0x3;

    if (threadIdx.x == 0) {
      spin_wait_(barrier, step, expected);
    }
    // CTA waits for thread 0
    __syncthreads();
  }

  int phase_counter_;
  int* b0_;
  int* b1_;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename T, uint32_t CTAS_PER_ROW, uint32_t WARPS_M, uint32_t WARPS_N>
struct Reducer : public Reducer<T, 1, WARPS_M, WARPS_N> {
  using InterCTASync = InterCTASync<CTAS_PER_ROW>;
  using Base = Reducer<T, 1, WARPS_M, WARPS_N>;
  using Type = typename Base::Type;

  enum { SMEM_BYTES = Base::SMEM_BYTES };

  enum { WS_BARRIER_BYTES = 2 * sizeof(int) };
  enum { WS_DATA_BYTES = WARPS_M * CTAS_PER_ROW * sizeof(T) };

  // size of the barriers + temporary result per CTA (multiply with CTAS_PER_ROW to get total)
  enum { WORKSPACE_BYTES_PER_GROUP = Base::WORKSPACE_BYTES_PER_GROUP + WS_BARRIER_BYTES + WS_DATA_BYTES };

  template <typename Params>
  inline __device__ Reducer(Params& params, uint32_t bidm, uint32_t bidn, uint32_t warp_m, uint32_t warp_n,
                            uint32_t lane, void* smem)
      : Base(params, bidm, bidn, warp_m, warp_n, lane, smem),
        inter_cta_(params, bidm, bidn),
        bidn_(bidn)  // CTA id within the group.
        ,
        w0_(static_cast<T*>(params.workspace) + (bidm * WARPS_M + warp_m) * CTAS_PER_ROW),
        w1_(w0_ + params.ctas_per_col * WARPS_M * CTAS_PER_ROW) {}

  template <typename Op>
  inline __device__ T allreduce(T data, Op& op) {
    data = Base::reduce(data, op);
    // We switch workspace every iteration.
    T* workspace = inter_cta_.phase_counter_ & 0x1 ? w1_ : w0_;

    // Warp leaders 0 hold the CTA-local results.
    if (this->warp_n_ == 0 && this->lane_ == 0) {
      workspace[bidn_] = data;
    }
    inter_cta_.sync();
    static_assert(CTAS_PER_ROW <= 32);
    T total = Zeros<T>::get();
    if (this->lane_ < CTAS_PER_ROW) {
      total = workspace[this->lane_];
    }
    total = Reducer<T, 1, 1, 1>::allreduce_(total, op);

    return total;
  }

  InterCTASync inter_cta_;

  T* w0_;
  T* w1_;
  int bidn_;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename T, uint32_t WARPS_M>
struct Reducer<T, 1, WARPS_M, 1> {
  using Type = T;
  enum { SMEM_BYTES = 0 };
  enum { WORKSPACE_BYTES_PER_GROUP = 0 };

  enum { THREADS_PER_WARP = 32 };

  template <typename Params>
  inline __device__ Reducer(Params& params, uint32_t bidm, uint32_t bidn, uint32_t warp_m, uint32_t warp_n,
                            uint32_t lane, void* smem)
      : warp_n_(warp_n), lane_(lane) {}

  template <typename Op>
  static inline __device__ T allreduce_(T data, Op& op) {
#pragma unroll
    for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
      data = op(data, warp_shuffle_xor(data, it));
    }
    return data;
  }

  template <typename Op>
  inline __device__ T allreduce(T data, Op& op) {
    return allreduce_(data, op);
  }

  template <typename Op>
  inline __device__ T reduce(T data, Op& op) {
// only lane 0 holds the result!
#pragma unroll
    for (int it = THREADS_PER_WARP / 2; it > 0; it /= 2) {
      data = op(data, warp_shuffle_down(data, it));
    }
    return data;
  }
  int warp_n_;
  int lane_;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename T, uint32_t WARPS_M, uint32_t WARPS_N>
struct Reducer<T, 1, WARPS_M, WARPS_N> : public Reducer<T, 1, WARPS_M, 1> {
  using Base = Reducer<T, 1, WARPS_M, 1>;

  using Type = T;

  enum { SMEM_BYTES = Base::SMEM_BYTES + WARPS_M * WARPS_N * sizeof(T) * 2 };
  enum { WORKSPACE_BYTES_PER_GROUP = 0 };

  enum { THREADS_PER_WARP = 32 };

  template <typename Params>
  inline __device__ Reducer(Params& params, uint32_t bidm, uint32_t bidn, uint32_t warp_m, uint32_t warp_n,
                            uint32_t lane, void* smem)
      : Base(params, bidm, bidn, warp_m, warp_n, lane, smem), use0_(true) {
    smem0_ = &static_cast<T*>(smem)[warp_m * WARPS_N];
    smem1_ = smem0_ + WARPS_M * WARPS_N;
  }

  template <typename Op>
  inline __device__ T allreduce(T data, Op& op) {
    T* smem = use0_ ? smem0_ : smem1_;
    use0_ = !use0_;
    data = Base::reduce(data, op);
    if (this->lane_ == 0) {
      smem[this->warp_n_] = data;
    }
    __syncthreads();
    T out = Zeros<T>::get();
#pragma unroll
    for (int it = 0; it < WARPS_N; it++) {
      out = op(out, smem[it]);
    }
    return out;
  }

  template <typename Op>
  inline __device__ T reduce(T data, Op& op) {
    T* smem = use0_ ? smem0_ : smem1_;
    use0_ = !use0_;
    // only intra-CTA group leader holds the result!
    data = Base::reduce(data, op);
    if (this->lane_ == 0) {
      smem[this->warp_n_] = data;
    }
    __syncthreads();
    T out = Zeros<T>::get();
    if (this->warp_n_ == 0 && this->lane_ == 0) {
#pragma unroll
      for (int it = 0; it < WARPS_N; it++) {
        out = op(out, smem[it]);
      }
    }
    return out;
  }

  T* smem0_;
  T* smem1_;
  bool use0_;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename T>
inline __device__ void warp_chan_upd_dynamic(T& m_a, T& m2_a, T& n_a, int num_active) {
  // Assume at least leftmost is valid and init: step = next_pow2(num_active) / 2 (might get NaN otherwise)
  int highest_bit_set = (8 * sizeof(num_active)) - __clz(num_active - 1);

#pragma unroll
  for (int step = (1 << (highest_bit_set - 1)); step > 0; step /= 2) {
    // Exchange
    T n_b = warp_shuffle_down(n_a, step);
    T m_b = warp_shuffle_down(m_a, step);
    T m2_b = warp_shuffle_down(m2_a, step);

    // Update
    const T n_ab = n_a + n_b;    // We can handle one of them being 0, not both.
    const T rn_ab = 1.f / n_ab;  // Might have different n per thread, otherwise this would simplify :(
    const T delta = m_a - m_b;
    const float m2_ab = m2_a + m2_b + delta * delta * n_a * n_b * rn_ab;
    const float m_ab = (n_a * m_a + n_b * m_b) * rn_ab;

    n_a = n_ab;
    m_a = m_ab;
    m2_a = m2_ab;
  }
  // Intra-warp broadcast (only lane 0 has valid stats).
  m_a = __shfl_sync(uint32_t(-1), m_a, 0);
  m2_a = __shfl_sync(uint32_t(-1), m2_a, 0);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename T, uint32_t CTAS_PER_ROW, uint32_t WARPS_M, uint32_t WARPS_N>
struct Stats {
  // This could be done generically with the Reducer. But then we would have to exchange 3 instead of 2 fields.

  using InterCTASync = InterCTASync<CTAS_PER_ROW>;
  using BlockStats = Stats<T, 1, WARPS_M, WARPS_N>;
  using stats_t = typename BlockStats::stats_t;

  enum { SMEM_BYTES = BlockStats::SMEM_BYTES };

  template <typename Params>
  inline __device__ Stats(Params& params, uint32_t bidm, uint32_t bidn, uint32_t warp_m, uint32_t warp_n, uint32_t lane,
                          void* smem)
      : inter_cta_(params, bidm, bidn),
        block_stats_(params, bidm, bidn, warp_m, warp_n, lane, smem),
        bidn_(bidn)  // CTA id within the group.
        ,
        w0_(static_cast<stats_t*>(params.workspace) + (bidm * WARPS_M + warp_m) * CTAS_PER_ROW),
        w1_(w0_ + params.ctas_per_col * WARPS_M * CTAS_PER_ROW),
        warp_n_(warp_n),
        lane_(lane) {}

  template <uint32_t N>
  inline __device__ stats_t compute(const T (&elts)[N], const T rn) {
    constexpr T ELTS_PER_ROW_PER_CTA = N * WARPS_N * THREADS_PER_WARP;
    // TODO rn is not really needed here..
    constexpr T block_rn = 1.f / T(ELTS_PER_ROW_PER_CTA);
    stats_t block_stats = block_stats_.compute(elts, block_rn);

    stats_t* workspace = inter_cta_.phase_counter_ & 0x1 ? w1_ : w0_;

    if (warp_n_ == 0 && lane_ == 0) {
      workspace[bidn_] = block_stats;
    }

    // Wait for all CTAS_PER_ROW CTAS in the group to have written their result.
    inter_cta_.sync();

    T n = Zeros<T>::get();
    T m = Zeros<T>::get();
    T m2 = Zeros<T>::get();

    // Assume CTA group size in N less than 32, such that we can finalize with a single warp.
    static_assert(CTAS_PER_ROW <= 32);

    // Every warp does the final reduction locally.
    if (lane_ < CTAS_PER_ROW) {
      stats_t result = workspace[lane_];
      n = ELTS_PER_ROW_PER_CTA;
      m = layer_norm::Get<0>::of<stats_t, T>(result);
      m2 = layer_norm::Get<1>::of<stats_t, T>(result);
    }

    warp_chan_upd_dynamic(m, m2, n, CTAS_PER_ROW);

    return {m, m2};
  }

  InterCTASync inter_cta_;
  BlockStats block_stats_;

  stats_t* w0_;
  stats_t* w1_;
  int bidn_;
  int warp_n_;
  int lane_;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename T, uint32_t WARPS_M, uint32_t WARPS_N>
struct Stats<T, 1, WARPS_M, WARPS_N> {
  using WarpStats = Stats<T, 1, WARPS_M, 1>;
  using stats_t = typename WarpStats::stats_t;

  enum { SMEM_BYTES = WARPS_M * WARPS_N * sizeof(stats_t) * 2 };

  template <typename Params>
  inline __device__ Stats(Params& params, uint32_t bidm, uint32_t bidn, uint32_t warp_m, uint32_t warp_n, uint32_t lane,
                          void* smem)
      : warp_stats_(params, bidm, bidn, warp_m, warp_n, lane, smem), use0_(true) {
    smem0_ = static_cast<stats_t*>(smem) + warp_m * WARPS_N;
    smem1_ = smem0_ + WARPS_M * WARPS_N;
  }

  template <uint32_t N>
  inline __device__ stats_t compute(const T (&elts)[N], const T rn) {
    stats_t* smem = use0_ ? smem0_ : smem1_;
    use0_ = !use0_;
    // Compute warp local for all WARPS_N
    constexpr T warp_rn = 1.f / T(N * THREADS_PER_WARP);
    stats_t warp_stats = warp_stats_.compute(elts, warp_rn);

    // Each warp warp leader stores its stats
    const auto warp_n = warp_stats_.reducer_.warp_n_;
    const auto lane = warp_stats_.reducer_.lane_;
    if (lane == 0) {
      smem[warp_n] = warp_stats;
    }
    __syncthreads();

    T n = Zeros<T>::get();
    T m = Zeros<T>::get();
    T m2 = Zeros<T>::get();

    // Assume that there are less than 32 warps, such that we can finalize with a single warp
    static_assert(WARPS_N <= 32);
    if (lane < WARPS_N) {
      stats_t result = smem[lane];
      n = N * THREADS_PER_WARP;
      m = layer_norm::Get<0>::of<stats_t, T>(result);
      m2 = layer_norm::Get<1>::of<stats_t, T>(result);
    }

    warp_chan_upd_dynamic(m, m2, n, WARPS_N);

    return {m, m2};
  }
  WarpStats warp_stats_;
  stats_t* smem0_;
  stats_t* smem1_;
  bool use0_;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename T, uint32_t WARPS_M>
struct Stats<T, 1, WARPS_M, 1> {
  using stats_t = typename TypeToVec2<T>::Type;
  // The simple Warp reducer.
  using Reducer = Reducer<T, 1, WARPS_M, 1>;

  enum { SMEM_BYTES = 0 };

  template <typename Params>
  inline __device__ Stats(Params& params, uint32_t bidm, uint32_t bidn, uint32_t warp_m, uint32_t warp_n, uint32_t lane,
                          void* smem)
      : reducer_(params, bidm, bidn, warp_m, warp_n, lane, smem) {}

  template <uint32_t N>
  inline __device__ stats_t compute(const T (&elts)[N], const T rn) {
    auto sum = Sum<T>();

    T m = Zeros<T>::get();
#pragma unroll
    for (int it = 0; it < N; it++) {
      m += elts[it];
    }
    m = reducer_.allreduce(m, sum) * rn;

    T m2 = Zeros<T>::get();
#pragma unroll
    for (int it = 0; it < N; it++) {
      T diff = (elts[it] - m);
      m2 += diff * diff;
    }
    m2 = reducer_.allreduce(m2, sum);

    return {m, m2};
  }

  Reducer reducer_;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

}  // namespace layer_norm


================================================
FILE: apex/contrib/csrc/multihead_attn/additive_masked_softmax_dropout_cuda.cu
================================================
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <cuda_runtime.h>
#include <math.h>
#include <torch/extension.h>

#include <iostream>
#include <vector>

#include "dropout.cuh"
#include "softmax.cuh"

// symbol to be automatically resolved by PyTorch libs

namespace multihead_attn {
namespace fused_softmax {
namespace additive_mask_softmax_dropout {

std::vector<torch::Tensor> fwd_cuda(bool is_training, int heads, torch::Tensor const& input, const half* pad_mask,
                                    float dropout_prob) {
  const int attn_batches = input.size(0);
  const int sequences = attn_batches / heads;
  const int q_seq_len = input.size(1);
  const int k_seq_len = q_seq_len;
  // const int dropout_elems = attn_batches * q_seq_len * k_seq_len;

  // There is no reason to use more than one stream as every kernel is
  // sequentially dependent
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  cublasSetStream(handle, stream);

  // 3 Intermediate Results + Output (Note: dropout intermediates are generated
  // by ATen library code)
  auto act_options = input.options().requires_grad(false);
  auto mask_options = act_options.dtype(torch::kUInt8);

  torch::Tensor softmax_results = torch::empty({attn_batches, q_seq_len, k_seq_len}, act_options);
  torch::Tensor dropout_results = torch::empty({attn_batches, q_seq_len, k_seq_len}, act_options);
  torch::Tensor dropout_mask = torch::empty({attn_batches, q_seq_len, k_seq_len}, mask_options);

  // Softmax Intermediate Result Ptr (used by Matmul1 -> Softmax)
  void* input_ptr = static_cast<void*>(input.data_ptr());
  void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());

  // Padded Softmax
  [[maybe_unused]] bool softmax_success = false;
  if (pad_mask == nullptr) {
    softmax_success = dispatch_softmax<half, half, float>(reinterpret_cast<half*>(softmax_results_ptr),
                                                          reinterpret_cast<const half*>(input_ptr), k_seq_len,
                                                          k_seq_len, attn_batches * q_seq_len);
  } else {
    softmax_success = dispatch_additive_masked_softmax<half, half, float>(
        reinterpret_cast<half*>(softmax_results_ptr), reinterpret_cast<const half*>(input_ptr), pad_mask, k_seq_len,
        k_seq_len, attn_batches * q_seq_len, attn_batches * q_seq_len / sequences);
  }

  if (is_training) {
    // use at:: function so that C++ version generates the same random mask as
    // python version
    auto dropout_tuple = at::_fused_dropout(softmax_results, 1.0f - dropout_prob);
    dropout_results = std::get<0>(dropout_tuple);
    dropout_mask = std::get<1>(dropout_tuple);
  }

  // Matmul2

  return {dropout_results, dropout_mask, softmax_results};
}

torch::Tensor bwd_cuda(int heads, torch::Tensor const& output_grads, torch::Tensor const& softmax_results,
                       torch::Tensor const& dropout_mask, float dropout_prob) {
  const int attn_batches = output_grads.size(0);
  const int q_seq_len = output_grads.size(1);
  const int k_seq_len = q_seq_len;
  // const int dropout_elems = attn_batches * q_seq_len * k_seq_len;
  // TODO: Streams can be used in Backprop but I haven't added more than one
  // in my first attempt to create the code
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  cublasSetStream(handle, stream);

  // Output Tensor Allocations
  //  torch::Tensor input_grads         = torch::empty_like(output_grads);

  // Apply Dropout Mask and Scale by Dropout Probability
  // Softmax Grad
  dispatch_masked_scale_softmax_backward_stream<half, half, float, false>(
      static_cast<half*>(output_grads.data_ptr()), static_cast<half*>(output_grads.data_ptr()),
      reinterpret_cast<half const*>(softmax_results.data_ptr()), static_cast<uint8_t const*>(dropout_mask.data_ptr()),
      1.0 / (1.0 - dropout_prob), k_seq_len, k_seq_len, attn_batches * q_seq_len, stream);
  // backward pass is completely in-place
  return output_grads;
}
}  // namespace additive_mask_softmax_dropout
}  // namespace fused_softmax
}  // namespace multihead_attn


================================================
FILE: apex/contrib/csrc/multihead_attn/dropout.cuh
================================================
#pragma once
#include <ATen/ATen.h>

#ifdef OLD_GENERATOR_PATH
#include <ATen/CUDAGeneratorImpl.h>
#else
#include <ATen/cuda/CUDAGeneratorImpl.h>
#endif

#include <ATen/cuda/CUDAContext.h>
#include <curand_kernel.h>

namespace {
constexpr int UNROLL = 4;
}  // namespace

template <typename scalar_t, typename accscalar_t, typename IndexType>
__global__ void apex_fused_dropout_kernel(scalar_t const* inputs, scalar_t* outputs, uint8_t* mask,
                                          IndexType totalElements, accscalar_t p, std::pair<uint64_t, uint64_t> seeds) {
  accscalar_t pinv = accscalar_t(1) / p;
  IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;

  curandStatePhilox4_32_10_t state;
  curand_init(seeds.first, idx, seeds.second, &state);

  IndexType rounded_size =
      ((totalElements - 1) / (blockDim.x * gridDim.x * UNROLL) + 1) * blockDim.x * gridDim.x * UNROLL;
  for (IndexType linearIndex = idx; linearIndex < rounded_size; linearIndex += gridDim.x * blockDim.x * UNROLL) {
    float4 rand = curand_uniform4(&state);
    scalar_t src[UNROLL];
    rand.x = rand.x <= p;
    rand.y = rand.y <= p;
    rand.z = rand.z <= p;
    rand.w = rand.w <= p;

    for (int ii = 0; ii < UNROLL; ii++) {
      IndexType li = linearIndex + blockDim.x * gridDim.x * ii;
      if (li < totalElements) {
        src[ii] = inputs[li];
      }
    }
    for (int ii = 0; ii < UNROLL; ii++) {
      IndexType li = linearIndex + blockDim.x * gridDim.x * ii;
      if (li < totalElements) {
        outputs[li] = src[ii] * (&rand.x)[ii] * pinv;
        mask[li] = (uint8_t)(&rand.x)[ii];
      }
    }
    __syncthreads();
  }
}

template <typename scalar_t, typename accscalar_t, typename IndexType>
__global__ void apex_dropout_add_kernel(scalar_t const* inputs, scalar_t const* add_inputs, scalar_t* outputs,
                                        uint8_t* mask, IndexType totalElements, accscalar_t p,
                                        std::pair<uint64_t, uint64_t> seeds) {
  accscalar_t pinv = accscalar_t(1) / p;
  IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;

  curandStatePhilox4_32_10_t state;
  curand_init(seeds.first, idx, seeds.second, &state);

  IndexType rounded_size =
      ((totalElements - 1) / (blockDim.x * gridDim.x * UNROLL) + 1) * blockDim.x * gridDim.x * UNROLL;
  for (IndexType linearIndex = idx; linearIndex < rounded_size; linearIndex += gridDim.x * blockDim.x * UNROLL) {
    float4 rand = curand_uniform4(&state);
    scalar_t src[UNROLL];
    scalar_t add_src[UNROLL];
    rand.x = rand.x <= p;
    rand.y = rand.y <= p;
    rand.z = rand.z <= p;
    rand.w = rand.w <= p;
    for (int ii = 0; ii < UNROLL; ii++) {
      IndexType li = linearIndex + blockDim.x * gridDim.x * ii;
      if (li < totalElements) {
        src[ii] = inputs[li];
        add_src[ii] = add_inputs[li];
      }
    }
    for (int ii = 0; ii < UNROLL; ii++) {
      IndexType li = linearIndex + blockDim.x * gridDim.x * ii;
      if (li < totalElements) {
        accscalar_t int1 = src[ii] * (&rand.x)[ii] * pinv;
        outputs[li] = static_cast<scalar_t>(static_cast<accscalar_t>(add_src[ii]) + int1);
        mask[li] = (uint8_t)(&rand.x)[ii];
      }
    }
    __syncthreads();
  }
}

template <typename scalar_t, typename accscalar_t, typename IndexType>
__global__ void apex_add_kernel(scalar_t const* inputs, scalar_t const* add_inputs, scalar_t* outputs,
                                IndexType totalElements) {
  IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;
  IndexType rounded_size =
      ((totalElements - 1) / (blockDim.x * gridDim.x * UNROLL) + 1) * blockDim.x * gridDim.x * UNROLL;
  for (IndexType linearIndex = idx; linearIndex < rounded_size; linearIndex += gridDim.x * blockDim.x * UNROLL) {
    scalar_t src[UNROLL];
    scalar_t add_src[UNROLL];
    for (int ii = 0; ii < UNROLL; ii++) {
      IndexType li = linearIndex + blockDim.x * gridDim.x * ii;
      if (li < totalElements) {
        src[ii] = inputs[li];
        add_src[ii] = add_inputs[li];
      }
    }
    for (int ii = 0; ii < UNROLL; ii++) {
      IndexType li = linearIndex + blockDim.x * gridDim.x * ii;
      if (li < totalElements) {
        outputs[li] = src[ii] + add_src[ii];
      }
    }
    __syncthreads();
  }
}

template <typename scalar_t, typename accscalar_t, typename IndexType>
__global__ void apex_masked_scale_kernel(scalar_t const* inputs, scalar_t* outputs, uint8_t const* mask,
                                         IndexType totalElements, accscalar_t scale) {
  IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;
  IndexType rounded_size =
      ((totalElements - 1) / (blockDim.x * gridDim.x * UNROLL) + 1) * blockDim.x * gridDim.x * UNROLL;
  for (IndexType linearIndex = idx; linearIndex < rounded_size; linearIndex += gridDim.x * blockDim.x * UNROLL) {
    scalar_t src[UNROLL];
    scalar_t msk[UNROLL];
    for (int ii = 0; ii < UNROLL; ii++) {
      IndexType li = linearIndex + blockDim.x * gridDim.x * ii;
      if (li < totalElements) {
        src[ii] = static_cast<scalar_t>(inputs[li]);
        msk[ii] = static_cast<scalar_t>(mask[li]);
      }
    }
    for (int ii = 0; ii < UNROLL; ii++) {
      IndexType li = linearIndex + blockDim.x * gridDim.x * ii;
      if (li < totalElements) {
        outputs[li] = static_cast<accscalar_t>(src[ii]) * scale * static_cast<accscalar_t>(msk[ii]);
      }
    }
  }
}

template <typename scalar_t, typename accscalar_t, typename IndexType>
void apex_fused_dropout_cuda(scalar_t const* inputs, scalar_t* outputs, uint8_t* mask, IndexType totalElements,
                             accscalar_t p) {
  auto gen = at::cuda::detail::getDefaultCUDAGenerator();

  int block_size = 256;
  dim3 dim_block(block_size);
  dim3 grid((totalElements + block_size - 1) / block_size);
  unsigned int blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor / block_size;
  grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * blocks_per_sm, grid.x);

  // number of times random will be generated per thread, to offset philox
  // counter in the random state
  int64_t counter_offset = ((totalElements - 1) / (block_size * grid.x * UNROLL) + 1) * UNROLL;
  std::pair<uint64_t, uint64_t> rng_engine_inputs;
  {
    // See Note [Acquire lock when using random generators]
    std::lock_guard<std::mutex> lock(gen.mutex());
    rng_engine_inputs = at::check_generator<at::CUDAGeneratorImpl>(gen)->philox_engine_inputs(counter_offset);
  }

  apex_fused_dropout_kernel<scalar_t, accscalar_t, IndexType><<<grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(
      inputs, outputs, mask, totalElements, p, rng_engine_inputs);
  C10_CUDA_CHECK(cudaGetLastError());
}

template <typename scalar_t, typename accscalar_t, typename IndexType>
void apex_dropout_add_cuda(scalar_t const* inputs, scalar_t const* add_inputs, scalar_t* outputs, uint8_t* mask,
                           IndexType totalElements, accscalar_t p) {
  auto gen = at::cuda::detail::getDefaultCUDAGenerator();

  int block_size = 256;
  dim3 dim_block(block_size);
  dim3 grid((totalElements + block_size - 1) / block_size);
  unsigned int blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor / block_size;
  grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * blocks_per_sm, grid.x);

  // number of times random will be generated per thread, to offset philox
  // counter in the random state
  int64_t counter_offset = ((totalElements - 1) / (block_size * grid.x * UNROLL) + 1) * UNROLL;
  std::pair<uint64_t, uint64_t> rng_engine_inputs;
  {
    // See Note [Acquire lock when using random generators]
    std::lock_guard<std::mutex> lock(gen.mutex());
    rng_engine_inputs = at::check_generator<at::CUDAGeneratorImpl>(gen)->philox_engine_inputs(counter_offset);
  }

  apex_dropout_add_kernel<scalar_t, accscalar_t, IndexType><<<grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(
      inputs, add_inputs, outputs, mask, totalElements, p, rng_engine_inputs);
  C10_CUDA_CHECK(cudaGetLastError());
}

template <typename scalar_t, typename accscalar_t, typename IndexType>
void apex_add_cuda(scalar_t const* inputs, scalar_t const* add_inputs, scalar_t* outputs, IndexType totalElements) {
  int block_size = 256;
  dim3 dim_block(block_size);
  dim3 grid((totalElements + block_size - 1) / block_size);
  unsigned int blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor / block_size;
  grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * blocks_per_sm, grid.x);

  apex_add_kernel<scalar_t, accscalar_t, IndexType>
      <<<grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(inputs, add_inputs, outputs, totalElements);
  C10_CUDA_CHECK(cudaGetLastError());
}

template <typename scalar_t, typename accscalar_t, typename IndexType>
void apex_masked_scale_cuda(scalar_t const* inputs, scalar_t* outputs, uint8_t const* mask, IndexType totalElements,
                            accscalar_t scale) {
  int block_size = 256;
  dim3 dim_block(block_size);
  dim3 grid((totalElements + block_size - 1) / block_size);
  unsigned int blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor / block_size;
  grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * blocks_per_sm, grid.x);

  apex_masked_scale_kernel<scalar_t, accscalar_t, IndexType>
      <<<grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(inputs, outputs, mask, totalElements, scale);
  C10_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: apex/contrib/csrc/multihead_attn/encdec_multihead_attn_cuda.cu
================================================
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <cuda_runtime.h>
#include <math.h>
#include <torch/extension.h>

#include <iostream>
#include <vector>

#include "dropout.cuh"
#include "softmax.cuh"
#include "strided_batched_gemm.cuh"

namespace multihead_attn {
namespace encdec {
namespace cublas_gemmex {

std::vector<torch::Tensor> fwd_cuda(bool use_time_mask, bool is_training, int heads, torch::Tensor const& inputs_q,
                                    torch::Tensor const& inputs_kv, torch::Tensor const& input_weights_q,
                                    torch::Tensor const& input_weights_kv, torch::Tensor const& output_weights,
                                    const uint8_t* pad_mask, float dropout_prob) {
  const int embed_dim = inputs_q.size(2);
  const int sequences = inputs_q.size(1);
  const int q_seq_len = inputs_q.size(0);
  const int k_seq_len = inputs_kv.size(0);
  const int batches_q = sequences * q_seq_len;
  const int batches_kv = sequences * k_seq_len;
  const int head_dim = embed_dim / heads;
  const int output_lin_q_dim = embed_dim;
  const int output_lin_kv_dim = 2 * embed_dim;
  const int attn_batches = heads * sequences;
  const int lead_dim_q = attn_batches * head_dim;
  const int lead_dim_kv = attn_batches * 2 * head_dim;
  const int batch_stride_q = head_dim;
  const int batch_stride_kv = 2 * head_dim;
  const int dropout_elems = attn_batches * q_seq_len * k_seq_len;
  const float alpha = 1.0;
  const float beta = 0.0;
  const float scale = 1.0 / sqrt(static_cast<float>(head_dim));

  // There is no reason to use more than one stream as every kernel is
  // sequentially dependent
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  cublasSetStream(handle, stream);

  // 3 Intermediate Results + Output (Note: dropout intermediates are generated
  // by ATen library code)
  auto act_options = inputs_q.options().requires_grad(false);
  auto mask_options = act_options.dtype(torch::kUInt8);

  torch::Tensor input_lin_q_results = torch::empty({q_seq_len, sequences, output_lin_q_dim}, act_options);
  torch::Tensor input_lin_kv_results = torch::empty({k_seq_len, sequences, output_lin_kv_dim}, act_options);
  torch::Tensor softmax_results = torch::empty({attn_batches, q_seq_len, k_seq_len}, act_options);
  torch::Tensor dropout_results = torch::empty({attn_batches, q_seq_len, k_seq_len}, act_options);
  torch::Tensor dropout_mask = torch::empty({attn_batches, q_seq_len, k_seq_len}, mask_options);
  torch::Tensor matmul2_results = torch::empty({q_seq_len, attn_batches, head_dim}, act_options);
  torch::Tensor outputs = torch::empty_like(inputs_q, act_options);

  // Input Linear Results Pointers to Q, K, and V of interviewed activations
  void* q_lin_results_ptr = static_cast<void*>(input_lin_q_results.data_ptr());
  void* k_lin_results_ptr = static_cast<void*>(input_lin_kv_results.data_ptr());
  void* v_lin_results_ptr = static_cast<void*>(static_cast<half*>(input_lin_kv_results.data_ptr()) + head_dim);

  // Softmax Intermediate Result Ptr (used by Matmul1 -> Softmax)
  void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());

  char a_layout_t{'t'};
  char a_layout_n{'n'};
  char b_layout_n{'n'};

  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
  // Input Linear Q Fwd
  TORCH_CUDABLAS_CHECK(cublasGemmEx(
      handle, CUBLAS_OP_T, CUBLAS_OP_N, output_lin_q_dim, batches_q, embed_dim, static_cast<const void*>(&alpha),
      static_cast<const void*>(input_weights_q.data_ptr()), CUDA_R_16F, embed_dim,
      static_cast<const void*>(inputs_q.data_ptr()), CUDA_R_16F, embed_dim, static_cast<const void*>(&beta),
      q_lin_results_ptr, CUDA_R_16F, output_lin_q_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // Input Linear KV Fwd
  TORCH_CUDABLAS_CHECK(cublasGemmEx(
      handle, CUBLAS_OP_T, CUBLAS_OP_N, output_lin_kv_dim, batches_kv, embed_dim, static_cast<const void*>(&alpha),
      static_cast<const void*>(input_weights_kv.data_ptr()), CUDA_R_16F, embed_dim,
      static_cast<const void*>(inputs_kv.data_ptr()), CUDA_R_16F, embed_dim, static_cast<const void*>(&beta),
      k_lin_results_ptr, CUDA_R_16F, output_lin_kv_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // MatMul1 of Dot-Product Attention Plus scaling by 1/Sqrt(head size)
  gemm_switch_fp32accum(a_layout_t, b_layout_n, k_seq_len, q_seq_len, head_dim, scale,
                        static_cast<const half*>(k_lin_results_ptr), lead_dim_kv, batch_stride_kv,
                        static_cast<const half*>(q_lin_results_ptr), lead_dim_q, batch_stride_q, beta,
                        static_cast<half*>(softmax_results_ptr), k_seq_len, k_seq_len * q_seq_len, attn_batches);

  // Padded Softmax
  bool softmax_success = false;
  if (pad_mask == nullptr) {
    softmax_success = dispatch_softmax<half, half, float>(reinterpret_cast<half*>(softmax_results_ptr),
                                                          reinterpret_cast<const half*>(softmax_results_ptr), k_seq_len,
                                                          k_seq_len, attn_batches * q_seq_len);
  } else {
    if (use_time_mask) {
      softmax_success = dispatch_time_masked_softmax<half, half, float>(
          reinterpret_cast<half*>(softmax_results_ptr), reinterpret_cast<const half*>(softmax_results_ptr), pad_mask,
          k_seq_len, k_seq_len, attn_batches * q_seq_len, q_seq_len);
    } else {
      softmax_success = dispatch_masked_softmax<half, half, float>(
          reinterpret_cast<half*>(softmax_results_ptr), reinterpret_cast<const half*>(softmax_results_ptr), pad_mask,
          k_seq_len, k_seq_len, attn_batches * q_seq_len, attn_batches * q_seq_len / sequences);
    }
  }
  assert(softmax_success);

  if (is_training) {
    apex_fused_dropout_cuda<at::Half, float, uint32_t>(
        static_cast<at::Half const*>(softmax_results.data_ptr()), static_cast<at::Half*>(dropout_results.data_ptr()),
        static_cast<uint8_t*>(dropout_mask.data_ptr()), dropout_elems, (1.0f - dropout_prob));
  }

  // Matmul2
  gemm_switch_fp32accum(a_layout_n, b_layout_n, head_dim, q_seq_len, k_seq_len, alpha,
                        static_cast<const half*>(v_lin_results_ptr), lead_dim_kv, batch_stride_kv,
                        (is_training) ? static_cast<const half*>(dropout_results.data_ptr())
                                      : static_cast<const half*>(softmax_results.data_ptr()),
                        k_seq_len, k_seq_len * q_seq_len, beta, static_cast<half*>(matmul2_results.data_ptr()),
                        head_dim * attn_batches, head_dim, attn_batches);

  // Output Linear
  TORCH_CUDABLAS_CHECK(cublasGemmEx(
      handle, CUBLAS_OP_T, CUBLAS_OP_N, embed_dim, batches_q, embed_dim, static_cast<const void*>(&alpha),
      static_cast<const void*>(output_weights.data_ptr()), CUDA_R_16F, embed_dim,
      static_cast<const void*>(matmul2_results.data_ptr()), CUDA_R_16F, embed_dim, static_cast<const void*>(&beta),
      static_cast<void*>(outputs.data_ptr()), CUDA_R_16F, embed_dim, CUDA_R_32F,
      // CUBLAS_GEMM_ALGO1_TENSOR_OP));
      CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));

  return {input_lin_q_results, input_lin_kv_results, softmax_results, dropout_results,
          dropout_mask,        matmul2_results,      outputs};
}

std::vector<torch::Tensor> bwd_cuda(int heads, torch::Tensor const& output_grads, torch::Tensor const& matmul2_results,
                                    torch::Tensor const& dropout_results, torch::Tensor const& softmax_results,
                                    torch::Tensor const& input_lin_q_results, torch::Tensor const& input_lin_kv_results,
                                    torch::Tensor const& inputs_q, torch::Tensor const& inputs_kv,
                                    torch::Tensor const& input_weights_q, torch::Tensor const& input_weights_kv,
                                    torch::Tensor const& output_weights, torch::Tensor const& dropout_mask,
                                    float dropout_prob) {
  const int embed_dim = inputs_q.size(2);
  const int sequences = inputs_q.size(1);
  const int q_seq_len = inputs_q.size(0);
  const int k_seq_len = inputs_kv.size(0);
  const int batches_q = sequences * q_seq_len;
  const int batches_kv = sequences * k_seq_len;
  const int head_dim = embed_dim / heads;
  const int output_lin_q_dim = embed_dim;
  const int output_lin_kv_dim = 2 * embed_dim;
  const int attn_batches = heads * sequences;
  const int lead_dim_q = attn_batches * head_dim;
  const int lead_dim_kv = attn_batches * 2 * head_dim;
  const int batch_stride_q = head_dim;
  const int batch_stride_kv = 2 * head_dim;
  const int dropout_elems = attn_batches * q_seq_len * k_seq_len;
  const float alpha = 1.0;
  const float beta = 0.0;
  const float scale = 1.0 / sqrt(static_cast<float>(head_dim));

  // TODO: Streams can be used in Backprop but I haven't added more than one
  // in my first attempt to create the code
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  cublasSetStream(handle, stream);

  // Output Tensor Allocations
  torch::Tensor input_q_grads = torch::empty_like(inputs_q);
  torch::Tensor input_kv_grads = torch::empty_like(inputs_kv);
  torch::Tensor input_weight_q_grads = torch::empty_like(input_weights_q);
  torch::Tensor input_weight_kv_grads = torch::empty_like(input_weights_kv);
  torch::Tensor output_weight_grads = torch::empty_like(output_weights);
  // Intermediate Tensor Allocations
  at::Tensor output_lin_grads = torch::empty_like(matmul2_results);
  at::Tensor matmul2_grads = torch::empty_like(dropout_results);
  at::Tensor input_lin_q_output_grads = torch::empty_like(input_lin_q_results);
  at::Tensor input_lin_kv_output_grads = torch::empty_like(input_lin_kv_results);

  auto q_lin_results_ptr = static_cast<half*>(input_lin_q_results.data_ptr());
  auto k_lin_results_ptr = static_cast<half*>(input_lin_kv_results.data_ptr());
  auto v_lin_results_ptr = static_cast<half*>(input_lin_kv_results.data_ptr()) + head_dim;

  auto q_lin_grads_ptr = static_cast<half*>(input_lin_q_output_grads.data_ptr());
  auto k_lin_grads_ptr = static_cast<half*>(input_lin_kv_output_grads.data_ptr());
  auto v_lin_grads_ptr = static_cast<half*>(input_lin_kv_output_grads.data_ptr()) + head_dim;

  char a_layout_n{'n'};
  char a_layout_t{'t'};
  char b_layout_n{'n'};
  char b_layout_t{'t'};

  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));

  // Output Linear Dgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, embed_dim, batches_q, embed_dim,
                                    static_cast<const void*>(&alpha),
                                    static_cast<const void*>(output_weights.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(output_grads.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(&beta), static_cast<void*>(output_lin_grads.data_ptr()),
                                    CUDA_R_16F, embed_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // Output Linear Wgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_T, embed_dim, embed_dim, batches_q,
                                    static_cast<const void*>(&alpha),
                                    static_cast<const void*>(matmul2_results.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(output_grads.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(&beta), static_cast<void*>(output_weight_grads.data_ptr()),
                                    CUDA_R_16F, embed_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // MatMul2 Dgrad1
  gemm_switch_fp32accum(a_layout_t, b_layout_n, k_seq_len, q_seq_len, head_dim, alpha,
                        static_cast<const half*>(v_lin_results_ptr), lead_dim_kv, batch_stride_kv,
                        static_cast<const half*>(output_lin_grads.data_ptr()), head_dim * attn_batches, head_dim, beta,
                        static_cast<half*>(matmul2_grads.data_ptr()), k_seq_len, k_seq_len * q_seq_len, attn_batches);

  // Matmul2 Dgrad2
  gemm_switch_fp32accum(a_layout_n, b_layout_t, head_dim, k_seq_len, q_seq_len, alpha,
                        static_cast<const half*>(output_lin_grads.data_ptr()), head_dim * attn_batches, head_dim,
                        static_cast<const half*>(dropout_results.data_ptr()), k_seq_len, k_seq_len * q_seq_len, beta,
                        v_lin_grads_ptr, lead_dim_kv, batch_stride_kv, attn_batches);

  // Apply Dropout Mask and Scale by Dropout Probability
  apex_masked_scale_cuda<at::Half, float, uint32_t>(
      static_cast<at::Half const*>(matmul2_grads.data_ptr()), static_cast<at::Half*>(matmul2_grads.data_ptr()),
      static_cast<uint8_t const*>(dropout_mask.data_ptr()), dropout_elems, (1.0 / (1.0 - dropout_prob)));

  // Softmax Grad
  bool softmax_success = false;
  softmax_success = dispatch_softmax_backward<half, half, float>(
      static_cast<half*>(matmul2_grads.data_ptr()), static_cast<half*>(matmul2_grads.data_ptr()),
      reinterpret_cast<half const*>(softmax_results.data_ptr()), k_seq_len, k_seq_len, attn_batches * q_seq_len);
  assert(softmax_success);

  // Matmul1 Dgrad1
  gemm_switch_fp32accum(a_layout_n, b_layout_n, head_dim, q_seq_len, k_seq_len, scale, k_lin_results_ptr, lead_dim_kv,
                        batch_stride_kv, static_cast<half*>(matmul2_grads.data_ptr()), k_seq_len, k_seq_len * q_seq_len,
                        beta, q_lin_grads_ptr, lead_dim_q, batch_stride_q, attn_batches);

  // Matmul1 Dgrad2
  gemm_switch_fp32accum(a_layout_n, b_layout_t, head_dim, k_seq_len, q_seq_len, scale, q_lin_results_ptr, lead_dim_q,
                        batch_stride_q, static_cast<half*>(matmul2_grads.data_ptr()), k_seq_len, k_seq_len * q_seq_len,
                        beta, k_lin_grads_ptr, lead_dim_kv, batch_stride_kv, attn_batches);

  // Input Linear Q Dgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(
      handle, CUBLAS_OP_N, CUBLAS_OP_N, embed_dim, batches_q, output_lin_q_dim, static_cast<const void*>(&alpha),
      static_cast<const void*>(input_weights_q.data_ptr()), CUDA_R_16F, embed_dim,
      static_cast<const void*>(q_lin_grads_ptr), CUDA_R_16F, output_lin_q_dim, static_cast<const void*>(&beta),
      static_cast<void*>(input_q_grads.data_ptr()), CUDA_R_16F, embed_dim, CUDA_R_32F,
      // CUBLAS_GEMM_ALGO10_TENSOR_OP));
      CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // Input Linear Q Wgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_T, embed_dim, output_lin_q_dim, batches_q,
                                    static_cast<const void*>(&alpha), static_cast<const void*>(inputs_q.data_ptr()),
                                    CUDA_R_16F, embed_dim, static_cast<const void*>(q_lin_grads_ptr), CUDA_R_16F,
                                    output_lin_q_dim, static_cast<const void*>(&beta),
                                    static_cast<void*>(input_weight_q_grads.data_ptr()), CUDA_R_16F, embed_dim,
                                    CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // Input Linear KV Dgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(
      handle, CUBLAS_OP_N, CUBLAS_OP_N, embed_dim, batches_kv, output_lin_kv_dim, static_cast<const void*>(&alpha),
      static_cast<const void*>(input_weights_kv.data_ptr()), CUDA_R_16F, embed_dim,
      static_cast<const void*>(k_lin_grads_ptr), CUDA_R_16F, output_lin_kv_dim, static_cast<const void*>(&beta),
      static_cast<void*>(input_kv_grads.data_ptr()), CUDA_R_16F, embed_dim, CUDA_R_32F,
      // CUBLAS_GEMM_ALGO10_TENSOR_OP));
      CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // Input Linear KV Wgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_T, embed_dim, output_lin_kv_dim, batches_kv,
                                    static_cast<const void*>(&alpha), static_cast<const void*>(inputs_kv.data_ptr()),
                                    CUDA_R_16F, embed_dim, static_cast<const void*>(k_lin_grads_ptr), CUDA_R_16F,
                                    output_lin_kv_dim, static_cast<const void*>(&beta),
                                    static_cast<void*>(input_weight_kv_grads.data_ptr()), CUDA_R_16F, embed_dim,
                                    CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));

  return {input_q_grads, input_kv_grads, input_weight_q_grads, input_weight_kv_grads, output_weight_grads};
}

}  // end namespace cublas_gemmex
}  // end namespace encdec
}  // end namespace multihead_attn


================================================
FILE: apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add_cuda.cu
================================================
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <cuda_runtime.h>
#include <math.h>
#include <torch/extension.h>

#include <iostream>
#include <vector>

#include "dropout.cuh"
#include "layer_norm.cuh"
#include "softmax.cuh"
#include "strided_batched_gemm.cuh"

namespace multihead_attn {
namespace encdec_norm_add {
namespace cublas_gemmex {

std::vector<torch::Tensor> fwd_cuda(bool use_time_mask, bool is_training, int heads, torch::Tensor const& inputs_q,
                                    torch::Tensor const& inputs_kv, torch::Tensor const& lyr_nrm_gamma_weights,
                                    torch::Tensor const& lyr_nrm_beta_weights, torch::Tensor const& input_weights_q,
                                    torch::Tensor const& input_weights_kv, torch::Tensor const& output_weights,
                                    const uint8_t* pad_mask, float dropout_prob) {
  const int embed_dim = inputs_q.size(2);
  const int sequences = inputs_q.size(1);
  const int q_seq_len = inputs_q.size(0);
  const int k_seq_len = inputs_kv.size(0);
  const int batches_q = sequences * q_seq_len;
  const int batches_kv = sequences * k_seq_len;
  const int total_tokens_q = batches_q * embed_dim;
  const int head_dim = embed_dim / heads;
  const int output_lin_q_dim = embed_dim;
  const int output_lin_kv_dim = 2 * embed_dim;
  const int attn_batches = heads * sequences;
  const int lead_dim_q = attn_batches * head_dim;
  const int lead_dim_kv = attn_batches * 2 * head_dim;
  const int batch_stride_q = head_dim;
  const int batch_stride_kv = 2 * head_dim;
  const int dropout_elems = attn_batches * q_seq_len * k_seq_len;
  const float alpha = 1.0;
  const float beta = 0.0;
  const float scale = 1.0 / sqrt(static_cast<float>(head_dim));

  // There is no reason to use more than one stream as every kernel is
  // sequentially dependent
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  cublasSetStream(handle, stream);

  // 3 Intermediate Results + Output (Note: dropout intermediates are generated
  // by ATen library code)
  auto act_options = inputs_q.options().requires_grad(false);
  auto lyr_nrm_options = act_options.dtype(torch::kFloat32);
  auto mask_options = act_options.dtype(torch::kUInt8);

  torch::Tensor lyr_nrm_mean = torch::empty({batches_q}, lyr_nrm_options);
  torch::Tensor lyr_nrm_invvar = torch::empty({batches_q}, lyr_nrm_options);
  torch::Tensor lyr_nrm_results = torch::empty_like(inputs_q, act_options);

  torch::Tensor input_lin_q_results = torch::empty({q_seq_len, sequences, output_lin_q_dim}, act_options);
  torch::Tensor input_lin_kv_results = torch::empty({k_seq_len, sequences, output_lin_kv_dim}, act_options);
  torch::Tensor softmax_results = torch::empty({attn_batches, q_seq_len, k_seq_len}, act_options);
  torch::Tensor dropout_results = torch::empty({attn_batches, q_seq_len, k_seq_len}, act_options);
  torch::Tensor dropout_mask = torch::empty({attn_batches, q_seq_len, k_seq_len}, mask_options);
  torch::Tensor matmul2_results = torch::empty({q_seq_len, attn_batches, head_dim}, act_options);
  torch::Tensor output_lin_results = torch::empty_like(inputs_q, act_options);
  torch::Tensor dropout_add_mask = torch::empty_like(inputs_q, mask_options);
  torch::Tensor outputs = torch::empty_like(inputs_q, act_options);

  // Input Linear Results Pointers to Q, K, and V of interviewed activations
  void* q_lin_results_ptr = static_cast<void*>(input_lin_q_results.data_ptr());
  void* k_lin_results_ptr = static_cast<void*>(input_lin_kv_results.data_ptr());
  void* v_lin_results_ptr = static_cast<void*>(static_cast<half*>(input_lin_kv_results.data_ptr()) + head_dim);

  // Softmax Intermediate Result Ptr (used by Matmul1 -> Softmax)
  void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());

  char a_layout_t{'t'};
  char a_layout_n{'n'};
  char b_layout_n{'n'};

  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
  // Layer Norm
  HostApplyLayerNorm<at::Half, float>(
      static_cast<at::Half*>(lyr_nrm_results.data_ptr()), static_cast<float*>(lyr_nrm_mean.data_ptr()),
      static_cast<float*>(lyr_nrm_invvar.data_ptr()), static_cast<const at::Half*>(inputs_q.data_ptr()),
      static_cast<int>(batches_q),  // n1
      static_cast<int>(embed_dim),  // n2
      1.0e-5, static_cast<const at::Half*>(lyr_nrm_gamma_weights.data_ptr()),
      static_cast<const at::Half*>(lyr_nrm_beta_weights.data_ptr()));

  // Input Linear Q Fwd
  TORCH_CUDABLAS_CHECK(cublasGemmEx(
      handle, CUBLAS_OP_T, CUBLAS_OP_N, output_lin_q_dim, batches_q, embed_dim, static_cast<const void*>(&alpha),
      static_cast<const void*>(input_weights_q.data_ptr()), CUDA_R_16F, embed_dim,
      // static_cast<const void*>(inputs_q.data_ptr()),
      static_cast<const void*>(lyr_nrm_results.data_ptr()), CUDA_R_16F, embed_dim, static_cast<const void*>(&beta),
      q_lin_results_ptr, CUDA_R_16F, output_lin_q_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // Input Linear KV Fwd
  TORCH_CUDABLAS_CHECK(cublasGemmEx(
      handle, CUBLAS_OP_T, CUBLAS_OP_N, output_lin_kv_dim, batches_kv, embed_dim, static_cast<const void*>(&alpha),
      static_cast<const void*>(input_weights_kv.data_ptr()), CUDA_R_16F, embed_dim,
      static_cast<const void*>(inputs_kv.data_ptr()), CUDA_R_16F, embed_dim, static_cast<const void*>(&beta),
      k_lin_results_ptr, CUDA_R_16F, output_lin_kv_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // MatMul1 of Dot-Product Attention Plus scaling by 1/Sqrt(head size)
  gemm_switch_fp32accum(a_layout_t, b_layout_n, k_seq_len, q_seq_len, head_dim, scale,
                        static_cast<const half*>(k_lin_results_ptr), lead_dim_kv, batch_stride_kv,
                        static_cast<const half*>(q_lin_results_ptr), lead_dim_q, batch_stride_q, beta,
                        static_cast<half*>(softmax_results_ptr), k_seq_len, k_seq_len * q_seq_len, attn_batches);

  // Padded Softmax
  bool softmax_success = false;
  if (pad_mask == nullptr) {
    softmax_success = dispatch_softmax<half, half, float>(reinterpret_cast<half*>(softmax_results_ptr),
                                                          reinterpret_cast<const half*>(softmax_results_ptr), k_seq_len,
                                                          k_seq_len, attn_batches * q_seq_len);
  } else {
    if (use_time_mask) {
      softmax_success = dispatch_time_masked_softmax<half, half, float>(
          reinterpret_cast<half*>(softmax_results_ptr), reinterpret_cast<const half*>(softmax_results_ptr), pad_mask,
          k_seq_len, k_seq_len, attn_batches * q_seq_len, q_seq_len);
    } else {
      softmax_success = dispatch_masked_softmax<half, half, float>(
          reinterpret_cast<half*>(softmax_results_ptr), reinterpret_cast<const half*>(softmax_results_ptr), pad_mask,
          k_seq_len, k_seq_len, attn_batches * q_seq_len, attn_batches * q_seq_len / sequences);
    }
  }
  assert(softmax_success);

  if (is_training) {
    apex_fused_dropout_cuda<at::Half, float, uint32_t>(
        static_cast<at::Half const*>(softmax_results.data_ptr()), static_cast<at::Half*>(dropout_results.data_ptr()),
        static_cast<uint8_t*>(dropout_mask.data_ptr()), dropout_elems, (1.0f - dropout_prob));
  }

  // Matmul2
  gemm_switch_fp32accum(a_layout_n, b_layout_n, head_dim, q_seq_len, k_seq_len, alpha,
                        static_cast<const half*>(v_lin_results_ptr), lead_dim_kv, batch_stride_kv,
                        (is_training) ? static_cast<const half*>(dropout_results.data_ptr())
                                      : static_cast<const half*>(softmax_results.data_ptr()),
                        // static_cast<const half*>(dropout_results.data_ptr()),
                        k_seq_len, k_seq_len * q_seq_len, beta, static_cast<half*>(matmul2_results.data_ptr()),
                        head_dim * attn_batches, head_dim, attn_batches);

  // Output Linear
  TORCH_CUDABLAS_CHECK(cublasGemmEx(
      handle, CUBLAS_OP_T, CUBLAS_OP_N, embed_dim, batches_q, embed_dim, static_cast<const void*>(&alpha),
      static_cast<const void*>(output_weights.data_ptr()), CUDA_R_16F, embed_dim,
      static_cast<const void*>(matmul2_results.data_ptr()), CUDA_R_16F, embed_dim, static_cast<const void*>(&beta),
      static_cast<void*>(output_lin_results.data_ptr()), CUDA_R_16F, embed_dim, CUDA_R_32F,
      // CUBLAS_GEMM_ALGO1_TENSOR_OP));
      CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // End-of-block Dropout-Add
  if (is_training) {
    apex_dropout_add_cuda<at::Half, float, uint32_t>(
        static_cast<at::Half const*>(output_lin_results.data_ptr()), static_cast<at::Half const*>(inputs_q.data_ptr()),
        static_cast<at::Half*>(outputs.data_ptr()), static_cast<uint8_t*>(dropout_add_mask.data_ptr()), total_tokens_q,
        (1.0f - dropout_prob));
  } else {
    apex_add_cuda<at::Half, float, uint32_t>(static_cast<at::Half const*>(output_lin_results.data_ptr()),
                                             static_cast<at::Half const*>(inputs_q.data_ptr()),
                                             static_cast<at::Half*>(outputs.data_ptr()), total_tokens_q);
  }

  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));

  return {lyr_nrm_results,      lyr_nrm_mean,     lyr_nrm_invvar,  input_lin_q_results,
          input_lin_kv_results, softmax_results,  dropout_results, dropout_mask,
          matmul2_results,      dropout_add_mask, outputs};
}

std::vector<torch::Tensor> bwd_cuda(int heads, torch::Tensor const& output_grads, torch::Tensor const& matmul2_results,
                                    torch::Tensor const& dropout_results, torch::Tensor const& softmax_results,
                                    torch::Tensor const& input_lin_q_results, torch::Tensor const& input_lin_kv_results,
                                    torch::Tensor const& lyr_nrm_results, torch::Tensor const& lyr_nrm_mean,
                                    torch::Tensor const& lyr_nrm_invvar, torch::Tensor const& inputs_q,
                                    torch::Tensor const& inputs_kv, torch::Tensor const& lyr_nrm_gamma_weights,
                                    torch::Tensor const& lyr_nrm_beta_weights, torch::Tensor const& input_weights_q,
                                    torch::Tensor const& input_weights_kv, torch::Tensor const& output_weights,
                                    torch::Tensor const& dropout_mask, torch::Tensor const& dropout_add_mask,
                                    float dropout_prob) {
  const int embed_dim = inputs_q.size(2);
  const int sequences = inputs_q.size(1);
  const int q_seq_len = inputs_q.size(0);
  const int k_seq_len = inputs_kv.size(0);
  const int batches_q = sequences * q_seq_len;
  const int batches_kv = sequences * k_seq_len;
  const int total_tokens_q = batches_q * embed_dim;
  const int head_dim = embed_dim / heads;
  const int output_lin_q_dim = embed_dim;
  const int output_lin_kv_dim = 2 * embed_dim;
  const int attn_batches = heads * sequences;
  const int lead_dim_q = attn_batches * head_dim;
  const int lead_dim_kv = attn_batches * 2 * head_dim;
  const int batch_stride_q = head_dim;
  const int batch_stride_kv = 2 * head_dim;
  const int dropout_elems = attn_batches * q_seq_len * k_seq_len;
  const float alpha = 1.0;
  const float beta = 0.0;
  const float scale = 1.0 / sqrt(static_cast<float>(head_dim));

  // TODO: Streams can be used in Backprop but I haven't added more than one
  // in my first attempt to create the code
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  cublasSetStream(handle, stream);

  // Output Tensor Allocations
  torch::Tensor input_q_grads = torch::empty_like(inputs_q);
  torch::Tensor input_kv_grads = torch::empty_like(inputs_kv);
  torch::Tensor lyr_nrm_gamma_grads = torch::empty_like(lyr_nrm_gamma_weights);
  torch::Tensor lyr_nrm_beta_grads = torch::empty_like(lyr_nrm_beta_weights);
  torch::Tensor input_weight_q_grads = torch::empty_like(input_weights_q);
  torch::Tensor input_weight_kv_grads = torch::empty_like(input_weights_kv);
  torch::Tensor output_weight_grads = torch::empty_like(output_weights);
  // Intermediate Tensor Allocations
  at::Tensor dropout_add_grads = torch::empty_like(output_grads);
  at::Tensor output_lin_grads = torch::empty_like(matmul2_results);
  at::Tensor matmul2_grads = torch::empty_like(dropout_results);
  at::Tensor input_lin_q_output_grads = torch::empty_like(input_lin_q_results);
  at::Tensor input_lin_kv_output_grads = torch::empty_like(input_lin_kv_results);
  at::Tensor input_lin_q_grads = torch::empty_like(inputs_q);

  auto q_lin_results_ptr = static_cast<half*>(input_lin_q_results.data_ptr());
  auto k_lin_results_ptr = static_cast<half*>(input_lin_kv_results.data_ptr());
  auto v_lin_results_ptr = static_cast<half*>(input_lin_kv_results.data_ptr()) + head_dim;

  auto q_lin_grads_ptr = static_cast<half*>(input_lin_q_output_grads.data_ptr());
  auto k_lin_grads_ptr = static_cast<half*>(input_lin_kv_output_grads.data_ptr());
  auto v_lin_grads_ptr = static_cast<half*>(input_lin_kv_output_grads.data_ptr()) + head_dim;

  char a_layout_n{'n'};
  char a_layout_t{'t'};
  char b_layout_n{'n'};
  char b_layout_t{'t'};

  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));

  // Dropout Add Backward
  apex_masked_scale_cuda<at::Half, float, uint32_t>(
      static_cast<at::Half const*>(output_grads.data_ptr()), static_cast<at::Half*>(dropout_add_grads.data_ptr()),
      static_cast<uint8_t const*>(dropout_add_mask.data_ptr()), total_tokens_q, (1.0 / (1.0 - dropout_prob)));

  // Output Linear Dgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, embed_dim, batches_q, embed_dim,
                                    static_cast<const void*>(&alpha),
                                    static_cast<const void*>(output_weights.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(dropout_add_grads.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(&beta), static_cast<void*>(output_lin_grads.data_ptr()),
                                    CUDA_R_16F, embed_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // Output Linear Wgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_T, embed_dim, embed_dim, batches_q,
                                    static_cast<const void*>(&alpha),
                                    static_cast<const void*>(matmul2_results.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(dropout_add_grads.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(&beta), static_cast<void*>(output_weight_grads.data_ptr()),
                                    CUDA_R_16F, embed_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // MatMul2 Dgrad1
  gemm_switch_fp32accum(a_layout_t, b_layout_n, k_seq_len, q_seq_len, head_dim, alpha,
                        static_cast<const half*>(v_lin_results_ptr), lead_dim_kv, batch_stride_kv,
                        static_cast<const half*>(output_lin_grads.data_ptr()), head_dim * attn_batches, head_dim, beta,
                        static_cast<half*>(matmul2_grads.data_ptr()), k_seq_len, k_seq_len * q_seq_len, attn_batches);

  // Matmul2 Dgrad2
  gemm_switch_fp32accum(a_layout_n, b_layout_t, head_dim, k_seq_len, q_seq_len, alpha,
                        static_cast<const half*>(output_lin_grads.data_ptr()), head_dim * attn_batches, head_dim,
                        static_cast<const half*>(dropout_results.data_ptr()), k_seq_len, k_seq_len * q_seq_len, beta,
                        v_lin_grads_ptr, lead_dim_kv, batch_stride_kv, attn_batches);

  // Apply Dropout Mask and Scale by Dropout Probability
  apex_masked_scale_cuda<at::Half, float, uint32_t>(
      static_cast<at::Half const*>(matmul2_grads.data_ptr()), static_cast<at::Half*>(matmul2_grads.data_ptr()),
      static_cast<uint8_t const*>(dropout_mask.data_ptr()), dropout_elems, (1.0 / (1.0 - dropout_prob)));

  // Softmax Grad
  bool softmax_success = false;
  softmax_success = dispatch_softmax_backward<half, half, float>(
      static_cast<half*>(matmul2_grads.data_ptr()), static_cast<half*>(matmul2_grads.data_ptr()),
      reinterpret_cast<half const*>(softmax_results.data_ptr()), k_seq_len, k_seq_len, attn_batches * q_seq_len);
  assert(softmax_success);

  // Matmul1 Dgrad1
  gemm_switch_fp32accum(a_layout_n, b_layout_n, head_dim, q_seq_len, k_seq_len, scale, k_lin_results_ptr, lead_dim_kv,
                        batch_stride_kv, static_cast<half*>(matmul2_grads.data_ptr()), k_seq_len, k_seq_len * q_seq_len,
                        beta, q_lin_grads_ptr, lead_dim_q, batch_stride_q, attn_batches);

  // Matmul1 Dgrad2
  gemm_switch_fp32accum(a_layout_n, b_layout_t, head_dim, k_seq_len, q_seq_len, scale, q_lin_results_ptr, lead_dim_q,
                        batch_stride_q, static_cast<half*>(matmul2_grads.data_ptr()), k_seq_len, k_seq_len * q_seq_len,
                        beta, k_lin_grads_ptr, lead_dim_kv, batch_stride_kv, attn_batches);

  // Input Linear Q Dgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(
      handle, CUBLAS_OP_N, CUBLAS_OP_N, embed_dim, batches_q, output_lin_q_dim, static_cast<const void*>(&alpha),
      static_cast<const void*>(input_weights_q.data_ptr()), CUDA_R_16F, embed_dim,
      static_cast<const void*>(q_lin_grads_ptr), CUDA_R_16F, output_lin_q_dim, static_cast<const void*>(&beta),
      // static_cast<void*>(input_q_grads.data_ptr()),
      static_cast<void*>(input_lin_q_grads.data_ptr()), CUDA_R_16F, embed_dim, CUDA_R_32F,
      // CUBLAS_GEMM_ALGO10_TENSOR_OP));
      CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // Input Linear Q Wgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_T, embed_dim, output_lin_q_dim, batches_q,
                                    static_cast<const void*>(&alpha), static_cast<const void*>(inputs_q.data_ptr()),
                                    CUDA_R_16F, embed_dim, static_cast<const void*>(q_lin_grads_ptr), CUDA_R_16F,
                                    output_lin_q_dim, static_cast<const void*>(&beta),
                                    static_cast<void*>(input_weight_q_grads.data_ptr()), CUDA_R_16F, embed_dim,
                                    CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // Input Linear KV Dgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(
      handle, CUBLAS_OP_N, CUBLAS_OP_N, embed_dim, batches_kv, output_lin_kv_dim, static_cast<const void*>(&alpha),
      static_cast<const void*>(input_weights_kv.data_ptr()), CUDA_R_16F, embed_dim,
      static_cast<const void*>(k_lin_grads_ptr), CUDA_R_16F, output_lin_kv_dim, static_cast<const void*>(&beta),
      static_cast<void*>(input_kv_grads.data_ptr()), CUDA_R_16F, embed_dim, CUDA_R_32F,
      // CUBLAS_GEMM_ALGO10_TENSOR_OP));
      CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // Input Linear KV Wgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_T, embed_dim, output_lin_kv_dim, batches_kv,
                                    static_cast<const void*>(&alpha), static_cast<const void*>(inputs_kv.data_ptr()),
                                    CUDA_R_16F, embed_dim, static_cast<const void*>(k_lin_grads_ptr), CUDA_R_16F,
                                    output_lin_kv_dim, static_cast<const void*>(&beta),
                                    static_cast<void*>(input_weight_kv_grads.data_ptr()), CUDA_R_16F, embed_dim,
                                    CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // Fused Layer Norm Bwd with Residual Add
  HostLayerNormGradient<half, float>(
      static_cast<const half*>(input_lin_q_grads.data_ptr()), static_cast<half const*>(output_grads.data_ptr()),
      static_cast<const float*>(lyr_nrm_mean.data_ptr()), static_cast<const float*>(lyr_nrm_invvar.data_ptr()),
      inputs_q,
      static_cast<int>(batches_q),  // n1
      static_cast<int>(embed_dim),  // n2
      static_cast<const half*>(lyr_nrm_gamma_weights.data_ptr()),
      static_cast<const half*>(lyr_nrm_beta_weights.data_ptr()), 1.0e-5, static_cast<half*>(input_q_grads.data_ptr()),
      static_cast<half*>(lyr_nrm_gamma_grads.data_ptr()), static_cast<half*>(lyr_nrm_beta_grads.data_ptr()));

  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));

  return {input_q_grads,        input_kv_grads,        lyr_nrm_gamma_grads, lyr_nrm_beta_grads,
          input_weight_q_grads, input_weight_kv_grads, output_weight_grads};
}

}  // end namespace cublas_gemmex
}  // end namespace encdec_norm_add
}  // end namespace multihead_attn


================================================
FILE: apex/contrib/csrc/multihead_attn/layer_norm.cuh
================================================
#pragma once
#include <ATen/ATen.h>
#include <cuda.h>
#include <cuda_runtime.h>

#include <ATen/cuda/DeviceUtils.cuh>

namespace {
template <typename U>
__device__ void cuWelfordOnlineSum(const U curr, U& mu, U& sigma2, U& count) {
  count = count + U(1);
  U delta = curr - mu;
  U lmean = mu + delta / count;
  mu = lmean;
  U delta2 = curr - lmean;
  sigma2 = sigma2 + delta * delta2;
}

template <typename U>
__device__ void cuChanOnlineSum(const U muB, const U sigma2B, const U countB, U& mu, U& sigma2, U& count) {
  U delta = muB - mu;
  U nA = count;
  U nB = countB;
  count = count + countB;
  U nX = count;
  if (nX > U(0)) {
    nA = nA / nX;
    nB = nB / nX;
    mu = nA * mu + nB * muB;
    sigma2 = sigma2 + sigma2B + delta * delta * nA * nB * nX;
  } else {
    mu = U(0);
    sigma2 = U(0);
  }
}

template <typename T, typename U>
__device__ void cuWelfordMuSigma2(const T* __restrict__ vals, const int n1, const int n2, const int i1, U& mu,
                                  U& sigma2, U* buf) {
  // Assumptions:
  // 1) blockDim.x == warpSize
  // 2) Tensor is contiguous
  // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
  //
  // compute variance and mean over n2
  U count = U(0);
  mu = U(0);
  sigma2 = U(0);
  if (i1 < n1) {
    // one warp normalizes one n1 index,
    // synchronization is implicit
    // initialize with standard Welford algorithm
    const int numx = blockDim.x * blockDim.y;
    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
    const T* lvals = vals + i1 * n2;
    int l = 4 * thrx;
    for (; l + 3 < n2; l += 4 * numx) {
      for (int k = 0; k < 4; ++k) {
        U curr = static_cast<U>(lvals[l + k]);
        cuWelfordOnlineSum<U>(curr, mu, sigma2, count);
      }
    }
    for (; l < n2; ++l) {
      U curr = static_cast<U>(lvals[l]);
      cuWelfordOnlineSum<U>(curr, mu, sigma2, count);
    }
    // intra-warp reductions
    for (int l = 0; l <= 4; ++l) {
      int srcLaneB = (threadIdx.x + (1 << l)) & 31;
      U muB = WARP_SHFL(mu, srcLaneB);
      U countB = WARP_SHFL(count, srcLaneB);
      U sigma2B = WARP_SHFL(sigma2, srcLaneB);
      cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
    }
    // threadIdx.x == 0 has correct values for each warp
    // inter-warp reductions
    if (blockDim.y > 1) {
      U* ubuf = (U*)buf;
      U* ibuf = (U*)(ubuf + blockDim.y);
      for (int offset = blockDim.y / 2; offset > 0; offset /= 2) {
        // upper half of warps write to shared
        if (threadIdx.x == 0 && threadIdx.y >= offset && threadIdx.y < 2 * offset) {
          const int wrt_y = threadIdx.y - offset;
          ubuf[2 * wrt_y] = mu;
          ubuf[2 * wrt_y + 1] = sigma2;
          ibuf[wrt_y] = count;
        }
        __syncthreads();
        // lower half merges
        if (threadIdx.x == 0 && threadIdx.y < offset) {
          U muB = ubuf[2 * threadIdx.y];
          U sigma2B = ubuf[2 * threadIdx.y + 1];
          U countB = ibuf[threadIdx.y];
          cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
        }
        __syncthreads();
      }
      // threadIdx.x = 0 && threadIdx.y == 0 only thread that has correct values
      if (threadIdx.x == 0 && threadIdx.y == 0) {
        ubuf[0] = mu;
        ubuf[1] = sigma2;
      }
      __syncthreads();
      mu = ubuf[0];
      sigma2 = ubuf[1] / U(n2);
      // don't care about final value of count, we know count == n2
    } else {
      mu = WARP_SHFL(mu, 0);
      sigma2 = WARP_SHFL(sigma2 / U(n2), 0);
    }
  }
}

template <>
__device__ void cuWelfordMuSigma2(const at::Half* __restrict__ vals, const int n1, const int n2, const int i1,
                                  float& mu, float& sigma2, float* buf) {
  // Assumptions:
  // 1) blockDim.x == warpSize
  // 2) Tensor is contiguous
  // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
  //
  // compute variance and mean over n2
  float count = 0.0f;
  mu = float(0);
  sigma2 = float(0);

  if (i1 < n1) {
    // one warp normalizes one n1 index,
    // synchronization is implicit
    // initialize with standard Welford algorithm
    const int numx = blockDim.x * blockDim.y;
    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
    const at::Half* lvals = vals + i1 * n2;
    int l = 8 * thrx;
    if ((((size_t)lvals) & 3) != 0) {
      // 16 bit alignment
      // first thread consumes first point
      if (thrx == 0) {
        float curr = static_cast<float>(lvals[0]);
        cuWelfordOnlineSum(curr, mu, sigma2, count);
      }
      ++l;
    }
    // at this point, lvals[l] are 32 bit aligned for all threads.
    for (; l + 7 < n2; l += 8 * numx) {
      for (int k = 0; k < 8; k += 2) {
        float2 curr = __half22float2(*((__half2*)(lvals + l + k)));
        cuWelfordOnlineSum(curr.x, mu, sigma2, count);
        cuWelfordOnlineSum(curr.y, mu, sigma2, count);
      }
    }
    for (; l < n2; ++l) {
      float curr = static_cast<float>(lvals[l]);
      cuWelfordOnlineSum(curr, mu, sigma2, count);
    }
    // intra-warp reductions
    for (int l = 0; l <= 4; ++l) {
      int srcLaneB = (threadIdx.x + (1 << l)) & 31;
      float muB = WARP_SHFL(mu, srcLaneB);
      float countB = WARP_SHFL(count, srcLaneB);
      float sigma2B = WARP_SHFL(sigma2, srcLaneB);
      cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
    }
    // threadIdx.x == 0 has correct values for each warp
    // inter-warp reductions
    if (blockDim.y > 1) {
      float* ubuf = (float*)buf;
      float* ibuf = (float*)(ubuf + blockDim.y);
      for (int offset = blockDim.y / 2; offset > 0; offset /= 2) {
        // upper half of warps write to shared
        if (threadIdx.x == 0 && threadIdx.y >= offset && threadIdx.y < 2 * offset) {
          const int wrt_y = threadIdx.y - offset;
          ubuf[2 * wrt_y] = mu;
          ubuf[2 * wrt_y + 1] = sigma2;
          ibuf[wrt_y] = count;
        }
        __syncthreads();
        // lower half merges
        if (threadIdx.x == 0 && threadIdx.y < offset) {
          float muB = ubuf[2 * threadIdx.y];
          float sigma2B = ubuf[2 * threadIdx.y + 1];
          float countB = ibuf[threadIdx.y];
          cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
        }
        __syncthreads();
      }
      // threadIdx.x = 0 && threadIdx.y == 0 only thread that has correct values
      if (threadIdx.x == 0 && threadIdx.y == 0) {
        ubuf[0] = mu;
        ubuf[1] = sigma2;
      }
      __syncthreads();
      mu = ubuf[0];
      sigma2 = ubuf[1] / float(n2);
      // don't care about final value of count, we know count == n2
    } else {
      mu = WARP_SHFL(mu, 0);
      sigma2 = WARP_SHFL(sigma2 / float(n2), 0);
    }
  }
}

template <typename U>
__device__ U rsqrt(U v) {
  return U(1) / sqrt(v);
}
template <>
__device__ float rsqrt(float v) {
  return rsqrtf(v);
}
template <>
__device__ double rsqrt(double v) {
  return rsqrt(v);
}

// This is the un-specialized struct.  Note that we prevent instantiation of
// this struct by putting an undefined symbol in the function body so it won't
// compile.
//  template <typename T>
//  struct SharedMemory
//  {
//      // Ensure that we won't compile any un-specialized types
//      __device__ T *getPointer()
//      {
//          extern __device__ void error(void);
//          error();
//          return NULL;
//      }
//  };
// https://github.com/NVIDIA/apex/issues/246
template <typename T>
struct SharedMemory;
template <>
struct SharedMemory<float> {
  __device__ float* getPointer() {
    extern __shared__ float s_float[];
    return s_float;
  }
};

template <>
struct SharedMemory<double> {
  __device__ double* getPointer() {
    extern __shared__ double s_double[];
    return s_double;
  }
};

template <typename T, typename U>
__global__ void cuApplyLayerNorm(T* __restrict__ output_vals, U* __restrict__ mean, U* __restrict__ invvar,
                                 const T* __restrict__ vals, const int n1, const int n2, const U epsilon,
                                 const T* __restrict__ gamma, const T* __restrict__ beta) {
  // Assumptions:
  // 1) blockDim.x == warpSize
  // 2) Tensors are contiguous
  //
  for (auto i1 = blockIdx.y; i1 < n1; i1 += gridDim.y) {
    SharedMemory<U> shared;
    U* buf = shared.getPointer();
    U mu, sigma2;
    cuWelfordMuSigma2(vals, n1, n2, i1, mu, sigma2, buf);
    const T* lvals = vals + i1 * n2;
    T* ovals = output_vals + i1 * n2;
    U c_invvar = rsqrt(sigma2 + epsilon);
    const int numx = blockDim.x * blockDim.y;
    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
    if (gamma != NULL && beta != NULL) {
      for (int i = thrx; i < n2; i += numx) {
        U curr = static_cast<U>(lvals[i]);
        ovals[i] = gamma[i] * static_cast<T>(c_invvar * (curr - mu)) + beta[i];
      }
    } else {
      for (int i = thrx; i < n2; i += numx) {
        U curr = static_cast<U>(lvals[i]);
        ovals[i] = static_cast<T>(c_invvar * (curr - mu));
      }
    }
    if (threadIdx.x == 0 && threadIdx.y == 0) {
      mean[i1] = mu;
      invvar[i1] = c_invvar;
    }
  }
}

template <typename T, typename U>
__device__ void cuLoadWriteStridedInputs(const int i1_block, const int thr_load_row_off, const int thr_load_col_off,
                                         const int i2_off, const int row_stride, U* warp_buf1, U* warp_buf2,
                                         const T* input, const T* dout, const int i1_end, const int n2,
                                         const U* __restrict__ mean, const U* __restrict__ invvar) {
  int i1 = i1_block + thr_load_row_off;
  if (i1 < i1_end) {
    U curr_mean = mean[i1];
    U curr_invvar = invvar[i1];
    for (int k = 0; k < blockDim.y; ++k) {
      int i2 = i2_off + k;
      int load_idx = i1 * n2 + i2;
      int write_idx = thr_load_row_off * row_stride + thr_load_col_off + k;
      if (i2 < n2) {
        U curr_input = static_cast<U>(input[load_idx]);
        U curr_dout = static_cast<U>(dout[load_idx]);
        warp_buf1[write_idx] = curr_dout;
        warp_buf2[write_idx] = curr_dout * (curr_input - curr_mean) * curr_invvar;
      } else {
        warp_buf1[write_idx] = U(0);
        warp_buf2[write_idx] = U(0);
      }
    }
  } else {
    for (int k = 0; k < blockDim.y; ++k) {
      int write_idx = thr_load_row_off * row_stride + thr_load_col_off + k;
      warp_buf1[write_idx] = U(0);
      warp_buf2[write_idx] = U(0);
    }
  }
}

template <typename T, typename U>
__device__ void cuLoadAddStridedInputs(const int i1_block, const int thr_load_row_off, const int thr_load_col_off,
                                       const int i2_off, const int row_stride, U* warp_buf1, U* warp_buf2,
                                       const T* input, const T* dout, const int i1_end, const int n2,
                                       const U* __restrict__ mean, const U* __restrict__ invvar) {
  int i1 = i1_block + thr_load_row_off;
  if (i1 < i1_end) {
    U curr_mean = mean[i1];
    U curr_invvar = invvar[i1];
    for (int k = 0; k < blockDim.y; ++k) {
      int i2 = i2_off + k;
      int load_idx = i1 * n2 + i2;
      int write_idx = thr_load_row_off * row_stride + thr_load_col_off + k;
      if (i2 < n2) {
        U curr_input = static_cast<U>(input[load_idx]);
        U curr_dout = static_cast<U>(dout[load_idx]);
        warp_buf1[write_idx] += curr_dout;
        warp_buf2[write_idx] += curr_dout * (curr_input - curr_mean) * curr_invvar;
      }
    }
  }
}

template <typename T, typename U>
__global__ void cuComputePartGradGammaBeta(const T* __restrict__ dout, const T* __restrict__ input, const int n1,
                                           const int n2, const U* __restrict__ mean, const U* __restrict__ invvar,
                                           U epsilon, U* part_grad_gamma, U* part_grad_beta) {
  const int numsegs_n1 = (n1 + blockDim.y * blockDim.y - 1) / (blockDim.y * blockDim.y);
  const int segs_per_block = (numsegs_n1 + gridDim.y - 1) / gridDim.y;
  const int i1_beg = blockIdx.y * segs_per_block * blockDim.y * blockDim.y;
  const int i1_beg_plus_one = (blockIdx.y + 1) * segs_per_block * blockDim.y * blockDim.y;
  const int i1_end = i1_beg_plus_one < n1 ? i1_beg_plus_one : n1;
  const int row_stride = blockDim.x + 1;
  const int thr_load_col_off = (threadIdx.x * blockDim.y) & (blockDim.x - 1);
  const int thr_load_row_off = (threadIdx.x * blockDim.y) / blockDim.x + threadIdx.y * blockDim.y;
  const int i2_off = blockIdx.x * blockDim.x + thr_load_col_off;
  SharedMemory<U> shared;
  U* buf = shared.getPointer();  // buf has at least blockDim.x * blockDim.y *
                                 // blockDim.y + (blockDim.y -
                                 // 1)*(blockDim.x/blockDim.y) elements
  U* warp_buf1 = (U*)buf;
  U* warp_buf2 = warp_buf1 + blockDim.y * blockDim.y * row_stride;
  // compute partial sums from strided inputs
  // do this to increase number of loads in flight
  cuLoadWriteStridedInputs(i1_beg, thr_load_row_off, thr_load_col_off, i2_off, row_stride, warp_buf1, warp_buf2, input,
                           dout, i1_end, n2, mean, invvar);
  for (int i1_block = i1_beg + blockDim.y * blockDim.y; i1_block < i1_end; i1_block += blockDim.y * blockDim.y) {
    cuLoadAddStridedInputs(i1_block, thr_load_row_off, thr_load_col_off, i2_off, row_stride, warp_buf1, warp_buf2,
                           input, dout, i1_end, n2, mean, invvar);
  }
  __syncthreads();
  // inter-warp reductions
  // sum within each warp
  U acc1 = U(0);
  U acc2 = U(0);
  for (int k = 0; k < blockDim.y; ++k) {
    int row1 = threadIdx.y + k * blockDim.y;
    int idx1 = row1 * row_stride + threadIdx.x;
    acc1 += warp_buf1[idx1];
    acc2 += warp_buf2[idx1];
  }
  warp_buf1[threadIdx.y * row_stride + threadIdx.x] = acc1;
  warp_buf2[threadIdx.y * row_stride + threadIdx.x] = acc2;
  __syncthreads();
  // sum all warps
  for (int offset = blockDim.y / 2; offset > 1; offset /= 2) {
    if (threadIdx.y < offset) {
      int row1 = threadIdx.y;
      int row2 = threadIdx.y + offset;
      int idx1 = row1 * row_stride + threadIdx.x;
      int idx2 = row2 * row_stride + threadIdx.x;
      warp_buf1[idx1] += warp_buf1[idx2];
      warp_buf2[idx1] += warp_buf2[idx2];
    }
    __syncthreads();
  }
  int i2 = blockIdx.x * blockDim.x + threadIdx.x;
  if (threadIdx.y == 0 && i2 < n2) {
    int row1 = threadIdx.y;
    int row2 = threadIdx.y + 1;
    int idx1 = row1 * row_stride + threadIdx.x;
    int idx2 = row2 * row_stride + threadIdx.x;
    part_grad_beta[blockIdx.y * n2 + i2] = warp_buf1[idx1] + warp_buf1[idx2];
    part_grad_gamma[blockIdx.y * n2 + i2] = warp_buf2[idx1] + warp_buf2[idx2];
  }
}

template <typename T, typename U>
__global__ void cuComputeGradGammaBeta(const U* part_grad_gamma, const U* part_grad_beta, const int part_size,
                                       const int n1, const int n2, T* grad_gamma, T* grad_beta) {
  // sum partial gradients for gamma and beta
  SharedMemory<U> shared;
  U* buf = shared.getPointer();
  int i2 = blockIdx.x * blockDim.x + threadIdx.x;
  if (i2 < n2) {
    // each warp does sequential reductions until reduced part_size is num_warps
    int num_warp_reductions = part_size / blockDim.y;
    U sum_gamma = U(0);
    U sum_beta = U(0);
    const U* part_grad_gamma_ptr = part_grad_gamma + threadIdx.y * num_warp_reductions * n2 + i2;
    const U* part_grad_beta_ptr = part_grad_beta + threadIdx.y * num_warp_reductions * n2 + i2;
    for (int warp_offset = 0; warp_offset < num_warp_reductions; ++warp_offset) {
      sum_gamma += part_grad_gamma_ptr[warp_offset * n2];
      sum_beta += part_grad_beta_ptr[warp_offset * n2];
    }
    // inter-warp reductions
    const int nbsize3 = blockDim.x * blockDim.y / 2;
    for (int offset = blockDim.y / 2; offset >= 1; offset /= 2) {
      // top half write to shared memory
      if (threadIdx.y >= offset && threadIdx.y < 2 * offset) {
        const int write_idx = (threadIdx.y - offset) * blockDim.x + threadIdx.x;
        buf[write_idx] = sum_gamma;
        buf[write_idx + nbsize3] = sum_beta;
      }
      __syncthreads();
      // bottom half sums
      if (threadIdx.y < offset) {
        const int read_idx = threadIdx.y * blockDim.x + threadIdx.x;
        sum_gamma += buf[read_idx];
        sum_beta += buf[read_idx + nbsize3];
      }
      __syncthreads();
    }
    // write out fully summed gradients
    if (threadIdx.y == 0) {
      grad_gamma[i2] = sum_gamma;
      grad_beta[i2] = sum_beta;
    }
  }
}

template <typename T, typename U>
__global__ void cuComputeGradInput(const T* __restrict__ dout, const T* __restrict__ dout_resid,
                                   const T* __restrict__ input, const int n1, const int n2, const U* __restrict__ mean,
                                   const U* __restrict__ invvar, U epsilon, const T* gamma, T* grad_input) {
  for (auto i1 = blockIdx.y; i1 < n1; i1 += gridDim.y) {
    U sum_loss1 = U(0);
    U sum_loss2 = U(0);
    const U c_mean = mean[i1];
    const U c_invvar = invvar[i1];
    const T* k_input = input + i1 * n2;
    const T* k_dout = dout + i1 * n2;
    const T* k_dout_resid = dout_resid + i1 * n2;
    const int numx = blockDim.x * blockDim.y;
    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
    if (gamma != NULL) {
      int l = 4 * thrx;
      for (; l + 3 < n2; l += 4 * numx) {
        for (int k = 0; k < 4; ++k) {
          const U c_h = static_cast<U>(k_input[l + k]);
          const U c_loss = static_cast<U>(k_dout[l + k]);
          sum_loss1 += c_loss * static_cast<U>(gamma[l + k]);
          sum_loss2 += c_loss * static_cast<U>(gamma[l + k]) * (c_h - c_mean) * c_invvar;
        }
      }
      for (; l < n2; ++l) {
        const U c_h = static_cast<U>(k_input[l]);
        const U c_loss = static_cast<U>(k_dout[l]);
        sum_loss1 += c_loss * static_cast<U>(gamma[l]);
        sum_loss2 += c_loss * static_cast<U>(gamma[l]) * (c_h - c_mean) * c_invvar;
      }
    } else {
      int l = 4 * thrx;
      for (; l + 3 < n2; l += 4 * numx) {
        for (int k = 0; k < 4; ++k) {
          const U c_h = static_cast<U>(k_input[l + k]);
          const U c_loss = static_cast<U>(k_dout[l + k]);
          sum_loss1 += c_loss;
          sum_loss2 += c_loss * (c_h - c_mean) * c_invvar;
        }
      }
      for (; l < n2; ++l) {
        const U c_h = static_cast<U>(k_input[l]);
        const U c_loss = static_cast<U>(k_dout[l]);
        sum_loss1 += c_loss;
        sum_loss2 += c_loss * (c_h - c_mean) * c_invvar;
      }
    }
    // intra-warp reductions
    for (int mask = blockDim.x / 2; mask > 0; mask /= 2) {
      sum_loss1 += WARP_SHFL_XOR(sum_loss1, mask);
      sum_loss2 += WARP_SHFL_XOR(sum_loss2, mask);
    }
    // inter-warp reductions
    if (blockDim.y > 1) {
      SharedMemory<U> shared;
      U* buf = shared.getPointer();
      for (int offset = blockDim.y / 2; offset > 0; offset /= 2) {
        // upper half of warps write to shared
        if (threadIdx.y >= offset && threadIdx.y < 2 * offset) {
          const int wrt_i = (threadIdx.y - offset) * blockDim.x + threadIdx.x;
          buf[2 * wrt_i] = sum_loss1;
          buf[2 * wrt_i + 1] = sum_loss2;
        }
        __syncthreads();
        // lower half merges
        if (threadIdx.y < offset) {
          const int read_i = threadIdx.y * blockDim.x + threadIdx.x;
          sum_loss1 += buf[2 * read_i];
          sum_loss2 += buf[2 * read_i + 1];
        }
        __syncthreads();
      }
      if (threadIdx.y == 0) {
        buf[2 * threadIdx.x] = sum_loss1;
        buf[2 * threadIdx.x + 1] = sum_loss2;
      }
      __syncthreads();
      if (threadIdx.y != 0) {
        sum_loss1 = buf[2 * threadIdx.x];
        sum_loss2 = buf[2 * threadIdx.x + 1];
      }
    }
    // all threads now have the two sums over l
    U fH = (U)n2;
    U term1 = (U(1) / fH) * c_invvar;
    T* k_grad_input = grad_input + i1 * n2;
    if (gamma != NULL) {
      for (int l = thrx; l < n2; l += numx) {
        const U c_h = static_cast<U>(k_input[l]);
        const U c_loss = static_cast<U>(k_dout[l]);
        const T c_resid = static_cast<T>(k_dout_resid[l]);
        U f_grad_input = fH * c_loss * static_cast<U>(gamma[l]);
        f_grad_input -= sum_loss1;
        f_grad_input -= (c_h - c_mean) * c_invvar * sum_loss2;
        f_grad_input *= term1;
        k_grad_input[l] = static_cast<T>(f_grad_input) + c_resid;
      }
    } else {
      for (int l = thrx; l < n2; l += numx) {
        const U c_h = static_cast<U>(k_input[l]);
        const U c_loss = static_cast<U>(k_dout[l]);
        const T c_resid = static_cast<T>(k_dout_resid[l]);
        U f_grad_input = fH * c_loss;
        f_grad_input -= sum_loss1;
        f_grad_input -= (c_h - c_mean) * c_invvar * sum_loss2;
        f_grad_input *= term1;
        k_grad_input[l] = static_cast<T>(f_grad_input) + c_resid;
      }
    }
  }
}

template <typename T, typename U>
void HostApplyLayerNorm(T* output, U* mean, U* invvar, const T* input, int n1, int n2, double epsilon, const T* gamma,
                        const T* beta) {
  auto stream = at::cuda::getCurrentCUDAStream().stream();
  const dim3 threads(32, 4, 1);
  const uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
  const dim3 blocks(1, std::min((uint64_t)n1, maxGridY), 1);
  int nshared = threads.y > 1 ? threads.y * sizeof(U) + (threads.y / 2) * sizeof(U) : 0;
  cuApplyLayerNorm<<<blocks, threads, nshared, stream>>>(output, mean, invvar, input, n1, n2, U(epsilon), gamma, beta);
}

template <typename T, typename U>
void HostLayerNormGradient(const T* dout, const T* dout_resid, const U* mean, const U* invvar, const at::Tensor& input,
                           int n1, int n2, const T* gamma, const T* beta, double epsilon, T* grad_input, T* grad_gamma,
                           T* grad_beta) {
  auto stream = at::cuda::getCurrentCUDAStream().stream();

  if (gamma != NULL && beta != NULL) {
    // compute grad_gamma(j) and grad_beta(j)
    const int part_size = 16;
    const dim3 threads2(32, 4, 1);
    const dim3 blocks2((n2 + threads2.x - 1) / threads2.x, part_size, 1);
    const int nshared2_a = 2 * sizeof(U) * threads2.y * threads2.y * (threads2.x + 1);
    const int nshared2_b = threads2.x * threads2.y * sizeof(U);
    const int nshared2 = nshared2_a > nshared2_b ? nshared2_a : nshared2_b;
    at::Tensor part_grad_gamma = at::empty(
        {part_size, n2}, input.options().dtype(input.scalar_type() == at::ScalarType::Half ? at::ScalarType::Float
                                                                                           : input.scalar_type()));
    at::Tensor part_grad_beta = at::empty_like(part_grad_gamma);
    cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, stream>>>(
        dout, static_cast<T*>(input.data_ptr()), n1, n2, mean, invvar, U(epsilon),
        static_cast<U*>(part_grad_gamma.data_ptr()), static_cast<U*>(part_grad_beta.data_ptr()));

    const dim3 threads3(32, 8, 1);
    const dim3 blocks3((n2 + threads2.x - 1) / threads2.x, 1, 1);
    const int nshared3 = threads3.x * threads3.y * sizeof(U);
    cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, stream>>>(static_cast<U*>(part_grad_gamma.data_ptr()),
                                                                    static_cast<U*>(part_grad_beta.data_ptr()),
                                                                    part_size, n1, n2, grad_gamma, grad_beta);
  }

  // compute grad_input
  const uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
  const dim3 blocks1(1, std::min((uint64_t)n1, maxGridY), 1);
  const dim3 threads1(32, 4, 1);
  int nshared = threads1.y > 1 ? threads1.y * threads1.x * sizeof(U) : 0;
  cuComputeGradInput<<<blocks1, threads1, nshared, stream>>>(dout, dout_resid, static_cast<T*>(input.data_ptr()), n1,
                                                             n2, mean, invvar, U(epsilon), gamma, grad_input);
}
}  // namespace


================================================
FILE: apex/contrib/csrc/multihead_attn/masked_softmax_dropout_cuda.cu
================================================
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <cuda_runtime.h>
#include <math.h>
#include <torch/extension.h>

#include <iostream>
#include <vector>

#include "dropout.cuh"
#include "softmax.cuh"

namespace multihead_attn {
namespace fused_softmax {
namespace mask_softmax_dropout {

std::vector<torch::Tensor> fwd_cuda(bool is_training, int heads, torch::Tensor const& input, const uint8_t* pad_mask,
                                    float dropout_prob) {
  const int attn_batches = input.size(0);
  const int sequences = attn_batches / heads;
  const int q_seq_len = input.size(1);
  const int k_seq_len = q_seq_len;
  const int dropout_elems = attn_batches * q_seq_len * k_seq_len;

  // There is no reason to use more than one stream as every kernel is
  // sequentially dependent
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  cublasSetStream(handle, stream);

  // 3 Intermediate Results + Output (Note: dropout intermediates are generated
  // by ATen library code)
  auto act_options = input.options().requires_grad(false);
  auto mask_options = act_options.dtype(torch::kUInt8);

  torch::Tensor softmax_results = torch::empty({attn_batches, q_seq_len, k_seq_len}, act_options);
  torch::Tensor dropout_results = torch::empty({attn_batches, q_seq_len, k_seq_len}, act_options);
  torch::Tensor dropout_mask = torch::empty({attn_batches, q_seq_len, k_seq_len}, mask_options);

  // Softmax Intermediate Result Ptr (used by Matmul1 -> Softmax)
  void* input_ptr = static_cast<void*>(input.data_ptr());
  void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());

  // Padded Softmax
  bool softmax_success = false;
  if (pad_mask == nullptr) {
    softmax_success = dispatch_softmax<half, half, float>(reinterpret_cast<half*>(softmax_results_ptr),
                                                          reinterpret_cast<const half*>(input_ptr), k_seq_len,
                                                          k_seq_len, attn_batches * q_seq_len);
  } else {
    softmax_success = dispatch_masked_softmax<half, half, float>(
        reinterpret_cast<half*>(softmax_results_ptr), reinterpret_cast<const half*>(input_ptr), pad_mask, k_seq_len,
        k_seq_len, attn_batches * q_seq_len, attn_batches * q_seq_len / sequences);
  }

  if (is_training) {
    // use at:: function so that C++ version generates the same random mask as
    // python version
    auto dropout_tuple = at::_fused_dropout(softmax_results, 1.0f - dropout_prob);
    dropout_results = std::get<0>(dropout_tuple);
    dropout_mask = std::get<1>(dropout_tuple);
  }

  // Matmul2

  return {dropout_results, dropout_mask, softmax_results};
}

torch::Tensor bwd_cuda(int heads, torch::Tensor const& output_grads, torch::Tensor const& softmax_results,
                       torch::Tensor const& dropout_mask, const uint8_t* padding_mask, float dropout_prob) {
  const int attn_batches = output_grads.size(0);
  const int q_seq_len = output_grads.size(1);
  const int k_seq_len = q_seq_len;
  const int dropout_elems = attn_batches * q_seq_len * k_seq_len;
  // TODO: Streams can be used in Backprop but I haven't added more than one
  // in my first attempt to create the code
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  cublasSetStream(handle, stream);

  // Output Tensor Allocations
  //  torch::Tensor input_grads         = torch::empty_like(output_grads);

  // Apply Dropout Mask and Scale by Dropout Probability
  // Softmax Grad
  if (padding_mask == nullptr) {
    dispatch_masked_scale_softmax_backward_stream<half, half, float, false>(
        static_cast<half*>(output_grads.data_ptr()), static_cast<half*>(output_grads.data_ptr()),
        reinterpret_cast<half const*>(softmax_results.data_ptr()), static_cast<uint8_t const*>(dropout_mask.data_ptr()),
        1.0 / (1.0 - dropout_prob), k_seq_len, k_seq_len, attn_batches * q_seq_len, stream);
  } else {
    dispatch_masked_scale_softmax_backward_masked_out_stream<half, half, float, false>(
        static_cast<half*>(output_grads.data_ptr()), static_cast<half*>(output_grads.data_ptr()),
        reinterpret_cast<half const*>(softmax_results.data_ptr()), static_cast<uint8_t const*>(dropout_mask.data_ptr()),
        static_cast<uint8_t const*>(padding_mask), 1.0 / (1.0 - dropout_prob), k_seq_len, k_seq_len,
        attn_batches * q_seq_len, heads, stream);
  }
  // backward pass is completely in-place
  return output_grads;
}
}  // namespace mask_softmax_dropout
}  // namespace fused_softmax
}  // namespace multihead_attn


================================================
FILE: apex/contrib/csrc/multihead_attn/multihead_attn_frontend.cpp
================================================
#include <cuda_fp16.h>
#include <torch/extension.h>

#include <vector>

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
  CHECK_CUDA(x);       \
  CHECK_CONTIGUOUS(x)

namespace multihead_attn {
namespace fused_softmax {
namespace additive_mask_softmax_dropout {

std::vector<torch::Tensor> fwd_cuda(bool is_training, int heads, torch::Tensor const& input, const half* pad_mask,
                                    float dropout_prob);

torch::Tensor bwd_cuda(int heads, torch::Tensor const& output_grads, torch::Tensor const& softmax_results,
                       torch::Tensor const& dropout_mask, float dropout_prob);

std::vector<torch::Tensor> fwd(bool use_mask, bool is_training, int heads, torch::Tensor const& input,
                               torch::Tensor const& pad_mask, float dropout_prob) {
  TORCH_CHECK(input.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(input.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  if (use_mask) {
    TORCH_CHECK(pad_mask.dim() == 2, "expected 2D tensor");
    TORCH_CHECK(pad_mask.scalar_type() == at::ScalarType::Half, "Only BYTE is supported");
  }

  return fwd_cuda(is_training, heads, input, use_mask ? static_cast<const half*>(pad_mask.data_ptr()) : nullptr,
                  dropout_prob);
}

torch::Tensor bwd(bool use_mask, int heads, torch::Tensor const& output_grads, torch::Tensor const& softmax_results,
                  torch::Tensor const& dropout_mask, float dropout_prob) {
  TORCH_CHECK(output_grads.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(softmax_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(dropout_mask.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(output_grads.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(softmax_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  //  TORCH_CHECK(dropout_mask.scalar_type()      == at::ScalarType::Byte,
  //  "Only BYTE is supported");

  return bwd_cuda(heads, output_grads, softmax_results, dropout_mask, dropout_prob);
}

}  // namespace additive_mask_softmax_dropout
namespace mask_softmax_dropout {

std::vector<torch::Tensor> fwd_cuda(bool is_training, int heads, torch::Tensor const& input, const uint8_t* pad_mask,
                                    float dropout_prob);

torch::Tensor bwd_cuda(int heads, torch::Tensor const& output_grads, torch::Tensor const& softmax_results,
                       torch::Tensor const& dropout_mask, const uint8_t* padding_mask, float dropout_prob);

std::vector<torch::Tensor> fwd(bool use_mask, bool is_training, int heads, torch::Tensor const& input,
                               torch::Tensor const& pad_mask, float dropout_prob) {
  TORCH_CHECK(input.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(input.scalar_type() == at::ScalarType::Half, "Only HALF is supported");

  if (use_mask) {
    TORCH_CHECK(pad_mask.dim() == 2, "expected 2D tensor");
    TORCH_CHECK(pad_mask.scalar_type() == at::ScalarType::Byte, "Only BYTE is supported");
  }

  return fwd_cuda(is_training, heads, input, use_mask ? static_cast<const uint8_t*>(pad_mask.data_ptr()) : nullptr,
                  dropout_prob);
}

torch::Tensor bwd(bool use_mask, int heads, torch::Tensor const& output_grads, torch::Tensor const& softmax_results,
                  torch::Tensor const& dropout_mask, torch::Tensor const& padding_mask, float dropout_prob) {
  TORCH_CHECK(output_grads.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(softmax_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(dropout_mask.dim() == 3, "expected 3D tensor");

  TORCH_CHECK(output_grads.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(softmax_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  //  TORCH_CHECK(dropout_mask.scalar_type()      == at::ScalarType::Byte,
  //  "Only BYTE is supported");

  return bwd_cuda(heads, output_grads, softmax_results, dropout_mask,
                  use_mask ? static_cast<const uint8_t*>(padding_mask.data_ptr()) : nullptr, dropout_prob);
}

}  // end namespace mask_softmax_dropout
}  // end namespace fused_softmax

namespace encdec {
namespace cublas_gemmex {

std::vector<torch::Tensor> fwd_cuda(bool use_time_mask, bool is_training, int heads, torch::Tensor const& inputs_q,
                                    torch::Tensor const& inputs_kv, torch::Tensor const& input_weights_q,
                                    torch::Tensor const& input_weights_kv, torch::Tensor const& output_weights,
                                    const uint8_t* pad_mask, float dropout_prob);
std::vector<torch::Tensor> bwd_cuda(int heads, torch::Tensor const& output_grads, torch::Tensor const& matmul2_results,
                                    torch::Tensor const& dropout_results, torch::Tensor const& softmax_results,
                                    torch::Tensor const& input_lin_q_results, torch::Tensor const& input_lin_kv_results,
                                    torch::Tensor const& inputs_q, torch::Tensor const& inputs_kv,
                                    torch::Tensor const& input_weights_q, torch::Tensor const& input_weights_kv,
                                    torch::Tensor const& output_weights, torch::Tensor const& dropout_mask,
                                    float dropout_prob);

std::vector<torch::Tensor> fwd(bool use_mask, bool use_time_mask, bool is_training, int heads,
                               torch::Tensor const& inputs_q, torch::Tensor const& inputs_kv,
                               torch::Tensor const& input_weights_q, torch::Tensor const& input_weights_kv,
                               torch::Tensor const& output_weights, torch::Tensor const& pad_mask, float dropout_prob) {
  TORCH_CHECK(inputs_q.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(inputs_kv.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(input_weights_q.dim() == 2, "expected 2D tensor");
  TORCH_CHECK(input_weights_kv.dim() == 2, "expected 2D tensor");
  TORCH_CHECK(output_weights.dim() == 2, "expected 2D tensor");

  TORCH_CHECK(inputs_q.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(inputs_kv.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(input_weights_q.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(input_weights_kv.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(output_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");

  if (use_mask) {
    TORCH_CHECK(pad_mask.dim() == 2, "expected 2D tensor");
    TORCH_CHECK(pad_mask.scalar_type() == at::ScalarType::Byte, "Only BYTE is supported");
  }

  return fwd_cuda(use_time_mask, is_training, heads, inputs_q, inputs_kv, input_weights_q, input_weights_kv,
                  output_weights, use_mask ? static_cast<const uint8_t*>(pad_mask.data_ptr()) : nullptr, dropout_prob);
}

std::vector<torch::Tensor> bwd(int heads, torch::Tensor const& output_grads, torch::Tensor const& matmul2_results,
                               torch::Tensor const& dropout_results, torch::Tensor const& softmax_results,
                               torch::Tensor const& input_lin_q_results, torch::Tensor const& input_lin_kv_results,
                               torch::Tensor const& inputs_q, torch::Tensor const& inputs_kv,
                               torch::Tensor const& input_weights_q, torch::Tensor const& input_weights_kv,
                               torch::Tensor const& output_weights, torch::Tensor const& dropout_mask,
                               float dropout_prob) {
  TORCH_CHECK(output_grads.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(matmul2_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(dropout_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(softmax_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(input_lin_q_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(input_lin_kv_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(inputs_q.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(inputs_kv.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(input_weights_q.dim() == 2, "expected 2D tensor");
  TORCH_CHECK(input_weights_kv.dim() == 2, "expected 2D tensor");
  TORCH_CHECK(output_weights.dim() == 2, "expected 2D tensor");
  TORCH_CHECK(dropout_mask.dim() == 3, "expected 3D tensor");

  TORCH_CHECK(output_grads.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(matmul2_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(dropout_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(softmax_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(input_lin_q_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(input_lin_kv_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(inputs_q.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(inputs_kv.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(input_weights_q.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(input_weights_kv.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(output_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(dropout_mask.scalar_type() == at::ScalarType::Byte, "Only BYTE is supported");

  return bwd_cuda(heads, output_grads, matmul2_results, dropout_results, softmax_results, input_lin_q_results,
                  input_lin_kv_results, inputs_q, inputs_kv, input_weights_q, input_weights_kv, output_weights,
                  dropout_mask, dropout_prob);
}

}  // end namespace cublas_gemmex
}  // end namespace encdec

namespace encdec_norm_add {
namespace cublas_gemmex {

std::vector<torch::Tensor> fwd_cuda(bool use_time_mask, bool is_training, int heads, torch::Tensor const& inputs_q,
                                    torch::Tensor const& inputs_kv, torch::Tensor const& lyr_nrm_gamma_weights,
                                    torch::Tensor const& lyr_nrm_beta_weights, torch::Tensor const& input_weights_q,
                                    torch::Tensor const& input_weights_kv, torch::Tensor const& output_weights,
                                    const uint8_t* pad_mask, float dropout_prob);

std::vector<torch::Tensor> bwd_cuda(int heads, torch::Tensor const& output_grads, torch::Tensor const& matmul2_results,
                                    torch::Tensor const& dropout_results, torch::Tensor const& softmax_results,
                                    torch::Tensor const& input_lin_q_results, torch::Tensor const& input_lin_kv_results,
                                    torch::Tensor const& lyr_nrm_results, torch::Tensor const& lyr_nrm_mean,
                                    torch::Tensor const& lyr_nrm_invvar, torch::Tensor const& inputs_q,
                                    torch::Tensor const& inputs_kv, torch::Tensor const& lyr_nrm_gamma_weights,
                                    torch::Tensor const& lyr_nrm_beta_weights, torch::Tensor const& input_weights_q,
                                    torch::Tensor const& input_weights_kv, torch::Tensor const& output_weights,
                                    torch::Tensor const& dropout_mask, torch::Tensor const& dropout_add_mask,
                                    float dropout_prob);

std::vector<torch::Tensor> fwd(bool use_mask, bool use_time_mask, bool is_training, int heads,
                               torch::Tensor const& inputs_q, torch::Tensor const& inputs_kv,
                               torch::Tensor const& lyr_nrm_gamma_weights, torch::Tensor const& lyr_nrm_beta_weights,
                               torch::Tensor const& input_weights_q, torch::Tensor const& input_weights_kv,
                               torch::Tensor const& output_weights, torch::Tensor const& pad_mask, float dropout_prob) {
  TORCH_CHECK(inputs_q.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(inputs_kv.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(lyr_nrm_gamma_weights.dim() == 1, "expected 1D tensor");
  TORCH_CHECK(lyr_nrm_beta_weights.dim() == 1, "expected 1D tensor");
  TORCH_CHECK(input_weights_q.dim() == 2, "expected 2D tensor");
  TORCH_CHECK(input_weights_kv.dim() == 2, "expected 2D tensor");
  TORCH_CHECK(output_weights.dim() == 2, "expected 2D tensor");

  TORCH_CHECK(inputs_q.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(inputs_kv.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(lyr_nrm_gamma_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(lyr_nrm_beta_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(input_weights_q.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(input_weights_kv.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(output_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");

  if (use_mask) {
    TORCH_CHECK(pad_mask.dim() == 2, "expected 2D tensor");
    TORCH_CHECK(pad_mask.scalar_type() == at::ScalarType::Byte, "Only BYTE is supported");
  }

  return fwd_cuda(use_time_mask, is_training, heads, inputs_q, inputs_kv, lyr_nrm_gamma_weights, lyr_nrm_beta_weights,
                  input_weights_q, input_weights_kv, output_weights,
                  use_mask ? static_cast<const uint8_t*>(pad_mask.data_ptr()) : nullptr, dropout_prob);
}

std::vector<torch::Tensor> bwd(int heads, torch::Tensor const& output_grads, torch::Tensor const& matmul2_results,
                               torch::Tensor const& dropout_results, torch::Tensor const& softmax_results,
                               torch::Tensor const& input_lin_q_results, torch::Tensor const& input_lin_kv_results,
                               torch::Tensor const& lyr_nrm_results, torch::Tensor const& lyr_nrm_mean,
                               torch::Tensor const& lyr_nrm_invvar, torch::Tensor const& inputs_q,
                               torch::Tensor const& inputs_kv, torch::Tensor const& lyr_nrm_gamma_weights,
                               torch::Tensor const& lyr_nrm_beta_weights, torch::Tensor const& input_weights_q,
                               torch::Tensor const& input_weights_kv, torch::Tensor const& output_weights,
                               torch::Tensor const& dropout_mask, torch::Tensor const& dropout_add_mask,
                               float dropout_prob) {
  TORCH_CHECK(output_grads.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(matmul2_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(dropout_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(softmax_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(input_lin_q_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(input_lin_kv_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(lyr_nrm_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(lyr_nrm_mean.dim() == 1, "expected 1D tensor");
  TORCH_CHECK(lyr_nrm_invvar.dim() == 1, "expected 1D tensor");
  TORCH_CHECK(inputs_q.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(inputs_kv.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(lyr_nrm_gamma_weights.dim() == 1, "expected 1D tensor");
  TORCH_CHECK(lyr_nrm_beta_weights.dim() == 1, "expected 1D tensor");
  TORCH_CHECK(input_weights_q.dim() == 2, "expected 2D tensor");
  TORCH_CHECK(input_weights_kv.dim() == 2, "expected 2D tensor");
  TORCH_CHECK(output_weights.dim() == 2, "expected 2D tensor");
  TORCH_CHECK(dropout_mask.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(dropout_add_mask.dim() == 3, "expected 3D tensor");

  TORCH_CHECK(output_grads.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(matmul2_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(dropout_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(softmax_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(input_lin_q_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(input_lin_kv_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(lyr_nrm_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(lyr_nrm_mean.scalar_type() == at::ScalarType::Float, "Only FLOAT is supported");
  TORCH_CHECK(lyr_nrm_invvar.scalar_type() == at::ScalarType::Float, "Only FLOAT is supported");
  TORCH_CHECK(inputs_q.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(inputs_kv.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(lyr_nrm_gamma_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(lyr_nrm_beta_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(input_weights_q.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(input_weights_kv.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(output_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(dropout_mask.scalar_type() == at::ScalarType::Byte, "Only BYTE is supported");
  TORCH_CHECK(dropout_add_mask.scalar_type() == at::ScalarType::Byte, "Only BYTE is supported");

  return bwd_cuda(heads, output_grads, matmul2_results, dropout_results, softmax_results, input_lin_q_results,
                  input_lin_kv_results, lyr_nrm_results, lyr_nrm_mean, lyr_nrm_invvar, inputs_q, inputs_kv,
                  lyr_nrm_gamma_weights, lyr_nrm_beta_weights, input_weights_q, input_weights_kv, output_weights,
                  dropout_mask, dropout_add_mask, dropout_prob);
}

}  // end namespace cublas_gemmex
}  // end namespace encdec_norm_add

namespace self {
namespace cublas_gemmex {

std::vector<torch::Tensor> fwd_cuda(bool use_time_mask, bool is_training, int heads, torch::Tensor const& inputs,
                                    torch::Tensor const& input_weights, torch::Tensor const& output_weights,
                                    const uint8_t* pad_mask, float dropout_prob);

std::vector<torch::Tensor> bwd_cuda(int heads, torch::Tensor const& output_grads, torch::Tensor const& matmul2_results,
                                    torch::Tensor const& dropout_results, torch::Tensor const& softmax_results,
                                    torch::Tensor const& input_lin_results, torch::Tensor const& inputs,
                                    torch::Tensor const& input_weights, torch::Tensor const& output_weights,
                                    torch::Tensor const& dropout_mask, float dropout_prob);

std::vector<torch::Tensor> fwd(bool use_mask, bool use_time_mask, bool is_training, int heads,
                               torch::Tensor const& inputs, torch::Tensor const& input_weights,
                               torch::Tensor const& output_weights, torch::Tensor const& pad_mask, float dropout_prob) {
  TORCH_CHECK(inputs.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(input_weights.dim() == 2, "expected 2D tensor");
  TORCH_CHECK(output_weights.dim() == 2, "expected 2D tensor");

  TORCH_CHECK(inputs.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(input_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(output_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");

  if (use_mask) {
    TORCH_CHECK(pad_mask.dim() == 2, "expected 2D tensor");
    TORCH_CHECK(pad_mask.scalar_type() == at::ScalarType::Byte, "Only BYTE is supported");
  }

  return fwd_cuda(use_time_mask, is_training, heads, inputs, input_weights, output_weights,
                  use_mask ? static_cast<const uint8_t*>(pad_mask.data_ptr()) : nullptr, dropout_prob);
}

std::vector<torch::Tensor> bwd(int heads, torch::Tensor const& output_grads, torch::Tensor const& matmul2_results,
                               torch::Tensor const& dropout_results, torch::Tensor const& softmax_results,
                               torch::Tensor const& input_lin_results, torch::Tensor const& inputs,
                               torch::Tensor const& input_weights, torch::Tensor const& output_weights,
                               torch::Tensor const& dropout_mask, float dropout_prob) {
  TORCH_CHECK(output_grads.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(matmul2_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(dropout_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(softmax_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(input_lin_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(inputs.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(input_weights.dim() == 2, "expected 2D tensor");
  TORCH_CHECK(output_weights.dim() == 2, "expected 2D tensor");
  TORCH_CHECK(dropout_mask.dim() == 3, "expected 3D tensor");

  TORCH_CHECK(output_grads.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(matmul2_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(dropout_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(softmax_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(input_lin_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(inputs.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(input_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(output_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(dropout_mask.scalar_type() == at::ScalarType::Byte, "Only BYTE is supported");

  return bwd_cuda(heads, output_grads, matmul2_results, dropout_results, softmax_results, input_lin_results, inputs,
                  input_weights, output_weights, dropout_mask, dropout_prob);
}

}  // end namespace cublas_gemmex
}  // end namespace self
namespace self_bias {
namespace cublas_gemmex {

std::vector<torch::Tensor> fwd_cuda(bool use_time_mask, bool is_training, int heads, torch::Tensor const& inputs,
                                    torch::Tensor const& input_weights, torch::Tensor const& output_weights,
                                    torch::Tensor const& input_biases, torch::Tensor const& output_biases,
                                    const uint8_t* pad_mask, float dropout_prob);

std::vector<torch::Tensor> bwd_cuda(int heads, torch::Tensor const& output_grads, torch::Tensor const& matmul2_results,
                                    torch::Tensor const& dropout_results, torch::Tensor const& softmax_results,
                                    torch::Tensor const& input_lin_results, torch::Tensor const& inputs,
                                    torch::Tensor const& input_weights, torch::Tensor const& output_weights,
                                    // torch::Tensor const& input_biases,
                                    // torch::Tensor const& output_biases,
                                    torch::Tensor const& dropout_mask, float dropout_prob);

std::vector<torch::Tensor> fwd(bool use_mask, bool use_time_mask, bool is_training, int heads,
                               torch::Tensor const& inputs, torch::Tensor const& input_weights,
                               torch::Tensor const& output_weights, torch::Tensor const& input_biases,
                               torch::Tensor const& output_biases, torch::Tensor const& pad_mask, float dropout_prob) {
  TORCH_CHECK(inputs.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(input_weights.dim() == 2, "expected 2D tensor");
  TORCH_CHECK(output_weights.dim() == 2, "expected 2D tensor");

  TORCH_CHECK(inputs.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(input_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(output_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");

  if (use_mask) {
    TORCH_CHECK(pad_mask.dim() == 2, "expected 2D tensor");
    TORCH_CHECK(pad_mask.scalar_type() == at::ScalarType::Byte, "Only BYTE is supported");
  }

  return fwd_cuda(use_time_mask, is_training, heads, inputs, input_weights, output_weights, input_biases, output_biases,
                  use_mask ? static_cast<const uint8_t*>(pad_mask.data_ptr()) : nullptr, dropout_prob);
}

std::vector<torch::Tensor> bwd(int heads, torch::Tensor const& output_grads, torch::Tensor const& matmul2_results,
                               torch::Tensor const& dropout_results, torch::Tensor const& softmax_results,
                               torch::Tensor const& input_lin_results, torch::Tensor const& inputs,
                               torch::Tensor const& input_weights, torch::Tensor const& output_weights,
                               torch::Tensor const& dropout_mask, float dropout_prob) {
  TORCH_CHECK(output_grads.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(matmul2_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(dropout_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(softmax_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(input_lin_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(inputs.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(input_weights.dim() == 2, "expected 2D tensor");
  TORCH_CHECK(output_weights.dim() == 2, "expected 2D tensor");
  TORCH_CHECK(dropout_mask.dim() == 3, "expected 3D tensor");

  TORCH_CHECK(output_grads.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(matmul2_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(dropout_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(softmax_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(input_lin_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(inputs.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(input_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(output_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(dropout_mask.scalar_type() == at::ScalarType::Byte, "Only BYTE is supported");

  return bwd_cuda(heads, output_grads, matmul2_results, dropout_results, softmax_results, input_lin_results, inputs,
                  input_weights, output_weights, dropout_mask, dropout_prob);
}

}  // end namespace cublas_gemmex
}  // namespace self_bias
namespace self_bias_additive_mask {
namespace cublas_gemmex {

std::vector<torch::Tensor> fwd_cuda(bool use_time_mask, bool is_training, int heads, torch::Tensor const& inputs,
                                    torch::Tensor const& input_weights, torch::Tensor const& output_weights,
                                    torch::Tensor const& input_biases, torch::Tensor const& output_biases,
                                    const half* pad_mask, float dropout_prob);

std::vector<torch::Tensor> bwd_cuda(int heads, torch::Tensor const& output_grads, torch::Tensor const& matmul2_results,
                                    torch::Tensor const& dropout_results,
                                    // torch::Tensor const& softmax_results,
                                    torch::Tensor const& bmm1_results, torch::Tensor const& pad_mask,
                                    torch::Tensor const& input_lin_results, torch::Tensor const& inputs,
                                    torch::Tensor const& input_weights, torch::Tensor const& output_weights,
                                    // torch::Tensor const& input_biases,
                                    // torch::Tensor const& output_biases,
                                    torch::Tensor const& dropout_mask, float dropout_prob);

std::vector<torch::Tensor> fwd(bool use_mask, bool use_time_mask, bool is_training, int heads,
                               torch::Tensor const& inputs, torch::Tensor const& input_weights,
                               torch::Tensor const& output_weights, torch::Tensor const& input_biases,
                               torch::Tensor const& output_biases, torch::Tensor const& pad_mask, float dropout_prob) {
  TORCH_CHECK(inputs.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(input_weights.dim() == 2, "expected 2D tensor");
  TORCH_CHECK(output_weights.dim() == 2, "expected 2D tensor");

  TORCH_CHECK(inputs.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(input_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(output_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(use_mask, "no mask is not supported");

  if (use_mask) {
    TORCH_CHECK(pad_mask.dim() == 2, "expected 2D tensor");
    TORCH_CHECK(pad_mask.scalar_type() == at::ScalarType::Half, "Only Half is supported");
  }

  return fwd_cuda(use_time_mask, is_training, heads, inputs, input_weights, output_weights, input_biases, output_biases,
                  use_mask ? static_cast<const half*>(pad_mask.data_ptr()) : nullptr, dropout_prob);
}

std::vector<torch::Tensor> bwd(int heads, torch::Tensor const& output_grads, torch::Tensor const& matmul2_results,
                               torch::Tensor const& dropout_results, torch::Tensor const& bmm1_results,
                               torch::Tensor const& pad_mask, torch::Tensor const& input_lin_results,
                               torch::Tensor const& inputs, torch::Tensor const& input_weights,
                               torch::Tensor const& output_weights, torch::Tensor const& dropout_mask,
                               float dropout_prob) {
  TORCH_CHECK(output_grads.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(matmul2_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(dropout_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(input_lin_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(inputs.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(input_weights.dim() == 2, "expected 2D tensor");
  TORCH_CHECK(output_weights.dim() == 2, "expected 2D tensor");
  TORCH_CHECK(dropout_mask.dim() == 3, "expected 3D tensor");

  TORCH_CHECK(output_grads.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(matmul2_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(dropout_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(input_lin_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(inputs.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(input_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(output_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(dropout_mask.scalar_type() == at::ScalarType::Byte, "Only BYTE is supported");

  return bwd_cuda(heads, output_grads, matmul2_results, dropout_results, bmm1_results, pad_mask, input_lin_results,
                  inputs, input_weights, output_weights, dropout_mask, dropout_prob);
}

}  // end namespace cublas_gemmex
}  // namespace self_bias_additive_mask

namespace self_norm_add {
namespace cublas_gemmex {

std::vector<torch::Tensor> fwd_cuda(bool use_time_mask, bool is_training, int heads, torch::Tensor const& inputs,
                                    torch::Tensor const& lyr_nrm_gamma_weights,
                                    torch::Tensor const& lyr_nrm_beta_weights, torch::Tensor const& input_weights,
                                    torch::Tensor const& output_weights, const uint8_t* pad_mask, float dropout_prob);

std::vector<torch::Tensor> bwd_cuda(int heads, torch::Tensor const& output_grads, torch::Tensor const& matmul2_results,
                                    torch::Tensor const& dropout_results, torch::Tensor const& softmax_results,
                                    torch::Tensor const& input_lin_results, torch::Tensor const& lyr_nrm_results,
                                    torch::Tensor const& lyr_nrm_mean, torch::Tensor const& lyr_nrm_invvar,
                                    torch::Tensor const& inputs, torch::Tensor const& lyr_nrm_gamma_weights,
                                    torch::Tensor const& lyr_nrm_beta_weights, torch::Tensor const& input_weights,
                                    torch::Tensor const& output_weights, torch::Tensor const& dropout_mask,
                                    torch::Tensor const& dropout_add_mask, float dropout_prob);

std::vector<torch::Tensor> fwd(bool use_mask, bool use_time_mask, bool is_training, int heads,
                               torch::Tensor const& inputs, torch::Tensor const& lyr_nrm_gamma_weights,
                               torch::Tensor const& lyr_nrm_beta_weights, torch::Tensor const& input_weights,
                               torch::Tensor const& output_weights, torch::Tensor const& pad_mask, float dropout_prob) {
  TORCH_CHECK(inputs.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(lyr_nrm_gamma_weights.dim() == 1, "expected 1D tensor");
  TORCH_CHECK(lyr_nrm_beta_weights.dim() == 1, "expected 1D tensor");
  TORCH_CHECK(input_weights.dim() == 2, "expected 2D tensor");
  TORCH_CHECK(output_weights.dim() == 2, "expected 2D tensor");

  TORCH_CHECK(inputs.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(lyr_nrm_gamma_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(lyr_nrm_beta_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(input_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(output_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");

  if (use_mask) {
    TORCH_CHECK(pad_mask.dim() == 2, "expected 2D tensor");
    TORCH_CHECK(pad_mask.scalar_type() == at::ScalarType::Byte, "Only BYTE is supported");
  }

  return fwd_cuda(use_time_mask, is_training, heads, inputs, lyr_nrm_gamma_weights, lyr_nrm_beta_weights, input_weights,
                  output_weights, use_mask ? static_cast<const uint8_t*>(pad_mask.data_ptr()) : nullptr, dropout_prob);
}

std::vector<torch::Tensor> bwd(int heads, torch::Tensor const& output_grads, torch::Tensor const& matmul2_results,
                               torch::Tensor const& dropout_results, torch::Tensor const& softmax_results,
                               torch::Tensor const& input_lin_results, torch::Tensor const& lyr_nrm_results,
                               torch::Tensor const& lyr_nrm_mean, torch::Tensor const& lyr_nrm_invvar,
                               torch::Tensor const& inputs, torch::Tensor const& lyr_nrm_gamma_weights,
                               torch::Tensor const& lyr_nrm_beta_weights, torch::Tensor const& input_weights,
                               torch::Tensor const& output_weights, torch::Tensor const& dropout_mask,
                               torch::Tensor const& dropout_add_mask, float dropout_prob) {
  TORCH_CHECK(output_grads.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(matmul2_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(dropout_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(softmax_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(input_lin_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(lyr_nrm_results.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(lyr_nrm_mean.dim() == 1, "expected 1D tensor");
  TORCH_CHECK(lyr_nrm_invvar.dim() == 1, "expected 1D tensor");
  TORCH_CHECK(inputs.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(lyr_nrm_gamma_weights.dim() == 1, "expected 1D tensor");
  TORCH_CHECK(lyr_nrm_beta_weights.dim() == 1, "expected 1D tensor");
  TORCH_CHECK(input_weights.dim() == 2, "expected 2D tensor");
  TORCH_CHECK(output_weights.dim() == 2, "expected 2D tensor");
  TORCH_CHECK(dropout_mask.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(dropout_add_mask.dim() == 3, "expected 3D tensor");

  TORCH_CHECK(output_grads.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(matmul2_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(dropout_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(softmax_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(input_lin_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(lyr_nrm_results.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(lyr_nrm_mean.scalar_type() == at::ScalarType::Float, "Only FLOAT is supported");
  TORCH_CHECK(lyr_nrm_invvar.scalar_type() == at::ScalarType::Float, "Only FLOAT is supported");
  TORCH_CHECK(inputs.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(lyr_nrm_gamma_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(lyr_nrm_beta_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(input_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(output_weights.scalar_type() == at::ScalarType::Half, "Only HALF is supported");
  TORCH_CHECK(dropout_mask.scalar_type() == at::ScalarType::Byte, "Only BYTE is supported");
  TORCH_CHECK(dropout_add_mask.scalar_type() == at::ScalarType::Byte, "Only BYTE is supported");

  return bwd_cuda(heads, output_grads, matmul2_results, dropout_results, softmax_results, input_lin_results,
                  lyr_nrm_results, lyr_nrm_mean, lyr_nrm_invvar, inputs, lyr_nrm_gamma_weights, lyr_nrm_beta_weights,
                  input_weights, output_weights, dropout_mask, dropout_add_mask, dropout_prob);
}

}  // end namespace cublas_gemmex
}  // end namespace self_norm_add
}  // end namespace multihead_attn

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("additive_mask_softmax_dropout_forward", &multihead_attn::fused_softmax::additive_mask_softmax_dropout::fwd,
        "Self Multihead Attention masked softmax dropout -- Forward.", py::call_guard<py::gil_scoped_release>());
  m.def("additive_mask_softmax_dropout_backward", &multihead_attn::fused_softmax::additive_mask_softmax_dropout::bwd,
        "Self Multihead Attention masked softmax dropout -- Backward.", py::call_guard<py::gil_scoped_release>());
  m.def("mask_softmax_dropout_forward", &multihead_attn::fused_softmax::mask_softmax_dropout::fwd,
        "Self Multihead Attention masked softmax dropout -- Forward.", py::call_guard<py::gil_scoped_release>());
  m.def("mask_softmax_dropout_backward", &multihead_attn::fused_softmax::mask_softmax_dropout::bwd,
        "Self Multihead Attention masked softmax dropout -- Backward.", py::call_guard<py::gil_scoped_release>());
  m.def("encdec_multihead_attn_forward", &multihead_attn::encdec::cublas_gemmex::fwd,
        "Encdec Multihead Attention Forward.", py::call_guard<py::gil_scoped_release>());
  m.def("encdec_multihead_attn_backward", &multihead_attn::encdec::cublas_gemmex::bwd,
        "Encdec Multihead Attention Backward.", py::call_guard<py::gil_scoped_release>());
  m.def("encdec_multihead_attn_norm_add_forward", &multihead_attn::encdec_norm_add::cublas_gemmex::fwd,
        "Encdec Multihead Attention Plus Layer Norm and Residual Add Forward.",
        py::call_guard<py::gil_scoped_release>());
  m.def("encdec_multihead_attn_norm_add_backward", &multihead_attn::encdec_norm_add::cublas_gemmex::bwd,
        "Encdec Multihead Attention Plus Layer Norm and Residual Add Backward.",
        py::call_guard<py::gil_scoped_release>());
  m.def("self_attn_forward", &multihead_attn::self::cublas_gemmex::fwd, "Self Multihead Attention Forward.",
        py::call_guard<py::gil_scoped_release>());
  m.def("self_attn_backward", &multihead_attn::self::cublas_gemmex::bwd, "Self Multihead Attention Backward.",
        py::call_guard<py::gil_scoped_release>());
  m.def("self_attn_bias_forward", &multihead_attn::self_bias::cublas_gemmex::fwd,
        "Self Multihead Attention with Bias -- Forward.", py::call_guard<py::gil_scoped_release>());
  m.def("self_attn_bias_backward", &multihead_attn::self_bias::cublas_gemmex::bwd,
        "Self Multihead Attention with Bias -- Backward.", py::call_guard<py::gil_scoped_release>());
  m.def("self_attn_bias_additive_mask_forward", &multihead_attn::self_bias_additive_mask::cublas_gemmex::fwd,
        "Self Multihead Attention with Bias -- Forward.", py::call_guard<py::gil_scoped_release>());
  m.def("self_attn_bias_additive_mask_backward", &multihead_attn::self_bias_additive_mask::cublas_gemmex::bwd,
        "Self Multihead Attention with Bias -- Backward.", py::call_guard<py::gil_scoped_release>());
  m.def("self_attn_norm_add_forward", &multihead_attn::self_norm_add::cublas_gemmex::fwd,
        "Self Multihead Attention Plus Layer Norm and Residual Add Forward.", py::call_guard<py::gil_scoped_release>());
  m.def("self_attn_norm_add_backward", &multihead_attn::self_norm_add::cublas_gemmex::bwd,
        "Self Multihead Attention Plus Layer Norm and Residual Add Backward.",
        py::call_guard<py::gil_scoped_release>());
}

#undef CHECK_CUDA
#undef CHECK_CONTIGUOUS
#undef CHECK_INPUT


================================================
FILE: apex/contrib/csrc/multihead_attn/philox.cuh
================================================
#pragma once
// Philox CUDA.

namespace {

class Philox {
 public:
  __device__ inline Philox(unsigned long long seed, unsigned long long subsequence, unsigned long long offset)
      : STATE(0) {
    // key.x = (unsigned int)seed;
    // key.y = (unsigned int)(seed >> 32);
    // counter = make_uint4(0, 0, 0, 0);
    // counter.z = (unsigned int)(subsequence);
    // counter.w = (unsigned int)(subsequence >> 32);
    // STATE = 0;
    // incr_n(offset / 4);

    key = reinterpret_cast<const uint2&>(seed);
    ull2* tmp = reinterpret_cast<ull2*>(&counter);
    tmp->x = offset / 4;
    tmp->y = subsequence;
  }
  __device__ inline uint4 operator()() {
    if (STATE == 0) {
      uint4 counter_ = counter;
      uint2 key_ = key;
      // 7-round philox
      for (int i = 0; i < 6; i++) {
        counter_ = single_round(counter_, key_);
        key_.x += (kPhilox10A);
        key_.y += (kPhilox10B);
      }
      output = single_round(counter_, key_);
      incr();
    }
    // return a float4 directly
    // unsigned long ret;
    // switch(STATE) {
    //  case 0: ret = output.x; break;
    //  case 1: ret = output.y; break;
    //  case 2: ret = output.z; break;
    //  case 3: ret = output.w; break;
    //}
    // STATE = (STATE + 1) % 4;
    return output;
  }

 private:
  struct ull2 {
    uint64_t x;
    uint64_t y;
  };
  uint4 counter;
  uint4 output;
  uint2 key;
  unsigned int STATE;
  __device__ inline void incr_n(unsigned long long n) {
    unsigned int nlo = (unsigned int)(n);
    unsigned int nhi = (unsigned int)(n >> 32);
    counter.x += nlo;
    if (counter.x < nlo) nhi++;
    counter.y += nhi;
    if (nhi <= counter.y) return;
    if (++counter.z) return;
    ++counter.w;
  }

  __device__ uint4 incr128(uint4 ctr) {
    uint4 res;
    asm("add.cc.u32      %0, %4, %8;\n\t"
        "addc.cc.u32     %1, %5, %9;\n\t"
        "addc.cc.u32     %2, %6, %10;\n\t"
        "addc.u32        %3, %7, %11;\n\t"
        : "=r"(res.x), "=r"(res.y), "=r"(res.z), "=r"(res.w)
        : "r"(ctr.x), "r"(ctr.y), "r"(ctr.z), "r"(ctr.w), "n"(1), "n"(0), "n"(0), "n"(0));
    return res;
  }

  __device__ inline void incr() { counter = incr128(counter); }
  __device__ unsigned int mulhilo32(unsigned int a, unsigned int b, unsigned int* result_high) {
    *result_high = __umulhi(a, b);
    return a * b;
  }
  __device__ uint2 mulhilo32_v2(unsigned int a, unsigned int b) {
    uint2* res;
    unsigned long long tmp;
    asm("mul.wide.u32      %0, %1, %2;\n\t" : "=l"(tmp) : "r"(a), "r"(b));
    res = (uint2*)(&tmp);
    return *res;
  }
  __device__ inline uint4 single_round(uint4 ctr, uint2 key) {
    // unsigned int hi0;
    // unsigned int hi1;
    // unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
    // unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
    // uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
    uint2 res0 = mulhilo32_v2(kPhiloxSA, ctr.x);
    uint2 res1 = mulhilo32_v2(kPhiloxSB, ctr.z);
    uint4 ret = {res1.y ^ ctr.y ^ key.x, res1.x, res0.y ^ ctr.w ^ key.y, res0.x};
    return ret;
  }
  static const unsigned long kPhilox10A = 0x9E3779B9;
  static const unsigned long kPhilox10B = 0xBB67AE85;
  static const unsigned long kPhiloxSA = 0xD2511F53;
  static const unsigned long kPhiloxSB = 0xCD9E8D57;
};
// Inverse of 2^32.
constexpr float M_RAN_INVM32 = 2.3283064e-10f;
__device__ __inline__ float4 uniform4(uint4 x) {
  return make_float4(x.x * M_RAN_INVM32, x.y * M_RAN_INVM32, x.z * M_RAN_INVM32, x.w * M_RAN_INVM32);
}

}  // namespace


================================================
FILE: apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask_cuda.cu
================================================
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <cuda_runtime.h>
#include <math.h>
#include <torch/extension.h>

#include <iostream>
#include <vector>

#include "dropout.cuh"
#include "softmax.cuh"
#include "strided_batched_gemm.cuh"

namespace multihead_attn {
namespace self_bias_additive_mask {
namespace cublas_gemmex {

std::vector<torch::Tensor> fwd_cuda(bool use_time_mask, bool is_training, int heads, torch::Tensor const& inputs,
                                    torch::Tensor const& input_weights, torch::Tensor const& output_weights,
                                    torch::Tensor const& input_biases, torch::Tensor const& output_biases,
                                    const half* pad_mask, float dropout_prob) {
  const int embed_dim = inputs.size(2);
  const int sequences = inputs.size(1);
  const int q_seq_len = inputs.size(0);
  const int k_seq_len = q_seq_len;
  const int batches = sequences * q_seq_len;
  const int head_dim = embed_dim / heads;
  const int output_lin_dim = 3 * embed_dim;
  const int attn_batches = heads * sequences;
  const int lead_dim = attn_batches * 3 * head_dim;
  const int batch_stride = 3 * head_dim;
  [[maybe_unused]] const int dropout_elems = attn_batches * q_seq_len * k_seq_len;
  const float alpha = 1.0;
  const float beta_zero = 0.0;
  const float beta_one = 1.0;
  const float scale = 1.0 / sqrt(static_cast<float>(head_dim));

  // There is no reason to use more than one stream as every kernel is
  // sequentially dependent
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  cublasSetStream(handle, stream);

  // 3 Intermediate Results + Output (Note: dropout intermediates are generated
  // by ATen library code)
  auto act_options = inputs.options().requires_grad(false);
  auto mask_options = act_options.dtype(torch::kUInt8);

  torch::Tensor input_lin_results = torch::empty({q_seq_len, sequences, output_lin_dim}, act_options);
  torch::Tensor bmm1_results = torch::empty({attn_batches, q_seq_len, k_seq_len}, act_options);
  torch::Tensor dropout_results = torch::empty({attn_batches, q_seq_len, k_seq_len}, act_options);
  torch::Tensor dropout_mask = torch::empty({attn_batches, q_seq_len, k_seq_len}, mask_options);
  torch::Tensor matmul2_results = torch::empty({q_seq_len, attn_batches, head_dim}, act_options);
  torch::Tensor outputs = torch::empty_like(inputs, act_options);

  // Input Linear Results Pointers to Q, K, and V of interviewed activations
  void* q_lin_results_ptr = static_cast<void*>(input_lin_results.data_ptr());
  void* k_lin_results_ptr = static_cast<void*>(static_cast<half*>(input_lin_results.data_ptr()) + head_dim);
  void* v_lin_results_ptr = static_cast<void*>(static_cast<half*>(input_lin_results.data_ptr()) + 2 * head_dim);

  // Softmax Intermediate Result Ptr (used by Matmul1 -> Softmax)
  void* bmm1_results_ptr = static_cast<void*>(bmm1_results.data_ptr());
  void* dropout_results_ptr = static_cast<void*>(dropout_results.data_ptr());

  char a_layout_t{'t'};
  char a_layout_n{'n'};
  char b_layout_n{'n'};

  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
  // Input Linear Fwd
  input_lin_results.copy_(input_biases);
  TORCH_CUDABLAS_CHECK(cublasGemmEx(
      handle, CUBLAS_OP_T, CUBLAS_OP_N, output_lin_dim, batches, embed_dim, static_cast<const void*>(&alpha),
      static_cast<const void*>(input_weights.data_ptr()), CUDA_R_16F, embed_dim,
      static_cast<const void*>(inputs.data_ptr()), CUDA_R_16F, embed_dim, static_cast<const void*>(&beta_one),
      q_lin_results_ptr, CUDA_R_16F, output_lin_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // MatMul1 of Dot-Product Attention Plus scaling by 1/Sqrt(head size)
  gemm_switch_fp32accum(a_layout_t, b_layout_n, k_seq_len, q_seq_len, head_dim, scale,
                        static_cast<const half*>(k_lin_results_ptr), lead_dim, batch_stride,
                        static_cast<const half*>(q_lin_results_ptr), lead_dim, batch_stride, beta_zero,
                        static_cast<half*>(bmm1_results_ptr), k_seq_len, k_seq_len * q_seq_len, attn_batches);
  // Padded Softmax
  [[maybe_unused]] bool softmax_success = false;
  if (is_training) {
    softmax_success = dispatch_additive_masked_softmax_dropout<half, half, float>(
        reinterpret_cast<half*>(dropout_results_ptr),
        (is_training) ? reinterpret_cast<uint8_t*>(dropout_mask.data_ptr<uint8_t>()) : nullptr,
        reinterpret_cast<const half*>(bmm1_results_ptr), pad_mask, attn_batches * q_seq_len * q_seq_len, k_seq_len,
        k_seq_len, attn_batches * q_seq_len, attn_batches * q_seq_len / sequences, 1.0f - dropout_prob, stream);
  } else {
    softmax_success = dispatch_additive_masked_softmax<half, half, float>(
        reinterpret_cast<half*>(dropout_results_ptr),  // this is actually softmax results, but
                                                       // making it consistent for the next function
        reinterpret_cast<const half*>(bmm1_results_ptr), pad_mask, k_seq_len, k_seq_len, attn_batches * q_seq_len,
        attn_batches * q_seq_len / sequences);
  }

  // Matmul2
  gemm_switch_fp32accum(
      a_layout_n, b_layout_n, head_dim, q_seq_len, k_seq_len, alpha, static_cast<const half*>(v_lin_results_ptr),
      lead_dim, batch_stride, static_cast<const half*>(dropout_results.data_ptr()), k_seq_len, k_seq_len * q_seq_len,
      beta_zero, static_cast<half*>(matmul2_results.data_ptr()), head_dim * attn_batches, head_dim, attn_batches);

  outputs.copy_(output_biases);

  // Output Linear
  TORCH_CUDABLAS_CHECK(cublasGemmEx(
      handle, CUBLAS_OP_T, CUBLAS_OP_N, embed_dim, batches, embed_dim, static_cast<const void*>(&alpha),
      static_cast<const void*>(output_weights.data_ptr()), CUDA_R_16F, embed_dim,
      static_cast<const void*>(matmul2_results.data_ptr()), CUDA_R_16F, embed_dim, static_cast<const void*>(&beta_one),
      static_cast<void*>(outputs.data_ptr()), CUDA_R_16F, embed_dim, CUDA_R_32F,
      // CUBLAS_GEMM_ALGO1_TENSOR_OP));
      CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));

  return {input_lin_results, bmm1_results, dropout_results, dropout_mask, matmul2_results, outputs};
}

std::vector<torch::Tensor> bwd_cuda(int heads, torch::Tensor const& output_grads, torch::Tensor const& matmul2_results,
                                    torch::Tensor const& dropout_results, torch::Tensor const& bmm1_results,
                                    torch::Tensor const& pad_mask, torch::Tensor const& input_lin_results,
                                    torch::Tensor const& inputs, torch::Tensor const& input_weights,
                                    torch::Tensor const& output_weights, torch::Tensor const& dropout_mask,
                                    float dropout_prob) {
  const int embed_dim = inputs.size(2);
  const int sequences = inputs.size(1);
  const int q_seq_len = inputs.size(0);
  const int k_seq_len = q_seq_len;
  const int batches = sequences * q_seq_len;
  const int head_dim = embed_dim / heads;
  const int output_lin_dim = 3 * embed_dim;
  const int attn_batches = heads * sequences;
  const int lead_dim = attn_batches * 3 * head_dim;
  const int batch_stride = 3 * head_dim;
  [[maybe_unused]] const int dropout_elems = attn_batches * q_seq_len * k_seq_len;
  const float alpha = 1.0;
  const float beta = 0.0;
  const float scale = 1.0 / sqrt(static_cast<float>(head_dim));

  // TODO: Streams can be used in Backprop but I haven't added more than one
  // in my first attempt to create the code
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  cublasSetStream(handle, stream);

  // Output Tensor Allocations
  torch::Tensor input_grads = torch::empty_like(inputs);
  torch::Tensor input_weight_grads = torch::empty_like(input_weights);
  torch::Tensor output_weight_grads = torch::empty_like(output_weights);
  // Intermediate Tensor Allocations
  at::Tensor output_lin_grads = torch::empty_like(matmul2_results);
  at::Tensor matmul2_grads = torch::empty_like(dropout_results);
  at::Tensor input_lin_output_grads = torch::empty_like(input_lin_results);

  auto q_lin_results_ptr = static_cast<half*>(input_lin_results.data_ptr());
  auto k_lin_results_ptr = static_cast<half*>(input_lin_results.data_ptr()) + head_dim;
  auto v_lin_results_ptr = static_cast<half*>(input_lin_results.data_ptr()) + 2 * head_dim;

  auto q_lin_grads_ptr = static_cast<half*>(input_lin_output_grads.data_ptr());
  auto k_lin_grads_ptr = static_cast<half*>(input_lin_output_grads.data_ptr()) + head_dim;
  auto v_lin_grads_ptr = static_cast<half*>(input_lin_output_grads.data_ptr()) + 2 * head_dim;

  char a_layout_n{'n'};
  char a_layout_t{'t'};
  char b_layout_n{'n'};
  char b_layout_t{'t'};

  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));

  // Output Linear Dgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, embed_dim, batches, embed_dim,
                                    static_cast<const void*>(&alpha),
                                    static_cast<const void*>(output_weights.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(output_grads.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(&beta), static_cast<void*>(output_lin_grads.data_ptr()),
                                    CUDA_R_16F, embed_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
  // Output Linear Wgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_T, embed_dim, embed_dim, batches,
                                    static_cast<const void*>(&alpha),
                                    static_cast<const void*>(matmul2_results.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(output_grads.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(&beta), static_cast<void*>(output_weight_grads.data_ptr()),
                                    CUDA_R_16F, embed_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  auto output_bias_grads = output_grads.view({-1, embed_dim}).sum(0, false);
  // MatMul2 Dgrad1
  gemm_switch_fp32accum(a_layout_t, b_layout_n, k_seq_len, q_seq_len, head_dim, alpha,
                        static_cast<const half*>(v_lin_results_ptr), lead_dim, batch_stride,
                        static_cast<const half*>(output_lin_grads.data_ptr()), head_dim * attn_batches, head_dim, beta,
                        static_cast<half*>(matmul2_grads.data_ptr()), k_seq_len, k_seq_len * q_seq_len, attn_batches);

  // Matmul2 Dgrad2
  gemm_switch_fp32accum(a_layout_n, b_layout_t, head_dim, k_seq_len, q_seq_len, alpha,
                        static_cast<const half*>(output_lin_grads.data_ptr()), head_dim * attn_batches, head_dim,
                        static_cast<const half*>(dropout_results.data_ptr()), k_seq_len, k_seq_len * q_seq_len, beta,
                        v_lin_grads_ptr, lead_dim, batch_stride, attn_batches);

  // Apply Dropout Mask and Scale by Dropout Probability
  // Softmax Grad
  dispatch_masked_scale_softmax_backward_recompute<half, half, float, false>(
      static_cast<half*>(matmul2_grads.data_ptr()), static_cast<half* const>(matmul2_grads.data_ptr()),
      reinterpret_cast<half const*>(bmm1_results.data_ptr()), reinterpret_cast<half const*>(pad_mask.data_ptr()),
      static_cast<uint8_t const*>(dropout_mask.data_ptr()), 1.0 / (1.0 - dropout_prob), k_seq_len, k_seq_len,
      attn_batches * q_seq_len / sequences, attn_batches * q_seq_len, stream);

  // Matmul1 Dgrad1
  gemm_switch_fp32accum(a_layout_n, b_layout_n, head_dim, q_seq_len, k_seq_len, scale, k_lin_results_ptr, lead_dim,
                        batch_stride, static_cast<half*>(matmul2_grads.data_ptr()), k_seq_len, k_seq_len * q_seq_len,
                        beta, q_lin_grads_ptr, lead_dim, batch_stride, attn_batches);

  // Matmul1 Dgrad2
  gemm_switch_fp32accum(a_layout_n, b_layout_t, head_dim, k_seq_len, q_seq_len, scale, q_lin_results_ptr, lead_dim,
                        batch_stride, static_cast<half*>(matmul2_grads.data_ptr()), k_seq_len, k_seq_len * q_seq_len,
                        beta, k_lin_grads_ptr, lead_dim, batch_stride, attn_batches);
  // Input Linear Dgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, embed_dim, batches, output_lin_dim,
                                    static_cast<const void*>(&alpha),
                                    static_cast<const void*>(input_weights.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(input_lin_output_grads.data_ptr()),
                                    // static_cast<const void*>(q_lin_grads_ptr),
                                    CUDA_R_16F, output_lin_dim, static_cast<const void*>(&beta),
                                    static_cast<void*>(input_grads.data_ptr()), CUDA_R_16F, embed_dim, CUDA_R_32F,
                                    // CUBLAS_GEMM_ALGO10_TENSOR_OP));
                                    CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // Input Linear Wgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(
      handle, CUBLAS_OP_N, CUBLAS_OP_T, embed_dim, output_lin_dim, batches, static_cast<const void*>(&alpha),
      static_cast<const void*>(inputs.data_ptr()), CUDA_R_16F, embed_dim, static_cast<const void*>(q_lin_grads_ptr),
      CUDA_R_16F, output_lin_dim, static_cast<const void*>(&beta), static_cast<void*>(input_weight_grads.data_ptr()),
      CUDA_R_16F, embed_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  auto input_bias_grads = input_lin_output_grads.view({-1, output_lin_dim}).sum(0, false);
  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));

  return {input_grads, input_weight_grads, output_weight_grads, input_bias_grads, output_bias_grads};
}

}  // end namespace cublas_gemmex
}  // namespace self_bias_additive_mask
}  // end namespace multihead_attn


================================================
FILE: apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_cuda.cu
================================================
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <cuda_runtime.h>
#include <math.h>
#include <torch/extension.h>

#include <iostream>
#include <vector>

#include "dropout.cuh"
#include "softmax.cuh"
#include "strided_batched_gemm.cuh"

namespace multihead_attn {
namespace self_bias {
namespace cublas_gemmex {

std::vector<torch::Tensor> fwd_cuda(bool use_time_mask, bool is_training, int heads, torch::Tensor const& inputs,
                                    torch::Tensor const& input_weights, torch::Tensor const& output_weights,
                                    torch::Tensor const& input_biases, torch::Tensor const& output_biases,
                                    const uint8_t* pad_mask, float dropout_prob) {
  const int embed_dim = inputs.size(2);
  const int sequences = inputs.size(1);
  const int q_seq_len = inputs.size(0);
  const int k_seq_len = q_seq_len;
  const int batches = sequences * q_seq_len;
  const int head_dim = embed_dim / heads;
  const int output_lin_dim = 3 * embed_dim;
  const int attn_batches = heads * sequences;
  const int lead_dim = attn_batches * 3 * head_dim;
  const int batch_stride = 3 * head_dim;
  [[maybe_unused]] const int dropout_elems = attn_batches * q_seq_len * k_seq_len;
  const float alpha = 1.0;
  const float beta_zero = 0.0;
  const float beta_one = 1.0;
  const float scale = 1.0 / sqrt(static_cast<float>(head_dim));

  // There is no reason to use more than one stream as every kernel is
  // sequentially dependent
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  cublasSetStream(handle, stream);

  // 3 Intermediate Results + Output (Note: dropout intermediates are generated
  // by ATen library code)
  auto act_options = inputs.options().requires_grad(false);
  auto mask_options = act_options.dtype(torch::kUInt8);

  torch::Tensor input_lin_results = torch::empty({q_seq_len, sequences, output_lin_dim}, act_options);
  torch::Tensor softmax_results = torch::empty({attn_batches, q_seq_len, k_seq_len}, act_options);
  torch::Tensor dropout_results = torch::empty({attn_batches, q_seq_len, k_seq_len}, act_options);
  torch::Tensor dropout_mask = torch::empty({attn_batches, q_seq_len, k_seq_len}, mask_options);
  torch::Tensor matmul2_results = torch::empty({q_seq_len, attn_batches, head_dim}, act_options);
  torch::Tensor outputs = torch::empty_like(inputs, act_options);

  // Input Linear Results Pointers to Q, K, and V of interviewed activations
  void* q_lin_results_ptr = static_cast<void*>(input_lin_results.data_ptr());
  void* k_lin_results_ptr = static_cast<void*>(static_cast<half*>(input_lin_results.data_ptr()) + head_dim);
  void* v_lin_results_ptr = static_cast<void*>(static_cast<half*>(input_lin_results.data_ptr()) + 2 * head_dim);

  // Softmax Intermediate Result Ptr (used by Matmul1 -> Softmax)
  void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());

  char a_layout_t{'t'};
  char a_layout_n{'n'};
  char b_layout_n{'n'};

  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
  // Input Linear Fwd
  input_lin_results.copy_(input_biases);
  TORCH_CUDABLAS_CHECK(cublasGemmEx(
      handle, CUBLAS_OP_T, CUBLAS_OP_N, output_lin_dim, batches, embed_dim, static_cast<const void*>(&alpha),
      static_cast<const void*>(input_weights.data_ptr()), CUDA_R_16F, embed_dim,
      static_cast<const void*>(inputs.data_ptr()), CUDA_R_16F, embed_dim, static_cast<const void*>(&beta_one),
      q_lin_results_ptr, CUDA_R_16F, output_lin_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // MatMul1 of Dot-Product Attention Plus scaling by 1/Sqrt(head size)
  gemm_switch_fp32accum(a_layout_t, b_layout_n, k_seq_len, q_seq_len, head_dim, scale,
                        static_cast<const half*>(k_lin_results_ptr), lead_dim, batch_stride,
                        static_cast<const half*>(q_lin_results_ptr), lead_dim, batch_stride, beta_zero,
                        static_cast<half*>(softmax_results_ptr), k_seq_len, k_seq_len * q_seq_len, attn_batches);
  // Padded Softmax
  [[maybe_unused]] bool softmax_success = false;
  if (pad_mask == nullptr) {
    softmax_success = dispatch_softmax<half, half, float>(reinterpret_cast<half*>(softmax_results_ptr),
                                                          reinterpret_cast<const half*>(softmax_results_ptr), k_seq_len,
                                                          k_seq_len, attn_batches * q_seq_len);
  } else {
    if (use_time_mask) {
      softmax_success = dispatch_time_masked_softmax<half, half, float>(
          reinterpret_cast<half*>(softmax_results_ptr), reinterpret_cast<const half*>(softmax_results_ptr), pad_mask,
          k_seq_len, k_seq_len, attn_batches * q_seq_len, q_seq_len);
    } else {
      softmax_success = dispatch_masked_softmax<half, half, float>(
          reinterpret_cast<half*>(softmax_results_ptr), reinterpret_cast<const half*>(softmax_results_ptr), pad_mask,
          k_seq_len, k_seq_len, attn_batches * q_seq_len, attn_batches * q_seq_len / sequences);
    }
  }

  if (is_training) {
    // use at:: function so that C++ version generates the same random mask as
    // python version
    auto dropout_tuple = at::_fused_dropout(softmax_results, 1.0f - dropout_prob);
    dropout_results = std::get<0>(dropout_tuple);
    dropout_mask = std::get<1>(dropout_tuple);
  }

  // Matmul2
  gemm_switch_fp32accum(a_layout_n, b_layout_n, head_dim, q_seq_len, k_seq_len, alpha,
                        static_cast<const half*>(v_lin_results_ptr), lead_dim, batch_stride,
                        (is_training) ? static_cast<const half*>(dropout_results.data_ptr())
                                      : static_cast<const half*>(softmax_results.data_ptr()),
                        k_seq_len, k_seq_len * q_seq_len, beta_zero, static_cast<half*>(matmul2_results.data_ptr()),
                        head_dim * attn_batches, head_dim, attn_batches);

  outputs.copy_(output_biases);

  // Output Linear
  TORCH_CUDABLAS_CHECK(cublasGemmEx(
      handle, CUBLAS_OP_T, CUBLAS_OP_N, embed_dim, batches, embed_dim, static_cast<const void*>(&alpha),
      static_cast<const void*>(output_weights.data_ptr()), CUDA_R_16F, embed_dim,
      static_cast<const void*>(matmul2_results.data_ptr()), CUDA_R_16F, embed_dim, static_cast<const void*>(&beta_one),
      static_cast<void*>(outputs.data_ptr()), CUDA_R_16F, embed_dim, CUDA_R_32F,
      // CUBLAS_GEMM_ALGO1_TENSOR_OP));
      CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));

  return {input_lin_results, softmax_results, dropout_results, dropout_mask, matmul2_results, outputs};
}

std::vector<torch::Tensor> bwd_cuda(int heads, torch::Tensor const& output_grads, torch::Tensor const& matmul2_results,
                                    torch::Tensor const& dropout_results, torch::Tensor const& softmax_results,
                                    torch::Tensor const& input_lin_results, torch::Tensor const& inputs,
                                    torch::Tensor const& input_weights, torch::Tensor const& output_weights,
                                    torch::Tensor const& dropout_mask, float dropout_prob) {
  const int embed_dim = inputs.size(2);
  const int sequences = inputs.size(1);
  const int q_seq_len = inputs.size(0);
  const int k_seq_len = q_seq_len;
  const int batches = sequences * q_seq_len;
  const int head_dim = embed_dim / heads;
  const int output_lin_dim = 3 * embed_dim;
  const int attn_batches = heads * sequences;
  const int lead_dim = attn_batches * 3 * head_dim;
  const int batch_stride = 3 * head_dim;
  [[maybe_unused]] const int dropout_elems = attn_batches * q_seq_len * k_seq_len;
  const float alpha = 1.0;
  const float beta = 0.0;
  const float scale = 1.0 / sqrt(static_cast<float>(head_dim));

  // TODO: Streams can be used in Backprop but I haven't added more than one
  // in my first attempt to create the code
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  cublasSetStream(handle, stream);

  // Output Tensor Allocations
  torch::Tensor input_grads = torch::empty_like(inputs);
  torch::Tensor input_weight_grads = torch::empty_like(input_weights);
  torch::Tensor output_weight_grads = torch::empty_like(output_weights);
  // Intermediate Tensor Allocations
  at::Tensor output_lin_grads = torch::empty_like(matmul2_results);
  at::Tensor matmul2_grads = torch::empty_like(dropout_results);
  at::Tensor input_lin_output_grads = torch::empty_like(input_lin_results);

  auto q_lin_results_ptr = static_cast<half*>(input_lin_results.data_ptr());
  auto k_lin_results_ptr = static_cast<half*>(input_lin_results.data_ptr()) + head_dim;
  auto v_lin_results_ptr = static_cast<half*>(input_lin_results.data_ptr()) + 2 * head_dim;

  auto q_lin_grads_ptr = static_cast<half*>(input_lin_output_grads.data_ptr());
  auto k_lin_grads_ptr = static_cast<half*>(input_lin_output_grads.data_ptr()) + head_dim;
  auto v_lin_grads_ptr = static_cast<half*>(input_lin_output_grads.data_ptr()) + 2 * head_dim;

  char a_layout_n{'n'};
  char a_layout_t{'t'};
  char b_layout_n{'n'};
  char b_layout_t{'t'};

  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));

  // Output Linear Dgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, embed_dim, batches, embed_dim,
                                    static_cast<const void*>(&alpha),
                                    static_cast<const void*>(output_weights.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(output_grads.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(&beta), static_cast<void*>(output_lin_grads.data_ptr()),
                                    CUDA_R_16F, embed_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
  // Output Linear Wgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_T, embed_dim, embed_dim, batches,
                                    static_cast<const void*>(&alpha),
                                    static_cast<const void*>(matmul2_results.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(output_grads.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(&beta), static_cast<void*>(output_weight_grads.data_ptr()),
                                    CUDA_R_16F, embed_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  auto output_bias_grads = output_grads.view({-1, embed_dim}).sum(0, false);
  // MatMul2 Dgrad1
  gemm_switch_fp32accum(a_layout_t, b_layout_n, k_seq_len, q_seq_len, head_dim, alpha,
                        static_cast<const half*>(v_lin_results_ptr), lead_dim, batch_stride,
                        static_cast<const half*>(output_lin_grads.data_ptr()), head_dim * attn_batches, head_dim, beta,
                        static_cast<half*>(matmul2_grads.data_ptr()), k_seq_len, k_seq_len * q_seq_len, attn_batches);

  // Matmul2 Dgrad2
  gemm_switch_fp32accum(a_layout_n, b_layout_t, head_dim, k_seq_len, q_seq_len, alpha,
                        static_cast<const half*>(output_lin_grads.data_ptr()), head_dim * attn_batches, head_dim,
                        static_cast<const half*>(dropout_results.data_ptr()), k_seq_len, k_seq_len * q_seq_len, beta,
                        v_lin_grads_ptr, lead_dim, batch_stride, attn_batches);

  // Apply Dropout Mask and Scale by Dropout Probability
  // Softmax Grad
  dispatch_masked_scale_softmax_backward_stream<half, half, float, false>(
      static_cast<half*>(matmul2_grads.data_ptr()), static_cast<half*>(matmul2_grads.data_ptr()),
      reinterpret_cast<half const*>(softmax_results.data_ptr()), static_cast<uint8_t const*>(dropout_mask.data_ptr()),
      1.0 / (1.0 - dropout_prob), k_seq_len, k_seq_len, attn_batches * q_seq_len, stream);

  // Matmul1 Dgrad1
  gemm_switch_fp32accum(a_layout_n, b_layout_n, head_dim, q_seq_len, k_seq_len, scale, k_lin_results_ptr, lead_dim,
                        batch_stride, static_cast<half*>(matmul2_grads.data_ptr()), k_seq_len, k_seq_len * q_seq_len,
                        beta, q_lin_grads_ptr, lead_dim, batch_stride, attn_batches);

  // Matmul1 Dgrad2
  gemm_switch_fp32accum(a_layout_n, b_layout_t, head_dim, k_seq_len, q_seq_len, scale, q_lin_results_ptr, lead_dim,
                        batch_stride, static_cast<half*>(matmul2_grads.data_ptr()), k_seq_len, k_seq_len * q_seq_len,
                        beta, k_lin_grads_ptr, lead_dim, batch_stride, attn_batches);
  // Input Linear Dgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, embed_dim, batches, output_lin_dim,
                                    static_cast<const void*>(&alpha),
                                    static_cast<const void*>(input_weights.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(input_lin_output_grads.data_ptr()),
                                    // static_cast<const void*>(q_lin_grads_ptr),
                                    CUDA_R_16F, output_lin_dim, static_cast<const void*>(&beta),
                                    static_cast<void*>(input_grads.data_ptr()), CUDA_R_16F, embed_dim, CUDA_R_32F,
                                    // CUBLAS_GEMM_ALGO10_TENSOR_OP));
                                    CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // Input Linear Wgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(
      handle, CUBLAS_OP_N, CUBLAS_OP_T, embed_dim, output_lin_dim, batches, static_cast<const void*>(&alpha),
      static_cast<const void*>(inputs.data_ptr()), CUDA_R_16F, embed_dim, static_cast<const void*>(q_lin_grads_ptr),
      CUDA_R_16F, output_lin_dim, static_cast<const void*>(&beta), static_cast<void*>(input_weight_grads.data_ptr()),
      CUDA_R_16F, embed_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  auto input_bias_grads = input_lin_output_grads.view({-1, output_lin_dim}).sum(0, false);
  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));

  return {input_grads, input_weight_grads, output_weight_grads, input_bias_grads, output_bias_grads};
}

}  // end namespace cublas_gemmex
}  // namespace self_bias
}  // end namespace multihead_attn


================================================
FILE: apex/contrib/csrc/multihead_attn/self_multihead_attn_cuda.cu
================================================
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <cuda_runtime.h>
#include <math.h>
#include <torch/extension.h>

#include <iostream>
#include <vector>

#include "dropout.cuh"
#include "softmax.cuh"
#include "strided_batched_gemm.cuh"

namespace multihead_attn {
namespace self {
namespace cublas_gemmex {

std::vector<torch::Tensor> fwd_cuda(bool use_time_mask, bool is_training, int heads, torch::Tensor const& inputs,
                                    torch::Tensor const& input_weights, torch::Tensor const& output_weights,
                                    const uint8_t* pad_mask, float dropout_prob) {
  const int embed_dim = inputs.size(2);
  const int sequences = inputs.size(1);
  const int q_seq_len = inputs.size(0);
  const int k_seq_len = q_seq_len;
  const int batches = sequences * q_seq_len;
  const int head_dim = embed_dim / heads;
  const int output_lin_dim = 3 * embed_dim;
  const int attn_batches = heads * sequences;
  const int lead_dim = attn_batches * 3 * head_dim;
  const int batch_stride = 3 * head_dim;
  const int dropout_elems = attn_batches * q_seq_len * k_seq_len;
  const float alpha = 1.0;
  const float beta = 0.0;
  const float scale = 1.0 / sqrt(static_cast<float>(head_dim));

  // There is no reason to use more than one stream as every kernel is
  // sequentially dependent
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  cublasSetStream(handle, stream);

  // 3 Intermediate Results + Output (Note: dropout intermediates are generated
  // by ATen library code)
  auto act_options = inputs.options().requires_grad(false);
  auto mask_options = act_options.dtype(torch::kUInt8);

  torch::Tensor input_lin_results = torch::empty({q_seq_len, sequences, output_lin_dim}, act_options);
  torch::Tensor softmax_results = torch::empty({attn_batches, q_seq_len, k_seq_len}, act_options);
  torch::Tensor dropout_results = torch::empty({attn_batches, q_seq_len, k_seq_len}, act_options);
  torch::Tensor dropout_mask = torch::empty({attn_batches, q_seq_len, k_seq_len}, mask_options);
  torch::Tensor matmul2_results = torch::empty({q_seq_len, attn_batches, head_dim}, act_options);
  torch::Tensor outputs = torch::empty_like(inputs, act_options);

  // Input Linear Results Pointers to Q, K, and V of interviewed activations
  void* q_lin_results_ptr = static_cast<void*>(input_lin_results.data_ptr());
  void* k_lin_results_ptr = static_cast<void*>(static_cast<half*>(input_lin_results.data_ptr()) + head_dim);
  void* v_lin_results_ptr = static_cast<void*>(static_cast<half*>(input_lin_results.data_ptr()) + 2 * head_dim);

  // Softmax Intermediate Result Ptr (used by Matmul1 -> Softmax)
  void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());

  char a_layout_t{'t'};
  char a_layout_n{'n'};
  char b_layout_n{'n'};

  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
  // Input Linear Fwd
  TORCH_CUDABLAS_CHECK(cublasGemmEx(
      handle, CUBLAS_OP_T, CUBLAS_OP_N, output_lin_dim, batches, embed_dim, static_cast<const void*>(&alpha),
      static_cast<const void*>(input_weights.data_ptr()), CUDA_R_16F, embed_dim,
      static_cast<const void*>(inputs.data_ptr()), CUDA_R_16F, embed_dim, static_cast<const void*>(&beta),
      q_lin_results_ptr, CUDA_R_16F, output_lin_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // MatMul1 of Dot-Product Attention Plus scaling by 1/Sqrt(head size)
  gemm_switch_fp32accum(a_layout_t, b_layout_n, k_seq_len, q_seq_len, head_dim, scale,
                        static_cast<const half*>(k_lin_results_ptr), lead_dim, batch_stride,
                        static_cast<const half*>(q_lin_results_ptr), lead_dim, batch_stride, beta,
                        static_cast<half*>(softmax_results_ptr), k_seq_len, k_seq_len * q_seq_len, attn_batches);

  // Padded Softmax
  bool softmax_success = false;
  if (pad_mask == nullptr) {
    softmax_success = dispatch_softmax<half, half, float>(reinterpret_cast<half*>(softmax_results_ptr),
                                                          reinterpret_cast<const half*>(softmax_results_ptr), k_seq_len,
                                                          k_seq_len, attn_batches * q_seq_len);
  } else {
    if (use_time_mask) {
      softmax_success = dispatch_time_masked_softmax<half, half, float>(
          reinterpret_cast<half*>(softmax_results_ptr), reinterpret_cast<const half*>(softmax_results_ptr), pad_mask,
          k_seq_len, k_seq_len, attn_batches * q_seq_len, q_seq_len);
    } else {
      softmax_success = dispatch_masked_softmax<half, half, float>(
          reinterpret_cast<half*>(softmax_results_ptr), reinterpret_cast<const half*>(softmax_results_ptr), pad_mask,
          k_seq_len, k_seq_len, attn_batches * q_seq_len, attn_batches * q_seq_len / sequences);
    }
  }
  assert(softmax_success);

  if (is_training) {
    apex_fused_dropout_cuda<at::Half, float, uint32_t>(
        static_cast<at::Half const*>(softmax_results.data_ptr()), static_cast<at::Half*>(dropout_results.data_ptr()),
        static_cast<uint8_t*>(dropout_mask.data_ptr()), dropout_elems, (1.0f - dropout_prob));
  }

  // Matmul2
  gemm_switch_fp32accum(a_layout_n, b_layout_n, head_dim, q_seq_len, k_seq_len, alpha,
                        static_cast<const half*>(v_lin_results_ptr), lead_dim, batch_stride,
                        (is_training) ? static_cast<const half*>(dropout_results.data_ptr())
                                      : static_cast<const half*>(softmax_results.data_ptr()),
                        k_seq_len, k_seq_len * q_seq_len, beta, static_cast<half*>(matmul2_results.data_ptr()),
                        head_dim * attn_batches, head_dim, attn_batches);

  // Output Linear
  TORCH_CUDABLAS_CHECK(cublasGemmEx(
      handle, CUBLAS_OP_T, CUBLAS_OP_N, embed_dim, batches, embed_dim, static_cast<const void*>(&alpha),
      static_cast<const void*>(output_weights.data_ptr()), CUDA_R_16F, embed_dim,
      static_cast<const void*>(matmul2_results.data_ptr()), CUDA_R_16F, embed_dim, static_cast<const void*>(&beta),
      static_cast<void*>(outputs.data_ptr()), CUDA_R_16F, embed_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));

  return {input_lin_results, softmax_results, dropout_results, dropout_mask, matmul2_results, outputs};
}

std::vector<torch::Tensor> bwd_cuda(int heads, torch::Tensor const& output_grads, torch::Tensor const& matmul2_results,
                                    torch::Tensor const& dropout_results, torch::Tensor const& softmax_results,
                                    torch::Tensor const& input_lin_results, torch::Tensor const& inputs,
                                    torch::Tensor const& input_weights, torch::Tensor const& output_weights,
                                    torch::Tensor const& dropout_mask, float dropout_prob) {
  const int embed_dim = inputs.size(2);
  const int sequences = inputs.size(1);
  const int q_seq_len = inputs.size(0);
  const int k_seq_len = q_seq_len;
  const int batches = sequences * q_seq_len;
  const int head_dim = embed_dim / heads;
  const int output_lin_dim = 3 * embed_dim;
  const int attn_batches = heads * sequences;
  const int lead_dim = attn_batches * 3 * head_dim;
  const int batch_stride = 3 * head_dim;
  const int dropout_elems = attn_batches * q_seq_len * k_seq_len;
  const float alpha = 1.0;
  const float beta = 0.0;
  const float scale = 1.0 / sqrt(static_cast<float>(head_dim));

  // TODO: Streams can be used in Backprop but I haven't added more than one
  // in my first attempt to create the code
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  cublasSetStream(handle, stream);

  // Output Tensor Allocations
  torch::Tensor input_grads = torch::empty_like(inputs);
  torch::Tensor input_weight_grads = torch::empty_like(input_weights);
  torch::Tensor output_weight_grads = torch::empty_like(output_weights);
  // Intermediate Tensor Allocations
  at::Tensor output_lin_grads = torch::empty_like(matmul2_results);
  at::Tensor matmul2_grads = torch::empty_like(dropout_results);
  at::Tensor input_lin_output_grads = torch::empty_like(input_lin_results);

  auto q_lin_results_ptr = static_cast<half*>(input_lin_results.data_ptr());
  auto k_lin_results_ptr = static_cast<half*>(input_lin_results.data_ptr()) + head_dim;
  auto v_lin_results_ptr = static_cast<half*>(input_lin_results.data_ptr()) + 2 * head_dim;

  auto q_lin_grads_ptr = static_cast<half*>(input_lin_output_grads.data_ptr());
  auto k_lin_grads_ptr = static_cast<half*>(input_lin_output_grads.data_ptr()) + head_dim;
  auto v_lin_grads_ptr = static_cast<half*>(input_lin_output_grads.data_ptr()) + 2 * head_dim;

  char a_layout_n{'n'};
  char a_layout_t{'t'};
  char b_layout_n{'n'};
  char b_layout_t{'t'};

  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));

  // Output Linear Dgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, embed_dim, batches, embed_dim,
                                    static_cast<const void*>(&alpha),
                                    static_cast<const void*>(output_weights.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(output_grads.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(&beta), static_cast<void*>(output_lin_grads.data_ptr()),
                                    CUDA_R_16F, embed_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // Output Linear Wgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_T, embed_dim, embed_dim, batches,
                                    static_cast<const void*>(&alpha),
                                    static_cast<const void*>(matmul2_results.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(output_grads.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(&beta), static_cast<void*>(output_weight_grads.data_ptr()),
                                    CUDA_R_16F, embed_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // MatMul2 Dgrad1
  gemm_switch_fp32accum(a_layout_t, b_layout_n, k_seq_len, q_seq_len, head_dim, alpha,
                        static_cast<const half*>(v_lin_results_ptr), lead_dim, batch_stride,
                        static_cast<const half*>(output_lin_grads.data_ptr()), head_dim * attn_batches, head_dim, beta,
                        static_cast<half*>(matmul2_grads.data_ptr()), k_seq_len, k_seq_len * q_seq_len, attn_batches);

  // Matmul2 Dgrad2
  gemm_switch_fp32accum(a_layout_n, b_layout_t, head_dim, k_seq_len, q_seq_len, alpha,
                        static_cast<const half*>(output_lin_grads.data_ptr()), head_dim * attn_batches, head_dim,
                        static_cast<const half*>(dropout_results.data_ptr()), k_seq_len, k_seq_len * q_seq_len, beta,
                        v_lin_grads_ptr, lead_dim, batch_stride, attn_batches);

  // Apply Dropout Mask and Scale by Dropout Probability
  apex_masked_scale_cuda<at::Half, float, uint32_t>(
      static_cast<at::Half const*>(matmul2_grads.data_ptr()), static_cast<at::Half*>(matmul2_grads.data_ptr()),
      static_cast<uint8_t const*>(dropout_mask.data_ptr()), dropout_elems, (1.0 / (1.0 - dropout_prob)));

  // Softmax Grad
  bool softmax_success = false;
  softmax_success = dispatch_softmax_backward<half, half, float>(
      static_cast<half*>(matmul2_grads.data_ptr()), static_cast<half*>(matmul2_grads.data_ptr()),
      reinterpret_cast<half const*>(softmax_results.data_ptr()), k_seq_len, k_seq_len, attn_batches * q_seq_len);
  assert(softmax_success);

  // Matmul1 Dgrad1
  gemm_switch_fp32accum(a_layout_n, b_layout_n, head_dim, q_seq_len, k_seq_len, scale, k_lin_results_ptr, lead_dim,
                        batch_stride, static_cast<half*>(matmul2_grads.data_ptr()), k_seq_len, k_seq_len * q_seq_len,
                        beta, q_lin_grads_ptr, lead_dim, batch_stride, attn_batches);

  // Matmul1 Dgrad2
  gemm_switch_fp32accum(a_layout_n, b_layout_t, head_dim, k_seq_len, q_seq_len, scale, q_lin_results_ptr, lead_dim,
                        batch_stride, static_cast<half*>(matmul2_grads.data_ptr()), k_seq_len, k_seq_len * q_seq_len,
                        beta, k_lin_grads_ptr, lead_dim, batch_stride, attn_batches);

  // Input Linear Dgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(
      handle, CUBLAS_OP_N, CUBLAS_OP_N, embed_dim, batches, output_lin_dim, static_cast<const void*>(&alpha),
      static_cast<const void*>(input_weights.data_ptr()), CUDA_R_16F, embed_dim,
      static_cast<const void*>(q_lin_grads_ptr), CUDA_R_16F, output_lin_dim, static_cast<const void*>(&beta),
      static_cast<void*>(input_grads.data_ptr()), CUDA_R_16F, embed_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // Input Linear Wgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(
      handle, CUBLAS_OP_N, CUBLAS_OP_T, embed_dim, output_lin_dim, batches, static_cast<const void*>(&alpha),
      static_cast<const void*>(inputs.data_ptr()), CUDA_R_16F, embed_dim, static_cast<const void*>(q_lin_grads_ptr),
      CUDA_R_16F, output_lin_dim, static_cast<const void*>(&beta), static_cast<void*>(input_weight_grads.data_ptr()),
      CUDA_R_16F, embed_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));

  return {input_grads, input_weight_grads, output_weight_grads};
}

}  // end namespace cublas_gemmex
}  // end namespace self
}  // end namespace multihead_attn


================================================
FILE: apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add_cuda.cu
================================================
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <cuda_runtime.h>
#include <math.h>
#include <torch/extension.h>

#include <iostream>
#include <vector>

#include "dropout.cuh"
#include "layer_norm.cuh"
#include "softmax.cuh"
#include "strided_batched_gemm.cuh"

namespace multihead_attn {
namespace self_norm_add {
namespace cublas_gemmex {

std::vector<torch::Tensor> fwd_cuda(bool use_time_mask, bool is_training, int heads, torch::Tensor const& inputs,
                                    torch::Tensor const& lyr_nrm_gamma_weights,
                                    torch::Tensor const& lyr_nrm_beta_weights, torch::Tensor const& input_weights,
                                    torch::Tensor const& output_weights, const uint8_t* pad_mask, float dropout_prob) {
  const int embed_dim = inputs.size(2);
  const int sequences = inputs.size(1);
  const int q_seq_len = inputs.size(0);
  const int k_seq_len = q_seq_len;
  const int batches = sequences * q_seq_len;
  const int total_tokens = batches * embed_dim;
  const int head_dim = embed_dim / heads;
  const int output_lin_dim = 3 * embed_dim;
  const int attn_batches = heads * sequences;
  const int lead_dim = attn_batches * 3 * head_dim;
  const int batch_stride = 3 * head_dim;
  const int dropout_elems = attn_batches * q_seq_len * k_seq_len;
  const float alpha = 1.0;
  const float beta = 0.0;
  const float scale = 1.0 / sqrt(static_cast<float>(head_dim));

  // There is no reason to use more than one stream as every kernel is
  // sequentially dependent
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  cublasSetStream(handle, stream);

  // 3 Intermediate Results + Output (Note: dropout intermediates are generated
  // by ATen library code)
  auto act_options = inputs.options().requires_grad(false);
  auto lyr_nrm_options = act_options.dtype(torch::kFloat32);
  auto mask_options = act_options.dtype(torch::kUInt8);

  torch::Tensor lyr_nrm_mean = torch::empty({batches}, lyr_nrm_options);
  torch::Tensor lyr_nrm_invvar = torch::empty({batches}, lyr_nrm_options);
  torch::Tensor lyr_nrm_results = torch::empty_like(inputs, act_options);

  torch::Tensor input_lin_results = torch::empty({q_seq_len, sequences, output_lin_dim}, act_options);
  torch::Tensor softmax_results = torch::empty({attn_batches, q_seq_len, k_seq_len}, act_options);
  torch::Tensor dropout_results = torch::empty({attn_batches, q_seq_len, k_seq_len}, act_options);
  torch::Tensor dropout_mask = torch::empty({attn_batches, q_seq_len, k_seq_len}, mask_options);
  torch::Tensor matmul2_results = torch::empty({q_seq_len, attn_batches, head_dim}, act_options);
  torch::Tensor output_lin_results = torch::empty_like(inputs, act_options);
  torch::Tensor dropout_add_mask = torch::empty_like(inputs, mask_options);
  torch::Tensor outputs = torch::empty_like(inputs, act_options);

  // Input Linear Results Pointers to Q, K, and V of interviewed activations
  void* q_lin_results_ptr = static_cast<void*>(input_lin_results.data_ptr());
  void* k_lin_results_ptr = static_cast<void*>(static_cast<half*>(input_lin_results.data_ptr()) + head_dim);
  void* v_lin_results_ptr = static_cast<void*>(static_cast<half*>(input_lin_results.data_ptr()) + 2 * head_dim);

  // Softmax Intermediate Result Ptr (used by Matmul1 -> Softmax)
  void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());

  char a_layout_t{'t'};
  char a_layout_n{'n'};
  char b_layout_n{'n'};

  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
  // Layer Norm
  HostApplyLayerNorm<at::Half, float>(
      static_cast<at::Half*>(lyr_nrm_results.data_ptr()), static_cast<float*>(lyr_nrm_mean.data_ptr()),
      static_cast<float*>(lyr_nrm_invvar.data_ptr()), static_cast<const at::Half*>(inputs.data_ptr()),
      static_cast<int>(batches),    // n1
      static_cast<int>(embed_dim),  // n2
      1.0e-5, static_cast<const at::Half*>(lyr_nrm_gamma_weights.data_ptr()),
      static_cast<const at::Half*>(lyr_nrm_beta_weights.data_ptr()));

  // Input Linear Fwd
  TORCH_CUDABLAS_CHECK(cublasGemmEx(
      handle, CUBLAS_OP_T, CUBLAS_OP_N, output_lin_dim, batches, embed_dim, static_cast<const void*>(&alpha),
      static_cast<const void*>(input_weights.data_ptr()), CUDA_R_16F, embed_dim,
      // static_cast<const void*>(inputs.data_ptr()),
      static_cast<const void*>(lyr_nrm_results.data_ptr()), CUDA_R_16F, embed_dim, static_cast<const void*>(&beta),
      q_lin_results_ptr, CUDA_R_16F, output_lin_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // MatMul1 of Dot-Product Attention Plus scaling by 1/Sqrt(head size)
  gemm_switch_fp32accum(a_layout_t, b_layout_n, k_seq_len, q_seq_len, head_dim, scale,
                        static_cast<const half*>(k_lin_results_ptr), lead_dim, batch_stride,
                        static_cast<const half*>(q_lin_results_ptr), lead_dim, batch_stride, beta,
                        static_cast<half*>(softmax_results_ptr), k_seq_len, k_seq_len * q_seq_len, attn_batches);

  // Padded Softmax
  bool softmax_success = false;
  if (pad_mask == nullptr) {
    softmax_success = dispatch_softmax<half, half, float>(reinterpret_cast<half*>(softmax_results_ptr),
                                                          reinterpret_cast<const half*>(softmax_results_ptr), k_seq_len,
                                                          k_seq_len, attn_batches * q_seq_len);
  } else {
    if (use_time_mask) {
      softmax_success = dispatch_time_masked_softmax<half, half, float>(
          reinterpret_cast<half*>(softmax_results_ptr), reinterpret_cast<const half*>(softmax_results_ptr), pad_mask,
          k_seq_len, k_seq_len, attn_batches * q_seq_len, q_seq_len);
    } else {
      softmax_success = dispatch_masked_softmax<half, half, float>(
          reinterpret_cast<half*>(softmax_results_ptr), reinterpret_cast<const half*>(softmax_results_ptr), pad_mask,
          k_seq_len, k_seq_len, attn_batches * q_seq_len, attn_batches * q_seq_len / sequences);
    }
  }
  assert(softmax_success);

  if (is_training) {
    apex_fused_dropout_cuda<at::Half, float, uint32_t>(
        static_cast<at::Half const*>(softmax_results.data_ptr()), static_cast<at::Half*>(dropout_results.data_ptr()),
        static_cast<uint8_t*>(dropout_mask.data_ptr()), dropout_elems, (1.0f - dropout_prob));
  }

  // Matmul2
  gemm_switch_fp32accum(a_layout_n, b_layout_n, head_dim, q_seq_len, k_seq_len, alpha,
                        static_cast<const half*>(v_lin_results_ptr), lead_dim, batch_stride,
                        (is_training) ? static_cast<const half*>(dropout_results.data_ptr())
                                      : static_cast<const half*>(softmax_results.data_ptr()),
                        // static_cast<const half*>(dropout_results.data_ptr()),
                        k_seq_len, k_seq_len * q_seq_len, beta, static_cast<half*>(matmul2_results.data_ptr()),
                        head_dim * attn_batches, head_dim, attn_batches);

  // Output Linear
  TORCH_CUDABLAS_CHECK(cublasGemmEx(handle, CUBLAS_OP_T, CUBLAS_OP_N, embed_dim, batches, embed_dim,
                                    static_cast<const void*>(&alpha),
                                    static_cast<const void*>(output_weights.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(matmul2_results.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(&beta), static_cast<void*>(output_lin_results.data_ptr()),
                                    CUDA_R_16F, embed_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // End-of-block Dropout-Add
  if (is_training) {
    apex_dropout_add_cuda<at::Half, float, uint32_t>(
        static_cast<at::Half const*>(output_lin_results.data_ptr()), static_cast<at::Half const*>(inputs.data_ptr()),
        static_cast<at::Half*>(outputs.data_ptr()), static_cast<uint8_t*>(dropout_add_mask.data_ptr()), total_tokens,
        (1.0f - dropout_prob));
  } else {
    apex_add_cuda<at::Half, float, uint32_t>(static_cast<at::Half const*>(output_lin_results.data_ptr()),
                                             static_cast<at::Half const*>(inputs.data_ptr()),
                                             static_cast<at::Half*>(outputs.data_ptr()), total_tokens);
  }

  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));

  return {lyr_nrm_results, lyr_nrm_mean, lyr_nrm_invvar,  input_lin_results, softmax_results,
          dropout_results, dropout_mask, matmul2_results, dropout_add_mask,  outputs};
}

std::vector<torch::Tensor> bwd_cuda(int heads, torch::Tensor const& output_grads, torch::Tensor const& matmul2_results,
                                    torch::Tensor const& dropout_results, torch::Tensor const& softmax_results,
                                    torch::Tensor const& input_lin_results, torch::Tensor const& lyr_nrm_results,
                                    torch::Tensor const& lyr_nrm_mean, torch::Tensor const& lyr_nrm_invvar,
                                    torch::Tensor const& inputs, torch::Tensor const& lyr_nrm_gamma_weights,
                                    torch::Tensor const& lyr_nrm_beta_weights, torch::Tensor const& input_weights,
                                    torch::Tensor const& output_weights, torch::Tensor const& dropout_mask,
                                    torch::Tensor const& dropout_add_mask, float dropout_prob) {
  const int embed_dim = inputs.size(2);
  const int sequences = inputs.size(1);
  const int q_seq_len = inputs.size(0);
  const int k_seq_len = q_seq_len;
  const int batches = sequences * q_seq_len;
  const int total_tokens = batches * embed_dim;
  const int head_dim = embed_dim / heads;
  const int output_lin_dim = 3 * embed_dim;
  const int attn_batches = heads * sequences;
  const int lead_dim = attn_batches * 3 * head_dim;
  const int batch_stride = 3 * head_dim;
  const int dropout_elems = attn_batches * q_seq_len * k_seq_len;
  const float alpha = 1.0;
  const float beta = 0.0;
  const float scale = 1.0 / sqrt(static_cast<float>(head_dim));

  // TODO: Streams can be used in Backprop but I haven't added more than one
  // in my first attempt to create the code
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  cublasSetStream(handle, stream);

  // Output Tensor Allocations
  torch::Tensor input_grads = torch::empty_like(inputs);
  torch::Tensor lyr_nrm_gamma_grads = torch::empty_like(lyr_nrm_gamma_weights);
  torch::Tensor lyr_nrm_beta_grads = torch::empty_like(lyr_nrm_beta_weights);
  torch::Tensor input_weight_grads = torch::empty_like(input_weights);
  torch::Tensor output_weight_grads = torch::empty_like(output_weights);
  // Intermediate Tensor Allocations
  torch::Tensor dropout_add_grads = torch::empty_like(output_grads);
  torch::Tensor output_lin_grads = torch::empty_like(matmul2_results);
  torch::Tensor matmul2_grads = torch::empty_like(dropout_results);
  torch::Tensor input_lin_output_grads = torch::empty_like(input_lin_results);
  torch::Tensor input_lin_grads = torch::empty_like(inputs);

  auto q_lin_results_ptr = static_cast<half*>(input_lin_results.data_ptr());
  auto k_lin_results_ptr = static_cast<half*>(input_lin_results.data_ptr()) + head_dim;
  auto v_lin_results_ptr = static_cast<half*>(input_lin_results.data_ptr()) + 2 * head_dim;

  auto q_lin_grads_ptr = static_cast<half*>(input_lin_output_grads.data_ptr());
  auto k_lin_grads_ptr = static_cast<half*>(input_lin_output_grads.data_ptr()) + head_dim;
  auto v_lin_grads_ptr = static_cast<half*>(input_lin_output_grads.data_ptr()) + 2 * head_dim;

  char a_layout_n{'n'};
  char a_layout_t{'t'};
  char b_layout_n{'n'};
  char b_layout_t{'t'};

  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));

  // Dropout Add Backward
  apex_masked_scale_cuda<at::Half, float, uint32_t>(
      static_cast<at::Half const*>(output_grads.data_ptr()), static_cast<at::Half*>(dropout_add_grads.data_ptr()),
      static_cast<uint8_t const*>(dropout_add_mask.data_ptr()), total_tokens, (1.0 / (1.0 - dropout_prob)));

  // Output Linear Dgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, embed_dim, batches, embed_dim,
                                    static_cast<const void*>(&alpha),
                                    static_cast<const void*>(output_weights.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(dropout_add_grads.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(&beta), static_cast<void*>(output_lin_grads.data_ptr()),
                                    CUDA_R_16F, embed_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // Output Linear Wgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_T, embed_dim, embed_dim, batches,
                                    static_cast<const void*>(&alpha),
                                    static_cast<const void*>(matmul2_results.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(dropout_add_grads.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(&beta), static_cast<void*>(output_weight_grads.data_ptr()),
                                    CUDA_R_16F, embed_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // MatMul2 Dgrad1
  gemm_switch_fp32accum(a_layout_t, b_layout_n, k_seq_len, q_seq_len, head_dim, alpha,
                        static_cast<const half*>(v_lin_results_ptr), lead_dim, batch_stride,
                        static_cast<const half*>(output_lin_grads.data_ptr()), head_dim * attn_batches, head_dim, beta,
                        static_cast<half*>(matmul2_grads.data_ptr()), k_seq_len, k_seq_len * q_seq_len, attn_batches);

  // Matmul2 Dgrad2
  gemm_switch_fp32accum(a_layout_n, b_layout_t, head_dim, k_seq_len, q_seq_len, alpha,
                        static_cast<const half*>(output_lin_grads.data_ptr()), head_dim * attn_batches, head_dim,
                        static_cast<const half*>(dropout_results.data_ptr()), k_seq_len, k_seq_len * q_seq_len, beta,
                        v_lin_grads_ptr, lead_dim, batch_stride, attn_batches);

  // Apply Dropout Mask and Scale by Dropout Probability
  apex_masked_scale_cuda<at::Half, float, uint32_t>(
      static_cast<at::Half const*>(matmul2_grads.data_ptr()), static_cast<at::Half*>(matmul2_grads.data_ptr()),
      static_cast<uint8_t const*>(dropout_mask.data_ptr()), dropout_elems, (1.0 / (1.0 - dropout_prob)));

  // Softmax Grad
  bool softmax_success = false;
  softmax_success = dispatch_softmax_backward<half, half, float>(
      static_cast<half*>(matmul2_grads.data_ptr()), static_cast<half*>(matmul2_grads.data_ptr()),
      reinterpret_cast<half const*>(softmax_results.data_ptr()), k_seq_len, k_seq_len, attn_batches * q_seq_len);
  assert(softmax_success);

  // Matmul1 Dgrad1
  gemm_switch_fp32accum(a_layout_n, b_layout_n, head_dim, q_seq_len, k_seq_len, scale, k_lin_results_ptr, lead_dim,
                        batch_stride, static_cast<half*>(matmul2_grads.data_ptr()), k_seq_len, k_seq_len * q_seq_len,
                        beta, q_lin_grads_ptr, lead_dim, batch_stride, attn_batches);

  // Matmul1 Dgrad2
  gemm_switch_fp32accum(a_layout_n, b_layout_t, head_dim, k_seq_len, q_seq_len, scale, q_lin_results_ptr, lead_dim,
                        batch_stride, static_cast<half*>(matmul2_grads.data_ptr()), k_seq_len, k_seq_len * q_seq_len,
                        beta, k_lin_grads_ptr, lead_dim, batch_stride, attn_batches);

  // Input Linear Dgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(
      handle, CUBLAS_OP_N, CUBLAS_OP_N, embed_dim, batches, output_lin_dim, static_cast<const void*>(&alpha),
      static_cast<const void*>(input_weights.data_ptr()), CUDA_R_16F, embed_dim,
      static_cast<const void*>(q_lin_grads_ptr), CUDA_R_16F, output_lin_dim, static_cast<const void*>(&beta),
      // static_cast<void*>(input_grads.data_ptr()),
      static_cast<void*>(input_lin_grads.data_ptr()), CUDA_R_16F, embed_dim, CUDA_R_32F,
      // CUBLAS_GEMM_ALGO10_TENSOR_OP));
      CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // Input Linear Wgrad
  TORCH_CUDABLAS_CHECK(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_T, embed_dim, output_lin_dim, batches,
                                    static_cast<const void*>(&alpha),
                                    // static_cast<const void*>(inputs.data_ptr()),
                                    static_cast<const void*>(lyr_nrm_results.data_ptr()), CUDA_R_16F, embed_dim,
                                    static_cast<const void*>(q_lin_grads_ptr), CUDA_R_16F, output_lin_dim,
                                    static_cast<const void*>(&beta), static_cast<void*>(input_weight_grads.data_ptr()),
                                    CUDA_R_16F, embed_dim, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));

  // Fused Layer Norm Bwd with Residual Add
  HostLayerNormGradient<half, float>(
      static_cast<const half*>(input_lin_grads.data_ptr()), static_cast<const half*>(output_grads.data_ptr()),
      static_cast<const float*>(lyr_nrm_mean.data_ptr()), static_cast<const float*>(lyr_nrm_invvar.data_ptr()), inputs,
      static_cast<int>(batches),    // n1
      static_cast<int>(embed_dim),  // n2
      static_cast<const half*>(lyr_nrm_gamma_weights.data_ptr()),
      static_cast<const half*>(lyr_nrm_beta_weights.data_ptr()), 1.0e-5, static_cast<half*>(input_grads.data_ptr()),
      static_cast<half*>(lyr_nrm_gamma_grads.data_ptr()), static_cast<half*>(lyr_nrm_beta_grads.data_ptr()));

  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));

  return {input_grads, lyr_nrm_gamma_grads, lyr_nrm_beta_grads, input_weight_grads, output_weight_grads};
}

}  // end namespace cublas_gemmex
}  // end namespace self_norm_add
}  // end namespace multihead_attn


================================================
FILE: apex/contrib/csrc/multihead_attn/softmax.cuh
================================================
#pragma once
#include <curand_kernel.h>

#include <ATen/cuda/CUDAGraphsUtils.cuh>

#include "philox.cuh"

#ifdef OLD_GENERATOR_PATH
#include <ATen/CUDAGeneratorImpl.h>
#else
#include <ATen/cuda/CUDAGeneratorImpl.h>
#endif

#include <assert.h>
#include <cuda_fp16.h>
#include <stdint.h>

#include <cfloat>
#include <cmath>
#include <limits>

namespace {
template <typename Datatype, int ELEMENTS_PER_LDG>
__device__ __inline__ void copy_vector(Datatype* dst, const Datatype* src);

template <typename Datatype, int ELEMENTS_PER_LDG>
__device__ __inline__ void apply_mask(Datatype* dst, Datatype value, const uint8_t* src);

template <typename Datatype, int ELEMENTS_PER_LDG>
__device__ __inline__ void apply_additive_mask(Datatype* dst, const Datatype* additive_mask);

template <>
__device__ __inline__ void copy_vector<__half, 1>(__half* dst, const __half* src) {
  *dst = *src;
}

template <>
__device__ __inline__ void copy_vector<float, 1>(float* dst, const float* src) {
  *dst = *src;
}

template <>
__device__ __inline__ void copy_vector<__half, 4>(__half* dst, const __half* src) {
  *((float2*)dst) = *((float2*)src);
}
template <>
__device__ __inline__ void copy_vector<uint8_t, 1>(uint8_t* dst, const uint8_t* src) {
  *dst = *src;
}

template <>
__device__ __inline__ void copy_vector<uint8_t, 4>(uint8_t* dst, const uint8_t* src) {
  *((half2*)dst) = *((half2*)src);
}

template <>
__device__ __inline__ void apply_mask<__half, 1>(__half* dst, __half value, const uint8_t* src) {
  if (*src == 1) {
    *dst = value;
  }
}

template <>
__device__ __inline__ void apply_additive_mask<__half, 1>(__half* dst, const __half* additive_mask) {
  *dst += *additive_mask;
}

template <>
__device__ __inline__ void apply_additive_mask<__half, 4>(__half* dst, const __half* additive_mask) {
  *dst += *additive_mask;
  *(dst + 1) += *(additive_mask + 1);
  *(dst + 2) += *(additive_mask + 2);
  *(dst + 3) += *(additive_mask + 3);
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Warp Softmax forward
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

// WARP_BATCH number of batches.
// WARP_ITERATOINS The number of iterations required for one warp to iterate
// over all data. WARP_SIZE number of elements working on a single batch, has to
// be a power of two. ELEMENTS_PER_LDG_STG has to be 1.
template <typename input_t, typename output_t, typename acc_t, int WARP_BATCH, int WARP_ITERATIONS, int WARP_SIZE = 32,
          int ELEMENTS_PER_LDG_STG = 1>
__global__ void softmax_warp_forward(input_t* dst, const output_t* src, int batch_size, int stride, int element_count) {
  assert(ELEMENTS_PER_LDG_STG == 1);

  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;

  // batch_size might not be a multiple of WARP_BATCH. Check how
  // many batches have to computed within this WARP.
  int local_batches = batch_size - first_batch;
  if (local_batches > WARP_BATCH) local_batches = WARP_BATCH;

  // there might be multiple batches per warp. compute the index within the
  // batch
  int local_idx = threadIdx.x;

  src += first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;
  dst += first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;

  // load data from global memory
  input_t elements_input[WARP_BATCH][WARP_ITERATIONS];
  for (int i = 0; i < WARP_BATCH; ++i) {
    int batch_element_count = (i >= local_batches) ? 0 : element_count;
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
#pragma unroll
      for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
        elements_input[i][it + element] = -std::numeric_limits<float>::infinity();
      }

      if (element_index < batch_element_count) {
        copy_vector<input_t, ELEMENTS_PER_LDG_STG>(&elements_input[i][it], src + i * element_count + it * WARP_SIZE);
      }
    }
  }

  // convert input_t to acc_t
  acc_t elements[WARP_BATCH][WARP_ITERATIONS];
  for (int i = 0; i < WARP_BATCH; ++i) {
    for (int it = 0; it < WARP_ITERATIONS; ++it) {
      elements[i][it] = elements_input[i][it];
    }
  }

  constexpr uint32_t FULL_MASK = 0xffffffff;

  // compute local max_value

  // take the max_value of the first element to avoid one max call
  acc_t max_value[WARP_BATCH];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    max_value[i] = elements[i][0];
  }

#pragma unroll
  for (int it = 1; it < WARP_ITERATIONS; ++it) {
    for (int i = 0; i < WARP_BATCH; ++i) {
      max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
    }
  }

// reduction max_value
#pragma unroll
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
    float val[WARP_BATCH];
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      val[i] = __shfl_xor_sync(FULL_MASK, max_value[i], offset, WARP_SIZE);
    }
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      max_value[i] = max_value[i] > val[i] ? max_value[i] : val[i];
    }
  }

  // compute local sum
  acc_t sum[WARP_BATCH]{0.0f};

#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    for (int it = 0; it < WARP_ITERATIONS; ++it) {
      // elements[i][it] = expf(elements[i][it] - max_value[i]);
      elements[i][it] = std::exp(elements[i][it] - max_value[i]);
      sum[i] += elements[i][it];
    }
  }

// reduction sum
#pragma unroll
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      sum[i] += __shfl_xor_sync(FULL_MASK, sum[i], offset, WARP_SIZE);
    }
  }

// store result
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    if (i >= local_batches) break;
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
      if (element_index < element_count) {
        // dst[i * element_count + it * WARP_SIZE] = elements[i][it] / sum[i];
        output_t out[ELEMENTS_PER_LDG_STG];
        for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
          out[element] = elements[i][it + element] / sum[i];
        }
        copy_vector<output_t, ELEMENTS_PER_LDG_STG>(dst + i * element_count + it * WARP_SIZE, out);
      } else {
        break;
      }
    }
  }
}

// WARP_BATCH number of batches.
// WARP_ITERATOINS The number of iterations required for one warp to iterate
// over all data. WARP_SIZE number of elements working on a single batch, has to
// be a power of two. ELEMENTS_PER_LDG_STG has to be 1.
template <typename input_t, typename output_t>
using softmax_forward_func = void (*)(input_t* dst, const output_t* src, int batch_size, int stride, int element_count);

template <typename input_t, typename output_t, typename acc_t>
bool warp_softmax_kernel(int log2_elements, int& warp_size, int& batches_per_warp,
                         softmax_forward_func<input_t, output_t>& kernel) {
  // determine size of a warp
  const int next_power_of_two = 1 << log2_elements;
  warp_size = (next_power_of_two < 32) ? next_power_of_two : 32;

  // determine how many batches a warp should process.
  batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;

  switch (log2_elements) {
    case 0:  // 1
      kernel = &softmax_warp_forward<input_t, output_t, acc_t, 2, 1, 1, 1>;
      break;
    case 1:  // 2
      kernel = &softmax_warp_forward<input_t, output_t, acc_t, 2, 1, 2, 1>;
      break;
    case 2:  // 4
      kernel = &softmax_warp_forward<input_t, output_t, acc_t, 2, 1, 4, 1>;
      break;
    case 3:  // 8
      kernel = &softmax_warp_forward<input_t, output_t, acc_t, 2, 1, 8, 1>;
      break;
    case 4:  // 16
      kernel = &softmax_warp_forward<input_t, output_t, acc_t, 2, 1, 16, 1>;
      break;
    case 5:  // 32
      kernel = &softmax_warp_forward<input_t, output_t, acc_t, 2, 1, 32, 1>;
      break;
    case 6:  // 64
      kernel = &softmax_warp_forward<input_t, output_t, acc_t, 2, 2, 32, 1>;
      break;
    case 7:  // 128
      kernel = &softmax_warp_forward<input_t, output_t, acc_t, 2, 4, 32, 1>;
      break;
    case 8:  // 256
      kernel = &softmax_warp_forward<input_t, output_t, acc_t, 1, 8, 32, 1>;
      break;
    case 9:  // 512
      kernel = &softmax_warp_forward<input_t, output_t, acc_t, 1, 16, 32, 1>;
      break;
    case 10:  // 1024
      kernel = &softmax_warp_forward<input_t, output_t, acc_t, 1, 32, 32, 1>;
      break;
    default:
      return false;
  }
  return true;
}

template <typename input_t, typename output_t, typename acc_t>
bool dispatch_softmax(output_t* dst, const input_t* src, int softmax_elements, int softmax_elements_stride,
                      int batch_count) {
  if (softmax_elements == 0) {
    return true;
  } else if (softmax_elements <= 1024) {
    // compute function index. there's a function for each power of two size up
    // to 1024.
    int log2_elements = 0;
    while ((1 << log2_elements) < softmax_elements) ++log2_elements;

    softmax_forward_func<input_t, output_t> kernel;
    int warp_size, batches_per_warp;
    if (!warp_softmax_kernel<input_t, output_t, acc_t>(log2_elements, warp_size, batches_per_warp, kernel)) {
      return false;
    }

    // use 128 threads per block to maximize gpu utilization
    constexpr int threads_per_block = 128;

    // compute warps per block.
    int warps_per_block = (threads_per_block / warp_size);

    // compute launch size
    int batches_per_block = warps_per_block * batches_per_warp;
    int blocks = (batch_count + batches_per_block - 1) / batches_per_block;
    dim3 threads(warp_size, warps_per_block, 1);

    // launch
    kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, batch_count, softmax_elements_stride,
                                                                     softmax_elements);
    return true;
  }
  return false;
}

template <typename input_t, typename output_t, typename acc_t, int WARP_BATCH, int WARP_ITERATIONS, int WARP_SIZE,
          int ELEMENTS_PER_LDG_STG>
__global__ void additive_masked_softmax_dropout_warp_forward_vec4(output_t* dst, uint8_t* dropout_mask,
                                                                  const input_t* src, const input_t* pad_mask,
                                                                  int batch_size, int stride, int element_count,
                                                                  int pad_batch_stride, at::PhiloxCudaState philox_args,
                                                                  float p) {
  assert(ELEMENTS_PER_LDG_STG == 4);
  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
  int tid = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
  acc_t pinv = acc_t(1) / p;
  // batch_size might not be a multiple of WARP_BATCH. Check how
  // many batches have to computed within this WARP.
  int local_batches = batch_size - first_batch;
  if (local_batches > WARP_BATCH) local_batches = WARP_BATCH;

  // there might be multiple batches per warp. compute the index within the
  // batch
  int local_idx = threadIdx.x;
  // vectorize if element_count is multiple of 4, else don't vectorize
  input_t elements_input[WARP_BATCH][WARP_ITERATIONS];

  int thread_offset = first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;
  src += thread_offset;
  dst += thread_offset;
  dropout_mask += thread_offset;

  // load data from global memory
  for (int i = 0; i < WARP_BATCH; ++i) {
    int batch_element_count = (i >= local_batches) ? 0 : element_count;
    int pad_thread_offset = ((first_batch + i) / pad_batch_stride) * stride + ELEMENTS_PER_LDG_STG * local_idx;
    const half* curr_mask = pad_mask + pad_thread_offset;
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
#pragma unroll
      for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
        // masking_value is a large negative value
        elements_input[i][it + element] = -10000;
      }

      if (element_index < batch_element_count) {
        int itr_jmp = it * WARP_SIZE;
        int itr_idx = i * element_count + itr_jmp;
        copy_vector<input_t, ELEMENTS_PER_LDG_STG>(&elements_input[i][it], src + itr_idx);
        apply_additive_mask<input_t, ELEMENTS_PER_LDG_STG>(
            &elements_input[i][it],
            curr_mask + itr_jmp);  //(__half)-std::numeric_limits<float>::infinity()
      }
    }
  }
  // convert input_t to acc_t
  acc_t elements[WARP_BATCH][WARP_ITERATIONS];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; ++it) {
      elements[i][it] = elements_input[i][it];
    }
  }

  constexpr uint32_t FULL_MASK = 0xffffffff;

  // compute local max_value

  // take the max_value of the first element to avoid one max call
  acc_t max_value[WARP_BATCH];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    max_value[i] = elements[i][0];
  }

#pragma unroll
  for (int it = 1; it < WARP_ITERATIONS; ++it) {
    for (int i = 0; i < WARP_BATCH; ++i) {
      max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
    }
  }

// reduction max_value
#pragma unroll
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
    float val[WARP_BATCH];
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      val[i] = __shfl_xor_sync(FULL_MASK, max_value[i], offset, WARP_SIZE);
    }
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      max_value[i] = max_value[i] > val[i] ? max_value[i] : val[i];
    }
  }

  // compute local sum
  acc_t sum[WARP_BATCH]{0.0f};

#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    for (int it = 0; it < WARP_ITERATIONS; ++it) {
      elements[i][it] = std::exp(elements[i][it] - max_value[i]);
      sum[i] += elements[i][it];
    }
  }

// reduction sum
#pragma unroll
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      sum[i] += __shfl_xor_sync(FULL_MASK, sum[i], offset, WARP_SIZE);
    }
  }
  auto seeds = at::cuda::philox::unpack(philox_args);
  Philox ph(std::get<0>(seeds), tid, std::get<1>(seeds));
  uint8_t rands[WARP_BATCH][WARP_ITERATIONS];
  float4 rand_num;
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    if (i >= local_batches) break;
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
      if (element_index < element_count) {
        rand_num = uniform4(ph());
        rands[i][it] = (rand_num.x <= p) > 0.5;
        rands[i][it + 1] = (rand_num.y <= p) > 0.5;
        rands[i][it + 2] = (rand_num.z <= p) > 0.5;
        rands[i][it + 3] = (rand_num.w <= p) > 0.5;
        copy_vector<uint8_t, ELEMENTS_PER_LDG_STG>(dropout_mask + i * element_count + it * WARP_SIZE, &rands[i][it]);
      }
    }
  }

// store result
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    if (i >= local_batches) break;
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
      if (element_index < element_count) {
        output_t out[ELEMENTS_PER_LDG_STG];
#pragma unroll
        for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
          out[element] = rands[i][it + element] * (pinv * (elements[i][it + element] / sum[i]));
        }
        copy_vector<output_t, ELEMENTS_PER_LDG_STG>(dst + i * element_count + it * WARP_SIZE, out);

      } else {
        break;
      }
    }
  }
}

template <typename input_t, typename output_t, typename acc_t, int WARP_BATCH, int WARP_ITERATIONS, int WARP_SIZE,
          int ELEMENTS_PER_LDG_STG>
__global__ void additive_masked_softmax_dropout_warp_forward(output_t* dst, uint8_t* dropout_mask, const input_t* src,
                                                             const input_t* pad_mask, int batch_size, int stride,
                                                             int element_count, int pad_batch_stride,
                                                             at::PhiloxCudaState philox_args, float p) {
  assert(ELEMENTS_PER_LDG_STG == 1);
  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
  int tid = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
  acc_t pinv = acc_t(1) / p;
  // batch_size might not be a multiple of WARP_BATCH. Check how
  // many batches have to computed within this WARP.
  int local_batches = batch_size - first_batch;
  if (local_batches > WARP_BATCH) local_batches = WARP_BATCH;

  // there might be multiple batches per warp. compute the index within the
  // batch
  int local_idx = threadIdx.x;
  // vectorize if element_count is multiple of 4, else don't vectorize
  input_t elements_input[WARP_BATCH][WARP_ITERATIONS];

  int thread_offset = first_batch * stride + local_idx;
  src += thread_offset;
  dst += thread_offset;
  dropout_mask += thread_offset;

  // load data from global memory
  for (int i = 0; i < WARP_BATCH; ++i) {
    int batch_element_count = (i >= local_batches) ? 0 : element_count;
    int pad_thread_offset = ((first_batch + i) / pad_batch_stride) * stride + local_idx;
    const half* curr_mask = pad_mask + pad_thread_offset;
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it += 1) {
      int element_index = local_idx + it * WARP_SIZE;
#pragma unroll
      for (int element = 0; element < 1; ++element) {
        // masking_value is a large negative value
        elements_input[i][it + element] = -10000;
      }

      if (element_index < batch_element_count) {
        int itr_jmp = it * WARP_SIZE;
        int itr_idx = i * element_count + itr_jmp;
        copy_vector<input_t, 1>(&elements_input[i][it], src + itr_idx);
        apply_additive_mask<input_t, 1>(&elements_input[i][it], curr_mask + itr_jmp);
      }
    }
  }
  // convert input_t to acc_t
  acc_t elements[WARP_BATCH][WARP_ITERATIONS];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; ++it) {
      elements[i][it] = elements_input[i][it];
    }
  }

  constexpr uint32_t FULL_MASK = 0xffffffff;

  // compute local max_value

  // take the max_value of the first element to avoid one max call
  acc_t max_value[WARP_BATCH];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    max_value[i] = elements[i][0];
  }

#pragma unroll
  for (int it = 1; it < WARP_ITERATIONS; ++it) {
    for (int i = 0; i < WARP_BATCH; ++i) {
      max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
    }
  }

// reduction max_value
#pragma unroll
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
    float val[WARP_BATCH];
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      val[i] = __shfl_xor_sync(FULL_MASK, max_value[i], offset, WARP_SIZE);
    }
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      max_value[i] = max_value[i] > val[i] ? max_value[i] : val[i];
    }
  }

  // compute local sum
  acc_t sum[WARP_BATCH]{0.0f};

#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    for (int it = 0; it < WARP_ITERATIONS; ++it) {
      elements[i][it] = std::exp(elements[i][it] - max_value[i]);
      sum[i] += elements[i][it];
    }
  }

// reduction sum
#pragma unroll
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      sum[i] += __shfl_xor_sync(FULL_MASK, sum[i], offset, WARP_SIZE);
    }
  }
  curandStatePhilox4_32_10_t state;
  auto seeds = at::cuda::philox::unpack(philox_args);
  curand_init(std::get<0>(seeds), tid, std::get<1>(seeds), &state);

// store result
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    if (i >= local_batches) break;
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it += 1) {
      int element_index = local_idx + it * WARP_SIZE;
      if (element_index < element_count) {
        output_t out[1];
        acc_t softmax_out[1];
        uint8_t dropout_mask_temp[1];
        // generate a vector of random numbers here
        float rand = curand_uniform(&state);
        float* rand_ptr = (float*)(&rand);
#pragma unroll
        for (int element = 0; element < 1; ++element) {
          softmax_out[element] = (elements[i][it + element] / sum[i]);
          rand_ptr[element] = rand_ptr[element] <= p;
          out[element] = rand_ptr[element] * pinv * softmax_out[element];
          dropout_mask_temp[element] = rand_ptr[element] > 0.5;  // just to distinguish 0.0f and 1.0f
        }
        copy_vector<output_t, 1>(dst + i * element_count + it * WARP_SIZE, out);
        copy_vector<uint8_t, 1>(dropout_mask + i * element_count + it * WARP_SIZE, dropout_mask_temp);

      } else {
        break;
      }
    }
  }
}

// WARP_BATCH number of batches.
// WARP_ITERATOINS The number of iterations required for one warp to iterate
// over all data. WARP_SIZE number of elements working on a single batch, has to
// be a power of two. ELEMENTS_PER_LDG_STG has to be 1.
template <typename input_t, typename output_t, typename acc_t>
using additive_masked_softmax_dropout_forward_func = void (*)(output_t* dst, uint8_t* dropout_mask, const input_t* src,
                                                              const input_t* pad_mask, int batch_size, int stride,
                                                              int element_count, int pad_batch_stride,
                                                              at::PhiloxCudaState philox_args, float p);

template <typename input_t, typename output_t, typename acc_t>
bool warp_additive_masked_softmax_dropout_kernel(
    int element_count, int log2_elements, int& warp_size, int& batches_per_warp,
    additive_masked_softmax_dropout_forward_func<input_t, output_t, acc_t>& kernel) {
  // determine size of a warp
  const int next_power_of_two = 1 << log2_elements;
  warp_size = (next_power_of_two < 32) ? next_power_of_two : 32;

  // determine how many batches a warp should process.
  batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
  bool flag_vec4 = (element_count % 4 == 0);
  switch (log2_elements) {
    case 0:  // 1
      kernel = &additive_masked_softmax_dropout_warp_forward<input_t, output_t, acc_t, 2, 1, 1, 1>;
      break;
    case 1:  // 2
      kernel = &additive_masked_softmax_dropout_warp_forward<input_t, output_t, acc_t, 2, 1, 2, 1>;
      break;
    case 2:  // 4
      kernel = &additive_masked_softmax_dropout_warp_forward<input_t, output_t, acc_t, 2, 1, 4, 1>;
      break;
    case 3:  // 8
      kernel = &additive_masked_softmax_dropout_warp_forward<input_t, output_t, acc_t, 2, 1, 8, 1>;
      break;
    case 4:  // 16
      kernel = &additive_masked_softmax_dropout_warp_forward<input_t, output_t, acc_t, 2, 1, 16, 1>;
      break;
    case 5:  // 32
      kernel = &additive_masked_softmax_dropout_warp_forward<input_t, output_t, acc_t, 2, 1, 32, 1>;
      break;
    case 6:  // 64
      kernel = &additive_masked_softmax_dropout_warp_forward<input_t, output_t, acc_t, 2, 2, 32, 1>;
      break;
    case 7:  // 128
      if (flag_vec4)
        kernel = &additive_masked_softmax_dropout_warp_forward_vec4<input_t, output_t, acc_t, 2, 4, 32, 4>;
      else
        kernel = &additive_masked_softmax_dropout_warp_forward<input_t, output_t, acc_t, 2, 4, 32, 1>;
      break;
    case 8:  // 256
      if (flag_vec4)
        kernel = &additive_masked_softmax_dropout_warp_forward_vec4<input_t, output_t, acc_t, 1, 8, 32, 4>;
      else
        kernel = &additive_masked_softmax_dropout_warp_forward<input_t, output_t, acc_t, 1, 8, 32, 1>;
      break;
    case 9:  // 512
      if (flag_vec4)
        kernel = &additive_masked_softmax_dropout_warp_forward_vec4<input_t, output_t, acc_t, 1, 16, 32, 4>;
      else
        kernel = &additive_masked_softmax_dropout_warp_forward<input_t, output_t, acc_t, 1, 16, 32, 1>;
      break;
    case 10:  // 1024
      if (flag_vec4)
        kernel = &additive_masked_softmax_dropout_warp_forward_vec4<input_t, output_t, acc_t, 1, 32, 32, 4>;
      else
        kernel = &additive_masked_softmax_dropout_warp_forward<input_t, output_t, acc_t, 1, 32, 32, 1>;
      break;
    case 11:  // 2048
      if (flag_vec4)
        kernel = &additive_masked_softmax_dropout_warp_forward_vec4<input_t, output_t, acc_t, 1, 64, 32, 4>;
      else
        kernel = &additive_masked_softmax_dropout_warp_forward<input_t, output_t, acc_t, 1, 64, 32, 1>;
      break;
    default:
      return false;
  }
  return true;
}

template <typename input_t, typename output_t, typename acc_t>
bool dispatch_additive_masked_softmax_dropout(output_t* dst, uint8_t* dropout_mask, const input_t* src,
                                              const input_t* pad_mask, int totalElements, int softmax_elements,
                                              int softmax_elements_stride, int batch_count, int pad_batch_stride,
                                              float p,
                                              cudaStream_t streamid)  // p is the probability to keep, not drop
{
  if (softmax_elements == 0) {
    return true;
  } else if (softmax_elements <= 2048) {
    // compute function index. there's a function for each power of two size up
    // to 1024.
    int log2_elements = 0;
    while ((1 << log2_elements) < softmax_elements) ++log2_elements;

    additive_masked_softmax_dropout_forward_func<input_t, output_t, acc_t> kernel;
    int warp_size, batches_per_warp;
    if (!warp_additive_masked_softmax_dropout_kernel<input_t, output_t, acc_t>(softmax_elements, log2_elements,
                                                                               warp_size, batches_per_warp, kernel)) {
      return false;
    }

    // use 128 threads per block to maximize gpu utilization
    constexpr int threads_per_block = 128;
    // compute warps per block.
    int warps_per_block = (threads_per_block / warp_size);
    int batches_per_block = warps_per_block * batches_per_warp;
    int blocks = (batch_count + batches_per_block - 1) / batches_per_block;
    c10::optional<at::Generator> gen_;
    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(gen_, at::cuda::detail::getDefaultCUDAGenerator());
    int64_t counter_offset = (totalElements / (blocks * threads_per_block) + 1);
    at::PhiloxCudaState rng_engine_inputs;
    {
      std::lock_guard<std::mutex> lock(gen->mutex_);
      rng_engine_inputs = gen->philox_cuda_state(counter_offset);
    }

    // compute launch size
    dim3 threads(warp_size, warps_per_block, 1);

    // launch
    kernel<<<blocks, threads, 0, streamid>>>(dst, dropout_mask, src, pad_mask, batch_count, softmax_elements_stride,
                                             softmax_elements, pad_batch_stride, rng_engine_inputs, p);
    return true;
  }
  return false;
}

// WARP_BATCH number of batches.
// WARP_ITERATOINS The number of iterations required for one warp to iterate
// over all data. WARP_SIZE number of elements working on a single batch, has to
// be a power of two. ELEMENTS_PER_LDG_STG has to be 1.
template <typename input_t, typename output_t, typename acc_t, int WARP_BATCH, int WARP_ITERATIONS, int WARP_SIZE = 32,
          int ELEMENTS_PER_LDG_STG = 1>
__global__ void additive_masked_softmax_warp_forward(input_t* dst, const output_t* src, const input_t* pad_mask,
                                                     int batch_size, int stride, int element_count,
                                                     int pad_batch_stride) {
  assert(ELEMENTS_PER_LDG_STG == 1);

  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;

  // batch_size might not be a multiple of WARP_BATCH. Check how
  // many batches have to computed within this WARP.
  int local_batches = batch_size - first_batch;
  if (local_batches > WARP_BATCH) local_batches = WARP_BATCH;

  // there might be multiple batches per warp. compute the index within the
  // batch
  int local_idx = threadIdx.x;

  int thread_offset = first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;
  src += thread_offset;
  dst += thread_offset;

  // load data from global memory
  input_t elements_input[WARP_BATCH][WARP_ITERATIONS];
  for (int i = 0; i < WARP_BATCH; ++i) {
    int batch_element_count = (i >= local_batches) ? 0 : element_count;
    int pad_thread_offset = ((first_batch + i) / pad_batch_stride) * stride + ELEMENTS_PER_LDG_STG * local_idx;
    const half* curr_mask = pad_mask + pad_thread_offset;
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
#pragma unroll
      for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
        // masking_value is a large negative value
        elements_input[i][it + element] = -10000;
      }

      if (element_index < batch_element_count) {
        int itr_jmp = it * WARP_SIZE;
        int itr_idx = i * element_count + itr_jmp;
        copy_vector<input_t, ELEMENTS_PER_LDG_STG>(&elements_input[i][it], src + itr_idx);
        // apply_mask<input_t, ELEMENTS_PER_LDG_STG>(&elements_input[i][it],
        //                                          (__half)-std::numeric_limits<float>::infinity(),
        //                                          curr_mask + itr_jmp);
        elements_input[i][it] += *(curr_mask + itr_jmp);
      }
    }
  }

  // convert input_t to acc_t
  acc_t elements[WARP_BATCH][WARP_ITERATIONS];
  for (int i = 0; i < WARP_BATCH; ++i) {
    for (int it = 0; it < WARP_ITERATIONS; ++it) {
      elements[i][it] = elements_input[i][it];
    }
  }

  constexpr uint32_t FULL_MASK = 0xffffffff;

  // compute local max_value

  // take the max_value of the first element to avoid one max call
  acc_t max_value[WARP_BATCH];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    max_value[i] = elements[i][0];
  }

#pragma unroll
  for (int it = 1; it < WARP_ITERATIONS; ++it) {
    for (int i = 0; i < WARP_BATCH; ++i) {
      max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
    }
  }

// reduction max_value
#pragma unroll
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
    float val[WARP_BATCH];
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      val[i] = __shfl_xor_sync(FULL_MASK, max_value[i], offset, WARP_SIZE);
    }
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      max_value[i] = max_value[i] > val[i] ? max_value[i] : val[i];
    }
  }

  // compute local sum
  acc_t sum[WARP_BATCH]{0.0f};

#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    for (int it = 0; it < WARP_ITERATIONS; ++it) {
      // elements[i][it] = expf(elements[i][it] - max_value[i]);
      elements[i][it] = std::exp(elements[i][it] - max_value[i]);
      sum[i] += elements[i][it];
    }
  }

// reduction sum
#pragma unroll
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      sum[i] += __shfl_xor_sync(FULL_MASK, sum[i], offset, WARP_SIZE);
    }
  }

// store result
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    if (i >= local_batches) break;
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
      if (element_index < element_count) {
        // dst[i * element_count + it * WARP_SIZE] = elements[i][it] / sum[i];
        output_t out[ELEMENTS_PER_LDG_STG];
        for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
          out[element] = elements[i][it + element] / sum[i];
        }
        copy_vector<output_t, ELEMENTS_PER_LDG_STG>(dst + i * element_count + it * WARP_SIZE, out);
      } else {
        break;
      }
    }
  }
}

// WARP_BATCH number of batches.
// WARP_ITERATOINS The number of iterations required for one warp to iterate
// over all data. WARP_SIZE number of elements working on a single batch, has to
// be a power of two. ELEMENTS_PER_LDG_STG has to be 1.
template <typename input_t, typename output_t>
using additive_masked_softmax_forward_func = void (*)(input_t* dst, const output_t* src, const half* pad_mask,
                                                      int batch_size, int stride, int element_count,
                                                      int pad_batch_stride);

template <typename input_t, typename output_t, typename acc_t>
bool warp_additive_masked_softmax_kernel(int log2_elements, int& warp_size, int& batches_per_warp,
                                         additive_masked_softmax_forward_func<input_t, output_t>& kernel) {
  // determine size of a warp
  const int next_power_of_two = 1 << log2_elements;
  warp_size = (next_power_of_two < 32) ? next_power_of_two : 32;

  // determine how many batches a warp should process.
  batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;

  switch (log2_elements) {
    case 0:  // 1
      kernel = &additive_masked_softmax_warp_forward<input_t, output_t, acc_t, 2, 1, 1, 1>;
      break;
    case 1:  // 2
      kernel = &additive_masked_softmax_warp_forward<input_t, output_t, acc_t, 2, 1, 2, 1>;
      break;
    case 2:  // 4
      kernel = &additive_masked_softmax_warp_forward<input_t, output_t, acc_t, 2, 1, 4, 1>;
      break;
    case 3:  // 8
      kernel = &additive_masked_softmax_warp_forward<input_t, output_t, acc_t, 2, 1, 8, 1>;
      break;
    case 4:  // 16
      kernel = &additive_masked_softmax_warp_forward<input_t, output_t, acc_t, 2, 1, 16, 1>;
      break;
    case 5:  // 32
      kernel = &additive_masked_softmax_warp_forward<input_t, output_t, acc_t, 2, 1, 32, 1>;
      break;
    case 6:  // 64
      kernel = &additive_masked_softmax_warp_forward<input_t, output_t, acc_t, 2, 2, 32, 1>;
      break;
    case 7:  // 128
      kernel = &additive_masked_softmax_warp_forward<input_t, output_t, acc_t, 2, 4, 32, 1>;
      break;
    case 8:  // 256
      kernel = &additive_masked_softmax_warp_forward<input_t, output_t, acc_t, 1, 8, 32, 1>;
      break;
    case 9:  // 512
      kernel = &additive_masked_softmax_warp_forward<input_t, output_t, acc_t, 1, 16, 32, 1>;
      break;
    case 10:  // 1024
      kernel = &additive_masked_softmax_warp_forward<input_t, output_t, acc_t, 1, 32, 32, 1>;
      break;
    default:
      return false;
  }
  return true;
}

template <typename input_t, typename output_t, typename acc_t>
bool dispatch_additive_masked_softmax(output_t* dst, const input_t* src, const input_t* pad_mask, int softmax_elements,
                                      int softmax_elements_stride, int batch_count, int pad_batch_stride) {
  if (softmax_elements == 0) {
    return true;
  } else if (softmax_elements <= 1024) {
    // compute function index. there's a function for each power of two size up
    // to 1024.
    int log2_elements = 0;
    while ((1 << log2_elements) < softmax_elements) ++log2_elements;

    additive_masked_softmax_forward_func<input_t, output_t> kernel;
    int warp_size, batches_per_warp;
    if (!warp_additive_masked_softmax_kernel<input_t, output_t, acc_t>(log2_elements, warp_size, batches_per_warp,
                                                                       kernel)) {
      return false;
    }

    // use 128 threads per block to maximize gpu utilization
    constexpr int threads_per_block = 128;

    // compute warps per block.
    int warps_per_block = (threads_per_block / warp_size);

    // compute launch size
    int batches_per_block = warps_per_block * batches_per_warp;
    int blocks = (batch_count + batches_per_block - 1) / batches_per_block;
    dim3 threads(warp_size, warps_per_block, 1);

    // launch
    kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        dst, src, pad_mask, batch_count, softmax_elements_stride, softmax_elements, pad_batch_stride);
    return true;
  }
  return false;
}

template <typename input_t, typename output_t, typename acc_t>
bool dispatch_additive_masked_softmax_stream(output_t* dst, const input_t* src, const input_t* pad_mask,
                                             int softmax_elements, int softmax_elements_stride, int batch_count,
                                             int pad_batch_stride, cudaStream_t streamid) {
  if (softmax_elements == 0) {
    return true;
  } else if (softmax_elements <= 1024) {
    // compute function index. there's a function for each power of two size up
    // to 1024.
    int log2_elements = 0;
    while ((1 << log2_elements) < softmax_elements) ++log2_elements;
    additive_masked_softmax_forward_func<input_t, output_t> kernel;
    int warp_size, batches_per_warp;
    if (!warp_additive_masked_softmax_kernel<input_t, output_t, acc_t>(log2_elements, warp_size, batches_per_warp,
                                                                       kernel)) {
      return false;
    }
    // use 128 threads per block to maximize gpu utilization
    constexpr int threads_per_block = 128;
    // compute warps per block.
    int warps_per_block = (threads_per_block / warp_size);
    // compute launch size
    int batches_per_block = warps_per_block * batches_per_warp;
    int blocks = (batch_count + batches_per_block - 1) / batches_per_block;
    dim3 threads(warp_size, warps_per_block, 1);
    // launch
    kernel<<<blocks, threads, 0, streamid>>>(dst, src, pad_mask, batch_count, softmax_elements_stride, softmax_elements,
                                             pad_batch_stride);
    return true;
  }
  return false;
}

// WARP_BATCH number of batches.
// WARP_ITERATOINS The number of iterations required for one warp to iterate
// over all data. WARP_SIZE number of elements working on a single batch, has to
// be a power of two. ELEMENTS_PER_LDG_STG has to be 1.
template <typename input_t, typename output_t, typename acc_t, int WARP_BATCH, int WARP_ITERATIONS, int WARP_SIZE = 32,
          int ELEMENTS_PER_LDG_STG = 1>
__global__ void masked_softmax_warp_forward(input_t* dst, const output_t* src, const uint8_t* pad_mask, int batch_size,
                                            int stride, int element_count, int pad_batch_stride) {
  assert(ELEMENTS_PER_LDG_STG == 1);

  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;

  // batch_size might not be a multiple of WARP_BATCH. Check how
  // many batches have to computed within this WARP.
  int local_batches = batch_size - first_batch;
  if (local_batches > WARP_BATCH) local_batches = WARP_BATCH;

  // there might be multiple batches per warp. compute the index within the
  // batch
  int local_idx = threadIdx.x;

  int thread_offset = first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;
  src += thread_offset;
  dst += thread_offset;

  // load data from global memory
  input_t elements_input[WARP_BATCH][WARP_ITERATIONS];
  for (int i = 0; i < WARP_BATCH; ++i) {
    int batch_element_count = (i >= local_batches) ? 0 : element_count;
    int pad_thread_offset = ((first_batch + i) / pad_batch_stride) * stride + ELEMENTS_PER_LDG_STG * local_idx;
    const uint8_t* curr_mask = pad_mask + pad_thread_offset;
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
#pragma unroll
      for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
        elements_input[i][it + element] = -std::numeric_limits<float>::infinity();
      }

      if (element_index < batch_element_count) {
        int itr_jmp = it * WARP_SIZE;
        int itr_idx = i * element_count + itr_jmp;
        copy_vector<input_t, ELEMENTS_PER_LDG_STG>(&elements_input[i][it], src + itr_idx);
        apply_mask<input_t, ELEMENTS_PER_LDG_STG>(
            &elements_input[i][it], __float2half(-std::numeric_limits<float>::infinity()), curr_mask + itr_jmp);
      }
    }
  }

  // convert input_t to acc_t
  acc_t elements[WARP_BATCH][WARP_ITERATIONS];
  for (int i = 0; i < WARP_BATCH; ++i) {
    for (int it = 0; it < WARP_ITERATIONS; ++it) {
      elements[i][it] = elements_input[i][it];
    }
  }

  constexpr uint32_t FULL_MASK = 0xffffffff;

  // compute local max_value

  // take the max_value of the first element to avoid one max call
  acc_t max_value[WARP_BATCH];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    max_value[i] = elements[i][0];
  }

#pragma unroll
  for (int it = 1; it < WARP_ITERATIONS; ++it) {
    for (int i = 0; i < WARP_BATCH; ++i) {
      max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
    }
  }

// reduction max_value
#pragma unroll
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
    float val[WARP_BATCH];
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      val[i] = __shfl_xor_sync(FULL_MASK, max_value[i], offset, WARP_SIZE);
    }
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      max_value[i] = max_value[i] > val[i] ? max_value[i] : val[i];
    }
  }

  // compute local sum
  acc_t sum[WARP_BATCH]{0.0f};

#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    for (int it = 0; it < WARP_ITERATIONS; ++it) {
      // elements[i][it] = expf(elements[i][it] - max_value[i]);
      elements[i][it] = std::exp(elements[i][it] - max_value[i]);
      sum[i] += elements[i][it];
    }
  }

// reduction sum
#pragma unroll
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      sum[i] += __shfl_xor_sync(FULL_MASK, sum[i], offset, WARP_SIZE);
    }
  }

// store result
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    if (i >= local_batches) break;
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
      if (element_index < element_count) {
        // dst[i * element_count + it * WARP_SIZE] = elements[i][it] / sum[i];
        output_t out[ELEMENTS_PER_LDG_STG];
        for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
          out[element] = elements[i][it + element] / sum[i];
        }
        copy_vector<output_t, ELEMENTS_PER_LDG_STG>(dst + i * element_count + it * WARP_SIZE, out);
      } else {
        break;
      }
    }
  }
}

// WARP_BATCH number of batches.
// WARP_ITERATOINS The number of iterations required for one warp to iterate
// over all data. WARP_SIZE number of elements working on a single batch, has to
// be a power of two. ELEMENTS_PER_LDG_STG has to be 1.
template <typename input_t, typename output_t>
using masked_softmax_forward_func = void (*)(input_t* dst, const output_t* src, const uint8_t* pad_mask, int batch_size,
                                             int stride, int element_count, int pad_batch_stride);

template <typename input_t, typename output_t, typename acc_t>
bool warp_masked_softmax_kernel(int log2_elements, int& warp_size, int& batches_per_warp,
                                masked_softmax_forward_func<input_t, output_t>& kernel) {
  // determine size of a warp
  const int next_power_of_two = 1 << log2_elements;
  warp_size = (next_power_of_two < 32) ? next_power_of_two : 32;

  // determine how many batches a warp should process.
  batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;

  switch (log2_elements) {
    case 0:  // 1
      kernel = &masked_softmax_warp_forward<input_t, output_t, acc_t, 2, 1, 1, 1>;
      break;
    case 1:  // 2
      kernel = &masked_softmax_warp_forward<input_t, output_t, acc_t, 2, 1, 2, 1>;
      break;
    case 2:  // 4
      kernel = &masked_softmax_warp_forward<input_t, output_t, acc_t, 2, 1, 4, 1>;
      break;
    case 3:  // 8
      kernel = &masked_softmax_warp_forward<input_t, output_t, acc_t, 2, 1, 8, 1>;
      break;
    case 4:  // 16
      kernel = &masked_softmax_warp_forward<input_t, output_t, acc_t, 2, 1, 16, 1>;
      break;
    case 5:  // 32
      kernel = &masked_softmax_warp_forward<input_t, output_t, acc_t, 2, 1, 32, 1>;
      break;
    case 6:  // 64
      kernel = &masked_softmax_warp_forward<input_t, output_t, acc_t, 2, 2, 32, 1>;
      break;
    case 7:  // 128
      kernel = &masked_softmax_warp_forward<input_t, output_t, acc_t, 2, 4, 32, 1>;
      break;
    case 8:  // 256
      kernel = &masked_softmax_warp_forward<input_t, output_t, acc_t, 1, 8, 32, 1>;
      break;
    case 9:  // 512
      kernel = &masked_softmax_warp_forward<input_t, output_t, acc_t, 1, 16, 32, 1>;
      break;
    case 10:  // 1024
      kernel = &masked_softmax_warp_forward<input_t, output_t, acc_t, 1, 32, 32, 1>;
      break;
    default:
      return false;
  }
  return true;
}

template <typename input_t, typename output_t, typename acc_t>
bool dispatch_masked_softmax(output_t* dst, const input_t* src, const uint8_t* pad_mask, int softmax_elements,
                             int softmax_elements_stride, int batch_count, int pad_batch_stride) {
  if (softmax_elements == 0) {
    return true;
  } else if (softmax_elements <= 1024) {
    // compute function index. there's a function for each power of two size up
    // to 1024.
    int log2_elements = 0;
    while ((1 << log2_elements) < softmax_elements) ++log2_elements;

    masked_softmax_forward_func<input_t, output_t> kernel;
    int warp_size, batches_per_warp;
    if (!warp_masked_softmax_kernel<input_t, output_t, acc_t>(log2_elements, warp_size, batches_per_warp, kernel)) {
      return false;
    }

    // use 128 threads per block to maximize gpu utilization
    constexpr int threads_per_block = 128;

    // compute warps per block.
    int warps_per_block = (threads_per_block / warp_size);

    // compute launch size
    int batches_per_block = warps_per_block * batches_per_warp;
    int blocks = (batch_count + batches_per_block - 1) / batches_per_block;
    dim3 threads(warp_size, warps_per_block, 1);

    // launch
    kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        dst, src, pad_mask, batch_count, softmax_elements_stride, softmax_elements, pad_batch_stride);
    return true;
  }
  return false;
}

// WARP_BATCH number of batches.
// WARP_ITERATOINS The number of iterations required for one warp to iterate
// over all data. WARP_SIZE number of elements working on a single batch, has to
// be a power of two. ELEMENTS_PER_LDG_STG has to be 1.
template <typename input_t, typename output_t, typename acc_t, int WARP_BATCH, int WARP_ITERATIONS, int WARP_SIZE = 32,
          int ELEMENTS_PER_LDG_STG = 1>
__global__ void time_masked_softmax_warp_forward(input_t* dst, const output_t* src, const uint8_t* pad_mask,
                                                 int batch_size, int stride, int element_count, int mod_seq_len) {
  assert(ELEMENTS_PER_LDG_STG == 1);

  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;

  // batch_size might not be a multiple of WARP_BATCH. Check how
  // many batches have to computed within this WARP.
  int local_batches = batch_size - first_batch;
  if (local_batches > WARP_BATCH) local_batches = WARP_BATCH;

  // there might be multiple batches per warp. compute the index within the
  // batch
  int local_idx = threadIdx.x;

  int thread_offset = first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;
  src += thread_offset;
  dst += thread_offset;

  // load data from global memory
  input_t elements_input[WARP_BATCH][WARP_ITERATIONS];
  for (int i = 0; i < WARP_BATCH; ++i) {
    int batch_element_count = (i >= local_batches) ? 0 : element_count;
    int pad_thread_offset = ((first_batch + i) % mod_seq_len) * stride + ELEMENTS_PER_LDG_STG * local_idx;
    const uint8_t* curr_mask = pad_mask + pad_thread_offset;
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
#pragma unroll
      for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
        elements_input[i][it + element] = -std::numeric_limits<float>::infinity();
      }

      if (element_index < batch_element_count) {
        int itr_jmp = it * WARP_SIZE;
        int itr_idx = i * element_count + itr_jmp;
        copy_vector<input_t, ELEMENTS_PER_LDG_STG>(&elements_input[i][it], src + itr_idx);
        apply_mask<input_t, ELEMENTS_PER_LDG_STG>(
            &elements_input[i][it], __float2half(-std::numeric_limits<float>::infinity()), curr_mask + itr_jmp);
      }
    }
  }

  // convert input_t to acc_t
  acc_t elements[WARP_BATCH][WARP_ITERATIONS];
  for (int i = 0; i < WARP_BATCH; ++i) {
    for (int it = 0; it < WARP_ITERATIONS; ++it) {
      elements[i][it] = elements_input[i][it];
    }
  }

  constexpr uint32_t FULL_MASK = 0xffffffff;

  // compute local max_value

  // take the max_value of the first element to avoid one max call
  acc_t max_value[WARP_BATCH];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    max_value[i] = elements[i][0];
  }

#pragma unroll
  for (int it = 1; it < WARP_ITERATIONS; ++it) {
    for (int i = 0; i < WARP_BATCH; ++i) {
      max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
    }
  }

// reduction max_value
#pragma unroll
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
    float val[WARP_BATCH];
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      val[i] = __shfl_xor_sync(FULL_MASK, max_value[i], offset, WARP_SIZE);
    }
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      max_value[i] = max_value[i] > val[i] ? max_value[i] : val[i];
    }
  }

  // compute local sum
  acc_t sum[WARP_BATCH]{0.0f};

#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    for (int it = 0; it < WARP_ITERATIONS; ++it) {
      // elements[i][it] = expf(elements[i][it] - max_value[i]);
      elements[i][it] = std::exp(elements[i][it] - max_value[i]);
      sum[i] += elements[i][it];
    }
  }

// reduction sum
#pragma unroll
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      sum[i] += __shfl_xor_sync(FULL_MASK, sum[i], offset, WARP_SIZE);
    }
  }

// store result
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    if (i >= local_batches) break;
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
      if (element_index < element_count) {
        // dst[i * element_count + it * WARP_SIZE] = elements[i][it] / sum[i];
        output_t out[ELEMENTS_PER_LDG_STG];
        for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
          out[element] = elements[i][it + element] / sum[i];
        }
        copy_vector<output_t, ELEMENTS_PER_LDG_STG>(dst + i * element_count + it * WARP_SIZE, out);
      } else {
        break;
      }
    }
  }
}

// WARP_BATCH number of batches.
// WARP_ITERATOINS The number of iterations required for one warp to iterate
// over all data. WARP_SIZE number of elements working on a single batch, has to
// be a power of two. ELEMENTS_PER_LDG_STG has to be 1.
template <typename input_t, typename output_t>
using time_masked_softmax_forward_func = void (*)(input_t* dst, const output_t* src, const uint8_t* pad_mask,
                                                  int batch_size, int stride, int element_count, int mod_seq_len);

template <typename input_t, typename output_t, typename acc_t>
bool warp_time_masked_softmax_kernel(int log2_elements, int& warp_size, int& batches_per_warp,
                                     time_masked_softmax_forward_func<input_t, output_t>& kernel) {
  // determine size of a warp
  const int next_power_of_two = 1 << log2_elements;
  warp_size = (next_power_of_two < 32) ? next_power_of_two : 32;

  // determine how many batches a warp should process.
  batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;

  switch (log2_elements) {
    case 0:  // 1
      kernel = &time_masked_softmax_warp_forward<input_t, output_t, acc_t, 2, 1, 1, 1>;
      break;
    case 1:  // 2
      kernel = &time_masked_softmax_warp_forward<input_t, output_t, acc_t, 2, 1, 2, 1>;
      break;
    case 2:  // 4
      kernel = &time_masked_softmax_warp_forward<input_t, output_t, acc_t, 2, 1, 4, 1>;
      break;
    case 3:  // 8
      kernel = &time_masked_softmax_warp_forward<input_t, output_t, acc_t, 2, 1, 8, 1>;
      break;
    case 4:  // 16
      kernel = &time_masked_softmax_warp_forward<input_t, output_t, acc_t, 2, 1, 16, 1>;
      break;
    case 5:  // 32
      kernel = &time_masked_softmax_warp_forward<input_t, output_t, acc_t, 2, 1, 32, 1>;
      break;
    case 6:  // 64
      kernel = &time_masked_softmax_warp_forward<input_t, output_t, acc_t, 2, 2, 32, 1>;
      break;
    case 7:  // 128
      kernel = &time_masked_softmax_warp_forward<input_t, output_t, acc_t, 2, 4, 32, 1>;
      break;
    case 8:  // 256
      kernel = &time_masked_softmax_warp_forward<input_t, output_t, acc_t, 1, 8, 32, 1>;
      break;
    case 9:  // 512
      kernel = &time_masked_softmax_warp_forward<input_t, output_t, acc_t, 1, 16, 32, 1>;
      break;
    case 10:  // 1024
      kernel = &time_masked_softmax_warp_forward<input_t, output_t, acc_t, 1, 32, 32, 1>;
      break;
    default:
      return false;
  }
  return true;
}

template <typename input_t, typename output_t, typename acc_t>
bool dispatch_time_masked_softmax(output_t* dst, const input_t* src, const uint8_t* pad_mask, int softmax_elements,
                                  int softmax_elements_stride, int batch_count, int mod_seq_len) {
  if (softmax_elements == 0) {
    return true;
  } else if (softmax_elements <= 1024) {
    // compute function index. there's a function for each power of two size up
    // to 1024.
    int log2_elements = 0;
    while ((1 << log2_elements) < softmax_elements) ++log2_elements;

    time_masked_softmax_forward_func<input_t, output_t> kernel;
    int warp_size, batches_per_warp;
    if (!warp_time_masked_softmax_kernel<input_t, output_t, acc_t>(log2_elements, warp_size, batches_per_warp,
                                                                   kernel)) {
      return false;
    }

    // use 128 threads per block to maximize gpu utilization
    constexpr int threads_per_block = 128;

    // compute warps per block.
    int warps_per_block = (threads_per_block / warp_size);

    // compute launch size
    int batches_per_block = warps_per_block * batches_per_warp;
    int blocks = (batch_count + batches_per_block - 1) / batches_per_block;
    dim3 threads(warp_size, warps_per_block, 1);

    // launch
    kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        dst, src, pad_mask, batch_count, softmax_elements_stride, softmax_elements, mod_seq_len);
    return true;
  }
  return false;
}

int log2_ceil_native(int value) {
  int log2_value = 0;
  while ((1 << log2_value) < value) ++log2_value;
  return log2_value;
}

template <typename T>
__device__ __forceinline__ T WARP_SHFL_XOR_NATIVE(T value, int laneMask, int width = warpSize,
                                                  unsigned int mask = 0xffffffff) {
#if CUDA_VERSION >= 9000
  return __shfl_xor_sync(mask, value, laneMask, width);
#else
  return __shfl_xor(value, laneMask, width);
#endif
}

template <typename acc_t, int WARP_BATCH, int WARP_SIZE>
__device__ __forceinline__ void warp_reduce_sum(acc_t* sum) {
#pragma unroll
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      acc_t b = WARP_SHFL_XOR_NATIVE(sum[i], offset, WARP_SIZE);
      sum[i] = sum[i] + b;
    }
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Warp softmax backward functions as fused variants of
// at::softmax_backward_data function
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

// softmax backward data function is taken from native pytorch, elementwise mul
// is fused in the epolog, as well as masking and scaling for fusing dropout

template <typename input_t, typename output_t, typename acc_t, int log2_elements, bool is_log_softmax>
__global__ void masked_scale_softmax_warp_backward_masked_dgrad(output_t* gradInput, const input_t* grad,
                                                                const input_t* output, const uint8_t* mask,
                                                                const uint8_t* pad_mask, acc_t scale, int batch_size,
                                                                int stride, int element_count, int heads) {
  // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and
  // warp_size of method warp_softmax_backward_kernel.
  constexpr int next_power_of_two = 1 << log2_elements;
  constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
  constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
  constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;

  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;

  // batch_size might not be a multiple of WARP_BATCH. Check how
  // many batches have to computed within this WARP.
  int local_batches = batch_size - first_batch;
  if (local_batches > WARP_BATCH) local_batches = WARP_BATCH;

  // there might be multiple batches per warp. compute the index within the
  // batch
  int local_idx = threadIdx.x % WARP_SIZE;

  // the first element to process by the current thread
  int thread_offset = first_batch * stride + local_idx;
  grad += thread_offset;
  output += thread_offset;
  gradInput += thread_offset;
  mask += thread_offset;

  // The nested loops over WARP_BATCH and then WARP_ITERATIONS can be simplified
  // to one loop, but I think doing so would obfuscate the logic of the
  // algorithm, thus I chose to keep the nested loops. This should have no
  // impact on performance because the loops are unrolled anyway.

  // load data from global memory
  acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS];
  acc_t output_reg[WARP_BATCH][WARP_ITERATIONS];
  for (int i = 0; i < WARP_BATCH; ++i) {
    int batch_element_count = (i >= local_batches) ? 0 : element_count;
    for (int it = 0; it < WARP_ITERATIONS; ++it) {
      int element_index = local_idx + it * WARP_SIZE;
      if (element_index < batch_element_count) {
        grad_reg[i][it] = (input_t)((acc_t)mask[i * element_count + it * WARP_SIZE] *
                                    (acc_t)grad[i * element_count + it * WARP_SIZE] * (acc_t)scale) *
                          output[i * element_count + it * WARP_SIZE];
        output_reg[i][it] = output[i * element_count + it * WARP_SIZE];
      } else {
        grad_reg[i][it] = acc_t(0);
        output_reg[i][it] = acc_t(0);
      }
    }
  }

  acc_t sum[WARP_BATCH];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    sum[i] = grad_reg[i][0];
#pragma unroll
    for (int it = 1; it < WARP_ITERATIONS; ++it) {
      sum[i] += grad_reg[i][it];
    }
  }
  warp_reduce_sum<acc_t, WARP_BATCH, WARP_SIZE>(sum);

// store result
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    if (i >= local_batches) break;
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; ++it) {
      int element_index = local_idx + it * WARP_SIZE;
      if (element_index < element_count) {
        // compute gradients
        int total_ind = thread_offset + i * element_count + it * WARP_SIZE;
        int pad_mask_ind =
            element_count * (total_ind / (heads * element_count * element_count)) + total_ind % element_count;
        uint8_t pad_mask_element = 1 - pad_mask[pad_mask_ind];
        if (pad_mask_element == 0)
          gradInput[i * element_count + it * WARP_SIZE] = 0;
        else {
          if (is_log_softmax) {
            gradInput[i * element_count + it * WARP_SIZE] = (grad_reg[i][it] - std::exp(output_reg[i][it]) * sum[i]);
          } else {
            gradInput[i * element_count + it * WARP_SIZE] = (grad_reg[i][it] - output_reg[i][it] * sum[i]);
          }
        }
      }
    }
  }
}
template <typename input_t, typename output_t, typename acc_t, bool is_log_softmax>
void dispatch_masked_scale_softmax_backward_masked_out(output_t* grad_input, const input_t* grad, const input_t* output,
                                                       const uint8_t* mask, const uint8_t* pad_mask, acc_t scale,
                                                       int softmax_elements, int softmax_elements_stride,
                                                       int batch_count, int heads) {
  TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 1024);
  if (softmax_elements == 0) {
    return;
  } else {
    int log2_elements = log2_ceil_native(softmax_elements);
    const int next_power_of_two = 1 << log2_elements;

    // This value must match the WARP_SIZE constexpr value computed inside
    // softmax_warp_backward.
    int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;

    // This value must match the WARP_BATCH constexpr value computed inside
    // softmax_warp_backward.
    int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;

    // use 128 threads per block to maximize gpu utilization
    constexpr int threads_per_block = 128;

    int warps_per_block = (threads_per_block / warp_size);
    int batches_per_block = warps_per_block * batches_per_warp;
    int blocks = (batch_count + batches_per_block - 1) / batches_per_block;
    dim3 threads(warp_size, warps_per_block, 1);
    // Launch code would be more elegant if C++ supported FOR CONSTEXPR
    switch (log2_elements) {
      case 0:  // 1
        masked_scale_softmax_warp_backward_masked_dgrad<input_t, output_t, acc_t, 0, is_log_softmax>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, mask, pad_mask, scale,
                                                                       batch_count, softmax_elements_stride,
                                                                       softmax_elements, heads);
        break;
      case 1:  // 2
        masked_scale_softmax_warp_backward_masked_dgrad<input_t, output_t, acc_t, 1, is_log_softmax>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, mask, pad_mask, scale,
                                                                       batch_count, softmax_elements_stride,
                                                                       softmax_elements, heads);
        break;
      case 2:  // 4
        masked_scale_softmax_warp_backward_masked_dgrad<input_t, output_t, acc_t, 2, is_log_softmax>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, mask, pad_mask, scale,
                                                                       batch_count, softmax_elements_stride,
                                                                       softmax_elements, heads);
        break;
      case 3:  // 8
        masked_scale_softmax_warp_backward_masked_dgrad<input_t, output_t, acc_t, 3, is_log_softmax>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, mask, pad_mask, scale,
                                                                       batch_count, softmax_elements_stride,
                                                                       softmax_elements, heads);
        break;
      case 4:  // 16
        masked_scale_softmax_warp_backward_masked_dgrad<input_t, output_t, acc_t, 4, is_log_softmax>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, mask, pad_mask, scale,
                                                                       batch_count, softmax_elements_stride,
                                                                       softmax_elements, heads);
        break;
      case 5:  // 32
        masked_scale_softmax_warp_backward_masked_dgrad<input_t, output_t, acc_t, 5, is_log_softmax>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, mask, pad_mask, scale,
                                                                       batch_count, softmax_elements_stride,
                                                                       softmax_elements, heads);
        break;
      case 6:  // 64
        masked_scale_softmax_warp_backward_masked_dgrad<input_t, output_t, acc_t, 6, is_log_softmax>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, mask, pad_mask, scale,
                                                                       batch_count, softmax_elements_stride,
                                                                       softmax_elements, heads);
        break;
      case 7:  // 128
        masked_scale_softmax_warp_backward_masked_dgrad<input_t, output_t, acc_t, 7, is_log_softmax>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, mask, pad_mask, scale,
                                                                       batch_count, softmax_elements_stride,
                                                                       softmax_elements, heads);
        break;
      case 8:  // 256
        masked_scale_softmax_warp_backward_masked_dgrad<input_t, output_t, acc_t, 8, is_log_softmax>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, mask, pad_mask, scale,
                                                                       batch_count, softmax_elements_stride,
                                                                       softmax_elements, heads);
        break;
      case 9:  // 512
        masked_scale_softmax_warp_backward_masked_dgrad<input_t, output_t, acc_t, 9, is_log_softmax>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, mask, pad_mask, scale,
                                                                       batch_count, softmax_elements_stride,
                                                                       softmax_elements, heads);
        break;
      case 10:  // 1024
        masked_scale_softmax_warp_backward_masked_dgrad<input_t, output_t, acc_t, 10, is_log_softmax>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, mask, pad_mask, scale,
                                                                       batch_count, softmax_elements_stride,
                                                                       softmax_elements, heads);
        break;
      default:
        break;
    }
  }
}

template <typename input_t, typename output_t, typename acc_t, bool is_log_softmax>
void dispatch_masked_scale_softmax_backward_masked_out_stream(output_t* grad_input, const input_t* grad,
                                                              const input_t* output, const uint8_t* mask,
                                                              const uint8_t* pad_mask, acc_t scale,
                                                              int softmax_elements, int softmax_elements_stride,
                                                              int batch_count, int heads, cudaStream_t streamid) {
  TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 1024);
  if (softmax_elements == 0) {
    return;
  } else {
    int log2_elements = log2_ceil_native(softmax_elements);
    const int next_power_of_two = 1 << log2_elements;
    // This value must match the WARP_SIZE constexpr value computed inside
    // softmax_warp_backward.
    int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
    // This value must match the WARP_BATCH constexpr value computed inside
    // softmax_warp_backward.
    int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
    // use 128 threads per block to maximize gpu utilization
    constexpr int threads_per_block = 128;
    int warps_per_block = (threads_per_block / warp_size);
    int batches_per_block = warps_per_block * batches_per_warp;
    int blocks = (batch_count + batches_per_block - 1) / batches_per_block;
    dim3 threads(warp_size, warps_per_block, 1);
    // Launch code would be more elegant if C++ supported FOR CONSTEXPR
    switch (log2_elements) {
      case 0:  // 1
        masked_scale_softmax_warp_backward_masked_dgrad<input_t, output_t, acc_t, 0, is_log_softmax>
            <<<blocks, threads, 0, streamid>>>(grad_input, grad, output, mask, pad_mask, scale, batch_count,
                                               softmax_elements_stride, softmax_elements, heads);
        break;
      case 1:  // 2
        masked_scale_softmax_warp_backward_masked_dgrad<input_t, output_t, acc_t, 1, is_log_softmax>
            <<<blocks, threads, 0, streamid>>>(grad_input, grad, output, mask, pad_mask, scale, batch_count,
                                               softmax_elements_stride, softmax_elements, heads);
        break;
      case 2:  // 4
        masked_scale_softmax_warp_backward_masked_dgrad<input_t, output_t, acc_t, 2, is_log_softmax>
            <<<blocks, threads, 0, streamid>>>(grad_input, grad, output, mask, pad_mask, scale, batch_count,
                                               softmax_elements_stride, softmax_elements, heads);
        break;
      case 3:  // 8
        masked_scale_softmax_warp_backward_masked_dgrad<input_t, output_t, acc_t, 3, is_log_softmax>
            <<<blocks, threads, 0, streamid>>>(grad_input, grad, output, mask, pad_mask, scale, batch_count,
                                               softmax_elements_stride, softmax_elements, heads);
        break;
      case 4:  // 16
        masked_scale_softmax_warp_backward_masked_dgrad<input_t, output_t, acc_t, 4, is_log_softmax>
            <<<blocks, threads, 0, streamid>>>(grad_input, grad, output, mask, pad_mask, scale, batch_count,
                                               softmax_elements_stride, softmax_elements, heads);
        break;
      case 5:  // 32
        masked_scale_softmax_warp_backward_masked_dgrad<input_t, output_t, acc_t, 5, is_log_softmax>
            <<<blocks, threads, 0, streamid>>>(grad_input, grad, output, mask, pad_mask, scale, batch_count,
                                               softmax_elements_stride, softmax_elements, heads);
        break;
      case 6:  // 64
        masked_scale_softmax_warp_backward_masked_dgrad<input_t, output_t, acc_t, 6, is_log_softmax>
            <<<blocks, threads, 0, streamid>>>(grad_input, grad, output, mask, pad_mask, scale, batch_count,
                                               softmax_elements_stride, softmax_elements, heads);
        break;
      case 7:  // 128
        masked_scale_softmax_warp_backward_masked_dgrad<input_t, output_t, acc_t, 7, is_log_softmax>
            <<<blocks, threads, 0, streamid>>>(grad_input, grad, output, mask, pad_mask, scale, batch_count,
                                               softmax_elements_stride, softmax_elements, heads);
        break;
      case 8:  // 256
        masked_scale_softmax_warp_backward_masked_dgrad<input_t, output_t, acc_t, 8, is_log_softmax>
            <<<blocks, threads, 0, streamid>>>(grad_input, grad, output, mask, pad_mask, scale, batch_count,
                                               softmax_elements_stride, softmax_elements, heads);
        break;
      case 9:  // 512
        masked_scale_softmax_warp_backward_masked_dgrad<input_t, output_t, acc_t, 9, is_log_softmax>
            <<<blocks, threads, 0, streamid>>>(grad_input, grad, output, mask, pad_mask, scale, batch_count,
                                               softmax_elements_stride, softmax_elements, heads);
        break;
      case 10:  // 1024
        masked_scale_softmax_warp_backward_masked_dgrad<input_t, output_t, acc_t, 10, is_log_softmax>
            <<<blocks, threads, 0, streamid>>>(grad_input, grad, output, mask, pad_mask, scale, batch_count,
                                               softmax_elements_stride, softmax_elements, heads);
        break;
      default:
        break;
    }
  }
}

template <typename input_t, typename output_t, typename acc_t, int log2_elements, bool is_log_softmax>
__global__ void masked_scale_softmax_warp_backward(output_t* gradInput, const input_t* grad, const input_t* output,
                                                   const uint8_t* mask, acc_t scale, int batch_size, int stride,
                                                   int element_count) {
  // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and
  // warp_size of method warp_softmax_backward_kernel.
  constexpr int next_power_of_two = 1 << log2_elements;
  constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
  constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
  constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;

  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;

  // batch_size might not be a multiple of WARP_BATCH. Check how
  // many batches have to computed within this WARP.
  int local_batches = batch_size - first_batch;
  if (local_batches > WARP_BATCH) local_batches = WARP_BATCH;

  // there might be multiple batches per warp. compute the index within the
  // batch
  int local_idx = threadIdx.x % WARP_SIZE;

  // the first element to process by the current thread
  int thread_offset = first_batch * stride + local_idx;
  grad += thread_offset;
  output += thread_offset;
  gradInput += thread_offset;
  mask += thread_offset;

  // The nested loops over WARP_BATCH and then WARP_ITERATIONS can be simplified
  // to one loop, but I think doing so would obfuscate the logic of the
  // algorithm, thus I chose to keep the nested loops. This should have no
  // impact on performance because the loops are unrolled anyway.

  // load data from global memory
  acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS];
  acc_t output_reg[WARP_BATCH][WARP_ITERATIONS];
  for (int i = 0; i < WARP_BATCH; ++i) {
    int batch_element_count = (i >= local_batches) ? 0 : element_count;
    for (int it = 0; it < WARP_ITERATIONS; ++it) {
      int element_index = local_idx + it * WARP_SIZE;
      if (element_index < batch_element_count) {
        grad_reg[i][it] = (input_t)((acc_t)mask[i * element_count + it * WARP_SIZE] *
                                    (acc_t)grad[i * element_count + it * WARP_SIZE] * (acc_t)scale) *
                          output[i * element_count + it * WARP_SIZE];
        output_reg[i][it] = output[i * element_count + it * WARP_SIZE];
      } else {
        grad_reg[i][it] = acc_t(0);
        output_reg[i][it] = acc_t(0);
      }
    }
  }

  acc_t sum[WARP_BATCH];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    sum[i] = grad_reg[i][0];
#pragma unroll
    for (int it = 1; it < WARP_ITERATIONS; ++it) {
      sum[i] += grad_reg[i][it];
    }
  }
  warp_reduce_sum<acc_t, WARP_BATCH, WARP_SIZE>(sum);

// store result
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    if (i >= local_batches) break;
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; ++it) {
      int element_index = local_idx + it * WARP_SIZE;
      if (element_index < element_count) {
        // compute gradients
        if (is_log_softmax) {
          gradInput[i * element_count + it * WARP_SIZE] = (grad_reg[i][it] - std::exp(output_reg[i][it]) * sum[i]);
        } else {
          gradInput[i * element_count + it * WARP_SIZE] = (grad_reg[i][it] - output_reg[i][it] * sum[i]);
        }
      }
    }
  }
}

template <typename input_t, typename output_t, typename acc_t, int WARP_BATCH, int WARP_ITERATIONS, int WARP_SIZE = 32,
          int ELEMENTS_PER_LDG_STG, bool is_log_softmax>
__global__ void masked_scale_softmax_warp_backward_recompute(output_t* gradInput, const input_t* grad,
                                                             const input_t* softmax_input, const input_t* pad_mask,
                                                             const uint8_t* mask, acc_t scale, int batch_size,
                                                             int stride, int pad_batch_stride, int element_count) {
  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;

  // batch_size might not be a multiple of WARP_BATCH. Check how
  // many batches have to computed within this WARP.
  int local_batches = batch_size - first_batch;
  if (local_batches > WARP_BATCH) local_batches = WARP_BATCH;

  // there might be multiple batches per warp. compute the index within the
  // batch
  int local_idx = threadIdx.x % WARP_SIZE;
  // vectorize if a row length is multiple of 4
  int flag_vec4 = element_count & 3 == 0;
  acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS];
  input_t elements_input[WARP_BATCH][WARP_ITERATIONS];

  // the first element to process by the current thread
  int thread_offset = first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;

  grad += thread_offset;
  softmax_input += thread_offset;
  gradInput += thread_offset;
  mask += thread_offset;

  // The nested loops over WARP_BATCH and then WARP_ITERATIONS can be simplified
  // to one loop, but I think doing so would obfuscate the logic of the
  // algorithm, thus I chose to keep the nested loops. This should have no
  // impact on performance because the loops are unrolled anyway.

  // load data from global memory
  for (int i = 0; i < WARP_BATCH; ++i) {
    int batch_element_count = (i >= local_batches) ? 0 : element_count;
    int pad_thread_offset = ((first_batch + i) / pad_batch_stride) * stride + ELEMENTS_PER_LDG_STG * local_idx;
    const input_t* curr_mask = pad_mask + pad_thread_offset;
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;

#pragma unroll
      for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
        // masking_value is a large negative value
        elements_input[i][it + element] = -10000;
        grad_reg[i][it + element] = acc_t(0);
      }

      if (element_index < batch_element_count) {
        int itr_jmp = it * WARP_SIZE;
        int itr_idx = i * element_count + itr_jmp;
        copy_vector<input_t, ELEMENTS_PER_LDG_STG>(&elements_input[i][it], softmax_input + itr_idx);
        apply_additive_mask<input_t, ELEMENTS_PER_LDG_STG>(
            &elements_input[i][it],
            curr_mask + itr_jmp);  //(__half)-std::numeric_limits<float>::infinity()
        uint8_t mask_temp[ELEMENTS_PER_LDG_STG];
        input_t grad_temp[ELEMENTS_PER_LDG_STG];
        copy_vector<uint8_t, ELEMENTS_PER_LDG_STG>(&mask_temp[0], mask + itr_idx);
        copy_vector<input_t, ELEMENTS_PER_LDG_STG>(&grad_temp[0], grad + itr_idx);
#pragma unroll
        for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
          grad_reg[i][it + element] = ((acc_t)mask_temp[element] * (acc_t)grad_temp[element] * (acc_t)scale);
        }
      }
    }
  }
  // load data from global memory

  // convert input_t to acc_t
  // TODO : remove this, input is already acc_t type in register
  acc_t elements[WARP_BATCH][WARP_ITERATIONS];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; ++it) {
      elements[i][it] = elements_input[i][it];
    }
  }

  constexpr uint32_t FULL_MASK = 0xffffffff;

  // compute local max_value

  // take the max_value of the first element to avoid one max call
  acc_t max_value[WARP_BATCH];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    max_value[i] = elements[i][0];
  }

#pragma unroll
  for (int it = 1; it < WARP_ITERATIONS; ++it) {
    for (int i = 0; i < WARP_BATCH; ++i) {
      max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
    }
  }

// reduction max_value
#pragma unroll
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
    float val[WARP_BATCH];
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      val[i] = __shfl_xor_sync(FULL_MASK, max_value[i], offset, WARP_SIZE);
    }
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      max_value[i] = max_value[i] > val[i] ? max_value[i] : val[i];
    }
  }

  // compute local sum
  acc_t sum[WARP_BATCH]{0.0f};

#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    for (int it = 0; it < WARP_ITERATIONS; ++it) {
      // elements[i][it] = expf(elements[i][it] - max_value[i]);
      elements[i][it] = std::exp(elements[i][it] - max_value[i]);
      sum[i] += elements[i][it];
    }
  }

// reduction sum
#pragma unroll
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      sum[i] += __shfl_xor_sync(FULL_MASK, sum[i], offset, WARP_SIZE);
    }
  }

// store result
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    if (i >= local_batches) break;
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it++) {
      elements[i][it] = elements[i][it] / sum[i];
      grad_reg[i][it] = grad_reg[i][it] * elements[i][it];
    }
  }

  acc_t grad_sum[WARP_BATCH];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    grad_sum[i] = grad_reg[i][0];
#pragma unroll
    for (int it = 1; it < WARP_ITERATIONS; ++it) {
      grad_sum[i] += grad_reg[i][it];
    }
  }
  warp_reduce_sum<acc_t, WARP_BATCH, WARP_SIZE>(grad_sum);

// store result
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    if (i >= local_batches) break;
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
      if (element_index < element_count) {
        // compute gradients
        output_t grad_input_reg[ELEMENTS_PER_LDG_STG];
#pragma unroll
        for (int element = 0; element < ELEMENTS_PER_LDG_STG; element++) {
          if (is_log_softmax) {
            grad_input_reg[element] = (grad_reg[i][it + element] - std::exp(elements[i][it + element]) * grad_sum[i]);
          } else {
            grad_input_reg[element] = (grad_reg[i][it + element] - elements[i][it + element] * grad_sum[i]);
          }
        }
        copy_vector<output_t, ELEMENTS_PER_LDG_STG>(gradInput + i * element_count + it * WARP_SIZE, grad_input_reg);
      }
    }
  }
}

template <typename input_t, typename output_t, typename acc_t, bool is_log_softmax>
using masked_scale_softmax_warp_backward_recompute_func = void (*)(output_t* gradInput, const input_t* grad,
                                                                   const input_t* softmax_input,
                                                                   const input_t* pad_mask, const uint8_t* mask,
                                                                   acc_t scale, int batch_size, int stride,
                                                                   int pad_batch_stride, int element_count);

template <typename input_t, typename output_t, typename acc_t, bool is_log_softmax>
bool masked_scale_softmax_warp_backward_recompute_kernel(
    int element_count, int log2_elements, int& warp_size, int& batches_per_warp,
    masked_scale_softmax_warp_backward_recompute_func<input_t, output_t, acc_t, is_log_softmax>& kernel) {
  // determine size of a warp
  const int next_power_of_two = 1 << log2_elements;
  warp_size = (next_power_of_two < 32) ? next_power_of_two : 32;

  // determine how many batches a warp should process.
  batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
  bool flag_vec4 = (element_count % 4 == 0);
  switch (log2_elements) {
    case 0:  // 1
      kernel = &masked_scale_softmax_warp_backward_recompute<input_t, output_t, acc_t, 2, 1, 1, 1, is_log_softmax>;
      break;
    case 1:  // 2
      kernel = &masked_scale_softmax_warp_backward_recompute<input_t, output_t, acc_t, 2, 1, 2, 1, is_log_softmax>;
      break;
    case 2:  // 4
      kernel = &masked_scale_softmax_warp_backward_recompute<input_t, output_t, acc_t, 2, 1, 4, 1, is_log_softmax>;
      break;
    case 3:  // 8
      kernel = &masked_scale_softmax_warp_backward_recompute<input_t, output_t, acc_t, 2, 1, 8, 1, is_log_softmax>;
      break;
    case 4:  // 16
      kernel = &masked_scale_softmax_warp_backward_recompute<input_t, output_t, acc_t, 2, 1, 16, 1, is_log_softmax>;
      break;
    case 5:  // 32
      kernel = &masked_scale_softmax_warp_backward_recompute<input_t, output_t, acc_t, 2, 1, 32, 1, is_log_softmax>;
      break;
    case 6:  // 64
      kernel = &masked_scale_softmax_warp_backward_recompute<input_t, output_t, acc_t, 2, 2, 32, 1, is_log_softmax>;
      break;
    case 7:  // 128
      kernel = &masked_scale_softmax_warp_backward_recompute<input_t, output_t, acc_t, 2, 4, 32, 1, is_log_softmax>;
      break;
    case 8:  // 256
      if (flag_vec4)
        kernel = &masked_scale_softmax_warp_backward_recompute<input_t, output_t, acc_t, 1, 8, 32, 4, is_log_softmax>;
      else
        kernel = &masked_scale_softmax_warp_backward_recompute<input_t, output_t, acc_t, 1, 8, 32, 1, is_log_softmax>;
      break;
    case 9:  // 512
      if (flag_vec4)
        kernel = &masked_scale_softmax_warp_backward_recompute<input_t, output_t, acc_t, 1, 16, 32, 4, is_log_softmax>;
      else
        kernel = &masked_scale_softmax_warp_backward_recompute<input_t, output_t, acc_t, 1, 16, 32, 1, is_log_softmax>;
      break;
    case 10:  // 1024
      if (flag_vec4)
        kernel = &masked_scale_softmax_warp_backward_recompute<input_t, output_t, acc_t, 1, 32, 32, 4, is_log_softmax>;
      else
        kernel = &masked_scale_softmax_warp_backward_recompute<input_t, output_t, acc_t, 1, 32, 32, 1, is_log_softmax>;
      break;
    case 11:  // 2048
      if (flag_vec4)
        kernel = &masked_scale_softmax_warp_backward_recompute<input_t, output_t, acc_t, 1, 64, 32, 4, is_log_softmax>;
      else
        kernel = &masked_scale_softmax_warp_backward_recompute<input_t, output_t, acc_t, 1, 64, 32, 1, is_log_softmax>;
      break;
    default:
      return false;
  }
  return true;
}

template <typename input_t, typename output_t, typename acc_t, bool is_log_softmax>
bool dispatch_masked_scale_softmax_backward_recompute(output_t* grad_input, const input_t* grad,
                                                      const input_t* softmax_input, const input_t* pad_mask,
                                                      const uint8_t* mask, acc_t scale, int softmax_elements,
                                                      int softmax_elements_stride, int pad_batch_stride,
                                                      int batch_count, cudaStream_t streamid) {
  if (softmax_elements == 0) {
    return true;
  } else if (softmax_elements <= 2048) {
    // compute function index. there's a function for each power of two size up
    // to 1024.
    int log2_elements = 0;
    while ((1 << log2_elements) < softmax_elements) ++log2_elements;

    masked_scale_softmax_warp_backward_recompute_func<input_t, output_t, acc_t, is_log_softmax> kernel;
    int warp_size, batches_per_warp;
    if (!masked_scale_softmax_warp_backward_recompute_kernel<input_t, output_t, acc_t, is_log_softmax>(
            softmax_elements, log2_elements, warp_size, batches_per_warp, kernel)) {
      return false;
    }

    // use 128 threads per block to maximize gpu utilization
    constexpr int threads_per_block = 128;
    // compute warps per block.
    int warps_per_block = (threads_per_block / warp_size);
    int batches_per_block = warps_per_block * batches_per_warp;
    int blocks = (batch_count + batches_per_block - 1) / batches_per_block;

    // compute launch size
    dim3 threads(warp_size, warps_per_block, 1);

    // launch
    kernel<<<blocks, threads, 0, streamid>>>(grad_input, grad, softmax_input, pad_mask, mask, scale, batch_count,
                                             softmax_elements_stride, pad_batch_stride, softmax_elements);
    return true;
  }
  return false;
}

template <typename input_t, typename output_t, typename acc_t, bool is_log_softmax>
void dispatch_masked_scale_softmax_backward_stream(output_t* grad_input, const input_t* grad, const input_t* output,
                                                   const uint8_t* mask, acc_t scale, int softmax_elements,
                                                   int softmax_elements_stride, int batch_count,
                                                   cudaStream_t streamid) {
  TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 1024);
  if (softmax_elements == 0) {
    return;
  } else {
    int log2_elements = log2_ceil_native(softmax_elements);
    const int next_power_of_two = 1 << log2_elements;
    // This value must match the WARP_SIZE constexpr value computed inside
    // softmax_warp_backward.
    int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
    // This value must match the WARP_BATCH constexpr value computed inside
    // softmax_warp_backward.
    int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
    // use 128 threads per block to maximize gpu utilization
    constexpr int threads_per_block = 128;
    int warps_per_block = (threads_per_block / warp_size);
    int batches_per_block = warps_per_block * batches_per_warp;
    int blocks = (batch_count + batches_per_block - 1) / batches_per_block;
    dim3 threads(warp_size, warps_per_block, 1);
    // Launch code would be more elegant if C++ supported FOR CONSTEXPR
    switch (log2_elements) {
      case 0:  // 1
        masked_scale_softmax_warp_backward<input_t, output_t, acc_t, 0, is_log_softmax>
            <<<blocks, threads, 0, streamid>>>(grad_input, grad, output, mask, scale, batch_count,
                                               softmax_elements_stride, softmax_elements);
        break;
      case 1:  // 2
        masked_scale_softmax_warp_backward<input_t, output_t, acc_t, 1, is_log_softmax>
            <<<blocks, threads, 0, streamid>>>(grad_input, grad, output, mask, scale, batch_count,
                                               softmax_elements_stride, softmax_elements);
        break;
      case 2:  // 4
        masked_scale_softmax_warp_backward<input_t, output_t, acc_t, 2, is_log_softmax>
            <<<blocks, threads, 0, streamid>>>(grad_input, grad, output, mask, scale, batch_count,
                                               softmax_elements_stride, softmax_elements);
        break;
      case 3:  // 8
        masked_scale_softmax_warp_backward<input_t, output_t, acc_t, 3, is_log_softmax>
            <<<blocks, threads, 0, streamid>>>(grad_input, grad, output, mask, scale, batch_count,
                                               softmax_elements_stride, softmax_elements);
        break;
      case 4:  // 16
        masked_scale_softmax_warp_backward<input_t, output_t, acc_t, 4, is_log_softmax>
            <<<blocks, threads, 0, streamid>>>(grad_input, grad, output, mask, scale, batch_count,
                                               softmax_elements_stride, softmax_elements);
        break;
      case 5:  // 32
        masked_scale_softmax_warp_backward<input_t, output_t, acc_t, 5, is_log_softmax>
            <<<blocks, threads, 0, streamid>>>(grad_input, grad, output, mask, scale, batch_count,
                                               softmax_elements_stride, softmax_elements);
        break;
      case 6:  // 64
        masked_scale_softmax_warp_backward<input_t, output_t, acc_t, 6, is_log_softmax>
            <<<blocks, threads, 0, streamid>>>(grad_input, grad, output, mask, scale, batch_count,
                                               softmax_elements_stride, softmax_elements);
        break;
      case 7:  // 128
        masked_scale_softmax_warp_backward<input_t, output_t, acc_t, 7, is_log_softmax>
            <<<blocks, threads, 0, streamid>>>(grad_input, grad, output, mask, scale, batch_count,
                                               softmax_elements_stride, softmax_elements);
        break;
      case 8:  // 256
        masked_scale_softmax_warp_backward<input_t, output_t, acc_t, 8, is_log_softmax>
            <<<blocks, threads, 0, streamid>>>(grad_input, grad, output, mask, scale, batch_count,
                                               softmax_elements_stride, softmax_elements);
        break;
      case 9:  // 512
        masked_scale_softmax_warp_backward<input_t, output_t, acc_t, 9, is_log_softmax>
            <<<blocks, threads, 0, streamid>>>(grad_input, grad, output, mask, scale, batch_count,
                                               softmax_elements_stride, softmax_elements);
        break;
      case 10:  // 1024
        masked_scale_softmax_warp_backward<input_t, output_t, acc_t, 10, is_log_softmax>
            <<<blocks, threads, 0, streamid>>>(grad_input, grad, output, mask, scale, batch_count,
                                               softmax_elements_stride, softmax_elements);
        break;
      default:
        break;
    }
  }
}

// elementwise multiplication called in at::softmax_backward_data is fused
// inside softmax dgrad kernel as a result of fusion, intermediate
// multiplication result is stored in fp32 in registers, instead of fp16
template <typename input_t, typename output_t, typename acc_t, int log2_elements, bool is_log_softmax>
__global__ void softmax_warp_backward_fused_native(output_t* gradInput, const input_t* grad, const input_t* output,
                                                   int batch_size, int stride, int element_count) {
  // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and
  // warp_size of method warp_softmax_backward_kernel.
  constexpr int next_power_of_two = 1 << log2_elements;
  constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
  constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
  constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;

  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;

  // batch_size might not be a multiple of WARP_BATCH. Check how
  // many batches have to computed within this WARP.
  int local_batches = batch_size - first_batch;
  if (local_batches > WARP_BATCH) local_batches = WARP_BATCH;

  // there might be multiple batches per warp. compute the index within the
  // batch
  int local_idx = threadIdx.x % WARP_SIZE;

  // the first element to process by the current thread
  int thread_offset = first_batch * stride + local_idx;
  grad += thread_offset;
  output += thread_offset;
  gradInput += thread_offset;

  // The nested loops over WARP_BATCH and then WARP_ITERATIONS can be simplified
  // to one loop, but I think doing so would obfuscate the logic of the
  // algorithm, thus I chose to keep the nested loops. This should have no
  // impact on performance because the loops are unrolled anyway.

  // load data from global memory
  acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS];
  acc_t output_reg[WARP_BATCH][WARP_ITERATIONS];
  for (int i = 0; i < WARP_BATCH; ++i) {
    int batch_element_count = (i >= local_batches) ? 0 : element_count;
    for (int it = 0; it < WARP_ITERATIONS; ++it) {
      int element_index = local_idx + it * WARP_SIZE;
      if (element_index < batch_element_count) {
        grad_reg[i][it] = grad[i * element_count + it * WARP_SIZE] * output[i * element_count + it * WARP_SIZE];
        output_reg[i][it] = output[i * element_count + it * WARP_SIZE];
      } else {
        grad_reg[i][it] = acc_t(0);
        output_reg[i][it] = acc_t(0);
      }
    }
  }

  acc_t sum[WARP_BATCH];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    sum[i] = grad_reg[i][0];  //* output_reg[i][0];
#pragma unroll
    for (int it = 1; it < WARP_ITERATIONS; ++it) {
      sum[i] += grad_reg[i][it];  // * output_reg[i][it];
    }
  }
  warp_reduce_sum<acc_t, WARP_BATCH, WARP_SIZE>(sum);

// store result
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    if (i >= local_batches) break;
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; ++it) {
      int element_index = local_idx + it * WARP_SIZE;
      if (element_index < element_count) {
        // compute gradients
        if (is_log_softmax) {
          gradInput[i * element_count + it * WARP_SIZE] = (grad_reg[i][it] - std::exp(output_reg[i][it]) * sum[i]);
        } else {
          gradInput[i * element_count + it * WARP_SIZE] = (grad_reg[i][it] - output_reg[i][it] * sum[i]);
        }
      }
    }
  }
}

template <typename input_t, typename output_t, typename acc_t, bool is_log_softmax>
void dispatch_softmax_backward_fused_native(output_t* grad_input, const input_t* grad, const input_t* output,
                                            int softmax_elements, int softmax_elements_stride, int batch_count) {
  TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 1024);
  if (softmax_elements == 0) {
    return;
  } else {
    int log2_elements = log2_ceil_native(softmax_elements);
    const int next_power_of_two = 1 << log2_elements;

    // This value must match the WARP_SIZE constexpr value computed inside
    // softmax_warp_backward.
    int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;

    // This value must match the WARP_BATCH constexpr value computed inside
    // softmax_warp_backward.
    int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;

    // use 128 threads per block to maximize gpu utilization
    constexpr int threads_per_block = 128;

    int warps_per_block = (threads_per_block / warp_size);
    int batches_per_block = warps_per_block * batches_per_warp;
    int blocks = (batch_count + batches_per_block - 1) / batches_per_block;
    dim3 threads(warp_size, warps_per_block, 1);
    // Launch code would be more elegant if C++ supported FOR CONSTEXPR
    switch (log2_elements) {
      case 0:  // 1
        softmax_warp_backward_fused_native<input_t, output_t, acc_t, 0, is_log_softmax>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 1:  // 2
        softmax_warp_backward_fused_native<input_t, output_t, acc_t, 1, is_log_softmax>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 2:  // 4
        softmax_warp_backward_fused_native<input_t, output_t, acc_t, 2, is_log_softmax>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 3:  // 8
        softmax_warp_backward_fused_native<input_t, output_t, acc_t, 3, is_log_softmax>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 4:  // 16
        softmax_warp_backward_fused_native<input_t, output_t, acc_t, 4, is_log_softmax>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 5:  // 32
        softmax_warp_backward_fused_native<input_t, output_t, acc_t, 5, is_log_softmax>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 6:  // 64
        softmax_warp_backward_fused_native<input_t, output_t, acc_t, 6, is_log_softmax>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 7:  // 128
        softmax_warp_backward_fused_native<input_t, output_t, acc_t, 7, is_log_softmax>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 8:  // 256
        softmax_warp_backward_fused_native<input_t, output_t, acc_t, 8, is_log_softmax>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 9:  // 512
        softmax_warp_backward_fused_native<input_t, output_t, acc_t, 9, is_log_softmax>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 10:  // 1024
        softmax_warp_backward_fused_native<input_t, output_t, acc_t, 10, is_log_softmax>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      default:
        break;
    }
  }
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Warp softmax backward
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename input_t, typename output_t, typename acc_t, int WARP_BATCH, int WARP_ITERATIONS, int WARP_SIZE = 32,
          int ELEMENTS_PER_LDG_STG = 1>
__global__ void softmax_warp_backward(__half* gradInput, const __half* grad, const __half* output, int batch_size,
                                      int stride, int element_count) {
  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;

  // batch_size might not be a multiple of WARP_BATCH. Check how
  // many batches have to computed within this WARP.
  int local_batches = batch_size - first_batch;
  if (local_batches > WARP_BATCH) local_batches = WARP_BATCH;

  // there might be multiple batches per warp. compute the index within the
  // batch
  int local_idx = threadIdx.x;

  // the first element to process by the current thread
  int thread_offset = first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;
  grad += thread_offset;
  output += thread_offset;
  gradInput += thread_offset;

  // load data from global memory
  input_t grad_reg_input[WARP_BATCH][WARP_ITERATIONS] = {0.0f};
  input_t output_reg_input[WARP_BATCH][WARP_ITERATIONS] = {0.0f};
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    int batch_element_count = (i >= local_batches) ? 0 : element_count;
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
      if (element_index < batch_element_count) {
        copy_vector<input_t, ELEMENTS_PER_LDG_STG>(&grad_reg_input[i][it], grad + i * element_count + it * WARP_SIZE);
        copy_vector<input_t, ELEMENTS_PER_LDG_STG>(&output_reg_input[i][it],
                                                   output + i * element_count + it * WARP_SIZE);
      }
    }
  }

  // convert half to floating point
  acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS];
  acc_t output_reg[WARP_BATCH][WARP_ITERATIONS];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    for (int it = 0; it < WARP_ITERATIONS; ++it) {
      grad_reg[i][it] = grad_reg_input[i][it];
      output_reg[i][it] = output_reg_input[i][it];
    }
  }

  // compute thread local sum
  acc_t sum[WARP_BATCH] = {0};
#pragma unroll
  for (int it = 0; it < WARP_ITERATIONS; ++it) {
    for (int i = 0; i < WARP_BATCH; ++i) {
      sum[i] += grad_reg[i][it] * output_reg[i][it];
    }
  }

  // reduction sum
  constexpr uint32_t FULL_MASK = 0xffffffff;
#pragma unroll
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      sum[i] += __shfl_xor_sync(FULL_MASK, sum[i], offset, WARP_SIZE);
    }
  }

// store result
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    if (i >= local_batches) break;
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
      if (element_index < element_count) {
        // compute gradients
        output_t out[ELEMENTS_PER_LDG_STG];
        for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
          out[element] = (output_reg[i][it + element] * (grad_reg[i][it + element] - sum[i]));
        }
        // store them in global memory
        copy_vector<output_t, ELEMENTS_PER_LDG_STG>(gradInput + i * element_count + it * WARP_SIZE, out);
      }
    }
  }
}

// WARP_BATCH number of batches.
// WARP_ITERATOINS The number of iterations required for one warp to iterate
// over all data. WARP_SIZE number of elements working on a single batch, has to
// be a power of two. ELEMENTS_PER_LDG_STG has to be 1.
template <typename input_t, typename output_t>
using softmax_backward_func = void (*)(output_t* gradInput, const input_t* grad, const input_t* output, int batch_size,
                                       int stride, int element_count);

template <typename input_t, typename output_t, typename acc_t>
bool warp_softmax_backward_kernel(int log2_elements, int& warp_size, int& batches_per_warp,
                                  softmax_backward_func<input_t, output_t>& kernel) {
  // determine size of a warp
  const int next_power_of_two = 1 << log2_elements;
  warp_size = (next_power_of_two < 32) ? next_power_of_two : 32;

  // determine how many batches a warp should process.
  batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;

  switch (log2_elements) {
    case 0:  // 1
      kernel = &softmax_warp_backward<input_t, output_t, acc_t, 2, 1, 1, 1>;
      break;
    case 1:  // 2
      kernel = &softmax_warp_backward<input_t, output_t, acc_t, 2, 1, 2, 1>;
      break;
    case 2:  // 4
      kernel = &softmax_warp_backward<input_t, output_t, acc_t, 2, 1, 4, 1>;
      break;
    case 3:  // 8
      kernel = &softmax_warp_backward<input_t, output_t, acc_t, 2, 1, 8, 1>;
      break;
    case 4:  // 16
      kernel = &softmax_warp_backward<input_t, output_t, acc_t, 2, 1, 16, 1>;
      break;
    case 5:  // 32
      kernel = &softmax_warp_backward<input_t, output_t, acc_t, 2, 1, 32, 1>;
      break;
    case 6:  // 64
      kernel = &softmax_warp_backward<input_t, output_t, acc_t, 2, 2, 32, 1>;
      break;
    case 7:  // 128
      kernel = &softmax_warp_backward<input_t, output_t, acc_t, 2, 4, 32, 1>;
      break;
    case 8:  // 256
      kernel = &softmax_warp_backward<input_t, output_t, acc_t, 1, 8, 32, 1>;
      break;
    case 9:  // 512
      kernel = &softmax_warp_backward<input_t, output_t, acc_t, 1, 16, 32, 1>;
      break;
    case 10:  // 1024
      kernel = &softmax_warp_backward<input_t, output_t, acc_t, 1, 32, 32, 1>;
      break;
    default:
      return false;
  }
  return true;
}

template <typename input_t, typename output_t, typename acc_t>
bool dispatch_softmax_backward(output_t* grad_input, const input_t* grad, const input_t* output, int softmax_elements,
                               int softmax_elements_stride, int batch_count) {
  if (softmax_elements == 0) {
    return true;
  } else if (softmax_elements <= 1024) {
    // compute function index. there's a function for each power of two size up
    // to 1024.
    int log2_elements = 0;
    while ((1 << log2_elements) < softmax_elements) ++log2_elements;

    softmax_backward_func<input_t, output_t> kernel;
    int warp_size, batches_per_warp;
    if (!warp_softmax_backward_kernel<input_t, output_t, acc_t>(log2_elements, warp_size, batches_per_warp, kernel)) {
      return false;
    }

    // use 128 threads per block to maximize gpu utilization
    constexpr int threads_per_block = 128;

    // compute warps per block.
    int warps_per_block = (threads_per_block / warp_size);

    // compute launch size
    int batches_per_block = warps_per_block * batches_per_warp;
    int blocks = (batch_count + batches_per_block - 1) / batches_per_block;
    dim3 threads(warp_size, warps_per_block, 1);

    // launch
    kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, batch_count,
                                                                     softmax_elements_stride, softmax_elements);
    return true;
  }
  return false;
}

template <typename input_t, typename output_t, typename acc_t>
bool dispatch_softmax_backward_stream(output_t* grad_input, const input_t* grad, const input_t* output,
                                      int softmax_elements, int softmax_elements_stride, int batch_count,
                                      cudaStream_t streamid) {
  if (softmax_elements == 0) {
    return true;
  } else if (softmax_elements <= 1024) {
    // compute function index. there's a function for each power of two size up
    // to 1024.
    int log2_elements = 0;
    while ((1 << log2_elements) < softmax_elements) ++log2_elements;
    softmax_backward_func<input_t, output_t> kernel;
    int warp_size, batches_per_warp;
    if (!warp_softmax_backward_kernel<input_t, output_t, acc_t>(log2_elements, warp_size, batches_per_warp, kernel)) {
      return false;
    }
    // use 128 threads per block to maximize gpu utilization
    constexpr int threads_per_block = 128;
    // compute warps per block.
    int warps_per_block = (threads_per_block / warp_size);
    // compute launch size
    int batches_per_block = warps_per_block * batches_per_warp;
    int blocks = (batch_count + batches_per_block - 1) / batches_per_block;
    dim3 threads(warp_size, warps_per_block, 1);
    // launch
    kernel<<<blocks, threads, 0, streamid>>>(grad_input, grad, output, batch_count, softmax_elements_stride,
                                             softmax_elements);
    return true;
  }
  return false;
}

template <typename input_t, typename output_t, typename acc_t, int WARP_BATCH, int WARP_ITERATIONS, int WARP_SIZE = 32,
          int ELEMENTS_PER_LDG_STG = 1>
__global__ void masked_softmax_warp_backward(__half* gradInput, const __half* grad, const __half* output,
                                             const uint8_t* pad_mask, int batch_size, int stride, int element_count,
                                             int pad_batch_stride) {
  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;

  // batch_size might not be a multiple of WARP_BATCH. Check how
  // many batches have to computed within this WARP.
  int local_batches = batch_size - first_batch;
  if (local_batches > WARP_BATCH) local_batches = WARP_BATCH;

  // there might be multiple batches per warp. compute the index within the
  // batch
  int local_idx = threadIdx.x;

  // the first element to process by the current thread
  int thread_offset = first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;
  grad += thread_offset;
  output += thread_offset;
  gradInput += thread_offset;

  // load data from global memory
  input_t grad_reg_input[WARP_BATCH][WARP_ITERATIONS] = {0.0f};
  input_t output_reg_input[WARP_BATCH][WARP_ITERATIONS] = {0.0f};
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    int batch_element_count = (i >= local_batches) ? 0 : element_count;
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
      if (element_index < batch_element_count) {
        copy_vector<input_t, ELEMENTS_PER_LDG_STG>(&grad_reg_input[i][it], grad + i * element_count + it * WARP_SIZE);
        copy_vector<input_t, ELEMENTS_PER_LDG_STG>(&output_reg_input[i][it],
                                                   output + i * element_count + it * WARP_SIZE);
      }
    }
  }

  // convert half to floating point
  acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS];
  acc_t output_reg[WARP_BATCH][WARP_ITERATIONS];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    for (int it = 0; it < WARP_ITERATIONS; ++it) {
      grad_reg[i][it] = grad_reg_input[i][it];
      output_reg[i][it] = output_reg_input[i][it];
    }
  }

  // compute thread local sum
  acc_t sum[WARP_BATCH] = {0};
#pragma unroll
  for (int it = 0; it < WARP_ITERATIONS; ++it) {
    for (int i = 0; i < WARP_BATCH; ++i) {
      sum[i] += grad_reg[i][it] * output_reg[i][it];
    }
  }

  // reduction sum
  constexpr uint32_t FULL_MASK = 0xffffffff;
#pragma unroll
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      sum[i] += __shfl_xor_sync(FULL_MASK, sum[i], offset, WARP_SIZE);
    }
  }

// store result
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    if (i >= local_batches) break;
    int pad_thread_offset = ((first_batch + i) / pad_batch_stride) * stride + ELEMENTS_PER_LDG_STG * local_idx;
    const uint8_t* curr_mask = pad_mask + pad_thread_offset;
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
      if (element_index < element_count) {
        // compute gradients
        output_t out[ELEMENTS_PER_LDG_STG];
        for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
          out[element] = (output_reg[i][it + element] * (grad_reg[i][it + element] - sum[i]));
        }
        // store them in global memory
        int itr_jmp = it * WARP_SIZE;
        int itr_idx = i * element_count + itr_jmp;
        // It is kind of unfortunate this has to be here to zero something out
        // that is close to zero in the first place
        apply_mask<input_t, ELEMENTS_PER_LDG_STG>(&out[0], 0.0, curr_mask + itr_jmp);
        copy_vector<output_t, ELEMENTS_PER_LDG_STG>(gradInput + itr_idx, out);
      }
    }
  }
}

// WARP_BATCH number of batches.
// WARP_ITERATOINS The number of iterations required for one warp to iterate
// over all data. WARP_SIZE number of elements working on a single batch, has to
// be a power of two. ELEMENTS_PER_LDG_STG has to be 1.
template <typename input_t, typename output_t>
using masked_softmax_backward_func = void (*)(output_t* gradInput, const input_t* grad, const input_t* output,
                                              const uint8_t* pad_mask, int batch_size, int stride, int element_count,
                                              int pad_batch_stride);

template <typename input_t, typename output_t, typename acc_t>
bool warp_masked_softmax_backward_kernel(int log2_elements, int& warp_size, int& batches_per_warp,
                                         masked_softmax_backward_func<input_t, output_t>& kernel) {
  // determine size of a warp
  const int next_power_of_two = 1 << log2_elements;
  warp_size = (next_power_of_two < 32) ? next_power_of_two : 32;

  // determine how many batches a warp should process.
  batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;

  switch (log2_elements) {
    case 0:  // 1
      kernel = &masked_softmax_warp_backward<input_t, output_t, acc_t, 2, 1, 1, 1>;
      break;
    case 1:  // 2
      kernel = &masked_softmax_warp_backward<input_t, output_t, acc_t, 2, 1, 2, 1>;
      break;
    case 2:  // 4
      kernel = &masked_softmax_warp_backward<input_t, output_t, acc_t, 2, 1, 4, 1>;
      break;
    case 3:  // 8
      kernel = &masked_softmax_warp_backward<input_t, output_t, acc_t, 2, 1, 8, 1>;
      break;
    case 4:  // 16
      kernel = &masked_softmax_warp_backward<input_t, output_t, acc_t, 2, 1, 16, 1>;
      break;
    case 5:  // 32
      kernel = &masked_softmax_warp_backward<input_t, output_t, acc_t, 2, 1, 32, 1>;
      break;
    case 6:  // 64
      kernel = &masked_softmax_warp_backward<input_t, output_t, acc_t, 2, 2, 32, 1>;
      break;
    case 7:  // 128
      kernel = &masked_softmax_warp_backward<input_t, output_t, acc_t, 2, 4, 32, 1>;
      break;
    case 8:  // 256
      kernel = &masked_softmax_warp_backward<input_t, output_t, acc_t, 1, 8, 32, 1>;
      break;
    case 9:  // 512
      kernel = &masked_softmax_warp_backward<input_t, output_t, acc_t, 1, 16, 32, 1>;
      break;
    case 10:  // 1024
      kernel = &masked_softmax_warp_backward<input_t, output_t, acc_t, 1, 32, 32, 1>;
      break;
    default:
      return false;
  }
  return true;
}

template <typename input_t, typename output_t, typename acc_t>
bool dispatch_masked_softmax_backward(output_t* grad_input, const input_t* grad, const input_t* output,
                                      const uint8_t* pad_mask, int softmax_elements, int softmax_elements_stride,
                                      int batch_count, int pad_batch_stride) {
  if (softmax_elements == 0) {
    return true;
  } else if (softmax_elements <= 1024) {
    // compute function index. there's a function for each power of two size up
    // to 1024.
    int log2_elements = 0;
    while ((1 << log2_elements) < softmax_elements) ++log2_elements;

    masked_softmax_backward_func<input_t, output_t> kernel;
    int warp_size, batches_per_warp;
    if (!warp_masked_softmax_backward_kernel<input_t, output_t, acc_t>(log2_elements, warp_size, batches_per_warp,
                                                                       kernel)) {
      return false;
    }

    // use 128 threads per block to maximize gpu utilization
    constexpr int threads_per_block = 128;

    // compute warps per block.
    int warps_per_block = (threads_per_block / warp_size);

    // compute launch size
    int batches_per_block = warps_per_block * batches_per_warp;
    int blocks = (batch_count + batches_per_block - 1) / batches_per_block;
    dim3 threads(warp_size, warps_per_block, 1);

    // launch
    kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        grad_input, grad, output, pad_mask, batch_count, softmax_elements_stride, softmax_elements, pad_batch_stride);
    return true;
  }
  return false;
}
}  // namespace


================================================
FILE: apex/contrib/csrc/multihead_attn/strided_batched_gemm.cuh
================================================
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <cuda_runtime.h>

#include <iostream>
#include <vector>

// #include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
#include <cutlass/cutlass.h>
#include <cutlass/fast_math.h>
#include <cutlass/gemm/device/gemm_batched.h>
#include <cutlass/gemm/gemm.h>
#include <cutlass/layout/matrix.h>
#include <cutlass/matrix_coord.h>
#include <cutlass/pitch_linear_coord.h>

namespace {
cublasOperation_t convertTransToCublasOperation(char trans) {
  if (trans == 't')
    return CUBLAS_OP_T;
  else if (trans == 'n')
    return CUBLAS_OP_N;
  else if (trans == 'c')
    return CUBLAS_OP_C;
  else {
    TORCH_CHECK(false, "trans must be one of: t, n, c");
    return CUBLAS_OP_T;
  }
}

void CublasStridedBatchedGemm(char transa, char transb, long m, long n, long k, float alpha, const half* a, long lda,
                              long strideA, const half* b, long ldb, long strideB, float beta, half* c, long ldc,
                              long strideC, long batchCount, cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP) {
  cublasOperation_t opa = convertTransToCublasOperation(transa);
  cublasOperation_t opb = convertTransToCublasOperation(transb);

  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  cublasSetStream(handle, stream);
  float fAlpha = alpha;
  float fBeta = beta;
  TORCH_CUDABLAS_CHECK(cublasGemmStridedBatchedEx(
      handle, opa, opb, (int)m, (int)n, (int)k, (void*)&fAlpha, a, CUDA_R_16F, (int)lda, strideA, b, CUDA_R_16F,
      (int)ldb, strideB, (void*)&fBeta, c, CUDA_R_16F, (int)ldc, strideC, (int)batchCount, CUDA_R_32F, algo));
}

}  // namespace

// TODO(mkozuki): Make use of the int template parameters or discard them.
template <typename LayoutA, typename LayoutB, int SRC_A, int SRC_B, int DST_C>
void CutlassGemm_FP32Accum(cudaStream_t stream, long m, long n, long k, float alpha, const half* a, long lda,
                           long long int batch_stride_A, const half* b, long ldb, long long int batch_stride_B,
                           float beta, half* c, long ldc, long long int batch_stride_C, long batch_count) {
  using Gemm = cutlass::gemm::device::GemmBatched<
      /* Element type of A matrix */ half, /* Layout of A matrix */ LayoutA,
      /* Element type of B matrix */ half, /* Layout of B matrix */ LayoutB,
      /* Element type of C matrix */ half, /* Layout of C matrix */ cutlass::layout::ColumnMajor,
      /* Element Accumulator*/ float>;
  Gemm gemm_op;
  cutlass::Status status = gemm_op({{static_cast<int>(m), static_cast<int>(n), static_cast<int>(k)},
                                    {a, lda},
                                    batch_stride_A,
                                    {b, ldb},
                                    batch_stride_B,
                                    {c, ldc},
                                    batch_stride_C,
                                    {c, ldc},
                                    batch_stride_C,
                                    {alpha, beta},
                                    static_cast<int>(batch_count)},
                                   nullptr, stream);
  C10_CUDA_CHECK(status != cutlass::Status::kSuccess ? cudaErrorUnknown : cudaSuccess);
}

namespace {
void gemm_switch_fp32accum(char transa, char transb, long m, long n, long k, float alpha, const half* a, long lda,
                           long strideA, const half* b, long ldb, long strideB, float beta, half* c, long ldc,
                           long strideC, long batchCount) {
  auto stream = c10::cuda::getCurrentCUDAStream();
  // printf("GEMM   -> %c%c M: %i N: %i K: %i Alpha: %f Beta: %f\n", (transa ==
  // 't' ? 'T' : 'N'), (transb =='t' ? 'T' : 'N'), m, n, k, alpha, beta);
  if ((transa == 't') && (transb == 'n')) {
    if (!(lda & 0x7) && !(ldb & 0x7) && !(ldc & 0x7)) {
      CublasStridedBatchedGemm(transa, transb, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC,
                               batchCount, CUBLAS_GEMM_ALGO0_TENSOR_OP);
    } else if (!(lda & 0x7) && !(ldb & 0x7) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 8, 8, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x7) && !(ldb & 0x7) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 8, 8, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x7) && !(ldb & 0x3) && !(ldc & 0x7)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 8, 4, 8>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x7) && !(ldb & 0x3) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 8, 4, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x7) && !(ldb & 0x3) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 8, 4, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x7) && !(ldb & 0x1) && !(ldc & 0x7)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 8, 2, 8>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x7) && !(ldb & 0x1) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 8, 2, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x7) && !(ldb & 0x1) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 8, 2, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x7) && !(ldc & 0x7)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 4, 8, 8>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x7) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 4, 8, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x7) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 4, 8, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x3) && !(ldc & 0x7)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 4, 4, 8>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x3) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 4, 4, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x3) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 4, 4, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x1) && !(ldc & 0x7)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 4, 2, 8>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x1) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 4, 2, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x1) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 4, 2, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x7) && !(ldc & 0x7)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 2, 8, 8>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x7) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 2, 8, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x7) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 2, 8, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x3) && !(ldc & 0x7)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 2, 4, 8>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x3) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 2, 4, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x3) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 2, 4, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x1) && !(ldc & 0x7)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 2, 2, 8>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x1) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 2, 2, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x1) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::RowMajor, cutlass::layout::ColumnMajor, 2, 2, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else {
      CublasStridedBatchedGemm(transa, transb, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC,
                               batchCount);
    }
  } else if ((transa == 'n') && (transb == 'n')) {
    if (!(lda & 0x7) && !(ldb & 0x7) && !(ldc & 0x7)) {
      CublasStridedBatchedGemm(transa, transb, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC,
                               batchCount, CUBLAS_GEMM_ALGO0_TENSOR_OP);
    } else if (!(lda & 0x7) && !(ldb & 0x7) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 8, 8, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x7) && !(ldb & 0x7) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 8, 8, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x7) && !(ldb & 0x3) && !(ldc & 0x7)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 8, 4, 8>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x7) && !(ldb & 0x3) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 8, 4, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x7) && !(ldb & 0x3) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 8, 4, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x7) && !(ldb & 0x1) && !(ldc & 0x7)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 8, 2, 8>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x7) && !(ldb & 0x1) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 8, 2, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x7) && !(ldb & 0x1) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 8, 2, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x7) && !(ldc & 0x7)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 4, 8, 8>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x7) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 4, 8, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x7) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 4, 8, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x3) && !(ldc & 0x7)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 4, 4, 8>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x3) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 4, 4, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x3) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 4, 4, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x1) && !(ldc & 0x7)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 4, 2, 8>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x1) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 4, 2, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x1) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 4, 2, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x7) && !(ldc & 0x7)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 2, 8, 8>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x7) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 2, 8, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x7) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 2, 8, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x3) && !(ldc & 0x7)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 2, 4, 8>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x3) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 2, 4, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x3) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 2, 4, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x1) && !(ldc & 0x7)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 2, 2, 8>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x1) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 2, 2, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x1) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::ColumnMajor, 2, 2, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else {
      CublasStridedBatchedGemm(transa, transb, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC,
                               batchCount);
    }
  } else if ((transa == 'n') && (transb == 't')) {
    if (!(lda & 0x7) && !(ldb & 0x7) && !(ldc & 0x7)) {
      CublasStridedBatchedGemm(transa, transb, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC,
                               batchCount, CUBLAS_GEMM_ALGO0_TENSOR_OP);
    } else if (!(lda & 0x7) && !(ldb & 0x7) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, 8, 8, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x7) && !(ldb & 0x7) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, 8, 8, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x7) && !(ldb & 0x3) && !(ldc & 0x7)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, 8, 4, 8>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x7) && !(ldb & 0x3) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, 8, 4, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x7) && !(ldb & 0x3) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, 8, 4, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x7) && !(ldb & 0x1) && !(ldc & 0x7)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, 8, 2, 8>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x7) && !(ldb & 0x1) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, 8, 2, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x7) && !(ldb & 0x1) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, 8, 2, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x7) && !(ldc & 0x7)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, 4, 8, 8>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x7) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, 4, 8, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x7) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, 4, 8, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x3) && !(ldc & 0x7)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, 4, 4, 8>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x3) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, 4, 4, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x1) && !(ldc & 0x7)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, 4, 2, 8>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x1) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, 4, 2, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x3) && !(ldb & 0x1) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, 4, 2, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x7) && !(ldc & 0x7)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, 2, 8, 8>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x7) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, 2, 8, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x7) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, 2, 8, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x3) && !(ldc & 0x7)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, 2, 4, 8>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x3) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, 2, 4, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x3) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, 2, 4, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x1) && !(ldc & 0x7)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, 2, 2, 8>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x1) && !(ldc & 0x3)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, 2, 2, 4>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else if (!(lda & 0x1) && !(ldb & 0x1) && !(ldc & 0x1)) {
      CutlassGemm_FP32Accum<cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, 2, 2, 2>(
          stream, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC, batchCount);
    } else {
      CublasStridedBatchedGemm(transa, transb, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC,
                               batchCount);
    }
  } else {
    TORCH_CHECK(false, "TransA and TransB are invalid");
  }
}

void adjustLdLevel3(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t* lda, int64_t* ldb,
                    int64_t* ldc) {
  int transa_ = ((transa == 't') || (transa == 'T'));
  int transb_ = ((transb == 't') || (transb == 'T'));

  // Note: leading dimensions generally are checked that they are > 0 and at
  // least as big the result requires (even if the value won't be used).
  if (n <= 1) *ldc = std::max<int64_t>(m, 1);

  if (transa_) {
    if (m <= 1) *lda = std::max<int64_t>(k, 1);
  } else {
    if (k <= 1) *lda = std::max<int64_t>(m, 1);
  }

  if (transb_) {
    if (k <= 1) *ldb = std::max<int64_t>(n, 1);
  } else {
    if (n <= 1) *ldb = std::max<int64_t>(k, 1);
  }
}

void HgemmStridedBatched(char transa, char transb, long m, long n, long k, float alpha, const half* a, long lda,
                         long strideA, const half* b, long ldb, long strideB, float beta, half* c, long ldc,
                         long strideC, long batchCount) {
  if ((m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (ldc >= INT_MAX) ||
      (batchCount >= INT_MAX))

  {
    TORCH_CHECK(false,
                "Cublas_SgemmStridedBatched only supports m, n, k, lda, ldb, ldc, "
                "batchCount"
                "with the bound [val] <= %d",
                INT_MAX);
  }

  adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);

  gemm_switch_fp32accum(transa, transb, m, n, k, alpha, a, lda, strideA, b, ldb, strideB, beta, c, ldc, strideC,
                        batchCount);
}

}  // namespace


================================================
FILE: apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp
================================================
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/util/Exception.h>
#include <nccl.h>
#include <torch/csrc/cuda/CUDAPluggableAllocator.h>
#include <torch/extension.h>

#define NCCL_CHECK(cmd)                                                                                     \
  do {                                                                                                      \
    ncclResult_t result = cmd;                                                                              \
    if (result != ncclSuccess) {                                                                            \
      std::string err = "NCCL error in: " + std::string(__FILE__) + ":" + std::to_string(__LINE__) + ", " + \
                        std::string(ncclGetErrorString(result));                                            \
      TORCH_CHECK(false, err);                                                                              \
    }                                                                                                       \
  } while (0)

void* nccl_alloc_plug(size_t size, int device, void* stream) {
  void* ptr;
  NCCL_CHECK(ncclMemAlloc(&ptr, size));
  return ptr;
}

void nccl_free_plug(void* ptr, std::size_t size, int device, void* stream) { NCCL_CHECK(ncclMemFree(ptr)); }

std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator> nccl_allocator;

void maybe_init() {
  if (!nccl_allocator) {
    nccl_allocator =
        std::make_shared<torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator>(nccl_alloc_plug, nccl_free_plug);
  }
}

std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator> get_nccl_allocator() {
  maybe_init();
  return nccl_allocator;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("get_nccl_allocator", []() { return get_nccl_allocator(); });
};


================================================
FILE: apex/contrib/csrc/nccl_p2p/nccl_p2p.cpp
================================================
/**
 * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "nccl_p2p_cuda.cuh"

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("get_unique_nccl_id", &apex::contrib::nccl_p2p::get_unique_nccl_id, "get_unique_nccl_id",
        py::call_guard<py::gil_scoped_release>());
  m.def("init_nccl_comm", &apex::contrib::nccl_p2p::init_nccl_comm, "init_nccl_comm",
        py::call_guard<py::gil_scoped_release>());
  m.def("left_right_halo_exchange_inplace", &apex::contrib::nccl_p2p::left_right_halo_exchange_inplace,
        "left_right_halo_exchange_inplace", py::call_guard<py::gil_scoped_release>());
  m.def("left_right_halo_exchange", &apex::contrib::nccl_p2p::left_right_halo_exchange, "left_right_halo_exchange",
        py::call_guard<py::gil_scoped_release>());
  m.def("add_delay", &apex::contrib::nccl_p2p::add_delay, "add_delay", py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: apex/contrib/csrc/nccl_p2p/nccl_p2p_cuda.cu
================================================
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <torch/extension.h>

#include <cassert>
#include <cstdio>
#include <ctime>
#include <list>

#include "nccl.h"

/*
 * This file implements a crude but effective mechanism for copying data between tenors owned by different ranks
 * on the same machine using cudaMemcpyAsync peer-to-peer transfers.
 */

namespace {

__global__ void AddDelay_kernel(const int delay, int* counter) {
  if (blockIdx.x == 0 && threadIdx.x == 0) {
    // waste time while doing something compiler can't predict, thus preventing it from optimizing away this code.
    int new_counter = 0;
    double elapsed = 0;
    clock_t start = clock();
    do {
      clock_t now = clock();
      elapsed = (double)(now - start) * 1e9 / CLOCKS_PER_SEC;
      ++new_counter;
    } while (elapsed < (double)delay);
    *counter = new_counter;
  }
}

class NcclCommWrapper {
 private:
  ncclComm_t comm;
  int rank, world_size;

  ncclDataType_t get_nccl_type(at::Tensor input) {
    switch (input.scalar_type()) {
      case at::ScalarType::Half:
        return ncclFloat16;
      case at::ScalarType::Float:
        return ncclFloat32;
      case at::ScalarType::Double:
        return ncclFloat64;
      case at::ScalarType::Byte:
        return ncclUint8;
      case at::ScalarType::Char:
        return ncclInt8;
      case at::ScalarType::Int:
        return ncclInt32;
      case at::ScalarType::Long:
        return ncclInt64;
      case at::ScalarType::BFloat16:
        return ncclBfloat16;
      default:
        assert(false);
    }
  }

 public:
  NcclCommWrapper() {
    memset(&comm, 0, sizeof(ncclComm_t));
    rank = 0;
    world_size = 0;
  }
  NcclCommWrapper(ncclUniqueId id, int my_rank, int num_ranks) {
    ncclCommInitRank(&comm, num_ranks, id, my_rank);
    rank = my_rank;
    world_size = num_ranks;
  }

  ~NcclCommWrapper() {
    printf("ncclCommDestroy()\n");
    ncclCommDestroy(comm);
  }

  void left_right_halo_exchange_inplace(int left_rank, int right_rank, at::Tensor left_output_halo,
                                        at::Tensor right_output_halo, at::Tensor left_input_halo,
                                        at::Tensor right_input_halo) {
    auto stream = at::cuda::getCurrentCUDAStream();
    ncclGroupStart();
    ncclDataType_t ncclType = get_nccl_type(left_output_halo);
    bool left_zero = (left_rank < 0);
    bool right_zero = (right_rank < 0);
    size_t left_n = torch::numel(left_output_halo);
    size_t right_n = torch::numel(right_output_halo);
    assert(left_n > 0 && left_n == right_n);
    if (left_zero) {
      left_input_halo.zero_();
    } else {
      AT_DISPATCH_ALL_TYPES_AND3(
          at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half, left_output_halo.scalar_type(),
          "left_halo_exch", [&]() {
            // send left (to my_rank - 1)
            ncclSend(left_output_halo.data_ptr<scalar_t>(), left_n, ncclType, left_rank, comm, stream);
            // receive left (from my_rank - 1)
            ncclRecv(left_input_halo.data_ptr<scalar_t>(), right_n, ncclType, left_rank, comm, stream);
          });
    }
    if (right_zero) {
      right_input_halo.zero_();
    } else {
      AT_DISPATCH_ALL_TYPES_AND3(
          at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half, right_output_halo.scalar_type(),
          "right_halo_exch", [&]() {
            // send right (to my_rank + 1 )
            ncclSend(right_output_halo.data_ptr<scalar_t>(), right_n, ncclType, right_rank, comm, stream);
            // receive right (from my_rank + 1)
            ncclRecv(right_input_halo.data_ptr<scalar_t>(), left_n, ncclType, right_rank, comm, stream);
          });
    }
    ncclGroupEnd();
  }

  std::vector<at::Tensor> left_right_halo_exchange(int left_rank, int right_rank, at::Tensor left_output_halo,
                                                   at::Tensor right_output_halo) {
    // after halo exchange:
    // left_output_halo of rank+1 ends up in right_input_halo of rank
    // right_output_halo of rank-1 ends up in left_input_halo of rank
    auto right_input_halo = torch::empty_like(left_output_halo);
    auto left_input_halo = torch::empty_like(right_output_halo);
    left_right_halo_exchange_inplace(left_rank, right_rank, left_output_halo, right_output_halo, left_input_halo,
                                     right_input_halo);
    return {left_input_halo, right_input_halo};
  }
};

class ManagedObjects {
 public:
  ManagedObjects() {}
  ~ManagedObjects() {
    for (auto it = _nccl_comms.begin(); it != _nccl_comms.end(); ++it) {
      delete *it;
    }
  }

  int add_comm(NcclCommWrapper* comm) {
    int handle = _nccl_comms.size();
    _nccl_comms.push_back(comm);
    return handle;
  }

  NcclCommWrapper& get_comm(int handle) {
    assert(handle >= 0 && handle < _nccl_comms.size());
    return *_nccl_comms[handle];
  }

 private:
  std::vector<NcclCommWrapper*> _nccl_comms;
};
class ManagedObjects mo;

}  // end anonymous namespace

namespace apex {
namespace contrib {
namespace nccl_p2p {

at::Tensor get_unique_nccl_id(int n) {
  ncclUniqueId id;
  ncclGetUniqueId(&id);
  auto id_tensor = torch::empty({n, (int)sizeof(ncclUniqueId)},
                                torch::dtype(torch::kUInt8).device(torch::kCPU).requires_grad(false));
  auto id_ptr = id_tensor.data_ptr<uint8_t>();
  size_t offset = 0;
  for (int i = 0; i < n; ++i) {
    ncclUniqueId id;
    ncclGetUniqueId(&id);
    memcpy(id_ptr + offset, &id, sizeof(ncclUniqueId));
    offset += sizeof(ncclUniqueId);
  }
  return id_tensor;
}

int init_nccl_comm(at::Tensor unique_nccl_id, int my_rank, int num_ranks) {
  ncclUniqueId id;
  auto unique_nccl_id_ptr = unique_nccl_id.data_ptr<uint8_t>();
  memcpy(&id, unique_nccl_id_ptr, sizeof(ncclUniqueId));
  NcclCommWrapper* comm = new NcclCommWrapper(id, my_rank, num_ranks);
  int handle = mo.add_comm(comm);
  comm = 0L;
  return handle;
}

void left_right_halo_exchange_inplace(int handle, int left_rank, int right_rank, at::Tensor left_output_halo,
                                      at::Tensor right_output_halo, at::Tensor left_input_halo,
                                      at::Tensor right_input_halo) {
  class NcclCommWrapper& communicator = mo.get_comm(handle);
  return communicator.left_right_halo_exchange_inplace(left_rank, right_rank, left_output_halo, right_output_halo,
                                                       left_input_halo, right_input_halo);
}

std::vector<at::Tensor> left_right_halo_exchange(int handle, int left_rank, int right_rank, at::Tensor left_output_halo,
                                                 at::Tensor right_output_halo) {
  class NcclCommWrapper& communicator = mo.get_comm(handle);
  return communicator.left_right_halo_exchange(left_rank, right_rank, left_output_halo, right_output_halo);
}

void add_delay(int delay) {
  auto stream = at::cuda::getCurrentCUDAStream();
  auto t = torch::empty({1}, torch::dtype(torch::kInt32).device(torch::kCUDA));
  AddDelay_kernel<<<1, 1, 0, stream>>>(delay, t.data_ptr<int>());
}

}  // namespace nccl_p2p
}  // namespace contrib
}  // namespace apex


================================================
FILE: apex/contrib/csrc/nccl_p2p/nccl_p2p_cuda.cuh
================================================
/**
 * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once
#include <torch/extension.h>
#ifndef _nccl_p2p_h_
#define _nccl_p2p_h_

namespace apex {
namespace contrib {
namespace nccl_p2p {
at::Tensor get_unique_nccl_id(int n);
int init_nccl_comm(at::Tensor unique_nccl_id, int my_rank, int num_ranks);
void left_right_halo_exchange_inplace(int handle, int left_rank, int right_rank, at::Tensor left_output_halo,
                                      at::Tensor right_output_halo, at::Tensor left_input_halo,
                                      at::Tensor right_input_halo);
std::vector<at::Tensor> left_right_halo_exchange(int handle, int left_rank, int right_rank, at::Tensor left_output_halo,
                                                 at::Tensor right_output_halo);
void add_delay(int delay);
}  // namespace nccl_p2p
}  // namespace contrib
}  // namespace apex
#endif


================================================
FILE: apex/contrib/csrc/nccl_p2p/nccl_version.cpp
================================================
// Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
// This file is used to check the version of NCCL detected.
#include <torch/extension.h>

#include <tuple>

std::tuple<int, int> get_nccl_version();

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("get_nccl_version", &get_nccl_version); }


================================================
FILE: apex/contrib/csrc/nccl_p2p/nccl_version_check.cu
================================================
// Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.

// This file is used to check the version of NCCL detected.
#include <nccl.h>

#include <tuple>

std::tuple<int, int> get_nccl_version() { return {int(NCCL_MAJOR), int(NCCL_MINOR)}; }


================================================
FILE: apex/contrib/csrc/optimizers/fused_adam_cuda.cpp
================================================
#include <torch/extension.h>

// CUDA forward declaration
void fused_strided_check_finite(at::Tensor& overflow_flag, at::Tensor& p_copy, int stride, int clear_overflow_first);

void fused_adam_cuda(at::Tensor& p, at::Tensor& p_copy, at::Tensor& m, at::Tensor& v, at::Tensor& g, float lr,
                     float beta1, float beta2, float eps, float grad_scale, int step, int mode, int bias_correction,
                     float decay);
void fused_reversible_adam_cuda(at::Tensor& p, at::Tensor& p_copy, at::Tensor& m, at::Tensor& v, at::Tensor& g,
                                float lr, float beta1, float beta2, float eps, float grad_scale, int step, int mode,
                                int bias_correction, float decay);
void fused_maybe_adam_undo_cuda(at::Tensor& overflow_flag, at::Tensor& p, at::Tensor& m, at::Tensor& v, at::Tensor& g,
                                float lr, float beta1, float beta2, float eps, float grad_scale, int step, int mode,
                                int bias_correction, float decay);

void fused_adam_cuda_mt(int chunk_size, at::Tensor overflow_flag, std::vector<std::vector<at::Tensor>> tensor_lists,
                        float lr, float beta1, float beta2, float eps, float grad_scale, int step, int mode,
                        int bias_correction, float decay);

void maybe_cast_cuda(at::Tensor& overflow_flag, at::Tensor& p_in, at::Tensor& p_out);
void maybe_cast_cuda_mt(int chunk_size, at::Tensor overflow_flag, std::vector<std::vector<at::Tensor>> tensor_lists);

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
  CHECK_CUDA(x);       \
  CHECK_CONTIGUOUS(x)

// C++ interface
void strided_check_finite(at::Tensor& overflow_flag, at::Tensor& p_copy, int stride, int clear_overflow_first) {
  CHECK_INPUT(p_copy);
  fused_strided_check_finite(overflow_flag, p_copy, stride, clear_overflow_first);
}
void adam(at::Tensor& p, at::Tensor& p_copy, at::Tensor& m, at::Tensor& v, at::Tensor& g, float lr, float beta1,
          float beta2, float eps, float grad_scale, int step, int mode, int bias_correction, float decay) {
  CHECK_INPUT(p);
  if (p_copy.numel() > 0) CHECK_INPUT(p_copy);
  CHECK_INPUT(m);
  CHECK_INPUT(v);
  CHECK_INPUT(g);
  int64_t num_elem = p.numel();
  TORCH_CHECK(m.numel() == num_elem, "number of elements in m and p tensors should be equal");
  TORCH_CHECK(v.numel() == num_elem, "number of elements in v and p tensors should be equal");
  TORCH_CHECK(g.numel() == num_elem, "number of elements in g and p tensors should be equal");
  TORCH_CHECK(p_copy.numel() == num_elem || p_copy.numel() == 0,
              "number of elements in p_copy and p tensors should be equal, or p_copy should be empty");

  fused_adam_cuda(p, p_copy, m, v, g, lr, beta1, beta2, eps, grad_scale, step, mode, bias_correction, decay);
}
void reversible_adam(at::Tensor& p, at::Tensor& p_copy, at::Tensor& m, at::Tensor& v, at::Tensor& g, float lr,
                     float beta1, float beta2, float eps, float grad_scale, int step, int mode, int bias_correction,
                     float decay) {
  CHECK_INPUT(p);
  if (p_copy.numel() > 0) CHECK_INPUT(p_copy);
  CHECK_INPUT(m);
  CHECK_INPUT(v);
  CHECK_INPUT(g);
  int64_t num_elem = p.numel();
  TORCH_CHECK(m.numel() == num_elem, "number of elements in m and p tensors should be equal");
  TORCH_CHECK(v.numel() == num_elem, "number of elements in v and p tensors should be equal");
  TORCH_CHECK(g.numel() == num_elem, "number of elements in g and p tensors should be equal");
  TORCH_CHECK(p_copy.numel() == num_elem || p_copy.numel() == 0,
              "number of elements in p_copy and p tensors should be equal, or p_copy should be empty");

  fused_reversible_adam_cuda(p, p_copy, m, v, g, lr, beta1, beta2, eps, grad_scale, step, mode, bias_correction, decay);
}
void maybe_adam_undo(at::Tensor& overflow_flag, at::Tensor& p, at::Tensor& m, at::Tensor& v, at::Tensor& g, float lr,
                     float beta1, float beta2, float eps, float grad_scale, int step, int mode, int bias_correction,
                     float decay) {
  CHECK_INPUT(p);
  CHECK_INPUT(m);
  CHECK_INPUT(v);
  CHECK_INPUT(g);
  int64_t num_elem = p.numel();
  TORCH_CHECK(m.numel() == num_elem, "number of elements in m and p tensors should be equal");
  TORCH_CHECK(v.numel() == num_elem, "number of elements in v and p tensors should be equal");
  TORCH_CHECK(g.numel() == num_elem, "number of elements in g and p tensors should be equal");

  fused_maybe_adam_undo_cuda(overflow_flag, p, m, v, g, lr, beta1, beta2, eps, grad_scale, step, mode, bias_correction,
                             decay);
}
void maybe_cast(at::Tensor& overflow_flag, at::Tensor& p_in, at::Tensor& p_out) {
  CHECK_INPUT(p_in);
  CHECK_INPUT(p_out);
  int64_t num_elem = p_in.numel();
  TORCH_CHECK(p_out.numel() == num_elem, "number of elements in p_in and p_out should be equal");

  maybe_cast_cuda(overflow_flag, p_in, p_out);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("strided_check_finite", &strided_check_finite, "Strided finite check.",
        py::call_guard<py::gil_scoped_release>());
  m.def("adam", &adam, "Adam optimized CUDA implementation.", py::call_guard<py::gil_scoped_release>());
  m.def("reversible_adam", &reversible_adam, "Reversible Adam optimized CUDA implementation.",
        py::call_guard<py::gil_scoped_release>());
  m.def("adam_mt", &fused_adam_cuda_mt, "Multi tensor Adam optimized CUDA implementation.",
        py::call_guard<py::gil_scoped_release>());
  m.def("maybe_adam_undo", &maybe_adam_undo, "Undo function for Adam optimized CUDA implementation.",
        py::call_guard<py::gil_scoped_release>());
  m.def("maybe_cast", &maybe_cast, "Unpack byte tensor containing e5m2 floats.",
        py::call_guard<py::gil_scoped_release>());
  m.def("maybe_cast_mt", &maybe_cast_cuda_mt, "Unpack byte tensor containing e5m2 floats.",
        py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu
================================================
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>

#include <cmath>

#include "ATen/ATen.h"
#include "ATen/TensorUtils.h"
#include "ATen/cuda/CUDAContext.h"
#include "ATen/cuda/detail/IndexUtils.cuh"
// #include "ATen/Type.h"
#include "ATen/AccumulateType.h"
#include "multi_tensor_apply.cuh"

#define BLOCK_SIZE 512
#define ILP 4

template <typename T>
__device__ __forceinline__ bool is_aligned(T* p) {
  return ((uint64_t)p) % (ILP * sizeof(T)) == 0;
}

template <typename T>
__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset) {
  typedef typename std::aligned_storage<ILP * sizeof(T), ILP * alignof(T)>::type LT;
  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
}

#include "type_shim.h"

typedef enum {
  ADAM_MODE_0 = 0,  // eps under square root
  ADAM_MODE_1 = 1   // eps outside square root
} adamMode_t;

template <typename T, typename GRAD_T>
__global__ void adam_cuda_kernel(T* __restrict__ p,
                                 GRAD_T* __restrict__ p_copy,  // For mixed precision training, pass NULL if not needed
                                 T* __restrict__ m, T* __restrict__ v, const GRAD_T* __restrict__ g, const float b1,
                                 const float b2, const float eps, const float grad_scale, const float step_size,
                                 const size_t tsize, adamMode_t mode, const float decay) {
  // Assuming 2D grids and 2D blocks
  const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
  const int threadsPerBlock = blockDim.x * blockDim.y;
  const int threadIdInBlock = threadIdx.y * blockDim.x + threadIdx.x;
  const int i = (blockId * threadsPerBlock + threadIdInBlock);
  const int totThreads = gridDim.x * gridDim.y * threadsPerBlock;

  for (int j = i; j < tsize; j += totThreads) {
    T scaled_grad = g[j] / grad_scale;
    m[j] = b1 * m[j] + (1 - b1) * scaled_grad;
    v[j] = b2 * v[j] + (1 - b2) * scaled_grad * scaled_grad;
    float denom;
    if (mode == ADAM_MODE_0)
      denom = sqrtf(v[j] + eps);
    else  // Mode 1
      denom = sqrtf(v[j]) + eps;
    float update = (m[j] / denom) + (decay * p[j]);
    p[j] = p[j] - (step_size * update);
    if (p_copy != NULL) p_copy[j] = (GRAD_T)p[j];
  }
}

template <int DEPTH, typename T, typename GRAD_T>
struct AdamFunctor {
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<DEPTH>& tl,
                                             const float b1, const float b2, const float eps, const float grad_scale,
                                             const float step_size, adamMode_t mode, const float decay) {
    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    T* p = (T*)tl.addresses[0][tensor_loc];
    p += chunk_idx * chunk_size;
    T* m = (T*)tl.addresses[1][tensor_loc];
    m += chunk_idx * chunk_size;
    T* v = (T*)tl.addresses[2][tensor_loc];
    v += chunk_idx * chunk_size;
    GRAD_T* g = (GRAD_T*)tl.addresses[3][tensor_loc];
    g += chunk_idx * chunk_size;
    GRAD_T* p_copy = NULL;
    if (DEPTH == 5) {
      p_copy = (GRAD_T*)tl.addresses[4][tensor_loc];
      p_copy += chunk_idx * chunk_size;
    }

    n -= chunk_idx * chunk_size;

    T incoming_p[ILP];
    T incoming_m[ILP];
    T incoming_v[ILP];
    T incoming_g[ILP];

    // to make things simple, we put aligned case in a different code path
    if (n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(p) && is_aligned(m) && is_aligned(v) && is_aligned(g) &&
        is_aligned(p_copy)) {
      for (int i_start = threadIdx.x; i_start * ILP < n && i_start * ILP < chunk_size; i_start += blockDim.x) {
        // load
        GRAD_T tmp_g[ILP];
        load_store(incoming_p, p, 0, i_start);
        load_store(incoming_m, m, 0, i_start);
        load_store(incoming_v, v, 0, i_start);
        load_store(tmp_g, g, 0, i_start);
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          incoming_g[ii] = static_cast<T>(tmp_g[ii]);
          T scaled_grad = incoming_g[ii] / grad_scale;
          incoming_m[ii] = b1 * incoming_m[ii] + (1 - b1) * scaled_grad;
          incoming_v[ii] = b2 * incoming_v[ii] + (1 - b2) * scaled_grad * scaled_grad;
          float denom;
          if (mode == ADAM_MODE_0)
            denom = sqrtf(incoming_v[ii] + eps);
          else  // Mode 1
            denom = sqrtf(incoming_v[ii]) + eps;
          float update = (incoming_m[ii] / denom) + (decay * incoming_p[ii]);
          incoming_p[ii] = incoming_p[ii] - (step_size * update);
          if (DEPTH == 5) tmp_g[ii] = static_cast<GRAD_T>(incoming_p[ii]);
        }
        load_store(p, incoming_p, i_start, 0);
        load_store(m, incoming_m, i_start, 0);
        load_store(v, incoming_v, i_start, 0);
        if (DEPTH == 5) load_store(p_copy, tmp_g, i_start, 0);
      }
    } else {
      for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          incoming_p[ii] = 0;
          incoming_m[ii] = 0;
          incoming_v[ii] = 0;
          incoming_g[ii] = 0;

          int i = i_start + threadIdx.x + ii * blockDim.x;
          if (i < n && i < chunk_size) {
            incoming_p[ii] = p[i];
            incoming_m[ii] = m[i];
            incoming_v[ii] = v[i];
            incoming_g[ii] = static_cast<T>(g[i]);
          }
        }

        // note for clarification to future michael:
        // From a pure memory dependency perspective, there's likely no point unrolling
        // the write loop, since writes just fire off once their LDGs arrive.
        // Put another way, the STGs are dependent on the LDGs, but not on each other.
        // There is still compute ILP benefit from unrolling the loop though.
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          int j = i_start + threadIdx.x + ii * blockDim.x;

          if (j < n && j < chunk_size) {
            T scaled_grad = incoming_g[ii] / grad_scale;
            m[j] = b1 * incoming_m[ii] + (1 - b1) * scaled_grad;
            v[j] = b2 * incoming_v[ii] + (1 - b2) * scaled_grad * scaled_grad;
            float denom;
            if (mode == ADAM_MODE_0)
              denom = sqrtf(v[j] + eps);
            else  // Mode 1
              denom = sqrtf(v[j]) + eps;
            float update = (m[j] / denom) + (decay * incoming_p[ii]);
            p[j] = incoming_p[ii] - (step_size * update);
            if (DEPTH == 5) p_copy[j] = (GRAD_T)p[j];
          }
        }
      }
    }
  }
};

void fused_adam_cuda(at::Tensor& p, at::Tensor& p_copy, at::Tensor& m, at::Tensor& v, at::Tensor& g, float lr,
                     float beta1, float beta2, float eps, float grad_scale, int step, int mode, int bias_correction,
                     float decay) {
  //        using namespace at;

  // Get tensor size
  int tsize = p.numel();
  // Determine #threads and #blocks
  const int threadsPerBlock = 512;
  const dim3 blocks((tsize + threadsPerBlock - 1) / threadsPerBlock);
  TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(p), "parameter tensor is too large to be indexed with int32");
  // Constants
  float step_size = 0;
  if (bias_correction == 1) {
    const float bias_correction1 = 1 - std::pow(beta1, step);
    const float bias_correction2 = 1 - std::pow(beta2, step);
    step_size = lr * std::sqrt(bias_correction2) / bias_correction1;
  } else {
    step_size = lr;
  }
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  if (g.scalar_type() == at::ScalarType::Half) {
    // all other values should be fp32 for half gradients
    TORCH_CHECK(p.scalar_type() == at::ScalarType::Float, "expected parameter to be of float type");
    // dispatch is done on the gradient type
    using namespace at;  // prevents "toString is undefined" errors
    DISPATCH_FLOAT_AND_HALF(g.scalar_type(), 0, "adam_cuda_kernel", using accscalar_t = at::acc_type<scalar_t_0, true>;
                            adam_cuda_kernel<accscalar_t, scalar_t_0><<<blocks, threadsPerBlock, 0, stream>>>(
                                p.data_ptr<accscalar_t>(), p_copy.numel() ? p_copy.data_ptr<scalar_t_0>() : NULL,
                                m.data_ptr<accscalar_t>(), v.data_ptr<accscalar_t>(), g.data_ptr<scalar_t_0>(), beta1,
                                beta2, eps, grad_scale, step_size, tsize, (adamMode_t)mode, decay););
  } else {
    using namespace at;
    DISPATCH_DOUBLE_AND_FLOAT(g.scalar_type(), 0, "adam_cuda_kernel",
                              adam_cuda_kernel<scalar_t_0, scalar_t_0><<<blocks, threadsPerBlock, 0, stream>>>(
                                  p.data_ptr<scalar_t_0>(),
                                  NULL,  // don't output p_copy for fp32, it's wasted write
                                  m.data_ptr<scalar_t_0>(), v.data_ptr<scalar_t_0>(), g.data_ptr<scalar_t_0>(), beta1,
                                  beta2, eps, grad_scale, step_size, tsize, (adamMode_t)mode, decay););
  }
  C10_CUDA_CHECK(cudaGetLastError());
}

void fused_adam_cuda_mt(int chunk_size, at::Tensor noop_flag,
                        std::vector<std::vector<at::Tensor>> tensor_lists,  // p, m, v, g, p_copy
                        float lr, float beta1, float beta2, float eps, float grad_scale, int step, int mode,
                        int bias_correction, float decay) {
  // Constants
  float step_size = 0;
  if (bias_correction == 1) {
    const float bias_correction1 = 1 - std::pow(beta1, step);
    const float bias_correction2 = 1 - std::pow(beta2, step);
    step_size = lr * std::sqrt(bias_correction2) / bias_correction1;
  } else {
    step_size = lr;
  }
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  size_t tl_sz = tensor_lists.size();
  TORCH_CHECK(tl_sz == 4 || tl_sz == 5, "expected tensor lists of size 4 or 5");

  if (tensor_lists[3][0].scalar_type() == at::ScalarType::Half) {
    // alher values should be fp32 for half gradients
    TORCH_CHECK(tensor_lists[0][0].scalar_type() == at::ScalarType::Float, "expected parameter to be of float type");
    // dich is done on the gradient type
    if (tl_sz == 5) {
      DISPATCH_FLOAT_AND_HALF(tensor_lists[3][0].scalar_type(), 0, "adam_cuda_mt_kernel",
                              using accscalar_t = at::acc_type<scalar_t_0, true>;
                              multi_tensor_apply<5>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists,
                                                    AdamFunctor<5, accscalar_t, scalar_t_0>(), beta1, beta2, eps,
                                                    grad_scale, step_size, (adamMode_t)mode, decay););
    } else {
      DISPATCH_FLOAT_AND_HALF(tensor_lists[3][0].scalar_type(), 0, "adam_cuda_mt_kernel",
                              using accscalar_t = at::acc_type<scalar_t_0, true>;
                              multi_tensor_apply<4>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists,
                                                    AdamFunctor<4, accscalar_t, scalar_t_0>(), beta1, beta2, eps,
                                                    grad_scale, step_size, (adamMode_t)mode, decay););
    }
  } else {
    if (tl_sz == 5) {
      DISPATCH_DOUBLE_AND_FLOAT(tensor_lists[3][0].scalar_type(), 0, "adam_cuda_mt_kernel",
                                multi_tensor_apply<5>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists,
                                                      AdamFunctor<5, scalar_t_0, scalar_t_0>(), beta1, beta2, eps,
                                                      grad_scale, step_size, (adamMode_t)mode, decay););
    } else {
      DISPATCH_DOUBLE_AND_FLOAT(tensor_lists[3][0].scalar_type(), 0, "adam_cuda_mt_kernel",
                                multi_tensor_apply<4>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists,
                                                      AdamFunctor<4, scalar_t_0, scalar_t_0>(), beta1, beta2, eps,
                                                      grad_scale, step_size, (adamMode_t)mode, decay););
    }
  }
  C10_CUDA_CHECK(cudaGetLastError());
}

template <typename FROM_T, typename TO_T>
__device__ void convert(const FROM_T vi, TO_T& vo) {
  vo = static_cast<TO_T>(vi);
}

template <>
__device__ void convert(const float vi, uint8_t& vo) {
  union S {
    float as_float;
    int as_int;
  };
  S s;
  s.as_float = vi;
  s.as_int = s.as_int & 0xFF800000;
  union T {
    at::Half as_half;
    uint8_t as_byte[2];
  };
  T t;
  t.as_half = static_cast<at::Half>(vi + s.as_float / 8.0f);
  vo = t.as_byte[1];
}

template <>
__device__ void convert(const uint8_t vi, float& vo) {
  union T {
    at::Half as_half;
    uint8_t as_byte[2];
  };
  T t;
  t.as_byte[0] = 0;
  t.as_byte[1] = vi;
  vo = static_cast<float>(t.as_half);
}

template <>
__device__ void convert(const at::Half vi, uint8_t& vo) {
  union S {
    float as_float;
    int as_int;
  };
  S s;
  s.as_float = static_cast<float>(vi);
  s.as_int = s.as_int & 0xFF800000;
  union T {
    at::Half as_half;
    uint8_t as_byte[2];
  };
  T t;
  t.as_half = static_cast<at::Half>(vi + s.as_float / 8.0f);
  vo = t.as_byte[1];
}

template <>
__device__ void convert(const uint8_t vi, at::Half& vo) {
  union T {
    at::Half as_half;
    uint8_t as_byte[2];
  };
  T t;
  t.as_byte[0] = 0;
  t.as_byte[1] = vi;
  vo = t.as_half;
}

template <typename GRAD_T>
__global__ void strided_check_finite_cuda_kernel(volatile int* noop_gmem, GRAD_T* __restrict__ p_copy,
                                                 const size_t tsize, int stride, int clear_overflow_first) {
  // Assuming 2D grids and 2D blocks
  const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
  const int threadsPerBlock = blockDim.x * blockDim.y;
  const int threadIdInBlock = threadIdx.y * blockDim.x + threadIdx.x;
  const int i = (blockId * threadsPerBlock + threadIdInBlock) * stride;
  const int totThreads = gridDim.x * gridDim.y * threadsPerBlock * stride;

  if (clear_overflow_first) {
    if (i == 0) {
      *noop_gmem = 0;
    }
    __syncthreads();
  }

  for (int j = i; j < tsize; j += totThreads) {
    GRAD_T pi = p_copy[j];
    if (!isfinite(pi)) {
      *noop_gmem = 1;
    }
  }
}
template <>
__global__ void strided_check_finite_cuda_kernel(volatile int* noop_gmem, uint8_t* __restrict__ p_copy,
                                                 const size_t tsize, int stride, int clear_overflow_first) {
  // Assuming 2D grids and 2D blocks
  const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
  const int threadsPerBlock = blockDim.x * blockDim.y;
  const int threadIdInBlock = threadIdx.y * blockDim.x + threadIdx.x;
  const int i = (blockId * threadsPerBlock + threadIdInBlock) * stride;
  const int totThreads = gridDim.x * gridDim.y * threadsPerBlock * stride;

  if (clear_overflow_first) {
    if (i == 0) {
      *noop_gmem = 0;
    }
    __syncthreads();
  }

  for (int j = i; j < tsize; j += totThreads) {
    at::Half pi;
    convert(p_copy[j], pi);
    if (!isfinite(pi)) {
      *noop_gmem = 1;
    }
  }
}

template <typename FROM_T, typename TO_T>
__global__ void maybe_cast_kernel(volatile int* overflow_flag, const FROM_T* p_in, TO_T* p_out, const size_t tsize) {
  if (overflow_flag && *overflow_flag != 0) return;

  // Assuming 2D grids and 2D blocks
  const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
  const int threadsPerBlock = blockDim.x * blockDim.y;
  const int threadIdInBlock = threadIdx.y * blockDim.x + threadIdx.x;
  const int i = (blockId * threadsPerBlock + threadIdInBlock);
  const int totThreads = gridDim.x * gridDim.y * threadsPerBlock;

  FROM_T pi[ILP];
  TO_T po[ILP];

  for (int j_start = 0; j_start < tsize; j_start += totThreads * ILP) {
#pragma unroll
    for (int ii = 0; ii < ILP; ii++) {
      pi[ii] = 0;

      int j = j_start + i + totThreads * ii;
      if (j < tsize) {
        pi[ii] = p_in[j];
      }
    }

#pragma unroll
    for (int ii = 0; ii < ILP; ii++) {
      convert(pi[ii], po[ii]);
    }

#pragma unroll
    for (int ii = 0; ii < ILP; ii++) {
      int j = j_start + i + totThreads * ii;
      if (j < tsize) {
        p_out[j] = po[ii];
      }
    }
  }
}

template <typename T, typename GRAD_T, typename REDU_T>
__global__ void reversible_adam_cuda_kernel(
    T* __restrict__ p,
    REDU_T* __restrict__ p_copy,  // For mixed precision training, pass NULL if not needed
    T* __restrict__ m, T* __restrict__ v, const GRAD_T* __restrict__ g, const float b1, const float b2, const float eps,
    const float grad_scale, const float step_size, const size_t tsize, adamMode_t mode, const float decay) {
  // Assuming 2D grids and 2D blocks
  const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
  const int threadsPerBlock = blockDim.x * blockDim.y;
  const int threadIdInBlock = threadIdx.y * blockDim.x + threadIdx.x;
  const int i = (blockId * threadsPerBlock + threadIdInBlock);
  const int totThreads = gridDim.x * gridDim.y * threadsPerBlock;

  T mi[ILP];
  T vi[ILP];
  T pi[ILP];
  T gi[ILP];

  bool overflow = false;
  for (int j_start = 0; j_start < tsize; j_start += totThreads * ILP) {
#pragma unroll
    for (int ii = 0; ii < ILP; ii++) {
      mi[ii] = T(0);
      vi[ii] = T(0);
      pi[ii] = T(0);
      gi[ii] = GRAD_T(0);

      int j = j_start + i + totThreads * ii;
      if (j < tsize) {
        pi[ii] = p[j];
        mi[ii] = m[j];
        vi[ii] = v[j];
        gi[ii] = static_cast<T>(g[j]);
      }
    }

#pragma unroll
    for (int ii = 0; ii < ILP; ii++) {
      T scaled_grad = gi[ii] / grad_scale;
      if (isfinite(scaled_grad)) {
        mi[ii] = b1 * mi[ii] + (1 - b1) * scaled_grad;
        vi[ii] = b2 * vi[ii] + (1 - b2) * scaled_grad * scaled_grad;
        float denom;
        if (mode == ADAM_MODE_0)
          denom = sqrtf(vi[ii] + eps);
        else  // Mode 1
          denom = sqrtf(vi[ii]) + eps;
        float update = (mi[ii] / denom) + (decay * pi[ii]);
        pi[ii] = pi[ii] - (step_size * update);
      } else {
        overflow = true;
      }
    }

#pragma unroll
    for (int ii = 0; ii < ILP; ii++) {
      int j = j_start + i + totThreads * ii;
      if (j < tsize) {
        m[j] = mi[ii];
        v[j] = vi[ii];
        p[j] = pi[ii];
        if (p_copy != NULL) {
          convert(pi[ii], p_copy[j]);
        }
      }
    }
  }

  if (p_copy != NULL) {
    __syncthreads();
    if (overflow) {
      convert(float(INFINITY), p_copy[0]);
    }
  }
}

template <typename T, typename GRAD_T>
__global__ void maybe_adam_undo_cuda_kernel(volatile int* overflow_flag, T* __restrict__ p, T* __restrict__ m,
                                            T* __restrict__ v, const GRAD_T* __restrict__ g, const float b1,
                                            const float b2, const float eps, const float grad_scale,
                                            const float step_size, const size_t tsize, adamMode_t mode,
                                            const float decay) {
  // NB! Skip undo kernel when overflow flag is NOT set
  if (overflow_flag && *overflow_flag == 0) return;

  // Assuming 2D grids and 2D blocks
  const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
  const int threadsPerBlock = blockDim.x * blockDim.y;
  const int threadIdInBlock = threadIdx.y * blockDim.x + threadIdx.x;
  const int i = (blockId * threadsPerBlock + threadIdInBlock);
  const int totThreads = gridDim.x * gridDim.y * threadsPerBlock;

  T mi[ILP];
  T vi[ILP];
  T pi[ILP];
  T gi[ILP];

  for (int j_start = 0; j_start < tsize; j_start += totThreads * ILP) {
#pragma unroll
    for (int ii = 0; ii < ILP; ii++) {
      mi[ii] = T(0);
      vi[ii] = T(0);
      pi[ii] = T(0);
      gi[ii] = GRAD_T(0);

      int j = j_start + i * ILP;
      if (j < tsize) {
        pi[ii] = p[j];
        mi[ii] = m[j];
        vi[ii] = v[j];
        gi[ii] = static_cast<T>(g[j]);
      }
    }

#pragma unroll
    for (int ii = 0; ii < ILP; ii++) {
      T scaled_grad = gi[ii] / grad_scale;
      if (isfinite(scaled_grad)) {
        float denom;
        if (mode == ADAM_MODE_0)
          denom = sqrtf(vi[ii] + eps);
        else  // Mode 1
          denom = sqrtf(vi[ii]) + eps;
        pi[ii] = (pi[ii] + step_size * (mi[ii] / denom)) / (1.0f - step_size * decay);
        mi[ii] = (mi[ii] - (1 - b1) * scaled_grad) / b1;
        vi[ii] = (vi[ii] - (1 - b2) * scaled_grad * scaled_grad) / b2;
        // Make sure round off errors don't create (small) negative value.
        // This can happen if we have to revert the very first step.
        vi[ii] = vi[ii] >= 0.0f ? vi[ii] : 0.0f;
      }
    }

#pragma unroll
    for (int ii = 0; ii < ILP; ii++) {
      int j = j_start + i * ILP;
      if (j < tsize) {
        m[j] = mi[ii];
        v[j] = vi[ii];
        p[j] = pi[ii];
      }
    }
  }
}

template <int DEPTH, typename FROM_T, typename TO_T>
struct MaybeCastFunctor {
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* overflow_flag,
                                             TensorListMetadata<DEPTH>& tl) {
    if (overflow_flag && *overflow_flag != 0) return;

    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    FROM_T* p_in = (FROM_T*)tl.addresses[0][tensor_loc];
    p_in += chunk_idx * chunk_size;
    TO_T* p_out = (TO_T*)tl.addresses[1][tensor_loc];
    p_out += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;
    int dim = chunk_size < n ? chunk_size : n;

    FROM_T pi[ILP];
    TO_T po[ILP];

    for (int j_start = 0; j_start < dim; j_start += blockDim.x * ILP) {
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        pi[ii] = FROM_T(0);
        int j = j_start + threadIdx.x + ii * blockDim.x;
        if (j < dim) {
          pi[ii] = p_in[j];
        }
      }

#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        convert(pi[ii], po[ii]);
      }

#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int j = j_start + threadIdx.x + ii * blockDim.x;
        if (j < dim) {
          p_out[j] = po[ii];
        }
      }
    }
  }
};

void fused_strided_check_finite(at::Tensor& overflow_flag, at::Tensor& p_copy, int stride, int clear_overflow_first) {
  // Get tensor size
  int tsize = p_copy.numel();
  int niter = (tsize + stride - 1) / stride;

  // Determine #threads and #blocks
  const int threadsPerBlock = 512;
  // In order to avoid race condition, blocks must be 1 when clear_overflow_first flag is set.
  const dim3 blocks(clear_overflow_first ? 1 : (niter + threadsPerBlock - 1) / threadsPerBlock);
  TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(p_copy), "parameter tensor is too large to be indexed with int32");

  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  using namespace at;  // prevents "toString is undefined" errors
  DISPATCH_FLOAT_HALF_AND_BYTE(
      p_copy.scalar_type(), 0, "check_finite_cuda_kernel",
      strided_check_finite_cuda_kernel<scalar_t_0><<<blocks, threadsPerBlock, 0, stream>>>(
          overflow_flag.data_ptr<int>(), p_copy.data_ptr<scalar_t_0>(), tsize, stride, clear_overflow_first););
  C10_CUDA_CHECK(cudaGetLastError());
}

void fused_reversible_adam_cuda(at::Tensor& p, at::Tensor& p_copy, at::Tensor& m, at::Tensor& v, at::Tensor& g,
                                float lr, float beta1, float beta2, float eps, float grad_scale, int step, int mode,
                                int bias_correction, float decay) {
  //      using namespace at;

  // Get tensor size
  int tsize = p.numel();
  // Determine #threads and #blocks
  const int threadsPerBlock = 512;
  const dim3 blocks((tsize + threadsPerBlock - 1) / threadsPerBlock);
  TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(p), "parameter tensor is too large to be indexed with int32");
  // Constants
  float step_size = 0;
  if (bias_correction == 1) {
    const float bias_correction1 = 1 - std::pow(beta1, step);
    const float bias_correction2 = 1 - std::pow(beta2, step);
    step_size = lr * std::sqrt(bias_correction2) / bias_correction1;
  } else {
    step_size = lr;
  }
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  if (g.scalar_type() == at::ScalarType::Half) {
    // all other values should be fp32 for half gradients
    TORCH_CHECK(p.scalar_type() == at::ScalarType::Float, "expected parameter to be of float type");
    // dispatch is done on the gradient type
    using namespace at;  // prevents "toString is undefined" errors
    if (p_copy.numel() == 0 || p_copy.scalar_type() == g.scalar_type()) {
      DISPATCH_FLOAT_AND_HALF(
          g.scalar_type(), 0, "adam_cuda_kernel", using accscalar_t = at::acc_type<scalar_t_0, true>;
          reversible_adam_cuda_kernel<accscalar_t, scalar_t_0, scalar_t_0><<<blocks, threadsPerBlock, 0, stream>>>(
              p.data_ptr<accscalar_t>(), p_copy.numel() ? p_copy.data_ptr<scalar_t_0>() : NULL,
              m.data_ptr<accscalar_t>(), v.data_ptr<accscalar_t>(), g.data_ptr<scalar_t_0>(), beta1, beta2, eps,
              grad_scale, step_size, tsize, (adamMode_t)mode, decay););
    } else {
      TORCH_CHECK(p_copy.scalar_type() == at::ScalarType::Byte, "expected parameter to be of byte type");
      DISPATCH_FLOAT_AND_HALF(
          g.scalar_type(), 0, "adam_cuda_e5m2_kernel", using accscalar_t = at::acc_type<scalar_t_0, true>;
          reversible_adam_cuda_kernel<accscalar_t, scalar_t_0, uint8_t><<<blocks, threadsPerBlock, 0, stream>>>(
              p.data_ptr<accscalar_t>(), p_copy.data_ptr<uint8_t>(), m.data_ptr<accscalar_t>(),
              v.data_ptr<accscalar_t>(), g.data_ptr<scalar_t_0>(), beta1, beta2, eps, grad_scale, step_size, tsize,
              (adamMode_t)mode, decay););
    }
  } else {
    using namespace at;
    DISPATCH_DOUBLE_AND_FLOAT(g.scalar_type(), 0, "adam_cuda_kernel",
                              reversible_adam_cuda_kernel<scalar_t_0, scalar_t_0, scalar_t_0>
                              <<<blocks, threadsPerBlock, 0, stream>>>(
                                  p.data_ptr<scalar_t_0>(),
                                  NULL,  // don't output p_copy for fp32, it's wasted write
                                  m.data_ptr<scalar_t_0>(), v.data_ptr<scalar_t_0>(), g.data_ptr<scalar_t_0>(), beta1,
                                  beta2, eps, grad_scale, step_size, tsize, (adamMode_t)mode, decay););
  }
  C10_CUDA_CHECK(cudaGetLastError());
}

void maybe_cast_cuda(at::Tensor& overflow_flag, at::Tensor& p_in, at::Tensor& p_out) {
  // Get tensor size
  int tsize = p_in.numel();
  TORCH_CHECK(tsize == p_out.numel(), "p_in.numel() must equal p_out.numel()");
  // Determine #threads and #blocks
  const int threadsPerBlock = 512;
  const dim3 blocks((tsize + threadsPerBlock - 1) / threadsPerBlock);
  TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(p_in), "parameter tensor is too large to be indexed with int32");
  // Constants
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  DISPATCH_FLOAT_HALF_AND_BYTE(p_in.scalar_type(), 0,
                               "maybe_cast_cuda" DISPATCH_FLOAT_HALF_AND_BYTE(
                                   p_out.scalar_type(), 1, "maybe_cast_cuda",
                                   maybe_cast_kernel<scalar_t_0, scalar_t_1><<<blocks, threadsPerBlock, 0, stream>>>(
                                       overflow_flag.numel() ? overflow_flag.data_ptr<int>() : NULL,
                                       p_in.data_ptr<scalar_t_0>(), p_out.data_ptr<scalar_t_1>(), tsize);))
  C10_CUDA_CHECK(cudaGetLastError());
}

void maybe_cast_cuda_mt(int chunk_size, at::Tensor overflow_flag,
                        std::vector<std::vector<at::Tensor>> tensor_lists)  // p_in, p_out
{
  // Constants
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  size_t tl_sz = tensor_lists.size();
  TORCH_CHECK(tl_sz == 2, "expected tensor lists of size 2");

  DISPATCH_FLOAT_HALF_AND_BYTE(
      tensor_lists[0][0].scalar_type(), 0, "maybe_cast_cuda_mt_kernel",
      DISPATCH_FLOAT_HALF_AND_BYTE(tensor_lists[1][0].scalar_type(), 1, "maybe_cast_cuda_mt_kernel",
                                   multi_tensor_apply<2>(BLOCK_SIZE, chunk_size, overflow_flag, tensor_lists,
                                                         MaybeCastFunctor<2, scalar_t_0, scalar_t_1>());))
  C10_CUDA_CHECK(cudaGetLastError());
}

void fused_maybe_adam_undo_cuda(at::Tensor& overflow_flag, at::Tensor& p, at::Tensor& m, at::Tensor& v, at::Tensor& g,
                                float lr, float beta1, float beta2, float eps, float grad_scale, int step, int mode,
                                int bias_correction, float decay) {
  // Get tensor size
  int tsize = p.numel();
  // Determine #threads and #blocks
  const int threadsPerBlock = 512;
  const dim3 blocks((tsize + threadsPerBlock - 1) / threadsPerBlock);
  TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(p), "parameter tensor is too large to be indexed with int32");
  // Constants
  float step_size = 0;
  if (bias_correction == 1) {
    const float bias_correction1 = 1 - std::pow(beta1, step);
    const float bias_correction2 = 1 - std::pow(beta2, step);
    step_size = lr * std::sqrt(bias_correction2) / bias_correction1;
  } else {
    step_size = lr;
  }
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  if (g.scalar_type() == at::ScalarType::Half) {
    // all other values should be fp32 for half gradients
    TORCH_CHECK(p.scalar_type() == at::ScalarType::Float, "expected parameter to be of float type");
    // dispatch is done on the gradient type
    using namespace at;  // prevents "toString is undefined" errors
    DISPATCH_FLOAT_AND_HALF(g.scalar_type(), 0, "adam_cuda_kernel", using accscalar_t = at::acc_type<scalar_t_0, true>;
                            maybe_adam_undo_cuda_kernel<accscalar_t, scalar_t_0>
                            <<<blocks, threadsPerBlock, 0, stream>>>(
                                overflow_flag.numel() ? overflow_flag.data_ptr<int>() : NULL, p.data_ptr<accscalar_t>(),
                                m.data_ptr<accscalar_t>(), v.data_ptr<accscalar_t>(), g.data_ptr<scalar_t_0>(), beta1,
                                beta2, eps, grad_scale, step_size, tsize, (adamMode_t)mode, decay););
  } else {
    using namespace at;
    DISPATCH_DOUBLE_AND_FLOAT(
        g.scalar_type(), 0, "adam_cuda_kernel",
        maybe_adam_undo_cuda_kernel<scalar_t_0, scalar_t_0><<<blocks, threadsPerBlock, 0, stream>>>(
            overflow_flag.numel() ? overflow_flag.data_ptr<int>() : NULL, p.data_ptr<scalar_t_0>(),
            m.data_ptr<scalar_t_0>(), v.data_ptr<scalar_t_0>(), g.data_ptr<scalar_t_0>(), beta1, beta2, eps, grad_scale,
            step_size, tsize, (adamMode_t)mode, decay););
  }
  C10_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: apex/contrib/csrc/optimizers/fused_lamb_cuda.cpp
================================================
#include <torch/extension.h>

void multi_tensor_lamb_cuda(int chunk_size, at::Tensor noop_flag, std::vector<std::vector<at::Tensor>> tensor_lists,
                            const float lr, const float beta1, const float beta2, const float epsilon, const int step,
                            const int bias_correction, const float weight_decay, const int grad_averaging,
                            const int mode, const float global_grad_norm, const float max_grad_norm);

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("lamb", &multi_tensor_lamb_cuda, "Computes and apply update for LAMB optimizer",
        py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: apex/contrib/csrc/optimizers/fused_lamb_cuda_kernel.cu
================================================
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
// Another possibility:
// #include <torch/all.h>

#include <assert.h>

#include "multi_tensor_apply.cuh"
#include "type_shim.h"

#define BLOCK_SIZE 512
#define ILP 4

typedef enum {
  MOMENT_MODE_0 = 0,  // L2 regularization mode
  MOMENT_MODE_1 = 1   // Decoupled weight decay mode
} adamMode_t;

std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(int chunk_size, at::Tensor noop_flag,
                                                            std::vector<std::vector<at::Tensor>> tensor_lists,
                                                            at::optional<bool> per_tensor_python);

using MATH_T = float;

template <typename T>
struct LAMBStage1Functor {
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<4>& tl,
                                             const float beta1, const float beta2, const float beta3,
                                             const float beta1_correction, const float beta2_correction,
                                             const float epsilon, adamMode_t mode, const float decay,
                                             const float global_grad_norm, const float max_global_grad_norm) {
    // I'd like this kernel to propagate infs/nans.
    // if(*noop_gmem == 1)
    //   return;

    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    float clipped_global_grad_norm =
        global_grad_norm > max_global_grad_norm ? global_grad_norm / max_global_grad_norm : 1.0f;

    T* g = (T*)tl.addresses[0][tensor_loc];
    g += chunk_idx * chunk_size;

    T* p = (T*)tl.addresses[1][tensor_loc];
    p += chunk_idx * chunk_size;

    T* m = (T*)tl.addresses[2][tensor_loc];
    m += chunk_idx * chunk_size;

    T* v = (T*)tl.addresses[3][tensor_loc];
    v += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;

    // see note in multi_tensor_scale_kernel.cu
    for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
      MATH_T r_g[ILP];
      MATH_T r_p[ILP];
      MATH_T r_m[ILP];
      MATH_T r_v[ILP];
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int i = i_start + threadIdx.x + ii * blockDim.x;
        if (i < n && i < chunk_size) {
          r_g[ii] = g[i];
          // special ?optimization? for lamb stage 1
          if (decay == 0) {
            r_p[ii] = MATH_T(0);
          } else {
            r_p[ii] = p[i];
          }
          r_m[ii] = m[i];
          r_v[ii] = v[i];
        } else {
          r_g[ii] = MATH_T(0);
          r_p[ii] = MATH_T(0);
          r_m[ii] = MATH_T(0);
          r_v[ii] = MATH_T(0);
        }
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        if (mode == MOMENT_MODE_0) {
          MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
          // L2 on scaled grad
          scaled_grad = scaled_grad + decay * r_p[ii];
          r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
          r_v[ii] = r_v[ii] * beta2 + (1 - beta2) * scaled_grad * scaled_grad;
          MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
          MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
          MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
          r_p[ii] = next_m_unbiased / denom;
        } else {
          MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
          r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
          r_v[ii] = r_v[ii] * beta2 + (1 - beta2) * scaled_grad * scaled_grad;
          MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
          MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
          MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
          r_p[ii] = (next_m_unbiased / denom) + (decay * r_p[ii]);
        }
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int i = i_start + threadIdx.x + ii * blockDim.x;
        if (i < n && i < chunk_size) {
          g[i] = r_p[ii];
          m[i] = r_m[ii];
          v[i] = r_v[ii];
        }
      }
    }
  }
};

// Step 2 reads in 'update' value and per-tensor param_norm and update_norm.
// It computes new parameter value.
template <typename T>
struct LAMBStage2Functor {
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<2>& tl,
                                             const float* per_tensor_param_norm, const float* per_tensor_update_norm,
                                             const float learning_rate, const float decay) {
    // I'd like this kernel to propagate infs/nans.
    // if(*noop_gmem == 1)
    //   return;

    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int tensor_num = tl.start_tensor_this_launch + tensor_loc;
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    MATH_T ratio = learning_rate;
    // apply adaptive learning rate to parameters with non-zero weight decay
    if (decay != 0.0) {
      float param_norm = per_tensor_param_norm[tensor_num];
      float update_norm = per_tensor_update_norm[tensor_num];
      ratio = (update_norm != 0.0f && param_norm != 0.0f) ? learning_rate * (param_norm / update_norm) : learning_rate;
    }

    T* update = (T*)tl.addresses[0][tensor_loc];
    update += chunk_idx * chunk_size;

    T* p = (T*)tl.addresses[1][tensor_loc];
    p += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;

    for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
      MATH_T r_p[ILP];
      MATH_T r_update[ILP];
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int i = i_start + threadIdx.x + ii * blockDim.x;
        if (i < n && i < chunk_size) {
          r_p[ii] = p[i];
          r_update[ii] = update[i];
        }
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        r_p[ii] = r_p[ii] - (ratio * r_update[ii]);
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int i = i_start + threadIdx.x + ii * blockDim.x;
        if (i < n && i < chunk_size) {
          p[i] = r_p[ii];
        }
      }
    }
  }
};

void multi_tensor_lamb_cuda(int chunk_size, at::Tensor noop_flag, std::vector<std::vector<at::Tensor>> tensor_lists,
                            const float lr, const float beta1, const float beta2, const float epsilon, const int step,
                            const int bias_correction, const float weight_decay, const int grad_averaging,
                            const int mode, const float global_grad_norm, const float max_grad_norm) {
  using namespace at;
  // Master weight and 32bit momentum(potentially changing) is not handled by this
  // So we assume every tensor are all in the same type

  // Handle bias correction mode
  float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
  if (bias_correction == 1) {
    bias_correction1 = 1 - std::pow(beta1, step);
    bias_correction2 = 1 - std::pow(beta2, step);
  }

  // Handle grad averaging mode
  float beta3 = 1.0f;
  if (grad_averaging == 1) beta3 = 1 - beta1;

  std::vector<std::vector<at::Tensor>> grad_list(tensor_lists.begin(), tensor_lists.begin() + 1);
  std::vector<std::vector<at::Tensor>> param_list(tensor_lists.begin() + 1, tensor_lists.begin() + 2);

  // Compute per tensor param norm
  auto param_norm_tuple = multi_tensor_l2norm_cuda(chunk_size, noop_flag, param_list, true);

  // We now in-place modify grad to store update before compute its norm
  // Generally this is not a issue since people modify grad in step() method all the time
  // We can also grab list of empty tensor to avoid this, but I'd like to save space/cpu code
  DISPATCH_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 0, "lamb_stage_1",
                          multi_tensor_apply<4>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists,
                                                LAMBStage1Functor<scalar_t_0>(), beta1, beta2,
                                                beta3,  // 1-beta1 or 1 depends on averaging mode
                                                bias_correction1, bias_correction2, epsilon, (adamMode_t)mode,
                                                weight_decay, global_grad_norm, max_grad_norm);)

  // Compute update norms
  auto update_norm_tuple = multi_tensor_l2norm_cuda(chunk_size, noop_flag, grad_list, true);

  std::vector<std::vector<at::Tensor>> grad_param_list(tensor_lists.begin(), tensor_lists.begin() + 2);

  DISPATCH_FLOAT_AND_HALF(
      tensor_lists[0][0].scalar_type(), 0, "lamb_stage_2",
      multi_tensor_apply<2>(BLOCK_SIZE, chunk_size, noop_flag, grad_param_list, LAMBStage2Functor<scalar_t_0>(),
                            std::get<1>(param_norm_tuple).data_ptr<float>(),
                            std::get<1>(update_norm_tuple).data_ptr<float>(), lr, weight_decay);)

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: apex/contrib/csrc/optimizers/multi_tensor_distopt_adam.cpp
================================================
#include <torch/extension.h>

void multi_tensor_fused_adam_cuda(int chunk_size, at::Tensor noop_flag,
                                  std::vector<std::vector<at::Tensor>> tensor_lists, at::Tensor grad_scale, float lr,
                                  float beta1, float beta2, float eps, int step, int mode, int bias_correction,
                                  float weight_decay);

void multi_tensor_fused_adam_capturable_cuda(int chunk_size, at::Tensor noop_flag,
                                             std::vector<std::vector<at::Tensor>> tensor_lists, at::Tensor grad_scale,
                                             at::Tensor lr, float beta1, float beta2, float eps, at::Tensor step,
                                             int mode, int bias_correction, float weight_decay);

void multi_tensor_fused_adam_with_param_remainders_cuda(int chunk_size, at::Tensor noop_flag,
                                                        std::vector<std::vector<at::Tensor>> tensor_lists,
                                                        at::Tensor grad_scale, float lr, float beta1, float beta2,
                                                        float eps, int step, int mode, int bias_correction,
                                                        float weight_decay);

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("multi_tensor_fused_adam", &multi_tensor_fused_adam_cuda,
        "CUDA kernels for multi-tensor Adam, "
        "with param copy",
        py::call_guard<py::gil_scoped_release>());
  m.def("multi_tensor_fused_adam_capturable", &multi_tensor_fused_adam_capturable_cuda,
        "CUDA kernels for multi-tensor Adam, "
        "with param copy, capturable for CUDA graph",
        py::call_guard<py::gil_scoped_release>());
  m.def("multi_tensor_fused_adam_with_param_remainders", &multi_tensor_fused_adam_with_param_remainders_cuda,
        "CUDA kernel for multi-tensor Adam, "
        "with stored param remainders and param copy",
        py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: apex/contrib/csrc/optimizers/multi_tensor_distopt_adam_kernel.cu
================================================
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
// Another possibility:
// #include <torch/all.h>

#include <assert.h>

#include <cmath>

#include "multi_tensor_apply.cuh"
#include "type_shim.h"

#define BLOCK_SIZE 512
#define ILP 4

template <typename T>
__device__ __forceinline__ bool is_aligned(const T* p) {
  return ((uint64_t)p) % (ILP * sizeof(T)) == 0;
}

template <typename T>
__device__ __forceinline__ void load_store(T* dst, const T* src, int dst_offset = 0, int src_offset = 0) {
  typedef typename std::aligned_storage<ILP * sizeof(T), ILP * alignof(T)>::type LT;
  ((LT*)dst)[dst_offset] = ((const LT*)src)[src_offset];
}

// (1-t)*x + t*y
// Note: Named _lerp to avoid ambiguity with std::lerp under C++20.
__device__ __forceinline__ float _lerp(float t, float x, float y) {
  // See https://developer.nvidia.com/blog/lerp-faster-cuda/
  return fma(t, y, fma(-t, x, x));
}

typedef enum {
  ADAM_MODE_0 = 0,  // L2 regularization mode
  ADAM_MODE_1 = 1   // Decoupled weight decay mode(AdamW)
} adamMode_t;

/* Multi-tensor Adam
 *
 * Updates params in-place and outputs a copy with a desired datatype.
 */
template <typename T, typename GRAD_T, typename PARAM_OUT_T>
struct DistAdamFunctor {
  // Vectorized local compute
  __device__ __forceinline__ static void local_step(T p[ILP], T m[ILP], T v[ILP], const GRAD_T g[ILP],
                                                    const float grad_scale, const float beta1, const float beta2,
                                                    const float beta1_correction, const float beta2_correction,
                                                    const float eps, const float lr, adamMode_t mode,
                                                    const float weight_decay) {
    if (mode == ADAM_MODE_0) {  // L2
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        float scaled_grad = (g[ii] * grad_scale) + (weight_decay * p[ii]);
        float next_m = _lerp(beta1, scaled_grad, m[ii]);
        float next_v = _lerp(beta2, scaled_grad * scaled_grad, v[ii]);
        float next_m_unbiased = next_m / beta1_correction;
        float next_v_unbiased = next_v / beta2_correction;
        float denom = sqrtf(next_v_unbiased) + eps;
        float update = next_m_unbiased / denom;
        m[ii] = next_m;
        v[ii] = next_v;
        p[ii] -= lr * update;
      }
    } else {  // weight decay
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        float scaled_grad = g[ii] * grad_scale;
        float next_m = _lerp(beta1, scaled_grad, m[ii]);
        float next_v = _lerp(beta2, scaled_grad * scaled_grad, v[ii]);
        float next_m_unbiased = next_m / beta1_correction;
        float next_v_unbiased = next_v / beta2_correction;
        float denom = sqrtf(next_v_unbiased) + eps;
        float update = (next_m_unbiased / denom) + (weight_decay * p[ii]);
        m[ii] = next_m;
        v[ii] = next_v;
        p[ii] -= lr * update;
      }
    }
  }

  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<5>& tl,
                                             const float* grad_scale_ptr, const float beta1, const float beta2,
                                             const float beta1_correction, const float beta2_correction,
                                             const float eps, const float lr, adamMode_t mode,
                                             const float weight_decay) const {
    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    const float grad_scale = *grad_scale_ptr;

    T* p_in = (T*)tl.addresses[0][tensor_loc];
    p_in += chunk_idx * chunk_size;
    T* m = (T*)tl.addresses[1][tensor_loc];
    m += chunk_idx * chunk_size;
    T* v = (T*)tl.addresses[2][tensor_loc];
    v += chunk_idx * chunk_size;
    const GRAD_T* g = (GRAD_T*)tl.addresses[3][tensor_loc];
    g += chunk_idx * chunk_size;
    PARAM_OUT_T* p_out = (PARAM_OUT_T*)tl.addresses[4][tensor_loc];
    p_out += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;
    n = chunk_size < n ? chunk_size : n;

    const bool aligned =
        (n % ILP == 0 && is_aligned(p_in) && is_aligned(m) && is_aligned(v) && is_aligned(g) && is_aligned(p_out));

    for (int i_start = threadIdx.x * ILP; i_start < n; i_start += blockDim.x * ILP) {
      T local_p[ILP];
      T local_m[ILP];
      T local_v[ILP];
      GRAD_T local_g[ILP];
      PARAM_OUT_T local_p_out[ILP];

      // Load
      if (aligned) {
        load_store(local_p, p_in + i_start);
        load_store(local_m, m + i_start);
        load_store(local_v, v + i_start);
        load_store(local_g, g + i_start);
      } else {
#pragma unroll
        for (int ii = 0, i = i_start; ii < ILP; ii++, i++) {
          if (i < n) {
            local_p[ii] = p_in[i];
            local_m[ii] = m[i];
            local_v[ii] = v[i];
            local_g[ii] = g[i];
          } else {
            local_p[ii] = 0;
            local_m[ii] = 0;
            local_v[ii] = 0;
            local_g[ii] = 0;
          }
        }
      }

      // Local compute
      local_step(local_p, local_m, local_v, local_g, grad_scale, beta1, beta2, beta1_correction, beta2_correction, eps,
                 lr, mode, weight_decay);
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        local_p_out[ii] = static_cast<PARAM_OUT_T>(local_p[ii]);
      }

      // Store
      if (aligned) {
        load_store(p_in + i_start, local_p);
        load_store(m + i_start, local_m);
        load_store(v + i_start, local_v);
        load_store(p_out + i_start, local_p_out);
      } else {
#pragma unroll
        for (int ii = 0, i = i_start; ii < ILP; ii++, i++) {
          if (i < n) {
            p_in[i] = local_p[ii];
            m[i] = local_m[ii];
            v[i] = local_v[ii];
            p_out[i] = local_p_out[ii];
          }
        }
      }
    }
  }
};

/* Multi-tensor Adam with CUDA Graph Support
 *
 * Updates params in-place and outputs a copy with a desired datatype.
 */
template <typename T, typename GRAD_T, typename PARAM_OUT_T>
struct DistAdamCapturableFunctor {
  // Vectorized local compute
  __device__ __forceinline__ static void local_step(T p[ILP], T m[ILP], T v[ILP], const GRAD_T g[ILP],
                                                    const float grad_scale, const float beta1, const float beta2,
                                                    const float beta1_correction, const float beta2_correction,
                                                    const float eps, const float lr, adamMode_t mode,
                                                    const float weight_decay) {
    if (mode == ADAM_MODE_0) {  // L2
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        float scaled_grad = (g[ii] * grad_scale) + (weight_decay * p[ii]);
        float next_m = _lerp(beta1, scaled_grad, m[ii]);
        float next_v = _lerp(beta2, scaled_grad * scaled_grad, v[ii]);
        float next_m_unbiased = next_m / beta1_correction;
        float next_v_unbiased = next_v / beta2_correction;
        float denom = sqrtf(next_v_unbiased) + eps;
        float update = next_m_unbiased / denom;
        m[ii] = next_m;
        v[ii] = next_v;
        p[ii] -= lr * update;
      }
    } else {  // weight decay
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        float scaled_grad = g[ii] * grad_scale;
        float next_m = _lerp(beta1, scaled_grad, m[ii]);
        float next_v = _lerp(beta2, scaled_grad * scaled_grad, v[ii]);
        float next_m_unbiased = next_m / beta1_correction;
        float next_v_unbiased = next_v / beta2_correction;
        float denom = sqrtf(next_v_unbiased) + eps;
        float update = (next_m_unbiased / denom) + (weight_decay * p[ii]);
        m[ii] = next_m;
        v[ii] = next_v;
        p[ii] -= lr * update;
      }
    }
  }

  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<5>& tl,
                                             const float* grad_scale_ptr, const float beta1, const float beta2,
                                             const int* step, const int bias_correction, const float eps,
                                             const float* lr, adamMode_t mode, const float weight_decay) const {
    assert(noop_gmem);
    assert(grad_scale_ptr);
    assert(step);
    assert(lr);

    if (*noop_gmem == 1) return;

    float beta1_correction = 1.0f, beta2_correction = 1.0f;
    if (bias_correction == 1) {
      beta1_correction = 1 - pow(beta1, *step);
      beta2_correction = 1 - pow(beta2, *step);
    }

    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    const float grad_scale = *grad_scale_ptr;

    T* p_in = (T*)tl.addresses[0][tensor_loc];
    p_in += chunk_idx * chunk_size;
    T* m = (T*)tl.addresses[1][tensor_loc];
    m += chunk_idx * chunk_size;
    T* v = (T*)tl.addresses[2][tensor_loc];
    v += chunk_idx * chunk_size;
    const GRAD_T* g = (GRAD_T*)tl.addresses[3][tensor_loc];
    g += chunk_idx * chunk_size;
    PARAM_OUT_T* p_out = (PARAM_OUT_T*)tl.addresses[4][tensor_loc];
    p_out += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;
    n = chunk_size < n ? chunk_size : n;

    const bool aligned =
        (n % ILP == 0 && is_aligned(p_in) && is_aligned(m) && is_aligned(v) && is_aligned(g) && is_aligned(p_out));

    for (int i_start = threadIdx.x * ILP; i_start < n; i_start += blockDim.x * ILP) {
      T local_p[ILP];
      T local_m[ILP];
      T local_v[ILP];
      GRAD_T local_g[ILP];
      PARAM_OUT_T local_p_out[ILP];

      // Load
      if (aligned) {
        load_store(local_p, p_in + i_start);
        load_store(local_m, m + i_start);
        load_store(local_v, v + i_start);
        load_store(local_g, g + i_start);
      } else {
#pragma unroll
        for (int ii = 0, i = i_start; ii < ILP; ii++, i++) {
          if (i < n) {
            local_p[ii] = p_in[i];
            local_m[ii] = m[i];
            local_v[ii] = v[i];
            local_g[ii] = g[i];
          } else {
            local_p[ii] = 0;
            local_m[ii] = 0;
            local_v[ii] = 0;
            local_g[ii] = 0;
          }
        }
      }

      // Local compute
      local_step(local_p, local_m, local_v, local_g, grad_scale, beta1, beta2, beta1_correction, beta2_correction, eps,
                 *lr, mode, weight_decay);
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        local_p_out[ii] = static_cast<PARAM_OUT_T>(local_p[ii]);
      }

      // Store
      if (aligned) {
        load_store(p_in + i_start, local_p);
        load_store(m + i_start, local_m);
        load_store(v + i_start, local_v);
        load_store(p_out + i_start, local_p_out);
      } else {
#pragma unroll
        for (int ii = 0, i = i_start; ii < ILP; ii++, i++) {
          if (i < n) {
            p_in[i] = local_p[ii];
            m[i] = local_m[ii];
            v[i] = local_v[ii];
            p_out[i] = local_p_out[ii];
          }
        }
      }
    }
  }
};

/* Functor for multi-tensor Adam with implicit main params
 *
 * If params are BF16 and optimizer state is FP32, it is not necessary
 * to store FP32 main params. Instead, store 16-bit param remainder
 * and combine with BF16 param to reconstruct the FP32 main param.
 */
template <typename GRAD_T>
struct DistAdamWithParamRemaindersFunctor {
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<6>& tl,
                                             const float* grad_scale_ptr, const float beta1, const float beta2,
                                             const float beta1_correction, const float beta2_correction,
                                             const float eps, const float lr, adamMode_t mode,
                                             const float weight_decay) const {
    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    const float grad_scale = *grad_scale_ptr;

    int16_t* p_in = (int16_t*)tl.addresses[0][tensor_loc];
    p_in += chunk_idx * chunk_size;
    int16_t* p_rem = (int16_t*)tl.addresses[1][tensor_loc];
    p_rem += chunk_idx * chunk_size;
    float* m = (float*)tl.addresses[2][tensor_loc];
    m += chunk_idx * chunk_size;
    float* v = (float*)tl.addresses[3][tensor_loc];
    v += chunk_idx * chunk_size;
    const GRAD_T* g = (GRAD_T*)tl.addresses[4][tensor_loc];
    g += chunk_idx * chunk_size;
    int16_t* p_out = (int16_t*)tl.addresses[5][tensor_loc];
    p_out += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;
    n = chunk_size < n ? chunk_size : n;

    const bool aligned = (n % ILP == 0 && is_aligned(p_in) && is_aligned(p_rem) && is_aligned(m) && is_aligned(v) &&
                          is_aligned(g) && is_aligned(p_out));

    for (int i_start = threadIdx.x * ILP; i_start < n; i_start += blockDim.x * ILP) {
      union fp32_or_int162 {
        float fp32;
        int16_t int16[2];
      };
      fp32_or_int162 local_p[ILP];
      int16_t local_p_bf16[ILP];
      int16_t local_p_rem[ILP];
      float local_m[ILP];
      float local_v[ILP];
      GRAD_T local_g[ILP];

      // Load
      if (aligned) {
        load_store(local_p_bf16, p_in + i_start);
        load_store(local_p_rem, p_rem + i_start);
        load_store(local_m, m + i_start);
        load_store(local_v, v + i_start);
        load_store(local_g, g + i_start);
      } else {
#pragma unroll
        for (int ii = 0, i = i_start; ii < ILP; ii++, i++) {
          if (i < n) {
            local_p_bf16[ii] = p_in[i];
            local_p_rem[ii] = p_rem[i];
            local_m[ii] = m[i];
            local_v[ii] = v[i];
            local_g[ii] = g[i];
          } else {
            local_p_bf16[ii] = 0;
            local_p_rem[ii] = 0;
            local_m[ii] = 0;
            local_v[ii] = 0;
            local_g[ii] = 0;
          }
        }
      }

      // Reconstruct FP32 params
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        if (local_p_rem[ii] < 0) local_p_bf16[ii]--;  // Undo rounding
        local_p[ii].int16[1] = local_p_bf16[ii];
        local_p[ii].int16[0] = local_p_rem[ii];
      }

      // Local compute
      using LocalFunctor = DistAdamFunctor<float, GRAD_T, void>;
      LocalFunctor::local_step(reinterpret_cast<float*>(local_p), local_m, local_v, local_g, grad_scale, beta1, beta2,
                               beta1_correction, beta2_correction, eps, lr, mode, weight_decay);

      // Split into BF16 params (rounded-to-nearest) and remainders
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        local_p_bf16[ii] = local_p[ii].int16[1];
        local_p_rem[ii] = local_p[ii].int16[0];
        if (local_p_rem[ii] < 0) local_p_bf16[ii]++;  // Round up
      }

      // Store
      if (aligned) {
        load_store(p_rem + i_start, local_p_rem);
        load_store(m + i_start, local_m);
        load_store(v + i_start, local_v);
        load_store(p_out + i_start, local_p_bf16);
      } else {
#pragma unroll
        for (int ii = 0, i = i_start; ii < ILP; ii++, i++) {
          if (i < n) {
            p_rem[i] = local_p_rem[ii];
            m[i] = local_m[ii];
            v[i] = local_v[ii];
            p_out[i] = local_p_bf16[ii];
          }
        }
      }
    }
  }
};

void multi_tensor_fused_adam_cuda(int chunk_size, at::Tensor noop_flag,
                                  std::vector<std::vector<at::Tensor>> tensor_lists,  // p_in, m, v, g, p_out
                                  at::Tensor grad_scale, float lr, float beta1, float beta2, float eps, int step,
                                  int mode, int bias_correction, float weight_decay) {
  using namespace at;

  // Expect p_in, m, v, g, p_out
  size_t tl_sz = tensor_lists.size();
  TORCH_CHECK(tl_sz == 5, "expected tensor lists of size 5");
  const auto p_in_type = tensor_lists[0][0].scalar_type();
  const auto g_type = tensor_lists[3][0].scalar_type();
  const auto p_out_type = tensor_lists[4][0].scalar_type();

  float beta1_correction = 1.0f, beta2_correction = 1.0f;
  if (bias_correction == 1) {
    beta1_correction = 1 - std::pow(beta1, step);
    beta2_correction = 1 - std::pow(beta2, step);
  }

  DISPATCH_FLOAT_HALF_AND_BFLOAT(
      p_in_type, 0, "dist_adam_cuda_kernel",
      DISPATCH_FLOAT_HALF_AND_BFLOAT(
          g_type, 1, "dist_adam_cuda_kernel",
          DISPATCH_FLOAT_HALF_AND_BFLOAT(
              p_out_type, 2, "dist_adam_cuda_kernel",
              multi_tensor_apply<5>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists,
                                    DistAdamFunctor<scalar_t_0, scalar_t_1, scalar_t_2>(), grad_scale.data_ptr<float>(),
                                    beta1, beta2, beta1_correction, beta2_correction, eps, lr, (adamMode_t)mode,
                                    weight_decay);)));
  C10_CUDA_CHECK(cudaGetLastError());
}

void multi_tensor_fused_adam_capturable_cuda(int chunk_size, at::Tensor noop_flag,
                                             std::vector<std::vector<at::Tensor>> tensor_lists,  // p_in, m, v, g, p_out
                                             at::Tensor grad_scale, at::Tensor lr, float beta1, float beta2, float eps,
                                             at::Tensor step, int mode, int bias_correction, float weight_decay) {
  using namespace at;

  // Expect p_in, m, v, g, p_out
  size_t tl_sz = tensor_lists.size();
  TORCH_CHECK(tl_sz == 5, "expected tensor lists of size 5");
  const auto p_in_type = tensor_lists[0][0].scalar_type();
  const auto g_type = tensor_lists[3][0].scalar_type();
  const auto p_out_type = tensor_lists[4][0].scalar_type();

  DISPATCH_FLOAT_HALF_AND_BFLOAT(
      p_in_type, 0, "dist_adam_capturable_cuda_kernel",
      DISPATCH_FLOAT_HALF_AND_BFLOAT(
          g_type, 1, "dist_adam_capturable_cuda_kernel",
          DISPATCH_FLOAT_HALF_AND_BFLOAT(
              p_out_type, 2, "dist_adam_capturable_cuda_kernel",
              multi_tensor_apply<5>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists,
                                    DistAdamCapturableFunctor<scalar_t_0, scalar_t_1, scalar_t_2>(),
                                    grad_scale.data_ptr<float>(), beta1, beta2, step.data_ptr<int>(), bias_correction,
                                    eps, lr.data_ptr<float>(), (adamMode_t)mode, weight_decay);)));
  C10_CUDA_CHECK(cudaGetLastError());
}

void multi_tensor_fused_adam_with_param_remainders_cuda(
    int chunk_size, at::Tensor noop_flag,
    std::vector<std::vector<at::Tensor>> tensor_lists,  // p_in, p_rem, m, v, g, p_out
    at::Tensor grad_scale, float lr, float beta1, float beta2, float eps, int step, int mode, int bias_correction,
    float weight_decay) {
  using namespace at;

  // Expect p_in, p_rem, m, v, g, p_out
  size_t tl_sz = tensor_lists.size();
  TORCH_CHECK(tl_sz == 6, "expected tensor lists of size 6");
  const auto g_type = tensor_lists[4][0].scalar_type();

  float beta1_correction = 1.0f, beta2_correction = 1.0f;
  if (bias_correction == 1) {
    beta1_correction = 1 - std::pow(beta1, step);
    beta2_correction = 1 - std::pow(beta2, step);
  }

  DISPATCH_FLOAT_HALF_AND_BFLOAT(
      g_type, 0, "dist_adam_with_param_remainders_cuda_kernel",
      multi_tensor_apply<6>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists,
                            DistAdamWithParamRemaindersFunctor<scalar_t_0>(), grad_scale.data_ptr<float>(), beta1,
                            beta2, beta1_correction, beta2_correction, eps, lr, (adamMode_t)mode, weight_decay););
  C10_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb.cpp
================================================
#include <torch/extension.h>

void multi_tensor_lamb_compute_update_term_cuda(int chunk_size, at::Tensor noop_flag,
                                                std::vector<std::vector<at::Tensor>> tensor_lists,
                                                at::Tensor per_tensor_beta1, at::Tensor per_tensor_beta2,
                                                at::Tensor per_tensor_beta3, at::Tensor per_tensor_bias_correction,
                                                at::Tensor step, at::Tensor per_tensor_epsilon, const int mode,
                                                at::Tensor per_tensor_decay, at::Tensor global_scale,
                                                at::Tensor global_grad_norm, const float max_grad_norm);

void multi_tensor_lamb_update_weights_cuda(int chunk_size, at::Tensor noop_flag,
                                           std::vector<std::vector<at::Tensor>> tensor_lists,
                                           at::Tensor per_tensor_param_norm, at::Tensor per_tensor_update_norm,
                                           at::Tensor update_norm_offset, at::Tensor learning_rate,
                                           at::Tensor per_tensor_decay, at::Tensor global_grad_norm, bool use_nvlamb);

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("multi_tensor_lamb_compute_update_term", &multi_tensor_lamb_compute_update_term_cuda,
        "Computes update term for LAMB optimizer", py::call_guard<py::gil_scoped_release>());
  m.def("multi_tensor_lamb_update_weights", &multi_tensor_lamb_update_weights_cuda,
        "Applies update term for LAMB optimizer", py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb_kernel.cu
================================================
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
// Another possibility:
// #include <torch/all.h>

#include <assert.h>

#include "multi_tensor_apply.cuh"
#include "type_shim.h"

#define BLOCK_SIZE 512
#define ILP 4

template <typename T>
__device__ __forceinline__ bool is_aligned(T* p) {
  return ((uint64_t)p) % (ILP * sizeof(T)) == 0;
}

template <typename T>
__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset) {
  typedef typename std::aligned_storage<ILP * sizeof(T), ILP * alignof(T)>::type LT;
  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
}

template <typename FROM_T, typename TO_T>
__device__ void convert(const FROM_T vi, TO_T& vo) {
  vo = static_cast<TO_T>(vi);
}

template <>
__device__ void convert(const float vi, uint8_t& vo) {
  union S {
    float as_float;
    int as_int;
  };
  S s;
  s.as_float = vi;
  s.as_int = s.as_int & 0xFF800000;
  union T {
    at::Half as_half;
    uint8_t as_byte[2];
  };
  T t;
  t.as_half = static_cast<at::Half>(vi + s.as_float / 8.0f);
  vo = t.as_byte[1];
}

template <>
__device__ void convert(const uint8_t vi, float& vo) {
  union T {
    at::Half as_half;
    uint8_t as_byte[2];
  };
  T t;
  t.as_byte[0] = 0;
  t.as_byte[1] = vi;
  vo = static_cast<float>(t.as_half);
}

template <>
__device__ void convert(const at::Half vi, uint8_t& vo) {
  union S {
    float as_float;
    int as_int;
  };
  S s;
  s.as_float = static_cast<float>(vi);
  s.as_int = s.as_int & 0xFF800000;
  union T {
    at::Half as_half;
    uint8_t as_byte[2];
  };
  T t;
  t.as_half = static_cast<at::Half>(vi + s.as_float / 8.0f);
  vo = t.as_byte[1];
}

template <>
__device__ void convert(const uint8_t vi, at::Half& vo) {
  union T {
    at::Half as_half;
    uint8_t as_byte[2];
  };
  T t;
  t.as_byte[0] = 0;
  t.as_byte[1] = vi;
  vo = t.as_half;
}

typedef enum {
  MOMENT_MODE_0 = 0,  // L2 regularization mode
  MOMENT_MODE_1 = 1   // Decoupled weight decay mode
} adamMode_t;

template <typename T, typename GRAD_T, typename MATH_T>
struct DistOptLAMBStage1Functor {
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<5>& tl,
                                             const MATH_T* per_tensor_beta1, const MATH_T* per_tensor_beta2,
                                             const MATH_T* per_tensor_beta3, const int* per_tensor_bias_correction,
                                             const int* step, const MATH_T* per_tensor_epsilon, adamMode_t mode,
                                             const MATH_T* per_tensor_decay, const MATH_T* global_scale,
                                             const MATH_T* global_grad_norm, const float max_grad_norm) {
    // I'd like this kernel to propagate infs/nans.
    if (*noop_gmem == 1) return;

    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int tensor_num = tl.start_tensor_this_launch + tensor_loc;
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    float combined_scale = *global_scale;
    if (max_grad_norm > 0) {
      combined_scale = max_grad_norm / (*global_grad_norm / *global_scale + 1e-6);
      combined_scale = *global_scale / std::min((float)1.0, combined_scale);
    }

    MATH_T beta1 = per_tensor_beta1[tensor_num];
    MATH_T beta2 = per_tensor_beta2[tensor_num];
    MATH_T beta3 = 1 - beta1;
    MATH_T beta1_correction, beta2_correction;
    if (per_tensor_bias_correction[tensor_num] == 1) {
      beta1_correction = 1 - pow(beta1, *step);
      beta2_correction = 1 - pow(beta2, *step);
    } else {
      beta1_correction = (MATH_T)1.0;
      beta2_correction = (MATH_T)1.0;
    }
    MATH_T epsilon = per_tensor_epsilon[tensor_num];
    MATH_T decay = per_tensor_decay[tensor_num];

    GRAD_T* g = (GRAD_T*)tl.addresses[0][tensor_loc];
    g += chunk_idx * chunk_size;

    T* p = (T*)tl.addresses[1][tensor_loc];
    p += chunk_idx * chunk_size;

    T* m = (T*)tl.addresses[2][tensor_loc];
    m += chunk_idx * chunk_size;

    T* v = (T*)tl.addresses[3][tensor_loc];
    v += chunk_idx * chunk_size;

    MATH_T* u = (MATH_T*)tl.addresses[4][tensor_loc];
    u += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;

    MATH_T r_g[ILP];
    MATH_T r_p[ILP];
    MATH_T r_m[ILP];
    MATH_T r_v[ILP];
    // to make things simple, we put aligned case in a different code path
    if (n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(g) && is_aligned(p) && is_aligned(m) && is_aligned(v)) {
      GRAD_T l_g[ILP];
      T l_p[ILP];
      T l_m[ILP];
      T l_v[ILP];
      for (int i_start = threadIdx.x; i_start * ILP < n && i_start * ILP < chunk_size; i_start += blockDim.x) {
        // load
        load_store(l_g, g, 0, i_start);
        if (decay != 0) load_store(l_p, p, 0, i_start);
        load_store(l_m, m, 0, i_start);
        load_store(l_v, v, 0, i_start);
        // unpack
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          r_g[ii] = l_g[ii];
          if (decay == 0) {
            r_p[ii] = MATH_T(0);
          } else {
            r_p[ii] = l_p[ii];
          }
          r_m[ii] = l_m[ii];
          r_v[ii] = l_v[ii];
        }
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          if (mode == MOMENT_MODE_0) {
            MATH_T scaled_grad = r_g[ii] / combined_scale;
            // L2 on scaled grad
            scaled_grad = scaled_grad + decay * r_p[ii];
            r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
            r_v[ii] = r_v[ii] * beta2 + (1 - beta2) * scaled_grad * scaled_grad;
            MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
            MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
            MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
            r_p[ii] = next_m_unbiased / denom;
          } else {
            MATH_T scaled_grad = r_g[ii] / combined_scale;
            r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
            r_v[ii] = r_v[ii] * beta2 + (1 - beta2) * scaled_grad * scaled_grad;
            MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
            MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
            MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
            r_p[ii] = (next_m_unbiased / denom) + (decay * r_p[ii]);
          }
        }
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          l_m[ii] = r_m[ii];
          l_v[ii] = r_v[ii];
        }
        // store
        load_store(u, r_p, i_start, 0);
        load_store(m, l_m, i_start, 0);
        load_store(v, l_v, i_start, 0);
      }
    } else {
      // see note in multi_tensor_scale_kernel.cu
      for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
        MATH_T r_g[ILP];
        MATH_T r_p[ILP];
        MATH_T r_m[ILP];
        MATH_T r_v[ILP];
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          int i = i_start + threadIdx.x + ii * blockDim.x;
          if (i < n && i < chunk_size) {
            r_g[ii] = g[i];
            // special ?optimization? for lamb stage 1
            if (decay == 0) {
              r_p[ii] = MATH_T(0);
            } else {
              r_p[ii] = p[i];
            }
            r_m[ii] = m[i];
            r_v[ii] = v[i];
          } else {
            r_g[ii] = MATH_T(0);
            r_p[ii] = MATH_T(0);
            r_m[ii] = MATH_T(0);
            r_v[ii] = MATH_T(0);
          }
        }
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          if (mode == MOMENT_MODE_0) {
            MATH_T scaled_grad = r_g[ii] / combined_scale;
            // L2 on scaled grad
            scaled_grad = scaled_grad + decay * r_p[ii];
            r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
            r_v[ii] = r_v[ii] * beta2 + (1 - beta2) * scaled_grad * scaled_grad;
            MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
            MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
            MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
            r_p[ii] = next_m_unbiased / denom;
          } else {
            MATH_T scaled_grad = r_g[ii] / combined_scale;
            r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
            r_v[ii] = r_v[ii] * beta2 + (1 - beta2) * scaled_grad * scaled_grad;
            MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
            MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
            MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
            r_p[ii] = (next_m_unbiased / denom) + (decay * r_p[ii]);
          }
        }
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          int i = i_start + threadIdx.x + ii * blockDim.x;
          if (i < n && i < chunk_size) {
            u[i] = r_p[ii];
            m[i] = r_m[ii];
            v[i] = r_v[ii];
          }
        }
      }
    }
  }
};

// Step 2 reads in 'update' value and per-tensor param_norm and update_norm.
// It computes new parameter value.
template <typename T, typename GRAD_T, typename MATH_T>
struct DistOptLAMBStage2Functor {
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<3>& tl,
                                             const MATH_T* per_tensor_param_norm, const MATH_T* per_tensor_update_norm,
                                             const long* update_norm_offset, const MATH_T* learning_rate,
                                             const MATH_T* per_tensor_decay, const MATH_T* global_grad_norm,
                                             bool use_nvlamb) {
    // I'd like this kernel to propagate infs/nans.
    if (*noop_gmem == 1) return;

    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int tensor_num = tl.start_tensor_this_launch + tensor_loc;
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    MATH_T decay = per_tensor_decay[tensor_num];

    MATH_T ratio = *learning_rate;
    // nvlamb: apply adaptive learning rate to all parameters
    // otherwise, only apply to those with non-zero weight decay
    if (use_nvlamb || (decay != (MATH_T)0.0)) {
      MATH_T param_norm = per_tensor_param_norm[tensor_num];
      MATH_T update_norm = per_tensor_update_norm[update_norm_offset[tensor_num]];
      ratio =
          (update_norm != 0.0 && param_norm != 0.0) ? (*learning_rate) * (param_norm / update_norm) : (*learning_rate);
    }

    MATH_T* update = (MATH_T*)tl.addresses[0][tensor_loc];
    update += chunk_idx * chunk_size;

    T* p = (T*)tl.addresses[1][tensor_loc];
    p += chunk_idx * chunk_size;

    GRAD_T* p_copy = (GRAD_T*)tl.addresses[2][tensor_loc];
    p_copy += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;

    // to make things simple, we put aligned case in a different code path
    if (n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(p) && is_aligned(update)) {
      T r_p[ILP];
      MATH_T r_update[ILP];
      GRAD_T r_p_copy[ILP];
      for (int i_start = threadIdx.x; i_start * ILP < n && i_start * ILP < chunk_size; i_start += blockDim.x) {
        // load
        load_store(r_p, p, 0, i_start);
        load_store(r_update, update, 0, i_start);
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          r_p[ii] = static_cast<MATH_T>(r_p[ii]) - (ratio * r_update[ii]);
          convert(r_p[ii], r_p_copy[ii]);
        }
        load_store(p, r_p, i_start, 0);
        load_store(p_copy, r_p_copy, i_start, 0);
      }
    } else {
      for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
        MATH_T r_p[ILP];
        MATH_T r_update[ILP];
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          int i = i_start + threadIdx.x + ii * blockDim.x;
          if (i < n && i < chunk_size) {
            r_p[ii] = p[i];
            r_update[ii] = update[i];
          }
        }
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          r_p[ii] = r_p[ii] - (ratio * r_update[ii]);
        }
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          int i = i_start + threadIdx.x + ii * blockDim.x;
          if (i < n && i < chunk_size) {
            p[i] = r_p[ii];
            convert(r_p[ii], p_copy[i]);
          }
        }
      }
    }
  }
};

void multi_tensor_lamb_compute_update_term_cuda(int chunk_size, at::Tensor noop_flag,
                                                std::vector<std::vector<at::Tensor>> tensor_lists,
                                                at::Tensor per_tensor_beta1, at::Tensor per_tensor_beta2,
                                                at::Tensor per_tensor_beta3, at::Tensor per_tensor_bias_correction,
                                                at::Tensor step, at::Tensor per_tensor_epsilon, const int mode,
                                                at::Tensor per_tensor_decay, at::Tensor global_scale,
                                                at::Tensor global_grad_norm, const float max_grad_norm) {
  using namespace at;

  DISPATCH_FLOAT_AND_HALF(
      tensor_lists[1][0].scalar_type(), 0, "lamb_stage_1",
      DISPATCH_FLOAT_AND_HALF(
          tensor_lists[0][0].scalar_type(), 1, "lamb_stage_1",
          DISPATCH_FLOAT_AND_HALF(
              tensor_lists[4][0].scalar_type(), 2, "lamb_stage_1",
              multi_tensor_apply<5>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists,
                                    DistOptLAMBStage1Functor<scalar_t_0, scalar_t_1, scalar_t_2>(),
                                    per_tensor_beta1.data_ptr<scalar_t_2>(), per_tensor_beta2.data_ptr<scalar_t_2>(),
                                    per_tensor_beta3.data_ptr<scalar_t_2>(), per_tensor_bias_correction.data_ptr<int>(),
                                    step.data_ptr<int>(), per_tensor_epsilon.data_ptr<scalar_t_2>(), (adamMode_t)mode,
                                    per_tensor_decay.data_ptr<scalar_t_2>(), global_scale.data_ptr<scalar_t_2>(),
                                    global_grad_norm.data_ptr<scalar_t_2>(), max_grad_norm);)))

  AT_CUDA_CHECK(cudaGetLastError());
}

void multi_tensor_lamb_update_weights_cuda(int chunk_size, at::Tensor noop_flag,
                                           std::vector<std::vector<at::Tensor>> tensor_lists,
                                           at::Tensor per_tensor_param_norm, at::Tensor per_tensor_update_norm,
                                           at::Tensor update_norm_offset, at::Tensor learning_rate,
                                           at::Tensor per_tensor_decay, at::Tensor global_grad_norm, bool use_nvlamb) {
  using namespace at;

  DISPATCH_FLOAT_AND_HALF(
      tensor_lists[1][0].scalar_type(), 0, "lamb_stage_2",
      DISPATCH_FLOAT_HALF_AND_BYTE(
          tensor_lists[2][0].scalar_type(), 1, "lamb_stage_2",
          DISPATCH_FLOAT_AND_HALF(
              tensor_lists[0][0].scalar_type(), 2, "lamb_stage_2",
              multi_tensor_apply<3>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists,
                                    DistOptLAMBStage2Functor<scalar_t_0, scalar_t_1, scalar_t_2>(),
                                    per_tensor_param_norm.data_ptr<scalar_t_2>(),
                                    per_tensor_update_norm.data_ptr<scalar_t_2>(), update_norm_offset.data_ptr<long>(),
                                    learning_rate.data_ptr<scalar_t_2>(), per_tensor_decay.data_ptr<scalar_t_2>(),
                                    global_grad_norm.data_ptr<scalar_t_2>(), use_nvlamb);)))

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: apex/contrib/csrc/peer_memory/peer_memory.cpp
================================================
/**
 * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "peer_memory_cuda.cuh"

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("allocate_raw", &apex::contrib::peer_memory::allocate_raw, "allocate_raw",
        py::call_guard<py::gil_scoped_release>());
  m.def("free_raw", &apex::contrib::peer_memory::free_raw, "free_raw", py::call_guard<py::gil_scoped_release>());
  m.def("zero", &apex::contrib::peer_memory::zero, "zero", py::call_guard<py::gil_scoped_release>());
  m.def("get_raw_ipc_address", &apex::contrib::peer_memory::get_raw_ipc_address, "get_raw_ipc_address",
        py::call_guard<py::gil_scoped_release>());
  m.def("get_raw_peers", &apex::contrib::peer_memory::get_raw_peers, "get_raw_peers",
        py::call_guard<py::gil_scoped_release>());
  m.def("blob_view_half", &apex::contrib::peer_memory::blob_view_half, "blob_view_half",
        py::call_guard<py::gil_scoped_release>());
  m.def("blob_view_float", &apex::contrib::peer_memory::blob_view_float, "blob_view_float",
        py::call_guard<py::gil_scoped_release>());
  m.def("blob_view_int", &apex::contrib::peer_memory::blob_view_int, "blob_view_int",
        py::call_guard<py::gil_scoped_release>());
  m.def("push_pull_halos_1d", &apex::contrib::peer_memory::push_pull_halos_1d, "push_pull_halos_1d",
        py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: apex/contrib/csrc/peer_memory/peer_memory_cuda.cu
================================================
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <cuda_runtime_api.h>
#include <torch/extension.h>

#include <cassert>
#include <cstdio>
#include <list>

#include "nccl.h"

#define CUDACHECK(cmd)                                                                                \
  do {                                                                                                \
    cudaError_t err = cmd;                                                                            \
    if (err != cudaSuccess) {                                                                         \
      char hostname[1024];                                                                            \
      gethostname(hostname, 1024);                                                                    \
      printf("%s: CUDA failure %s:%d '%s'\n", hostname, __FILE__, __LINE__, cudaGetErrorString(err)); \
    }                                                                                                 \
  } while (0)

namespace {

constexpr int THREADS_PER_CTA = 128;

/* Basic deleter function for from_blob function.
void deleter(void* ptr)
{
    printf("deleter(ptr=%p)\n",ptr);
    cudaFree(ptr);
}
*/

template <class T>
at::Tensor blob_view(T* raw_ptr, std::vector<int64_t> shape, const at::TensorOptions& options, bool channels_last) {
  size_t size = 1;
  std::vector<int64_t> strides(shape.size());
  if (channels_last) {
    assert(shape.size() == 4);
    strides[0] = shape[1] * shape[2] * shape[3];
    strides[1] = 1;
    strides[2] = shape[1] * shape[3];
    strides[3] = shape[1];
  } else {
    int idx = strides.size();
    for (auto it = shape.rbegin(); it != shape.rend(); ++it) {
      strides[--idx] = size;
      size *= *it;
    }
  }
  size *= sizeof(T);
  // TODO: Implement dynamic reuse of pooled peer memory.
  // We provide no deleter function because all peer memory allocations are static in this implementation.
  return torch::from_blob((void*)raw_ptr, shape, strides, 0L, options);
}

void tensor_shape(at::Tensor t, bool explicit_nhwc, int& N, int& C, int& H, int& W) {
  if (t.dim() == 3) {
    N = 1;
    if (explicit_nhwc) {
      C = t.size(2);
      H = t.size(0);
      W = t.size(1);
    } else {
      C = t.size(0);
      H = t.size(1);
      W = t.size(2);
    }
  } else if (t.dim() == 4) {
    if (explicit_nhwc) {
      N = t.size(0);
      C = t.size(3);
      H = t.size(1);
      W = t.size(2);
    } else {
      N = t.size(0);
      C = t.size(1);
      H = t.size(2);
      W = t.size(3);
    }
  } else {
    printf("%s;%d - t.dim() must be either 3 or 4 (was %d)\n", __FILE__, __LINE__, int(t.dim()));
    assert(t.dim() == 3 || t.dim() == 4);
  }
}

void tensor_strides(at::Tensor t, bool explicit_nhwc, int& stride_N, int& stride_C, int& stride_H, int& stride_W) {
  if (t.dim() == 3) {
    if (explicit_nhwc) {
      stride_C = t.stride(2);
      stride_H = t.stride(0);
      stride_W = t.stride(1);
    } else {
      stride_C = t.stride(0);
      stride_H = t.stride(1);
      stride_W = t.stride(2);
    }
    stride_N = t.size(0) * t.size(1) * t.size(2);
  } else if (t.dim() == 4) {
    if (explicit_nhwc) {
      stride_N = t.stride(0);
      stride_C = t.stride(3);
      stride_H = t.stride(1);
      stride_W = t.stride(2);
    } else {
      stride_N = t.stride(0);
      stride_C = t.stride(1);
      stride_H = t.stride(2);
      stride_W = t.stride(3);
    }
  } else {
    printf("%s;%d - t.dim() must be either 3 or 4 (was %d)\n", __FILE__, __LINE__, t.dim());
    assert(t.dim() == 3 || t.dim() == 4);
  }
}

template <class T>
inline __device__ void __zero(T* dst) {
  *dst = T(0);
}

inline __device__ void __zero(int2* dst) { *dst = {0, 0}; }

template <class T, bool contiguous>
inline __device__ void zero_tensor(const int dim0, const int dim1, const int dim2, T* __restrict__ data,
                                   const int data_stride0, const int data_stride1, const int data_stride2,
                                   const int thread_id, const int block_id, const int num_blocks) {
  const int global_id = thread_id + block_id * THREADS_PER_CTA;
  const int num_threads = num_blocks * THREADS_PER_CTA;
  const int count = dim0 * dim1 * dim2;
  for (int i = global_id; i < count; i += num_threads) {
    int offset;
    if (contiguous) {
      offset = i;
    } else {
      const int j2 = i % dim2;
      const int k = i / dim2;
      const int j1 = k % dim1;
      const int j0 = k / dim1;
      offset = j0 * data_stride0 + j1 * data_stride1 + j2 * data_stride2;
    }
    __zero(data + offset);
  }
}

template <class T, bool contiguous>
inline __device__ void push_pull_tensor(const int dim0, const int dim1, const int dim2, const T* __restrict__ data_in,
                                        const int data_in_stride0, const int data_in_stride1, const int data_in_stride2,
                                        T* __restrict__ data_out, const int data_out_stride0,
                                        const int data_out_stride1, const int data_out_stride2, int4* local_peer,
                                        int4* remote_peer, const int thread_id, const int block_id,
                                        const int num_blocks) {
  // 128b=16B NVLink flit
  // Note: Use last 4B as a semaphore
  static_assert(sizeof(T) <= 12);
  union Flit {
    T payload;
    uint uints[4];
  };
  // Communication bit indicates whether flit has been received from
  // a remote GPU
  constexpr uint communication_mask = 1 << 0;
  // Status bit is used to choose the active peer buffer in an
  // alternating double buffer scheme. We use buffer 1 if the bits
  // match, use buffer 2 if the bits differ, and invert the bit
  // after finishing with a buffer.
  constexpr uint status_mask = 1 << 1;

  // Split peer memory into two sets of buffers
  // Note: Each block owns a THREADS_PER_CTA*2*16B chunk of peer
  // memory
  const int peer_offset1 = block_id * THREADS_PER_CTA * 2 + thread_id;
  const int peer_offset2 = peer_offset1 + THREADS_PER_CTA;
  volatile int* local_peer1 = reinterpret_cast<volatile int*>(local_peer + peer_offset1);
  volatile int* local_peer2 = reinterpret_cast<volatile int*>(local_peer + peer_offset2);
  volatile int* remote_peer1 = reinterpret_cast<volatile int*>(remote_peer + peer_offset1);
  volatile int* remote_peer2 = reinterpret_cast<volatile int*>(remote_peer + peer_offset2);

  // Iterate through tensor entries
  const int num_threads = num_blocks * THREADS_PER_CTA;
  const int count = dim0 * dim1 * dim2;
  for (int i0 = block_id * THREADS_PER_CTA; i0 < count; i0 += num_threads) {
    const int i = i0 + thread_id;
    const bool has_data = i < count;

    // Calculate buffer positions
    int data_in_offset, data_out_offset;
    if (contiguous) {
      data_in_offset = i;
      data_out_offset = i;
    } else {
      const int j2 = i % dim2;
      const int k = i / dim2;
      const int j1 = k % dim1;
      const int j0 = k / dim1;
      data_in_offset = j0 * data_in_stride0 + j1 * data_in_stride1 + j2 * data_in_stride2;
      data_out_offset = j0 * data_out_stride0 + j1 * data_out_stride1 + j2 * data_out_stride2;
    }

    // Determine which peer memory buffer to use
    // Note: The status bit is not affected by asynchronous
    // communication from the remote GPU.
    Flit local_message1, local_message2;
    asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];"
                 : "=r"(local_message1.uints[0]), "=r"(local_message1.uints[1]), "=r"(local_message1.uints[2]),
                   "=r"(local_message1.uints[3])
                 : "l"(local_peer1)
                 : "memory");
    asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];"
                 : "=r"(local_message2.uints[0]), "=r"(local_message2.uints[1]), "=r"(local_message2.uints[2]),
                   "=r"(local_message2.uints[3])
                 : "l"(local_peer2)
                 : "memory");
    const uint status1 = local_message1.uints[3] & status_mask;
    const uint status2 = local_message2.uints[3] & status_mask;
    const bool peer1_is_active = (status1 ^ status2) == 0;
    volatile int* ox = peer1_is_active ? remote_peer1 : remote_peer2;
    volatile int* ix = peer1_is_active ? local_peer1 : local_peer2;
    const uint status = peer1_is_active ? status1 : status2;
    Flit recv_message = peer1_is_active ? local_message1 : local_message2;

    // Send flit to remote GPU
    // Note: Set communication bit and keep status bit
    Flit send_message;
    if (has_data) {
      send_message.payload = data_in[data_in_offset];
    }
    send_message.uints[3] = communication_mask | status;
    asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" ::"l"(ox), "r"(send_message.uints[0]),
                 "r"(send_message.uints[1]), "r"(send_message.uints[2]), "r"(send_message.uints[3])
                 : "memory");

    // Recieve flit from peer
    while ((recv_message.uints[3] & communication_mask) == 0) {
      asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];"
                   : "=r"(recv_message.uints[0]), "=r"(recv_message.uints[1]), "=r"(recv_message.uints[2]),
                     "=r"(recv_message.uints[3])
                   : "l"(ix)
                   : "memory");
    }
    if (has_data) {
      data_out[data_out_offset] = recv_message.payload;
    }

    // Reset semaphore
    // Note: Clear communication bit and invert status bit
    uint flag = ~status & status_mask;
    asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" ::"l"(ix), "n"(0), "n"(0), "n"(0), "r"(flag)
                 : "memory");
    if (i0 + num_threads < count) {
      __threadfence_system();
    }
  }
}

template <class T, bool contiguous, bool top_zero, bool btm_zero>
#if __CUDA_ARCH__ >= 700
__launch_bounds__(THREADS_PER_CTA)
#endif
    __global__ void push_pull_halos_1d_kernel(
        // top halo,
        T* toh, int toh_stride0, int toh_stride1, int toh_stride2,        // top output halo (local)
        const T* tih, int tih_stride0, int tih_stride1, int tih_stride2,  // top input halo (local)
        int4* tox,                                                        // top output transfer buffer (remote peer)
        int4* tix,                                                        // top input transfer buffer (local peer)
        // btm halo
        T* boh, int boh_stride0, int boh_stride1, int boh_stride2,        // btm output halo (local)
        const T* bih, int bih_stride0, int bih_stride1, int bih_stride2,  // btm input halo (local)
        int4* box,                                                        // btm output transfer buffer (remote peer)
        int4* bix,                                                        // btm input transfer buffer (local peer)
        // dimensions
        int dim0, int dim1, int dim2,
        bool top_first  // whether to launch communicate top halo first
    ) {
  const int num_blocks_side = gridDim.x / 2;
  const int block_id_side = (blockIdx.x < num_blocks_side ? blockIdx.x : blockIdx.x - num_blocks_side);
  const bool in_top_block = top_first == (blockIdx.x < num_blocks_side);
  if (in_top_block) {
    if (top_zero) {
      zero_tensor<T, contiguous>(dim0, dim1, dim2, toh, toh_stride0, toh_stride1, toh_stride2, threadIdx.x,
                                 block_id_side, num_blocks_side);
    } else {
      push_pull_tensor<T, contiguous>(dim0, dim1, dim2, tih, tih_stride0, tih_stride1, tih_stride2, toh, toh_stride0,
                                      toh_stride1, toh_stride2, tix, tox, threadIdx.x, block_id_side, num_blocks_side);
    }
  } else {
    if (btm_zero) {
      zero_tensor<T, contiguous>(dim0, dim1, dim2, boh, boh_stride0, boh_stride1, boh_stride2, threadIdx.x,
                                 block_id_side, num_blocks_side);
    } else {
      push_pull_tensor<T, contiguous>(dim0, dim1, dim2, bih, bih_stride0, bih_stride1, bih_stride2, boh, boh_stride0,
                                      boh_stride1, boh_stride2, bix, box, threadIdx.x, block_id_side, num_blocks_side);
    }
  }
}

__global__ void delay_kernel(int delay_nanoseconds, int* counter) {
  if (blockIdx.x == 0 && threadIdx.x == 0) {
    // waste time while doing something compiler can't predict, thus preventing it from optimizing away this code.
    int new_counter = 0;
    double elapsed = 0;
    clock_t start = clock();
    do {
      clock_t now = clock();
      elapsed = (double)(now - start) * 1e9 / CLOCKS_PER_SEC;
      ++new_counter;
    } while (elapsed < (double)delay_nanoseconds);
    *counter = new_counter;
  }
}

}  // namespace

namespace apex {
namespace contrib {
namespace peer_memory {

int64_t allocate_raw(int64_t size) {
  float* ptr = 0L;
  cudaMalloc(&ptr, size);
  cudaMemset(ptr, 0, size);
  return (int64_t)ptr;
}

void free_raw(int64_t raw) { cudaFree((void*)raw); }

void zero(int64_t raw, int64_t size) { cudaMemset((void*)raw, 0, size); }

at::Tensor get_raw_ipc_address(int64_t raw) {
  cudaIpcMemHandle_t mem_handle;
  CUDACHECK(cudaIpcGetMemHandle(&mem_handle, (void*)raw));
  const int n = sizeof(cudaIpcMemHandle_t);
  auto address_tensor = torch::empty({n}, torch::dtype(torch::kUInt8));
  auto address_tensor_p = address_tensor.data_ptr<uint8_t>();
  memcpy(address_tensor_p, (uint8_t*)&mem_handle, n);
  return address_tensor;
}

std::vector<int64_t> get_raw_peers(at::Tensor ipc_addresses, int peer_rank, int64_t raw) {
  int peer_group_size = ipc_addresses.size(0);
  std::vector<int64_t> results(peer_group_size);
  for (int i = 0; i < peer_group_size; ++i) {
    if (i != peer_rank) {
      cudaIpcMemHandle_t mem_handle;
      memcpy(&mem_handle, ipc_addresses.index({i}).data_ptr<uint8_t>(), sizeof(cudaIpcMemHandle_t));
      void* p = 0L;
      CUDACHECK(cudaIpcOpenMemHandle((void**)&p, mem_handle, cudaIpcMemLazyEnablePeerAccess));
      results[i] = (int64_t)p;
    } else {
      results[i] = (int64_t)raw;
    }
  }
  return results;
}

at::Tensor blob_view_half(int64_t raw, std::vector<int64_t> shape, bool channels_last) {
  return blob_view<at::Half>((at::Half*)raw, shape, torch::dtype(torch::kFloat16).device(torch::kCUDA), channels_last);
}

at::Tensor blob_view_float(int64_t raw, std::vector<int64_t> shape, bool channels_last) {
  return blob_view<float>((float*)raw, shape, torch::dtype(torch::kFloat32).device(torch::kCUDA), channels_last);
}

at::Tensor blob_view_int(int64_t raw, std::vector<int64_t> shape, bool channels_last) {
  return blob_view<int>((int*)raw, shape, torch::dtype(torch::kInt32).device(torch::kCUDA), channels_last);
}

void push_pull_halos_1d(
    bool diagnostics, bool explicit_nhwc,
    int numSM,                    // number of SMs to use (zero corresponds to all SMs)
    int rank,                     // rank in spatial parallel group
    bool top_zero,                // if top halo should be zeroed
    at::Tensor top_in_halo,       // top input halo buffer (in local device memory, sent to top neighbor)
    at::Tensor top_in_transfer,   // top input transfer buffer (in local peer memory)
    at::Tensor top_out_transfer,  // top output transfer buffer (in top neighbor peer memory)
    at::Tensor top_out_halo,      // top output halo buffer (in local device memory, received from top neighbor)
    bool btm_zero,                // if btm halo should be zeroed
    at::Tensor btm_in_halo,       // btm input halo buffer (in local device memory, sent to btm neighbor)
    at::Tensor btm_in_transfer,   // btm input transfer buffer (in local peer memory)
    at::Tensor btm_out_transfer,  // btm output transfer buffer (in btm neighbor peer memory)
    at::Tensor btm_out_halo       // btm output halo buffer (in local device memory, received from btm neighbor)
) {
  // basic checks of inputs
  TORCH_CHECK(!(top_zero && btm_zero));
  TORCH_CHECK(top_in_halo.is_cuda());
  TORCH_CHECK(top_out_transfer.is_cuda());
  TORCH_CHECK(top_in_transfer.is_cuda());
  TORCH_CHECK(top_out_halo.is_cuda());
  TORCH_CHECK(btm_in_halo.is_cuda());
  TORCH_CHECK(btm_out_transfer.is_cuda());
  TORCH_CHECK(btm_in_transfer.is_cuda());
  TORCH_CHECK(btm_out_halo.is_cuda());

  // tensor shapes
  int tih_N, tih_C, tih_H, tih_W;
  tensor_shape(top_in_halo, explicit_nhwc, tih_N, tih_C, tih_H, tih_W);
  int toh_N, toh_C, toh_H, toh_W;
  tensor_shape(top_out_halo, explicit_nhwc, toh_N, toh_C, toh_H, toh_W);
  int bih_N, bih_C, bih_H, bih_W;
  tensor_shape(btm_in_halo, explicit_nhwc, bih_N, bih_C, bih_H, bih_W);
  int boh_N, boh_C, boh_H, boh_W;
  tensor_shape(btm_out_halo, explicit_nhwc, boh_N, boh_C, boh_H, boh_W);
  TORCH_CHECK(toh_N == tih_N && tih_N == boh_N && boh_N == bih_N && toh_C == tih_C && tih_C == boh_C &&
              boh_C == bih_C && toh_H == tih_H && tih_H == boh_H && boh_H == bih_H && toh_W == tih_W &&
              tih_W == boh_W && boh_W == bih_W);
  int NN = toh_N, NC = toh_C, NH = toh_H, NW = toh_W;
  if (diagnostics) {
    printf("rank %d: NN=%d, NC=%d, NH=%d, NW=%d\n", rank, NN, NC, NH, NW);
  }
  TORCH_CHECK(NN == 1);

  // tensor strides
  int tih_stride_N, tih_stride_C, tih_stride_H, tih_stride_W;
  tensor_strides(top_in_halo, explicit_nhwc, tih_stride_N, tih_stride_C, tih_stride_H, tih_stride_W);
  int toh_stride_N, toh_stride_C, toh_stride_H, toh_stride_W;
  tensor_strides(top_out_halo, explicit_nhwc, toh_stride_N, toh_stride_C, toh_stride_H, toh_stride_W);
  int bih_stride_N, bih_stride_C, bih_stride_H, bih_stride_W;
  tensor_strides(btm_in_halo, explicit_nhwc, bih_stride_N, bih_stride_C, bih_stride_H, bih_stride_W);
  int boh_stride_N, boh_stride_C, boh_stride_H, boh_stride_W;
  tensor_strides(btm_out_halo, explicit_nhwc, boh_stride_N, boh_stride_C, boh_stride_H, boh_stride_W);
  if (diagnostics) {
    printf("rank %d: tih_stride :: N=%d, C=%d, H=%d, W=%d\n", rank, tih_stride_N, tih_stride_C, tih_stride_H,
           tih_stride_W);
    printf("rank %d: toh_stride :: N=%d, C=%d, H=%d, W=%d\n", rank, toh_stride_N, toh_stride_C, toh_stride_H,
           toh_stride_W);
    printf("rank %d: bih_stride :: N=%d, C=%d, H=%d, W=%d\n", rank, bih_stride_N, bih_stride_C, bih_stride_H,
           bih_stride_W);
    printf("rank %d: boh_stride :: N=%d, C=%d, H=%d, W=%d\n", rank, boh_stride_N, boh_stride_C, boh_stride_H,
           boh_stride_W);
  }

  // determine if nhwc
  bool is_nhwc = (toh_stride_C == 1);
  if (diagnostics) {
    printf("rank %d: is_nhwc = %s\n", rank, is_nhwc ? "true" : "false");
  }

  // determine if contiguous
  bool contiguous = true;
  if ((NN - 1) * toh_stride_N + (NC - 1) * toh_stride_C + (NH - 1) * toh_stride_H + (NW - 1) * toh_stride_W !=
      NN * NC * NH * NW - 1) {
    contiguous = false;
  }
  if ((NN - 1) * boh_stride_N + (NC - 1) * boh_stride_C + (NH - 1) * boh_stride_H + (NW - 1) * boh_stride_W !=
      NN * NC * NH * NW - 1) {
    contiguous = false;
  }
  if (!top_zero) {
    if (toh_stride_N != tih_stride_N || toh_stride_C != tih_stride_C || toh_stride_H != tih_stride_H ||
        toh_stride_W != tih_stride_W) {
      contiguous = false;
    }
  }
  if (!btm_zero) {
    if (boh_stride_N != bih_stride_N || boh_stride_C != bih_stride_C || boh_stride_H != bih_stride_H ||
        boh_stride_W != bih_stride_W) {
      contiguous = false;
    }
  }
  if (diagnostics) {
    printf("rank %d: contiguous = %s\n", rank, contiguous ? "true" : "false");
  }

  // determine whether to communicate top halo first
  bool top_first = rank % 2 != 0;
  if (diagnostics) {
    printf("rank %d: top_first = %s\n", rank, top_first ? "true" : "false");
  }

  // peer memory buffers
  int tox_size = top_out_transfer.numel() * top_out_transfer.element_size();
  int tix_size = top_in_transfer.numel() * top_in_transfer.element_size();
  int box_size = btm_out_transfer.numel() * btm_out_transfer.element_size();
  int bix_size = btm_in_transfer.numel() * btm_in_transfer.element_size();
  if (!top_zero) {
    TORCH_CHECK(top_out_transfer.is_contiguous());
    TORCH_CHECK(top_in_transfer.is_contiguous());
    TORCH_CHECK(tox_size == tix_size);
  }
  if (!btm_zero) {
    TORCH_CHECK(btm_out_transfer.is_contiguous());
    TORCH_CHECK(btm_in_transfer.is_contiguous());
    TORCH_CHECK(box_size == bix_size);
  }

  // figure out launch parameters
  int device;
  cudaGetDevice(&device);
  cudaDeviceProp prop;
  cudaGetDeviceProperties(&prop, device);
  if (numSM <= 0 || numSM > prop.multiProcessorCount) {
    numSM = prop.multiProcessorCount;
  }
  auto current_stream = at::cuda::getCurrentCUDAStream();
  dim3 block(THREADS_PER_CTA, 1, 1);

  // helper macros to launch templated kernel
#define LAUNCH_PUSH_PULL_HALO_KERNEL_BASE(T, CONTIGUOUS, TOP_ZERO, BTM_ZERO, KERNEL_ARGS, NUM_ELEMENTS)           \
  do {                                                                                                            \
    /* kernel configuration */                                                                                    \
    int numBlocksPerSm;                                                                                           \
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(                                                                \
        &numBlocksPerSm, push_pull_halos_1d_kernel<T, CONTIGUOUS, TOP_ZERO, BTM_ZERO>, THREADS_PER_CTA, 0);       \
    dim3 grid(numSM * numBlocksPerSm, 1, 1);                                                                      \
    if (grid.x % 2 != 0) {                                                                                        \
      /* require even number of blocks (half for top, half for bottom) */                                         \
      grid.x -= 1;                                                                                                \
    }                                                                                                             \
    if ((grid.x / 2) * THREADS_PER_CTA > NUM_ELEMENTS) {                                                          \
      /* only need enough blocks to cover top and bottom halo elements */                                         \
      grid.x = 2 * ((NUM_ELEMENTS + THREADS_PER_CTA - 1) / THREADS_PER_CTA);                                      \
    }                                                                                                             \
    if (!TOP_ZERO) {                                                                                              \
      /* require 2*128b=32B peer memory per thread */                                                             \
      if ((grid.x / 2) * THREADS_PER_CTA * 32 > tox_size) {                                                       \
        grid.x = 2 * (tox_size / (THREADS_PER_CTA * 32));                                                         \
      }                                                                                                           \
    }                                                                                                             \
    if (!BTM_ZERO) {                                                                                              \
      /* require 2*128b=32B peer memory per thread */                                                             \
      if ((grid.x / 2) * THREADS_PER_CTA * 32 > box_size) {                                                       \
        grid.x = 2 * (box_size / (THREADS_PER_CTA * 32));                                                         \
      }                                                                                                           \
    }                                                                                                             \
    TORCH_CHECK(grid.x >= 2);                                                                                     \
                                                                                                                  \
    /* launch kernel */                                                                                           \
    cudaLaunchCooperativeKernel((void*)push_pull_halos_1d_kernel<T, CONTIGUOUS, TOP_ZERO, BTM_ZERO>, grid, block, \
                                KERNEL_ARGS, 0, current_stream);                                                  \
  } while (false)
#define LAUNCH_PUSH_PULL_HALO_KERNEL(T, CONTIGUOUS, KERNEL_ARGS, NUM_ELEMENTS)                   \
  do {                                                                                           \
    if (top_zero) {                                                                              \
      LAUNCH_PUSH_PULL_HALO_KERNEL_BASE(T, CONTIGUOUS, true, false, KERNEL_ARGS, NUM_ELEMENTS);  \
    } else if (btm_zero) {                                                                       \
      LAUNCH_PUSH_PULL_HALO_KERNEL_BASE(T, CONTIGUOUS, false, true, KERNEL_ARGS, NUM_ELEMENTS);  \
    } else {                                                                                     \
      LAUNCH_PUSH_PULL_HALO_KERNEL_BASE(T, CONTIGUOUS, false, false, KERNEL_ARGS, NUM_ELEMENTS); \
    }                                                                                            \
  } while (false)

  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, top_out_halo.scalar_type(), "push_pull_halos_1d_kernel", [&] {
    if (diagnostics) {
      printf("rank %d: size(scalar_t) = %ld\n", rank, sizeof(scalar_t));
    }
    scalar_t* toh_p = top_out_halo.data_ptr<scalar_t>();
    scalar_t* tih_p = top_in_halo.data_ptr<scalar_t>();
    int4* tox_p = reinterpret_cast<int4*>(top_out_transfer.data_ptr<scalar_t>());
    int4* tix_p = reinterpret_cast<int4*>(top_in_transfer.data_ptr<scalar_t>());
    scalar_t* boh_p = btm_out_halo.data_ptr<scalar_t>();
    scalar_t* bih_p = btm_in_halo.data_ptr<scalar_t>();
    int4* box_p = reinterpret_cast<int4*>(btm_out_transfer.data_ptr<scalar_t>());
    int4* bix_p = reinterpret_cast<int4*>(btm_in_transfer.data_ptr<scalar_t>());
    if (diagnostics) printf("rank %d: choosing halo exchange kernel\n", rank);

    // do int2 vector loads if channel count permits
    if (contiguous && (NN * NH * NW * NC * sizeof(scalar_t)) % sizeof(int2) == 0) {
      // can do contiguous int2 transfers
      if (diagnostics) {
      }
      toh_stride_N = toh_stride_H = toh_stride_W = toh_stride_C = 1;
      tih_stride_N = tih_stride_H = tih_stride_W = tih_stride_C = 1;
      boh_stride_N = boh_stride_H = boh_stride_W = boh_stride_C = 1;
      bih_stride_N = bih_stride_H = bih_stride_W = bih_stride_C = 1;
      NC = (NN * NH * NW * NC * sizeof(scalar_t)) / sizeof(int2);
      NN = NH = NW = 1;
      if (diagnostics) {
        printf("rank %d: launching contiguous int2 halo exchange kernel\n", rank);
        printf("rank %d: NC=%d, NH=%d, NW=%d\n", rank, NC, NH, NW);
      }
      void* kernel_args[] = {(int2**)&toh_p,
                             &toh_stride_H,
                             &toh_stride_W,
                             &toh_stride_C,
                             (int2**)&tih_p,
                             &tih_stride_H,
                             &tih_stride_W,
                             &tih_stride_C,
                             &tox_p,
                             &tix_p,
                             (int2**)&boh_p,
                             &boh_stride_H,
                             &boh_stride_W,
                             &boh_stride_C,
                             (int2**)&bih_p,
                             &bih_stride_H,
                             &bih_stride_W,
                             &bih_stride_C,
                             &box_p,
                             &bix_p,
                             &NH,
                             &NW,
                             &NC,
                             &top_first};
      int num_elem = NN * NH * NW * NC;
      LAUNCH_PUSH_PULL_HALO_KERNEL(int2, true, kernel_args, num_elem);
    } else if (is_nhwc && (NC * sizeof(scalar_t)) % sizeof(int2) == 0) {
      // can do strided int2 transfers
      int divisor = sizeof(int2) / sizeof(scalar_t);
      if (diagnostics) {
        printf("rank %d: launching strided int2 halo exchange kernel\n", rank);
      }
      toh_stride_N /= divisor;
      toh_stride_H /= divisor;
      toh_stride_W /= divisor;
      tih_stride_N /= divisor;
      tih_stride_H /= divisor;
      tih_stride_W /= divisor;
      boh_stride_N /= divisor;
      boh_stride_H /= divisor;
      boh_stride_W /= divisor;
      bih_stride_N /= divisor;
      bih_stride_H /= divisor;
      bih_stride_W /= divisor;
      NC /= divisor;
      if (diagnostics) {
        printf("rank %d: divisor=%d\n", rank, divisor);
        printf("rank %d: tih_stride :: N=%d, C=%d, H=%d, W=%d\n", rank, tih_stride_N, tih_stride_C, tih_stride_H,
               tih_stride_W);
        printf("rank %d: toh_stride :: N=%d, C=%d, H=%d, W=%d\n", rank, toh_stride_N, toh_stride_C, toh_stride_H,
               toh_stride_W);
        printf("rank %d: bih_stride :: N=%d, C=%d, H=%d, W=%d\n", rank, bih_stride_N, bih_stride_C, bih_stride_H,
               bih_stride_W);
        printf("rank %d: boh_stride :: N=%d, C=%d, H=%d, W=%d\n", rank, boh_stride_N, boh_stride_C, boh_stride_H,
               boh_stride_W);
        printf("rank %d: NC=%d, NH=%d, NW=%d\n", rank, NC, NH, NW);
      }
      void* kernel_args[] = {(int2**)&toh_p,
                             &toh_stride_H,
                             &toh_stride_W,
                             &toh_stride_C,
                             (int2**)&tih_p,
                             &tih_stride_H,
                             &tih_stride_W,
                             &tih_stride_C,
                             &tox_p,
                             &tix_p,
                             (int2**)&boh_p,
                             &boh_stride_H,
                             &boh_stride_W,
                             &boh_stride_C,
                             (int2**)&bih_p,
                             &bih_stride_H,
                             &bih_stride_W,
                             &bih_stride_C,
                             &box_p,
                             &bix_p,
                             &NH,
                             &NW,
                             &NC,
                             &top_first};
      int num_elem = NH * NW * NC;
      LAUNCH_PUSH_PULL_HALO_KERNEL(int2, false, kernel_args, num_elem);
    } else {
      // cannot do int2 transfers
      if (diagnostics) {
        printf("rank %d: launching non-int2 halo exchange kernel\n", rank);
      }
      int num_elem = NC * NH * NW;
      if (is_nhwc) {
        void* kernel_args[] = {&toh_p,        &toh_stride_H, &toh_stride_W, &toh_stride_C, &tih_p,        &tih_stride_H,
                               &tih_stride_W, &tih_stride_C, &tox_p,        &tix_p,        &boh_p,        &boh_stride_H,
                               &boh_stride_W, &boh_stride_C, &bih_p,        &bih_stride_H, &bih_stride_W, &bih_stride_C,
                               &box_p,        &bix_p,        &NH,           &NW,           &NC,           &top_first};
        LAUNCH_PUSH_PULL_HALO_KERNEL(scalar_t, false, kernel_args, num_elem);
      } else {
        void* kernel_args[] = {&toh_p,        &toh_stride_C, &toh_stride_H, &toh_stride_W, &tih_p,        &tih_stride_C,
                               &tih_stride_H, &tih_stride_W, &tox_p,        &tix_p,        &boh_p,        &boh_stride_C,
                               &boh_stride_H, &boh_stride_W, &bih_p,        &bih_stride_C, &bih_stride_H, &bih_stride_W,
                               &box_p,        &bix_p,        &NC,           &NH,           &NW,           &top_first};
        LAUNCH_PUSH_PULL_HALO_KERNEL(scalar_t, false, kernel_args, num_elem);
      }
    }
  });

#undef LAUNCH_PUSH_PULL_HALO_KERNEL_BASE
#undef LAUNCH_PUSH_PULL_HALO_KERNEL
}

}  // namespace peer_memory
}  // namespace contrib
}  // namespace apex


================================================
FILE: apex/contrib/csrc/peer_memory/peer_memory_cuda.cuh
================================================
/**
 * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once
#include <torch/extension.h>
#ifndef _peer_memory_h_
#define _peer_memory_h_

namespace apex {
namespace contrib {
namespace peer_memory {
int64_t allocate_raw(int64_t size);
void free_raw(int64_t raw);
void zero(int64_t raw, int64_t size);
at::Tensor get_raw_ipc_address(int64_t raw);
std::vector<int64_t> get_raw_peers(at::Tensor ipc_addresses, int peer_rank, int64_t raw);
at::Tensor blob_view_half(int64_t raw, std::vector<int64_t> shape, bool channels_last);
at::Tensor blob_view_float(int64_t raw, std::vector<int64_t> shape, bool channels_last);
at::Tensor blob_view_int(int64_t raw, std::vector<int64_t> shape, bool channels_last);
void push_pull_halos_1d(
    bool diagnostics, bool explicit_nhwc,
    int numSM,                    // number of SMs to use
    int peer_rank,                // rank in spatial parallel group
    bool top_zero,                // if top halo should be zeroed
    at::Tensor top_out_halo,      // top output halo buffer (in local device memory, received from top neighbor)
    at::Tensor top_inp_transfer,  // top input transfer buffer (in local peer memory)
    at::Tensor top_out_transfer,  // top output transfer buffer (in top neighbor peer memory)
    at::Tensor top_inp_halo,      // top input halo buffer (in local device memory, sent to top neighbor)
    bool btm_zero,                // if btm halo should be zeroed
    at::Tensor btm_out_halo,      // btm output halo buffer (in local device memory, received from btm neighbor)
    at::Tensor btm_inp_transfer,  // btm input transfer buffer (in local peer memory)
    at::Tensor btm_out_transfer,  // btm output transfer buffer (in btm neighbor peer memory)
    at::Tensor btm_inp_halo       // btm input halo buffer (in local device memory, sent to btm neighbor)
);
}  // namespace peer_memory
}  // namespace contrib
}  // namespace apex
#endif


================================================
FILE: apex/contrib/csrc/transducer/transducer_joint.cpp
================================================
#include <ATen/Functions.h>
#include <torch/extension.h>

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
  CHECK_CUDA(x);       \
  CHECK_CONTIGUOUS(x)

std::vector<torch::Tensor> transducer_joint_cuda_forward(torch::Tensor f, torch::Tensor g, torch::Tensor fLen,
                                                         torch::Tensor gLen, torch::Tensor batchOffset,
                                                         int64_t packedBatch, int opt, bool packOutput, bool relu,
                                                         bool dropout, float dropoutProb, int tileSize);

std::vector<torch::Tensor> transducer_joint_cuda_backward(std::vector<torch::Tensor> in, torch::Tensor fLen,
                                                          torch::Tensor gLen, torch::Tensor batchOffset, int maxFLen,
                                                          int maxGLen, bool packOutput, float scale);

std::vector<torch::Tensor> transducer_joint_forward(torch::Tensor f, torch::Tensor g, torch::Tensor fLen,
                                                    torch::Tensor gLen, torch::Tensor batchOffset, int64_t packedBatch,
                                                    int opt, bool packOutput, bool relu, bool dropout,
                                                    float dropoutProb, int tileSize) {
  CHECK_INPUT(f);
  CHECK_INPUT(g);
  CHECK_INPUT(fLen);
  CHECK_INPUT(gLen);
  if (packOutput) CHECK_INPUT(batchOffset);
  return transducer_joint_cuda_forward(f, g, fLen, gLen, batchOffset, packedBatch, opt, packOutput, relu, dropout,
                                       dropoutProb, tileSize);
}

std::vector<torch::Tensor> transducer_joint_backward(std::vector<torch::Tensor> in, torch::Tensor fLen,
                                                     torch::Tensor gLen, torch::Tensor batchOffset, int maxFLen,
                                                     int maxGLen, bool packOutput, float scale) {
  for (auto t : in) {
    CHECK_INPUT(t);
  }
  CHECK_INPUT(fLen);
  CHECK_INPUT(gLen);
  if (packOutput) CHECK_INPUT(batchOffset);
  return transducer_joint_cuda_backward(in, fLen, gLen, batchOffset, maxFLen, maxGLen, packOutput, scale);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &transducer_joint_forward, "transducer joint forward (CUDA)",
        py::call_guard<py::gil_scoped_release>());
  m.def("backward", &transducer_joint_backward, "transducer joint backward (CUDA)",
        py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: apex/contrib/csrc/transducer/transducer_joint_kernel.cu
================================================
#include <ATen/AccumulateType.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <curand_kernel.h>
#include <torch/extension.h>

#ifdef OLD_GENERATOR_PATH
#include <ATen/CUDAGeneratorImpl.h>
#else
#include <ATen/cuda/CUDAGeneratorImpl.h>
#endif

#include <ATen/cuda/CUDAContext.h>
#include <c10/macros/Macros.h>

#include <ATen/cuda/CUDAGraphsUtils.cuh>

#include "philox.cuh"

// Warp reduce kernels to reduce N groups of data into N numbers, where N = warpSize / width.
// width should be a power of 2 and should be less than warpSize.
template <typename scalar_t>
__device__ __forceinline__ scalar_t warpReduce(scalar_t x, int width = C10_WARP_SIZE) {
  for (unsigned offset = width / 2; offset > 0; offset /= 2) {
    x += __shfl_down_sync(0xffffffff, x, offset, width);
  }
  return x;
}

inline int largestPowerOfTwo(int x) {
  int y = 1;
  while (y <= x) y <<= 1;
  return y >> 1;
}

/*
Figure out vectorization type for masks.
Similar to how PyTorch figures out acc_t here:
aten/src/ATen/AccumulateType.h
*/
template <int V>
struct MaskVecType {};

template <>
struct MaskVecType<1> {
  using type = uint8_t;
};
template <>
struct MaskVecType<2> {
  using type = uint16_t;
};
template <>
struct MaskVecType<4> {
  using type = uint32_t;
};

template <int V>
using mvec_type = typename MaskVecType<V>::type;

// Helper class to calculate pointer offset that can be shared by different flavors of kernels.
// For fwd, batch offset and stride are different for packing and non-packing mode.
struct OffsetCalFwd {
  __device__ __forceinline__ OffsetCalFwd(int64_t batch, const int64_t* batchOffset, int64_t maxFLen, int64_t maxGLen,
                                          int64_t gLen, int64_t hiddenSize, bool packOutput)
      : batch(batch),
        batchOffset(batchOffset),
        maxFLen(maxFLen),
        maxGLen(maxGLen),
        gLen(gLen),
        hiddenSize(hiddenSize),
        packOutput(packOutput) {}

  int64_t batch;
  const int64_t* batchOffset;
  int64_t maxFLen;
  int64_t maxGLen;
  int64_t gLen;
  int64_t hiddenSize;
  bool packOutput;

  __device__ __forceinline__ int64_t getBatchOffset() {
    return packOutput ? ((batch == 0) ? 0 : batchOffset[batch - 1]) * hiddenSize
                      : batch * maxFLen * maxGLen * hiddenSize;
  }

  __device__ __forceinline__ int64_t getStrideF() { return packOutput ? gLen * hiddenSize : maxGLen * hiddenSize; }
};

// Helper class to calculate pointer offset that can be shared by different flavors of kernels
// For bwd, batch offset and stride are different for packing and non-packing mode.
// The reducion is done for two input tensors. Therefore, generating two sets of offsets
// according to bwdFasterDim can lead to a unified implementation in the actual kernel.
struct OffsetCalBwd {
  __device__ __forceinline__ OffsetCalBwd(int64_t batch, const int64_t* batchOffset, const int* fLen, const int* gLen,
                                          int64_t maxFLen, int64_t maxGLen, int64_t hiddenSize, bool packOutput,
                                          bool bwdFasterDim)
      : batch(batch),
        batchOffset(batchOffset),
        maxFLen(maxFLen),
        maxGLen(maxGLen),
        fLen(fLen),
        gLen(gLen),
        hiddenSize(hiddenSize),
        packOutput(packOutput),
        bwdFasterDim(bwdFasterDim) {}

  int64_t batch;
  const int64_t* batchOffset;
  const int* fLen;
  const int* gLen;
  int64_t maxFLen;
  int64_t maxGLen;
  int64_t hiddenSize;
  bool packOutput;
  bool bwdFasterDim;  // whether doing bwd on the faster moving dimension

  __device__ __forceinline__ int64_t getBatchOffset() {
    return packOutput ? ((batch == 0) ? 0 : batchOffset[batch - 1]) * hiddenSize
                      : batch * maxFLen * maxGLen * hiddenSize;
  }

  __device__ __forceinline__ int64_t getMaxXLen() { return bwdFasterDim ? maxGLen : maxFLen; }

  __device__ __forceinline__ auto getMyXLen() -> decltype(gLen[batch]) {
    return bwdFasterDim ? gLen[batch] : fLen[batch];
  }

  __device__ __forceinline__ auto getMyYLen() -> decltype(gLen[batch]) {
    return bwdFasterDim ? fLen[batch] : gLen[batch];
  }

  __device__ __forceinline__ int64_t getStrideX() {
    return bwdFasterDim ? hiddenSize : ((packOutput ? gLen[batch] : maxGLen) * hiddenSize);
  }

  __device__ __forceinline__ int64_t getStrideY() {
    return bwdFasterDim ? ((packOutput ? gLen[batch] : maxGLen) * hiddenSize) : hiddenSize;
  }
};

// Vanila transducer joint forward kernel
// Detail of this joint function can be found in:
// [1] Sequence Transduction with Recurrent Neural Networks.

// f is a tensor of shape [batch, T, H]
// g is a tensor of shape [batch, U, H]
// the transducer joint does
// sum = f.unsqueeze(dim=2) + g.unsqueeze(dim=1)
// The resultant tensor is of shape [batch, T, U, H]
// Each thread block is working on one "batch" of data in the output tensor, [batch, t, u, :]

// This joint function can optionally pack the output where the output tensor with a shape of
// [B, T, U, H] is packed into [B_packed, H].
// Don't-care region (t > fLen) or (u > gLen) is removed.
// To enable packing, the starting offset for each batch need to be specified with batchOffset.
template <typename scalar_t, class OffsetCal>
__global__ void transducer_joint_forward(const scalar_t* f, const scalar_t* g, const int* fLen, const int* gLen,
                                         const int64_t* batchOffset, int64_t maxFLen, int64_t maxGLen,
                                         int64_t hiddenSize, bool packOutput, scalar_t* sum) {
  const int batch = blockIdx.z;
  const int t = blockIdx.y;
  const int u = blockIdx.x;
  const auto myFLen = fLen[batch];
  const auto myGLen = gLen[batch];

  OffsetCal offsetCal(batch, batchOffset, maxFLen, maxGLen, myGLen, hiddenSize, packOutput);
  const auto myBatchOffset = offsetCal.getBatchOffset();
  const auto strideF = offsetCal.getStrideF();
  scalar_t const* myF = f + batch * maxFLen * hiddenSize + t * hiddenSize;
  scalar_t const* myG = g + batch * maxGLen * hiddenSize + u * hiddenSize;
  scalar_t* mySum = sum + myBatchOffset + t * strideF + u * hiddenSize;

  if (t < myFLen and u < myGLen) {
#pragma unroll
    for (int h = threadIdx.x; h < hiddenSize; h += blockDim.x) {
      if (h < hiddenSize) {
        mySum[h] = myF[h] + myG[h];
      }
    }
  } else if (packOutput == false and t < maxFLen and u < maxGLen) {
// Need to write finite data to don't-care region because we instantiate the result tensor
// with torch::empty for performance reasons. Even though it is don't-care region, the
// contents need to be finite, otherwise could lead to NaN in WGRAD.
// In packing mode, this write is no longer necessary as we remove the don't-care region
// from the output.
// Picking -1 (over 0) here for ease of testing.
#pragma unroll
    for (int h = threadIdx.x; h < hiddenSize; h += blockDim.x) {
      if (h < hiddenSize) {
        mySum[h] = -1;
      }
    }
  }
}

/*
Tiled version of the joint forward kernel
Detail of this joint function can be found in:
[1] Sequence Transduction with Recurrent Neural Networks.

f is a tensor of shape [batch, T, H]
g is a tensor of shape [batch, U, H]
the transducer joint does
sum = f.unsqueeze(dim=2) + g.unsqueeze(dim=1)
The resultant tensor is of shape [batch, T, U, H]
Each thread is working on a tile of the shape of tileF x tileG in the result tensor.
The input for the tile is first loaded in the register and is reused tileG and tileF times.

This joint function can optionally pack the output where the output tensor with a shape of
[B, T, U, H] is packed into [B_packed, H].
Don't-care region (t > fLen) or (u > gLen) is removed.
To enable packing, the starting offset for each batch need to be specified with batchOffset.

Optionally this joint function performs ReLU and/or dropout on the joint output, which is
controlled by arguments relu and dropout, respectively. philoxArgs is argument used for generating
pseudorandom number. When at least one of operations in ReLU and dropout is activated, the joint
function is a masked operation, which is controlled by the template argument masked. In this case,
masks are saved to backward.
*/
template <typename scalar_t, int tileF, int tileG, int U, class OffsetCal, bool masked>
__global__ void transducer_joint_tiled_forward(const scalar_t* f, const scalar_t* g, const int* fLen, const int* gLen,
                                               const int64_t* batchOffset, int64_t maxFLen, int64_t maxGLen,
                                               int64_t hiddenSize, int64_t hiddenPerBlock, bool packOutput, bool relu,
                                               bool dropout, float p, at::PhiloxCudaState philoxArgs, scalar_t* sum,
                                               uint8_t* mask) {
  static_assert(U == 4, "U has to be 4, as random numbers are generated in batch of 4");

  const int batch = blockIdx.z;
  const int t = blockIdx.y * tileF;
  const int hiddenBlock = (hiddenSize + hiddenPerBlock - 1) / hiddenPerBlock;
  const int u = blockIdx.x / hiddenBlock * tileG;
  const int hOffset = (blockIdx.x % hiddenBlock) * hiddenPerBlock;
  const int h = threadIdx.x;
  const auto myFLen = fLen[batch];
  const auto myGLen = gLen[batch];

  OffsetCal offsetCal(batch, batchOffset, maxFLen, maxGLen, myGLen, hiddenSize, packOutput);
  const auto myBatchOffset = offsetCal.getBatchOffset();
  const auto strideF = offsetCal.getStrideF();

  scalar_t const* myF = f + batch * maxFLen * hiddenSize + t * hiddenSize + hOffset;
  scalar_t const* myG = g + batch * maxGLen * hiddenSize + u * hiddenSize + hOffset;
  scalar_t* mySum = sum + myBatchOffset + t * strideF + u * hiddenSize + hOffset;
  uint8_t* myMask = mask + myBatchOffset + t * strideF + u * hiddenSize + hOffset;

  // The following code is only needed for dropout. We try to bypass them as much as possible.
  auto seeds = masked ? at::cuda::philox::unpack(philoxArgs)
                      : std::make_tuple(static_cast<uint64_t>(0), static_cast<uint64_t>(0));
  uint64_t tid =
      masked ? (static_cast<uint64_t>(blockIdx.z) * gridDim.y * gridDim.x + blockIdx.y * gridDim.x + blockIdx.x) *
                       blockDim.x +
                   threadIdx.x
             : 0;
  Philox ph(std::get<0>(seeds), tid, std::get<1>(seeds));
  scalar_t scale = masked ? ((p == 0) ? 0 : 1 / p) : 0;
  bool dropoutMask[U];

  if (t < myFLen and u < myGLen and hOffset + h < hiddenSize) {
    // register buffers for tiled input reuse
    scalar_t fBuffer[tileF], gBuffer[tileG];
    for (int i = 0; i < tileF; ++i) {
      if (t + i < myFLen) fBuffer[i] = myF[i * hiddenSize + h];
    }
    for (int j = 0; j < tileG; ++j) {
      if (u + j < myGLen) gBuffer[j] = myG[j * hiddenSize + h];
    }
#pragma unroll
    for (int i = 0; i < tileF; ++i) {
      if (t + i < myFLen) {
#pragma unroll
        for (int j = 0; j < tileG; ++j) {
          int idx = i * tileG + j;
          if (masked and dropout and idx % U == 0) {
            // For performance, generate 4 random numbers in one shot
            // auto rand4 = curand_uniform4(&state);
            auto rand4 = uniform4(ph());
            dropoutMask[0] = rand4.x < p;
            dropoutMask[1] = rand4.y < p;
            dropoutMask[2] = rand4.z < p;
            dropoutMask[3] = rand4.w < p;
          }

          if (u + j < myGLen) {
            scalar_t out = fBuffer[i] + gBuffer[j];
            if (masked) {
              // Apply ReLU here when relu is True
              bool localMask = relu ? (out > 0) : 1;
              localMask = dropout ? localMask & dropoutMask[idx % U] : localMask;
              out = dropout ? out * localMask * scale : out * localMask;
              myMask[i * strideF + j * hiddenSize + h] = static_cast<uint8_t>(localMask);
            }
            mySum[i * strideF + j * hiddenSize + h] = out;
          } else if (packOutput == false and u + j < maxGLen)
            mySum[i * strideF + j * hiddenSize + h] = -1;
        }
      } else if (packOutput == false and t + i < maxFLen) {
// Again need to write finite data to don't-care region
#pragma unroll
        for (int j = 0; j < tileG; ++j) {
          if (u + j < maxGLen) mySum[i * strideF + j * hiddenSize + h] = -1;
        }
      }
    }
  } else if (packOutput == false and t < maxFLen and u < maxGLen and hOffset + h < hiddenSize) {
// Only need to ensure the finity in normal mode
#pragma unroll
    for (int i = 0; i < tileF; ++i) {
      if (t + i < maxFLen) {
#pragma unroll
        for (int j = 0; j < tileG; ++j) {
          if (u + j < maxGLen) mySum[i * strideF + j * hiddenSize + h] = -1;
        }
      }
    }
  }
}

/*
Bwd operation (reduction) on one input tensor. Since the operation performed for the two input
tensors are exactly the same, only one kernel is needed, and the different indexing offsets
and strides are handled by OffsetCalBwd.

When packing is enabled in the fwd op, unpacking is needed to restore the gradients in a
non-packed form.

When ReLU and/or dropout are performed in the fwd pass, this operation becomes a masked operation,
and mask contains the mask information.
*/
template <typename scalar_t, typename acc_t, class OffsetCal, bool masked>
__device__ void transducer_joint_single_backward(const scalar_t* grad, const uint8_t* mask, const int* fLen,
                                                 const int* gLen, const int64_t* batchOffset, int64_t maxFLen,
                                                 int64_t maxGLen, int64_t hiddenSize, bool packOutput,
                                                 bool bwdFasterDim,  // whether bwd on the faster moving dimension (u)
                                                 float scale, scalar_t* inGrad, int yBlockOffset = 0) {
  const int batch = blockIdx.z;
  // For the second input tensor, this offset need to be subtracted because the first yBlockOffset
  // sets of thread blocks are for the first input tensor.
  const int x = blockIdx.y - yBlockOffset;
  const int hOffset = blockIdx.x * C10_WARP_SIZE;
  const int wid = threadIdx.y;
  const int lid = threadIdx.x;
  const int numWarp = blockDim.y;
  extern __shared__ char smem8[];
  auto smem = reinterpret_cast<acc_t*>(smem8);

  OffsetCal offsetCal(batch, batchOffset, fLen, gLen, maxFLen, maxGLen, hiddenSize, packOutput, bwdFasterDim);
  const auto maxXLen = offsetCal.getMaxXLen();
  const auto myXLen = offsetCal.getMyXLen();
  const auto myYLen = offsetCal.getMyYLen();
  scalar_t* myInGrad = inGrad + batch * maxXLen * hiddenSize + x * hiddenSize + hOffset;

  if (x < myXLen) {
    const auto myBatchOffset = offsetCal.getBatchOffset();
    const auto strideX = offsetCal.getStrideX();
    const auto strideY = offsetCal.getStrideY();
    const scalar_t* myGrad = grad + myBatchOffset + x * strideX + hOffset;
    const uint8_t* myMask = masked ? mask + myBatchOffset + x * strideX + hOffset : nullptr;

    // Each warp reduces numYPerWarp "y" first
    acc_t warpSum = 0;
    auto numYPerWarp = (myYLen + numWarp - 1) / numWarp;
#pragma unroll
    for (int warpY = 0; warpY < numYPerWarp; ++warpY) {
      auto y = wid * numYPerWarp + warpY;
      if (y < myYLen and (hOffset + lid) < hiddenSize)
        if (masked)
          warpSum += static_cast<acc_t>(myGrad[y * strideY + lid]) * myMask[y * strideY + lid] * scale;
        else
          warpSum += myGrad[y * strideY + lid];
    }

    // transpose partial sum in SMEM and reduce further using warpReduce
    smem[lid * numWarp + wid] = warpSum;
    __syncthreads();
    auto sum = smem[wid * C10_WARP_SIZE + lid];
    sum = warpReduce(sum, numWarp);

    // a a b b c c d d
    // a a b b c c d d
    // a a b b c c d d
    // a a b b c c d d
    // example of 4 warps (a, b, c, d) with 8 threads per warp
    // Each warp need 8 / 4 = 2 threads to write the results.
    if (hOffset + wid * C10_WARP_SIZE / numWarp + lid / numWarp < hiddenSize) {
      if (lid % numWarp == 0) {
        myInGrad[wid * C10_WARP_SIZE / numWarp + lid / numWarp] = sum;
      }
    }
  } else if (wid == 0 and hOffset + lid < hiddenSize) {
    // Need to ensure the grad is zero for don't care region
    myInGrad[lid] = 0;
  }
}

/*
Actual bwd (reduction) kernel get launched.
Call transducer_joint_single_backward twice on two input tensors.
The two bwd ops are launched together, the first op uses blockIdx.y < maxFLen, and the second op
uses the rest.
When ReLU and/or dropout are performed in the fwd pass, this operation becomes a masked operation,
and mask contains the mask information.
*/
template <typename scalar_t, typename acc_t, class OffsetCal, bool masked>
__global__ void transducer_joint_combined_backward(const scalar_t* grad, const uint8_t* mask, const int* fLen,
                                                   const int* gLen, const int64_t* batchOffset, int64_t maxFLen,
                                                   int64_t maxGLen, int64_t hiddenSize, bool packOutput, float scale,
                                                   scalar_t* fGrad, scalar_t* gGrad) {
  if (blockIdx.y < maxFLen) {
    transducer_joint_single_backward<scalar_t, acc_t, OffsetCal, masked>(
        grad, mask, fLen, gLen, batchOffset, maxFLen, maxGLen, hiddenSize, packOutput, false, scale, fGrad);
  } else {
    transducer_joint_single_backward<scalar_t, acc_t, OffsetCal, masked>(
        grad, mask, fLen, gLen, batchOffset, maxFLen, maxGLen, hiddenSize, packOutput, true, scale, gGrad, maxFLen);
  }
}

/*
Vectorized version of transducer_joint_single_backward
Doing exact same operation as transducer_joint_single_backward except the load and store are
vectorized.
When packing is enabled in the fwd op, unpacking is needed to restore the gradients in a
non-packed form.
When ReLU and/or dropout are performed in the fwd pass, this operation becomes a masked operation,
and mask contains the mask information.
*/
template <typename scalar_t, typename acc_t, typename vec_t, int V, class OffsetCal, bool masked>
__device__ void transducer_joint_single_vec_backward(const scalar_t* grad, const uint8_t* mask, const int* fLen,
                                                     const int* gLen, const int64_t* batchOffset, int64_t maxFLen,
                                                     int64_t maxGLen, int64_t hiddenSize, bool packOutput,
                                                     bool bwdFasterDim, float scale, scalar_t* inGrad,
                                                     int yBlockOffset = 0) {
  const int batch = blockIdx.z;
  const int x = blockIdx.y - yBlockOffset;
  const int hOffset = blockIdx.x * C10_WARP_SIZE * V;
  const int wid = threadIdx.y;
  const int lid = threadIdx.x;
  const int numWarp = blockDim.y;

  // Figure out the vectorization type for mask
  using mvec_t = mvec_type<V>;

  OffsetCal offsetCal(batch, batchOffset, fLen, gLen, maxFLen, maxGLen, hiddenSize, packOutput, bwdFasterDim);
  const auto maxXLen = offsetCal.getMaxXLen();
  const auto myXLen = offsetCal.getMyXLen();
  const auto myYLen = offsetCal.getMyYLen();
  scalar_t* myInGrad = inGrad + batch * maxXLen * hiddenSize + x * hiddenSize + hOffset;
  extern __shared__ char smem8[];
  auto smem = reinterpret_cast<acc_t*>(smem8);

  acc_t warpSum[V];
  scalar_t inBuffer[V];
  uint8_t maskBuffer[V];
  scalar_t outBuffer[V];
  auto myInGradVec = reinterpret_cast<vec_t*>(myInGrad);
  auto outBufferVec = reinterpret_cast<vec_t*>(outBuffer);

  if (x < myXLen) {
    const auto myBatchOffset = offsetCal.getBatchOffset();
    const auto strideX = offsetCal.getStrideX();
    const auto strideY = offsetCal.getStrideY();
    const scalar_t* myGrad = grad + myBatchOffset + x * strideX + hOffset;
    const uint8_t* myMask = masked ? mask + myBatchOffset + x * strideX + hOffset : nullptr;

    for (int i = 0; i < V; ++i) warpSum[i] = 0;

    // Each warp reduces numYPerWarp "y" first
    auto numYPerWarp = (myYLen + numWarp - 1) / numWarp;
    for (int warpY = 0; warpY < numYPerWarp; ++warpY) {
      auto y = wid * numYPerWarp + warpY;
      auto myGradVec = reinterpret_cast<vec_t const*>(myGrad + y * strideY);
      auto myMaskVec = masked ? reinterpret_cast<mvec_t const*>(myMask + y * strideY) : nullptr;
      auto inBufferVec = reinterpret_cast<vec_t*>(inBuffer);
      auto maskBufferVec = reinterpret_cast<mvec_t*>(maskBuffer);
      if (hOffset + lid * V < hiddenSize and y < myYLen) {
        *inBufferVec = myGradVec[lid];  // vectorized load
        if (masked) {
          *maskBufferVec = myMaskVec[lid];
#pragma unroll
          for (int i = 0; i < V; ++i) warpSum[i] += static_cast<acc_t>(inBuffer[i]) * maskBuffer[i] * scale;
        } else {
#pragma unroll
          for (int i = 0; i < V; ++i) warpSum[i] += inBuffer[i];
        }
      }
    }

    // transpose partial sum in SMEM and reduce further using warpReduce
    for (int i = 0; i < V; ++i) {
      smem[lid * numWarp + wid] = warpSum[i];
      __syncthreads();
      auto sum = smem[wid * C10_WARP_SIZE + lid];

      if (hOffset + (wid * C10_WARP_SIZE / numWarp) * V < hiddenSize) {
        sum = warpReduce(sum, numWarp);
        if (lid % numWarp == 0) {
          outBuffer[i] = sum;
        }
      }
      __syncthreads();
    }

    // a a b b c c d d
    // a a b b c c d d
    // a a b b c c d d
    // a a b b c c d d
    // example of 4 warps (a, b, c, d) with 8 threads per warp
    // Each warp need 8 / 4 = 2 threads to write the results.
    if (lid % numWarp == 0 and hOffset + (wid * C10_WARP_SIZE / numWarp + lid / numWarp) * V < hiddenSize)
      myInGradVec[wid * C10_WARP_SIZE / numWarp + lid / numWarp] = *outBufferVec;
  } else if (wid == 0 and hOffset + lid * V < hiddenSize) {
    // Need to ensure the grad is zero for don't care region
    myInGradVec[lid] = 0;
  }
}

/*
Vecotrized version of transducer_joint_combined_backward
Call transducer_joint_single_vec_backward twice on two input tensors.
The two bwd ops are launched together, the first op uses blockIdx.y < maxFLen, and the second op
uses the rest.
When ReLU and/or dropout are performed in the fwd pass, this operation becomes a masked operation,
and mask contains the mask information.
*/
template <typename scalar_t, typename acc_t, typename vec_t, int V, class OffsetCal, bool masked>
__global__ void transducer_joint_combined_vec_backward(const scalar_t* grad, const uint8_t* mask, const int* fLen,
                                                       const int* gLen, const int64_t* batchOffset, int64_t maxFLen,
                                                       int64_t maxGLen, int64_t hiddenSize, bool packOutput,
                                                       float scale, scalar_t* fGrad, scalar_t* gGrad) {
  if (blockIdx.y < maxFLen) {
    transducer_joint_single_vec_backward<scalar_t, acc_t, vec_t, V, OffsetCal, masked>(
        grad, mask, fLen, gLen, batchOffset, maxFLen, maxGLen, hiddenSize, packOutput, false, scale, fGrad);
  } else {
    transducer_joint_single_vec_backward<scalar_t, acc_t, vec_t, V, OffsetCal, masked>(
        grad, mask, fLen, gLen, batchOffset, maxFLen, maxGLen, hiddenSize, packOutput, true, scale, gGrad, maxFLen);
  }
}

std::vector<torch::Tensor> transducer_joint_cuda_forward(torch::Tensor f, torch::Tensor g, torch::Tensor fLen,
                                                         torch::Tensor gLen, torch::Tensor batchOffset,
                                                         int64_t packedBatch, int opt, bool packOutput, bool relu,
                                                         bool dropout, float dropoutProb, int tileSize) {
  auto tensorOpt = f.options();
  auto dtype = f.scalar_type();
  const auto batchSize = f.size(0);
  const auto maxFLen = f.size(1);
  const auto maxGLen = g.size(1);
  const auto hiddenSize = f.size(2);
  bool masked = dropout or relu;

  int64_t* batchOffsetPtr = nullptr;
  torch::Tensor sum, mask;
  auto maskOpt = tensorOpt.dtype(torch::kUInt8);
  if (!packOutput) {
    sum = torch::empty({batchSize, maxFLen, maxGLen, hiddenSize}, tensorOpt);
    batchOffsetPtr = nullptr;
    if (masked) mask = torch::empty({batchSize, maxFLen, maxGLen, hiddenSize}, maskOpt);
  } else {
    sum = torch::empty({packedBatch, hiddenSize}, tensorOpt);
    batchOffsetPtr = batchOffset.data_ptr<int64_t>();
    if (masked) mask = torch::empty({packedBatch, hiddenSize}, maskOpt);
  }
  uint8_t* maskPtr = masked ? mask.data_ptr<uint8_t>() : nullptr;

  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  TORCH_CHECK(opt == 0 or opt == 1, "Got an invalid optimization level ", opt);
  // Simple heuristics
  const int numThread =
      std::min(128, (static_cast<int>(hiddenSize) + C10_WARP_SIZE - 1) / C10_WARP_SIZE * C10_WARP_SIZE);

  if (opt == 0) {
    // vanilla kernel
    const int threads = numThread;
    const dim3 blocks(maxGLen, maxFLen, batchSize);

    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
        dtype, "transducer_joint_forward", ([&] {
          transducer_joint_forward<scalar_t, OffsetCalFwd><<<blocks, threads, 0, stream>>>(
              f.data_ptr<scalar_t>(), g.data_ptr<scalar_t>(), fLen.data_ptr<int>(), gLen.data_ptr<int>(),
              batchOffsetPtr, maxFLen, maxGLen, hiddenSize, packOutput, sum.data_ptr<scalar_t>());
        }));
  }
  if (opt == 1) {
    // tiled version. For simplicity, assume tileF == tileG, even though the kernel can
    // support more general cases.
    const int threads = numThread;
    const int hiddenPerBlock = numThread;
    const int hiddenBlock = (hiddenSize + hiddenPerBlock - 1) / hiddenPerBlock;
    const dim3 blocks((maxGLen + tileSize - 1) / tileSize * hiddenBlock, (maxFLen + tileSize - 1) / tileSize,
                      batchSize);

    TORCH_CHECK(tileSize == 1 or tileSize == 2 or tileSize == 4, "Expected tileSize to be in [1, 2, 4], but got ",
                tileSize);

    at::PhiloxCudaState rng_engine_inputs;
    if (masked) {
      // set up PRG when the input is masked. rng_engine_inputs will be used as a space filler
      // for non-masked calls.
      // Therefore no need to initialize.
      c10::optional<at::Generator> gen_;
      auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(gen_, at::cuda::detail::getDefaultCUDAGenerator());
      // counterOffset records how many cuRAND calls each thread makes. For a tiled kernel,
      // each thread processes tileF * tileG output elements.
      int64_t counterOffset = tileSize * tileSize;
      {
        std::lock_guard<std::mutex> lock(gen->mutex_);
        rng_engine_inputs = gen->philox_cuda_state(counterOffset);
      }
    }

    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
        dtype, "transducer_joint_forward", ([&] {
          void (*kernel)(const scalar_t*, const scalar_t*, const int*, const int*, const int64_t*, int64_t, int64_t,
                         int64_t, int64_t, bool, bool, bool, float, at::PhiloxCudaState, scalar_t*, uint8_t*);
          if (masked) {
            switch (tileSize) {
              case 2:
                kernel = &transducer_joint_tiled_forward<scalar_t, 2, 2, 4, OffsetCalFwd, true>;
                break;
              case 4:
                kernel = &transducer_joint_tiled_forward<scalar_t, 4, 4, 4, OffsetCalFwd, true>;
                break;
            }
          } else {
            switch (tileSize) {
              case 1:
                kernel = &transducer_joint_tiled_forward<scalar_t, 1, 1, 4, OffsetCalFwd, false>;
                break;
              case 2:
                kernel = &transducer_joint_tiled_forward<scalar_t, 2, 2, 4, OffsetCalFwd, false>;
                break;
              case 4:
                kernel = &transducer_joint_tiled_forward<scalar_t, 4, 4, 4, OffsetCalFwd, false>;
                break;
            }
          }

          kernel<<<blocks, threads, 0, stream>>>(f.data_ptr<scalar_t>(), g.data_ptr<scalar_t>(), fLen.data_ptr<int>(),
                                                 gLen.data_ptr<int>(), batchOffsetPtr, maxFLen, maxGLen, hiddenSize,
                                                 hiddenPerBlock, packOutput, relu, dropout, 1.0f - dropoutProb,
                                                 rng_engine_inputs, sum.data_ptr<scalar_t>(), maskPtr);
        }));
  }

  C10_CUDA_CHECK(cudaGetLastError());
  if (masked)
    return {sum, mask};
  else
    return {sum};
}

std::vector<torch::Tensor> transducer_joint_cuda_backward(std::vector<torch::Tensor> in, torch::Tensor fLen,
                                                          torch::Tensor gLen, torch::Tensor batchOffset, int maxFLen,
                                                          int maxGLen, bool packOutput, float scale) {
  auto grad = in[0];
  bool masked = (in.size() == 2);
  uint8_t* maskPtr = masked ? in[1].data_ptr<uint8_t>() : nullptr;

  auto tensorOpt = grad.options();
  auto dtype = grad.scalar_type();
  const int batchSize = fLen.size(0);
  const int hiddenSize = grad.size(-1);

  const auto deviceProperties = at::cuda::getCurrentDeviceProperties();
  const int maxNumWarp = deviceProperties->maxThreadsPerBlock / C10_WARP_SIZE;

  torch::Tensor fGrad = torch::empty({batchSize, maxFLen, hiddenSize}, tensorOpt);
  torch::Tensor gGrad = torch::empty({batchSize, maxGLen, hiddenSize}, tensorOpt);

  int64_t* batchOffsetPtr = (!packOutput) ? nullptr : batchOffset.data_ptr<int64_t>();

  // The number "y" I would like each thread to work on
  const int workPerThread = 32;
  // Since the bwd for f and g have the same thread block size, we need to use the max of the two.
  int numWarp = largestPowerOfTwo((std::max(maxFLen, maxGLen) + workPerThread - 1) / workPerThread);
  // Would like to have at least 2 warps
  numWarp = std::max(2, numWarp);
  // cap on the maximum number of warps allowed
  numWarp = std::min(maxNumWarp, numWarp);

  // Need smem for transposing the partial sum. The partial sum is in a matrix of the shape
  // numWarp x warpSize
  const int smemSize = numWarp * C10_WARP_SIZE;
  const dim3 threads(C10_WARP_SIZE, numWarp, 1);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      dtype, "transducer_joint_cuda_backward_kernel", ([&] {
        auto gradPtr = grad.data_ptr<scalar_t>();
        auto fLenPtr = fLen.data_ptr<int>();
        auto gLenPtr = gLen.data_ptr<int>();
        auto fGradPtr = fGrad.data_ptr<scalar_t>();
        auto gGradPtr = gGrad.data_ptr<scalar_t>();

        // resolve the acc_t type
        using acc_t = at::acc_type<scalar_t, true>;
        using vec_t = uint64_t;

        constexpr int vectFactor = sizeof(vec_t) / sizeof(scalar_t);
        constexpr int vecAlignment = std::alignment_of<vec_t>::value;

        // if all input and output tensors meet the alignment requirement
        bool memAlign = (reinterpret_cast<uint64_t>(gradPtr) % vecAlignment == 0) and
                        (reinterpret_cast<uint64_t>(fGradPtr) % vecAlignment == 0) and
                        (reinterpret_cast<uint64_t>(gGradPtr) % vecAlignment == 0);

        if (vectFactor > 1 and hiddenSize % vectFactor == 0 and memAlign) {
          // If vectorization helps and the alignment requirement is met, use the vectorized
          // kernel. For simplicity, hiddenSize needs to be a multiple vecFactor.
          const dim3 blocks((hiddenSize + C10_WARP_SIZE * vectFactor - 1) / (C10_WARP_SIZE * vectFactor),
                            maxFLen + maxGLen, batchSize);
          if (masked) {
            transducer_joint_combined_vec_backward<scalar_t, acc_t, vec_t, vectFactor, OffsetCalBwd, true>
                <<<blocks, threads, smemSize * sizeof(acc_t)>>>(gradPtr, maskPtr, fLenPtr, gLenPtr, batchOffsetPtr,
                                                                maxFLen, maxGLen, hiddenSize, packOutput, scale,
                                                                fGradPtr, gGradPtr);
          } else {
            transducer_joint_combined_vec_backward<scalar_t, acc_t, vec_t, vectFactor, OffsetCalBwd, false>
                <<<blocks, threads, smemSize * sizeof(acc_t)>>>(gradPtr, maskPtr, fLenPtr, gLenPtr, batchOffsetPtr,
                                                                maxFLen, maxGLen, hiddenSize, packOutput, scale,
                                                                fGradPtr, gGradPtr);
          }
        } else {
          const dim3 blocks((hiddenSize + C10_WARP_SIZE - 1) / C10_WARP_SIZE, maxFLen + maxGLen, batchSize);
          if (masked) {
            transducer_joint_combined_backward<scalar_t, acc_t, OffsetCalBwd, true>
                <<<blocks, threads, smemSize * sizeof(acc_t)>>>(gradPtr, maskPtr, fLenPtr, gLenPtr, batchOffsetPtr,
                                                                maxFLen, maxGLen, hiddenSize, packOutput, scale,
                                                                fGradPtr, gGradPtr);
          } else {
            transducer_joint_combined_backward<scalar_t, acc_t, OffsetCalBwd, false>
                <<<blocks, threads, smemSize * sizeof(acc_t)>>>(gradPtr, maskPtr, fLenPtr, gLenPtr, batchOffsetPtr,
                                                                maxFLen, maxGLen, hiddenSize, packOutput, scale,
                                                                fGradPtr, gGradPtr);
          }
        }
      }));

  return {fGrad, gGrad};
}


================================================
FILE: apex/contrib/csrc/transducer/transducer_loss.cpp
================================================
#include <torch/extension.h>

#include <vector>

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
  CHECK_CUDA(x);       \
  CHECK_CONTIGUOUS(x)

std::vector<torch::Tensor> transducer_loss_cuda_forward(torch::Tensor x, torch::Tensor label, torch::Tensor audLen,
                                                        torch::Tensor txtLen, torch::Tensor batchOffset, int maxFLen,
                                                        int blankIdx, int opt, bool packedInput);

torch::Tensor transducer_loss_cuda_backward(torch::Tensor x, torch::Tensor lossGrad, torch::Tensor alpha,
                                            torch::Tensor beta, torch::Tensor audLen, torch::Tensor txtLen,
                                            torch::Tensor label, torch::Tensor batchOffset, int maxFLen, int blankIdx,
                                            int opt, bool fuseSoftmaxBackward, bool packedInput);

std::vector<torch::Tensor> transducer_loss_forward(torch::Tensor x, torch::Tensor label, torch::Tensor fLen,
                                                   torch::Tensor yLen, torch::Tensor batchOffset, int maxFLen,
                                                   int blankIdx, int opt, bool packedInput) {
  CHECK_INPUT(x);
  CHECK_INPUT(label);
  CHECK_INPUT(fLen);
  CHECK_INPUT(yLen);
  if (packedInput) CHECK_INPUT(batchOffset);
  return transducer_loss_cuda_forward(x, label, fLen, yLen, batchOffset, maxFLen, blankIdx, opt, packedInput);
}

torch::Tensor transducer_loss_backward(torch::Tensor x, torch::Tensor lossGrad, torch::Tensor alpha, torch::Tensor beta,
                                       torch::Tensor fLen, torch::Tensor yLen, torch::Tensor label,
                                       torch::Tensor batchOffset, int maxFLen, int blankIdx, int opt,
                                       bool fuseSoftmaxBackward, bool packedInput) {
  CHECK_INPUT(x);
  CHECK_INPUT(label);
  CHECK_INPUT(lossGrad);
  CHECK_INPUT(alpha);
  CHECK_INPUT(beta);
  CHECK_INPUT(fLen);
  CHECK_INPUT(yLen);
  if (packedInput) CHECK_INPUT(batchOffset);

  return transducer_loss_cuda_backward(x, lossGrad, alpha, beta, fLen, yLen, label, batchOffset, maxFLen, blankIdx, opt,
                                       fuseSoftmaxBackward, packedInput);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &transducer_loss_forward, "transducer loss forward (CUDA)",
        py::call_guard<py::gil_scoped_release>());
  m.def("backward", &transducer_loss_backward, "transducer loss backward (CUDA)",
        py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: apex/contrib/csrc/transducer/transducer_loss_kernel.cu
================================================
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <torch/extension.h>

#include <vector>

template <typename scalar_t>
__device__ __forceinline__ scalar_t logSumExp(scalar_t a, scalar_t b) {
  // standard log-sum-exp trick is used here to provide better numerical stability
  return (a >= b) ? a + std::log1p(exp(b - a)) : b + std::log1p(exp(a - b));
}

// Vanilla transducer loss function (i.e. forward-backward algorithm)
// Detail of this loss function can be found in:
// [1] Sequence Transduction with Recurrent Neural Networks.

// Forward (alpha) and backward (beta) path are launched together. Input is assumed to be converted
// into log scale by the preceding log_softmax layer
// Diagonal wavefront advancing usually used in dynamic programming is leveraged here.
// alpha and beta are of acc_t type, as they are essentially accumulators.

// This loss function supports packed input where a tensor of shape [B, T, U, H] is packed into
// [B_packed, H].
// Don't-care region (t > audLen) or (u > txtLen) is removed.
// To support the packed input, the starting offsets for each batch need to be specified with
// batchOffset.
template <typename scalar_t, typename acc_t>
__global__ void transducer_loss_forward(const scalar_t* x, const int* label, const int* audLen, const int* txtLen,
                                        const int64_t* batchOffset,
                                        int64_t dictSize,  // 64-bit indexing for data tensor
                                        int64_t blankIdx, int64_t maxFLen, int64_t maxGLen, bool packedInput,
                                        acc_t* alpha, acc_t* beta, scalar_t* loss) {
  const int batch = blockIdx.y;
  const int tid = threadIdx.x;
  const auto myFLen = audLen[batch];
  // Note that start of the sentence is added as 1 here
  const auto myGLen = txtLen[batch] + 1;
  const auto myLabel = label + batch * (maxGLen - 1);
  const int64_t myBatchOffset = packedInput ? (batch == 0 ? 0 : batchOffset[batch - 1]) : batch * maxFLen * maxGLen;
  const int64_t myStrideT = packedInput ? myGLen : maxGLen;
  const scalar_t* myX = x + myBatchOffset * dictSize;
  int u = tid;

  if (blockIdx.x == 0) {
    // alpha path
    acc_t* myAlpha = alpha + batch * maxFLen * maxGLen;
    if (u == 0) myAlpha[0] = 0;
    __syncthreads();

    for (int64_t step = 1; step < myFLen + myGLen - 1; ++step) {
      // Move along the diagonal wavefront to leverage available parallelism
      for (u = tid; u < myGLen; u += blockDim.x) {
        int64_t t = step - u;
        if (t >= 0 and t < myFLen and u >= 0 and u < myGLen) {
          // Eq(16) in [1]
          if (u == 0) {
            // alpha(t, u) = alpha(t-1, u) * null(t-1, u)
            myAlpha[t * maxGLen + u] = myAlpha[(t - 1) * maxGLen] + myX[((t - 1) * myStrideT) * dictSize + blankIdx];
          } else if (t == 0) {
            // alpha(t, u-1) = alpha(t, u-1) * y(t, u-1)
            myAlpha[u] = myAlpha[u - 1] + myX[(u - 1) * dictSize + myLabel[u - 1]];
          } else {
            // alpha(t, u) = alpha(t-1, u) * null(t-1, u) + alpha(t, u-1) * y(t, u-1)
            acc_t current = myAlpha[(t - 1) * maxGLen + u] + myX[((t - 1) * myStrideT + u) * dictSize + blankIdx];
            acc_t next = myAlpha[t * maxGLen + u - 1] + myX[(t * myStrideT + u - 1) * dictSize + myLabel[u - 1]];
            myAlpha[t * maxGLen + u] = logSumExp(next, current);
          }
        }
      }
      __syncthreads();
    }
  } else if (blockIdx.x == 1) {
    // beta path
    acc_t* myBeta = beta + batch * maxFLen * maxGLen;
    if (u == 0) {
      myBeta[(myFLen - 1) * maxGLen + myGLen - 1] = myX[((myFLen - 1) * myStrideT + myGLen - 1) * dictSize + blankIdx];
    }
    __syncthreads();

    for (int64_t step = myFLen + myGLen - 3; step >= 0; --step) {
      for (u = tid; u < myGLen; u += blockDim.x) {
        int64_t t = step - u;
        if (t >= 0 and t < myFLen and u >= 0 and u < myGLen) {
          // Eq(18) in [1]
          if (u == myGLen - 1) {
            // beta(t, u) = beta(t+1, u) * null(t, u)
            myBeta[t * maxGLen + u] = myBeta[(t + 1) * maxGLen + u] + myX[(t * myStrideT + u) * dictSize + blankIdx];
          } else if (t == myFLen - 1) {
            // beta(t, u) = beta(t, u+1) * y(t, u)
            myBeta[t * maxGLen + u] = myBeta[t * maxGLen + u + 1] + myX[(t * myStrideT + u) * dictSize + myLabel[u]];
          } else {
            // beta(t, u) = beta(t+1, u)*null(t, u) + beta(t, u+1)*y(t, u)
            acc_t current = myBeta[(t + 1) * maxGLen + u] + myX[(t * myStrideT + u) * dictSize + blankIdx];
            acc_t next = myBeta[t * maxGLen + u + 1] + myX[(t * myStrideT + u) * dictSize + myLabel[u]];
            myBeta[t * maxGLen + u] = logSumExp(next, current);
          }
        }
      }
      __syncthreads();
    }
    if (tid == 0) loss[batch] = -myBeta[0];
  }
}

// transudcer loss function (i.e. forward-backward algorithm) with batch loading optimization.
// Compared to the vanilla version, there are two optimizations:
// 1. load x in batch through loop unrolling to reduce the latency.
// 2. Use registers and shared memory to hold alpha and beta values passed from one step the next.
// For simplicity, this kernel currently only supports U <= maxThread, which should be the common
// case. For cases where U > maxThread, the vanilla kernel is used as a fallback option.

// Detail of this loss function can be found in:
// [1] Sequence Transduction with Recurrent Neural Networks.
// Forward (alpha) and backward (beta) path are launched together. Input is assumed to be converted
// into log scale by the preceding log_softmax layer
// Diagonal wavefront advancing usually used in dynamic programming is leveraged here.
// alpha and beta are of acc_t type, as they are essentially accumulators.

// This loss function supports packed input where a tensor of shape [B, T, U, H] is packed into
// [B_packed, H].
// Don't-care region (t > audLen) or (u > txtLen) is removed.
// To support the packed input, the starting offsets for each batch need to be specified with
// batchOffset.
template <typename scalar_t, typename acc_t, int batchLdSize>
__global__ void transducer_loss_batch_load_forward(const scalar_t* x, const int* label, const int* audLen,
                                                   const int* txtLen, const int64_t* batchOffset, int64_t dictSize,
                                                   int64_t blankIdx, int64_t maxFLen, int64_t maxGLen, bool packedInput,
                                                   acc_t* alpha, acc_t* beta, scalar_t* loss) {
  const int batch = blockIdx.y;
  int u = threadIdx.x;
  const auto myFLen = audLen[batch];
  const auto myGLen = txtLen[batch] + 1;
  const int64_t myBatchOffset = packedInput ? (batch == 0 ? 0 : batchOffset[batch - 1]) : batch * maxFLen * maxGLen;
  const int64_t myStrideT = packedInput ? myGLen : maxGLen;
  const scalar_t* myX = x + myBatchOffset * dictSize;
  scalar_t next[batchLdSize], current[batchLdSize];
  extern __shared__ char smem8[];
  auto smem = reinterpret_cast<acc_t*>(smem8);

  if (blockIdx.x == 0) {
    // alpha path
    acc_t* myAlpha = alpha + batch * maxFLen * maxGLen;
    // two SMEM regions for double buffering read and write data to avoid data race
    acc_t* const sharedAlpha[2] = {smem, smem + maxGLen};

    sharedAlpha[0][u] = 0;
    __syncthreads();

    if (u == 0) myAlpha[0] = 0;

    auto myAlphaLabel = (u == 0) ? 0 : label[batch * (maxGLen - 1) + u - 1];
    // register used to pass value to the next step for the same thread
    acc_t prvStepAlpha = 0;
    for (int64_t step = 1; step < myFLen + myGLen - 1 + batchLdSize; step += batchLdSize) {
// Move along the diagonal wavefront to leverage available parallelism
// Batch loading X through loop unrolling
#pragma unroll
      for (int i = 0; i < batchLdSize; ++i) {
        if (step + i < myFLen + myGLen - 1) {
          // index computing
          int64_t t = step + i - u;
          int64_t currentId = ((t - 1) * myStrideT + u) * dictSize + blankIdx;
          int64_t nextId = (t * myStrideT + u - 1) * dictSize + myAlphaLabel;
          // main loading loop
          if (t >= 0 and t < myFLen and u >= 0 and u < myGLen) {
            if (u == 0) {
              current[i] = myX[currentId];
            } else if (t == 0) {
              next[i] = myX[nextId];
            } else {
              current[i] = myX[currentId];
              next[i] = myX[nextId];
            }
          }
        }
      }
      // main computing loop
      for (int i = 0; i < batchLdSize; ++i) {
        // swap the pointer for double buffering
        auto sharedAlphaRd = sharedAlpha[(step + i - 1) % 2];
        auto sharedAlphaWr = sharedAlpha[(step + i) % 2];
        if (step + i < myFLen + myGLen - 1) {
          int64_t t = step + i - u;
          if (t >= 0 and t < myFLen and u >= 0 and u < myGLen) {
            // Eq(16) in [1]
            if (u == 0)
              prvStepAlpha = prvStepAlpha + current[i];
            else if (t == 0)
              prvStepAlpha = sharedAlphaRd[u - 1] + next[i];
            else
              prvStepAlpha = logSumExp(prvStepAlpha + current[i], sharedAlphaRd[u - 1] + next[i]);
            sharedAlphaWr[u] = prvStepAlpha;
            myAlpha[t * maxGLen + u] = prvStepAlpha;
          }
        }
        __syncthreads();
      }
    }
  } else if (blockIdx.x == 1) {
    // beta path
    acc_t* myBeta = beta + batch * maxFLen * maxGLen;
    // two SMEM regions for double buffering read and write data to avoid data race
    acc_t* const sharedBeta[2] = {smem, smem + maxGLen};
    sharedBeta[0][u] = myX[((myFLen - 1) * myStrideT + myGLen - 1) * dictSize + blankIdx];
    __syncthreads();

    auto myBetaLabel = (u == maxGLen - 1) ? 0 : label[batch * (maxGLen - 1) + u];
    // register used to pass value to the next step for the same thread
    acc_t prvStepBeta = myX[((myFLen - 1) * myStrideT + myGLen - 1) * dictSize + blankIdx];
    if (u == 0) myBeta[(myFLen - 1) * maxGLen + myGLen - 1] = prvStepBeta;

    for (int64_t step = 1; step < myFLen + myGLen - 1; step += batchLdSize) {
// Move along the diagonal wavefront to leverage available parallelism
// Batch loading X
#pragma unroll
      for (int i = 0; i < batchLdSize; ++i) {
        if (step + i < myFLen + myGLen - 1) {
          // index computing
          int64_t t = myFLen + myGLen - (step + i) - 2 - u;
          int64_t currentId = (t * myStrideT + u) * dictSize + blankIdx;
          int64_t nextId = (t * myStrideT + u) * dictSize + myBetaLabel;
          // main loading loop
          if (t >= 0 and t < myFLen and u >= 0 and u < myGLen) {
            if (u == myGLen - 1) {
              current[i] = myX[currentId];
            } else if (t == myFLen - 1) {
              next[i] = myX[nextId];
            } else {
              current[i] = myX[currentId];
              next[i] = myX[nextId];
            }
          }
        }
      }
      // main computing loop
      for (int i = 0; i < batchLdSize; ++i) {
        // swap the pointer for double buffering
        auto sharedBetaRd = sharedBeta[(step + i - 1) % 2];
        auto sharedBetaWr = sharedBeta[(step + i) % 2];
        if (step + i < myFLen + myGLen - 1) {
          int64_t t = myFLen + myGLen - (step + i) - 2 - u;
          if (t >= 0 and t < myFLen and u >= 0 and u < myGLen) {
            // Eq(18) in [1]
            if (u == myGLen - 1)
              prvStepBeta = prvStepBeta + current[i];
            else if (t == myFLen - 1)
              prvStepBeta = sharedBetaRd[u + 1] + next[i];
            else
              prvStepBeta = logSumExp(prvStepBeta + current[i], sharedBetaRd[u + 1] + next[i]);
            sharedBetaWr[u] = prvStepBeta;
            myBeta[t * maxGLen + u] = prvStepBeta;
          }
        }
        __syncthreads();
      }
    }
    if (u == 0) loss[batch] = -prvStepBeta;
  }
}

// Vanilla transudcer loss backward operation.
// Detail of this loss function can be found in:
// [1] Sequence Transduction with Recurrent Neural Networks.
// For this backward kernel, bwd op for the preceding softmax is assumed to be handled elsewhere,
// hence only Eq(20) in [1] is implemented in this kernel.

// Each thread block works on [batch, t, :, :] of data. Each thread works on a specific u at a time
// Since only gradients for the correct token and null token need to be updated, gradients at other
// locations are initialized to 0.

// To support the packed input, the starting offsets for each batch need to be specified with
// batchOffset.
template <typename scalar_t, typename acc_t>
__global__ void transducer_loss_backward(const scalar_t* x, const scalar_t* lossGrad, const int* audLen,
                                         const int* txtLen, const int* label, const acc_t* alpha, const acc_t* beta,
                                         const int64_t* batchOffset, int64_t dictSize, int64_t blankIdx,
                                         int64_t maxFLen, int64_t maxGLen, bool packedInput, scalar_t* xGrad) {
  const int tid = threadIdx.x;
  const int t = blockIdx.x;
  const int batch = blockIdx.y;
  const int64_t myFLen = audLen[batch];
  const int64_t myGLen = txtLen[batch] + 1;
  const int64_t myBatchOffset = packedInput ? (batch == 0 ? 0 : batchOffset[batch - 1]) : batch * maxFLen * maxGLen;
  const int64_t myStrideT = packedInput ? myGLen : maxGLen;
  auto myX = x + (myBatchOffset + t * myStrideT) * dictSize;
  auto myAlpha = alpha + batch * maxFLen * maxGLen;
  auto myBeta = beta + batch * maxFLen * maxGLen;
  auto myXGrad = xGrad + (myBatchOffset + t * myStrideT) * dictSize;
  auto myLabel = label + batch * (maxGLen - 1);

  int64_t u = tid;
  while (t < myFLen and u < myGLen) {
    // Do the update
    // loss = -ln(Pr(y*|x))
    acc_t grad = std::log(lossGrad[batch]) + myAlpha[t * maxGLen + u] - myBeta[0];
    if (u != myGLen - 1)
      myXGrad[u * dictSize + myLabel[u]] =
          -std::exp(grad + myBeta[t * maxGLen + u + 1] + myX[u * dictSize + myLabel[u]]);
    if (t == myFLen - 1 and u == myGLen - 1)
      myXGrad[u * dictSize + blankIdx] = -std::exp(grad + myX[u * dictSize + blankIdx]);
    else if (t != myFLen - 1)
      myXGrad[u * dictSize + blankIdx] = -std::exp(grad + myBeta[(t + 1) * maxGLen + u] + myX[u * dictSize + blankIdx]);

    u += blockDim.x;
  }
}

// Fused transudcer loss backward operation.
// Detail of this loss function can be found in:
// [1] Sequence Transduction with Recurrent Neural Networks.
// The bwd op of the preceding softmax layer is fused in this kernel.
// Each thread block works on [batch, t, u, :] of data. Each thread works on a specific h at a time

// To support the packed input, the starting offsets for each batch need to be specified with
// batchOffset.
template <typename scalar_t, typename acc_t>
__global__ void transducer_loss_fused_backward(const scalar_t* x, const scalar_t* lossGrad, const int* audLen,
                                               const int* txtLen, const int* label, const acc_t* alpha,
                                               const acc_t* beta, const int64_t* batchOffset, int64_t dictSize,
                                               int64_t blankIdx, int64_t maxFLen, int64_t maxGLen, bool packedInput,
                                               scalar_t* xGrad) {
  const int tid = threadIdx.x;
  const int u = blockIdx.x;
  const int t = blockIdx.y;
  const int batch = blockIdx.z;
  const int64_t myFLen = audLen[batch];
  const int64_t myGLen = txtLen[batch] + 1;
  const int64_t myBatchOffset = packedInput ? (batch == 0 ? 0 : batchOffset[batch - 1]) : batch * maxFLen * maxGLen;
  const int64_t myStrideT = packedInput ? myGLen : maxGLen;

  __shared__ acc_t commonFactor, myBetaTU, myBetaTUp1, myBetaTp1U, myLabelShared;
  auto myXGrad = xGrad + (myBatchOffset + t * myStrideT + u) * dictSize;

  if (t < myFLen and u < myGLen) {
    auto myX = x + (myBatchOffset + t * myStrideT + u) * dictSize;
    auto myAlpha = alpha + batch * maxFLen * maxGLen;
    auto myBeta = beta + batch * maxFLen * maxGLen;
    auto myLabel = label + batch * (maxGLen - 1);

    // load and store shared variables in SMEM
    if (tid == 0) {
      commonFactor = std::log(lossGrad[batch]) + myAlpha[t * maxGLen + u] - myBeta[0];
      myBetaTU = myBeta[t * maxGLen + u];
      myBetaTUp1 = myBeta[t * maxGLen + u + 1];
      myBetaTp1U = myBeta[(t + 1) * maxGLen + u];
      myLabelShared = myLabel[u];
    }

    __syncthreads();

    for (int64_t h = tid; h < dictSize; h += blockDim.x) {
      // Do the update
      acc_t grad = commonFactor + myX[h];  // loss = -ln(Pr(y*|x))
      acc_t myGrad = std::exp(grad + myBetaTU);
      if (u != myGLen - 1 and h == myLabelShared) {
        myGrad -= std::exp(grad + myBetaTUp1);
      } else if (h == blankIdx) {
        if (t == myFLen - 1 and u == myGLen - 1)
          myGrad -= std::exp(grad);
        else if (t != myFLen - 1)
          myGrad -= std::exp(grad + myBetaTp1U);
      }
      myXGrad[h] = myGrad;
    }
  } else if (!packedInput) {
    // In non-pack mode, need to make sure the gradients for don't-care regions are zero.
    for (int64_t h = tid; h < dictSize; h += blockDim.x) {
      myXGrad[h] = 0;
    }
  }
}

// Vectorized version of fused transudcer loss backward operation.
// Detail of this loss function can be found in:
// [1] Sequence Transduction with Recurrent Neural Networks.
// The bwd op of the preceding softmax layer is fused in this kernel.
// Each thread block works on [batch, t, u, :] of data. Each thread works on a specific h at a time

// To support the packed input, the starting offsets for each batch need to be specified with
// batchOffset.
template <typename scalar_t, typename acc_t, typename vec_t, int V>
__global__ void transducer_loss_fused_vec_backward(const scalar_t* x, const scalar_t* lossGrad, const int* audLen,
                                                   const int* txtLen, const int* label, const acc_t* alpha,
                                                   const acc_t* beta, const int64_t* batchOffset, int64_t dictSize,
                                                   int64_t blankIdx, int64_t maxFLen, int64_t maxGLen, bool packedInput,
                                                   scalar_t* xGrad) {
  const int tid = threadIdx.x;
  const int u = blockIdx.x;
  const int t = blockIdx.y;
  const int batch = blockIdx.z;
  const int64_t myFLen = audLen[batch];
  const int64_t myGLen = txtLen[batch] + 1;
  const int64_t myBatchOffset = packedInput ? (batch == 0 ? 0 : batchOffset[batch - 1]) : batch * maxFLen * maxGLen;
  const int64_t myStrideT = packedInput ? myGLen : maxGLen;

  __shared__ acc_t commonFactor, myBetaTU, myBetaTUp1, myBetaTp1U, myLabelShared;
  auto myXGrad = xGrad + (myBatchOffset + t * myStrideT + u) * dictSize;
  auto myX = x + (myBatchOffset + t * myStrideT + u) * dictSize;
  auto myAlpha = alpha + batch * maxFLen * maxGLen;
  auto myBeta = beta + batch * maxFLen * maxGLen;
  auto myLabel = label + batch * (maxGLen - 1);

  // Variabels for vectorization
  scalar_t myXBuffer[V], myXGradBuffer[V];
  auto myXVec = reinterpret_cast<vec_t const*>(myX);
  auto myXGradVec = reinterpret_cast<vec_t*>(myXGrad);
  auto myXBufferVec = reinterpret_cast<vec_t*>(myXBuffer);
  auto myXGradBufferVec = reinterpret_cast<vec_t*>(myXGradBuffer);
  if (t < myFLen and u < myGLen) {
    // load and store shared variables in SMEM
    if (tid == 0) {
      commonFactor = std::log(lossGrad[batch]) + myAlpha[t * maxGLen + u] - myBeta[0];
      myBetaTU = myBeta[t * maxGLen + u];
      if (t != myFLen - 1) myBetaTp1U = myBeta[(t + 1) * maxGLen + u];
      if (u != myGLen - 1) {
        myBetaTUp1 = myBeta[t * maxGLen + u + 1];
        myLabelShared = myLabel[u];
      }
    }

    __syncthreads();

#pragma unroll
    for (int64_t h0 = tid * V; h0 < dictSize; h0 += blockDim.x * V) {
      // Load myX in a vector form
      *myXBufferVec = myXVec[h0 / V];
// Do the update for a vector of input
#pragma unroll
      for (int i = 0; i < V; ++i) {
        auto h = h0 + i;
        acc_t grad = commonFactor + myXBuffer[i];  // loss = -ln(Pr(y*|x))
        acc_t myGrad = std::exp(grad + myBetaTU);
        if (u != myGLen - 1 and h == myLabelShared) {
          myGrad -= std::exp(grad + myBetaTUp1);
        } else if (h == blankIdx) {
          if (t == myFLen - 1 and u == myGLen - 1)
            myGrad -= std::exp(grad);
          else if (t != myFLen - 1)
            myGrad -= std::exp(grad + myBetaTp1U);
        }
        myXGradBuffer[i] = myGrad;
      }

      // Store myXGrad in a vector form
      myXGradVec[h0 / V] = *myXGradBufferVec;
    }
  } else if (!packedInput) {
    // In non-pack mode, need to make sure the gradients for don't-care regions are zero.
    for (int64_t h0 = tid * V; h0 < dictSize; h0 += blockDim.x * V) {
      myXGradVec[h0 / V] = 0;
    }
  }
}

std::vector<torch::Tensor> transducer_loss_cuda_forward(torch::Tensor x, torch::Tensor label, torch::Tensor audLen,
                                                        torch::Tensor txtLen, torch::Tensor batchOffset, int maxFLen,
                                                        int blankIdx, int opt, bool packedInput) {
  auto scalarType = x.scalar_type();
  auto tensorOpt = x.options();
  const int batchSize = label.size(0);
  const int maxGLen = label.size(1) + 1;
  const int dictSize = x.size(-1);

  TORCH_CHECK(blankIdx >= 0 and blankIdx < dictSize, "Expected blank index to be in the range of 0 to ", dictSize - 1,
              ", but got ", blankIdx);
  TORCH_CHECK(opt == -1 or opt == 0 or opt == 1, "Got an invalid optimization level ", opt);

  // The data type of alpha and beta will be resolved at dispatch time,
  // hence defined here and assigned later
  torch::Tensor alpha;
  torch::Tensor beta;
  torch::Tensor loss = torch::empty({batchSize}, tensorOpt);
  const auto deviceProperties = at::cuda::getCurrentDeviceProperties();
  const auto maxThreadPerBlock = deviceProperties->maxThreadsPerBlock;
  const auto maxSmemPerBlock = deviceProperties->sharedMemPerBlock;
  const auto batchOffsetPtr = packedInput ? batchOffset.data_ptr<int64_t>() : nullptr;
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      scalarType, "transducer_loss_cuda_forward", ([&] {
        // resolve accumulation type
        using acc_t = at::acc_type<scalar_t, true>;
        auto accType = c10::CppTypeToScalarType<acc_t>::value;
        auto accTensorOpt = tensorOpt.dtype(accType);
        alpha = torch::empty({batchSize, maxFLen, maxGLen}, accTensorOpt);
        beta = torch::empty({batchSize, maxFLen, maxGLen}, accTensorOpt);

        // decide what kernel to launch based on the problem size
        // if the required SMEM size or number threads exceeds the limit, fall back to the vanilla
        // kernel.
        const auto smemSize = 2 * maxGLen * sizeof(acc_t);
        const auto optFallBack = (maxGLen > maxThreadPerBlock or smemSize > maxSmemPerBlock) ? 0
                                 : (opt == -1)                                               ? 1
                                                                                             : opt;
        const int threads = std::min(maxThreadPerBlock, maxGLen);
        const dim3 blocks(2, batchSize, 1);

        if (optFallBack == 0)
          transducer_loss_forward<<<blocks, threads, 0, stream>>>(
              x.data_ptr<scalar_t>(), label.data_ptr<int>(), audLen.data_ptr<int>(), txtLen.data_ptr<int>(),
              batchOffsetPtr, dictSize, blankIdx, maxFLen, maxGLen, packedInput, alpha.data_ptr<acc_t>(),
              beta.data_ptr<acc_t>(), loss.data_ptr<scalar_t>());
        else if (optFallBack == 1)
          transducer_loss_batch_load_forward<scalar_t, acc_t, 4><<<blocks, threads, smemSize, stream>>>(
              x.data_ptr<scalar_t>(), label.data_ptr<int>(), audLen.data_ptr<int>(), txtLen.data_ptr<int>(),
              batchOffsetPtr, dictSize, blankIdx, maxFLen, maxGLen, packedInput, alpha.data_ptr<acc_t>(),
              beta.data_ptr<acc_t>(), loss.data_ptr<scalar_t>());
      }));
  C10_CUDA_CHECK(cudaGetLastError());

  return {alpha, beta, loss};
}

torch::Tensor transducer_loss_cuda_backward(torch::Tensor x, torch::Tensor lossGrad, torch::Tensor alpha,
                                            torch::Tensor beta, torch::Tensor audLen, torch::Tensor txtLen,
                                            torch::Tensor label, torch::Tensor batchOffset, int maxFLen, int blankIdx,
                                            int opt, bool fuseSoftmaxBackward, bool packedInput) {
  auto dtype = x.scalar_type();
  torch::Tensor xGrad;
  const int batchSize = label.size(0);
  const int maxGLen = label.size(1) + 1;
  const int dictSize = x.size(-1);
  const auto deviceProperties = at::cuda::getCurrentDeviceProperties();
  const int maxThreadPerBlock = deviceProperties->maxThreadsPerBlock;
  const int warpSize = deviceProperties->warpSize;
  const auto batchOffsetPtr = packedInput ? batchOffset.data_ptr<int64_t>() : nullptr;
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  if (fuseSoftmaxBackward) {
    // alloc empty tensors for performance, hence need to ensure zeros are writtern to
    // don't-care region in the kernel.
    xGrad = torch::empty_like(x);

    // Would like each thread to work on 4 hidden units
    const int workPerThread = 4;
    // Don't want to have more than 128 threads per thread block
    const int maxThreadPerElmt = std::min(128, maxThreadPerBlock);
    const int threads = std::min(maxThreadPerElmt, std::max(warpSize, (dictSize + workPerThread - 1) / workPerThread));
    const dim3 blocks(maxGLen, maxFLen, batchSize);

    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
        dtype, "transducer_loss_cuda_backward", ([&] {
          using vec_t = uint64_t;
          using acc_t = at::acc_type<scalar_t, true>;
          constexpr int vectFactor = sizeof(vec_t) / sizeof(scalar_t);
          constexpr int vecAlignment = std::alignment_of<vec_t>::value;
          // if all input and output tensors meet the alignment requirement
          bool memAlign = reinterpret_cast<uint64_t>(x.data_ptr<scalar_t>()) % vecAlignment == 0 and
                          reinterpret_cast<uint64_t>(xGrad.data_ptr<scalar_t>()) % vecAlignment == 0;

          if (vectFactor > 1 and dictSize % vectFactor == 0 and memAlign) {
            transducer_loss_fused_vec_backward<scalar_t, acc_t, vec_t, vectFactor><<<blocks, threads, 0, stream>>>(
                x.data_ptr<scalar_t>(), lossGrad.data_ptr<scalar_t>(), audLen.data_ptr<int>(), txtLen.data_ptr<int>(),
                label.data_ptr<int>(), alpha.data_ptr<acc_t>(), beta.data_ptr<acc_t>(), batchOffsetPtr, dictSize,
                blankIdx, maxFLen, maxGLen, packedInput, xGrad.data_ptr<scalar_t>());
          } else {
            transducer_loss_fused_backward<<<blocks, threads, 0, stream>>>(
                x.data_ptr<scalar_t>(), lossGrad.data_ptr<scalar_t>(), audLen.data_ptr<int>(), txtLen.data_ptr<int>(),
                label.data_ptr<int>(), alpha.data_ptr<acc_t>(), beta.data_ptr<acc_t>(), batchOffsetPtr, dictSize,
                blankIdx, maxFLen, maxGLen, packedInput, xGrad.data_ptr<scalar_t>());
          }
        }));
  } else {
    // for non-fused kernel, the gradients need to be writtern are very sparse, hence initialize
    // the tensor with all zeros.
    xGrad = torch::zeros_like(x);
    // don't launch more threads than needed.
    const int threads = std::min(maxThreadPerBlock, maxGLen);
    const dim3 blocks(maxFLen, batchSize);
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(dtype, "transducer_loss_cuda_backward", ([&] {
                                          using acc_t = at::acc_type<scalar_t, true>;
                                          transducer_loss_backward<<<blocks, threads, 0, stream>>>(
                                              x.data_ptr<scalar_t>(), lossGrad.data_ptr<scalar_t>(),
                                              audLen.data_ptr<int>(), txtLen.data_ptr<int>(), label.data_ptr<int>(),
                                              alpha.data_ptr<acc_t>(), beta.data_ptr<acc_t>(), batchOffsetPtr, dictSize,
                                              blankIdx, maxFLen, maxGLen, packedInput, xGrad.data_ptr<scalar_t>());
                                        }));
  }
  C10_CUDA_CHECK(cudaGetLastError());

  return xGrad;
}


================================================
FILE: apex/contrib/csrc/xentropy/interface.cpp
================================================
#include <torch/extension.h>

#include <string>

// CUDA forward declarations

std::vector<at::Tensor> softmax_xentropy_cuda(const at::Tensor& input, const at::Tensor& labels, const float smoothing,
                                              const bool half_to_float);

at::Tensor softmax_xentropy_backward_cuda(const at::Tensor& grad_loss, const at::Tensor& logits,
                                          const at::Tensor& max_log_sum_exp, const at::Tensor& labels,
                                          const float smoothing);

// C++ interface

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
  CHECK_CUDA(x);       \
  CHECK_CONTIGUOUS(x)

std::vector<at::Tensor> softmax_xentropy_forward(const at::Tensor& input, const at::Tensor& labels,
                                                 const float smoothing, const bool half_to_float) {
  CHECK_CUDA(input);
  CHECK_INPUT(labels);

  return softmax_xentropy_cuda(input, labels, smoothing, half_to_float);
}

at::Tensor softmax_xentropy_backward(const at::Tensor& grad_loss, const at::Tensor& logits,
                                     const at::Tensor& max_log_sum_exp, const at::Tensor& labels,
                                     const float smoothing) {
  CHECK_CUDA(grad_loss);
  CHECK_CUDA(logits);
  CHECK_INPUT(max_log_sum_exp);
  CHECK_INPUT(labels);

  return softmax_xentropy_backward_cuda(grad_loss, logits, max_log_sum_exp, labels, smoothing);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &softmax_xentropy_forward, "Softmax cross entropy loss with label smoothing forward (CUDA)",
        py::call_guard<py::gil_scoped_release>());
  m.def("backward", &softmax_xentropy_backward, "Softmax cross entropy loss with label smoothing backward (CUDA)",
        py::call_guard<py::gil_scoped_release>());
  // ref: https://pybind11.readthedocs.io/en/stable/basics.html#exporting-variables
  py::object version = py::cast(
#ifdef XENTROPY_VER
      XENTROPY_VER
#else
      std::string{}
#endif
  );
  m.attr("__version__") = version;
}


================================================
FILE: apex/contrib/csrc/xentropy/xentropy_kernel.cu
================================================
/**
 * From PyTorch:
 *
 * Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
 * Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
 * Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
 * Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
 * Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
 * Copyright (c) 2011-2013 NYU                      (Clement Farabet)
 * Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
 * Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
 * Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
 *
 * From Caffe2:
 *
 * Copyright (c) 2016-present, Facebook Inc. All rights reserved.
 *
 * All contributions by Facebook:
 * Copyright (c) 2016 Facebook Inc.
 *
 * All contributions by Google:
 * Copyright (c) 2015 Google Inc.
 * All rights reserved.
 *
 * All contributions by Yangqing Jia:
 * Copyright (c) 2015 Yangqing Jia
 * All rights reserved.
 *
 * All contributions from Caffe:
 * Copyright(c) 2013, 2014, 2015, the respective contributors
 * All rights reserved.
 *
 * All other contributions:
 * Copyright(c) 2015, 2016 the respective contributors
 * All rights reserved.
 *
 * Caffe2 uses a copyright model similar to Caffe: each contributor holds
 * copyright over their contributions to Caffe2. The project versioning records
 * all such contribution and copyright details. If a contributor wants to further
 * mark their specific copyright on a particular contribution, they should
 * indicate their copyright solely in the commit message of the change when it is
 * committed.
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
 *    and IDIAP Research Institute nor the names of its contributors may be
 *    used to endorse or promote products derived from this software without
 *    specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>

#include <ATen/cuda/NumericLimits.cuh>

#include "type_shim.h"

#define ALIGN_BYTES 16

using Tensor = at::Tensor;
using TensorList = at::TensorList;
using ScalarType = at::ScalarType;
using at::acc_type;

template <typename T, typename AccumT, typename OutT>
struct LogSoftMaxForwardEpilogue {
  __device__ __forceinline__ LogSoftMaxForwardEpilogue(AccumT max_input, AccumT sum)
      : logsum(max_input + std::log(sum)) {}

  __device__ __forceinline__ LogSoftMaxForwardEpilogue(AccumT max_log_sum_exp) : logsum(max_log_sum_exp) {}

  __device__ __forceinline__ OutT operator()(T input) const { return static_cast<OutT>(input - logsum); }

  const AccumT logsum;
};

template <typename T, typename AccumT, typename OutT>
struct LogSoftMaxBackwardEpilogue {
  __device__ __forceinline__ LogSoftMaxBackwardEpilogue(AccumT sum) : sum(sum) {}

  __device__ __forceinline__ T operator()(OutT gradOutput, OutT output) const {
    return static_cast<T>(gradOutput - std::exp(static_cast<AccumT>(output)) * sum);
  }

  const AccumT sum;
};

const int max_threads = 1024;

inline dim3 SoftMax_getBlockSize(int ILP, uint64_t dim_size) {
  uint64_t block_size = 1;
  uint64_t max_block_size = std::min(dim_size / ILP, static_cast<uint64_t>(max_threads));
  while (block_size < (max_block_size / 2)) block_size *= 2;
  // Launch at least a single warp - the kernel assumes that.
  block_size = std::max(block_size, static_cast<uint64_t>(32));
  return dim3(block_size);
}

template <typename T>
struct Add {
  __device__ __forceinline__ T operator()(T a, T b) const { return a + b; }
};

template <typename T>
struct Max {
  __device__ __forceinline__ T operator()(T a, T b) const { return a < b ? b : a; }
};

////////////////////////////////////////////////////////////////////////////////
// Regular kernel (fast when dim_size is large; requires inner_size == 1)
////////////////////////////////////////////////////////////////////////////////

template <typename T, typename AccumT>
struct MaxFloat {
  __device__ __forceinline__ AccumT operator()(AccumT max, T v) const { return ::max(max, (AccumT)v); }
};

template <typename T, typename AccumT>
struct AddFloat {
  __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const { return sum + v; }
};

template <typename T, typename AccumT>
struct SumExpFloat {
  __device__ __forceinline__ SumExpFloat(AccumT v) : max_k(v) {}

  __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const { return sum + std::exp(v - max_k); }

  const AccumT max_k;
};

template <template <typename> class Reduction, typename AccumT>
__device__ __forceinline__ AccumT blockReduce(AccumT* smem, AccumT val, const Reduction<AccumT>& r, AccumT defaultVal) {
  // To avoid RaW races from chaining blockReduce calls together, we need a sync here
  __syncthreads();

  smem[threadIdx.x] = val;

  __syncthreads();

  AccumT warpVal = defaultVal;

  // First warp will perform per-warp reductions for the remaining warps
  uint32_t mask = (((uint64_t)1) << (blockDim.x / 32)) - 1;
  if (threadIdx.x < 32) {
    int lane = threadIdx.x % 32;
    if (lane < blockDim.x / 32) {
#pragma unroll
      for (int i = 0; i < 32; ++i) {
        warpVal = r(warpVal, smem[lane * 32 + i]);
      }
      __syncwarp(mask);
      smem[lane] = warpVal;
    }
  }

  __syncthreads();

  // First thread will perform a reduction of the above per-warp reductions
  AccumT blockVal = defaultVal;

  if (threadIdx.x == 0) {
    for (int i = 0; i < blockDim.x / 32; ++i) {
      blockVal = r(blockVal, smem[i]);
    }
    smem[0] = blockVal;
  }

  // Sync and broadcast
  __syncthreads();
  return smem[0];
}

template <template <typename> class Reduction1, template <typename> class Reduction2, typename AccumT>
__device__ __forceinline__ void blockReduce(AccumT* smem, AccumT* reducVal1, AccumT val1, const Reduction1<AccumT>& r1,
                                            AccumT defaultVal1, AccumT* reducVal2, AccumT val2,
                                            const Reduction2<AccumT>& r2, AccumT defaultVal2) {
  // To avoid RaW races from chaining blockReduce calls together, we need a sync here
  __syncthreads();

  smem[threadIdx.x] = val1;
  smem[blockDim.x + threadIdx.x] = val2;

  __syncthreads();

  AccumT warpVal1 = defaultVal1;
  AccumT warpVal2 = defaultVal2;

  // First warp will perform per-warp reductions for the remaining warps
  uint32_t mask = (((uint64_t)1) << (blockDim.x / 32)) - 1;
  if (threadIdx.x < 32) {
    int lane = threadIdx.x % 32;
    if (lane < blockDim.x / 32) {
#pragma unroll
      for (int i = 0; i < 32; ++i) {
        warpVal1 = r1(warpVal1, smem[lane * 32 + i]);
        warpVal2 = r2(warpVal2, smem[lane * 32 + i + blockDim.x]);
      }
      __syncwarp(mask);
      smem[lane] = warpVal1;
      smem[lane + blockDim.x] = warpVal2;
    }
  }

  __syncthreads();

  // First thread will perform a reduction of the above per-warp reductions
  AccumT blockVal1 = defaultVal1;
  AccumT blockVal2 = defaultVal2;

  if (threadIdx.x == 0) {
    for (int i = 0; i < blockDim.x / 32; ++i) {
      blockVal1 = r1(blockVal1, smem[i]);
      blockVal2 = r2(blockVal2, smem[i + blockDim.x]);
    }
    smem[0] = blockVal1;
    smem[blockDim.x] = blockVal2;
  }

  // Sync and broadcast
  __syncthreads();
  *reducVal1 = smem[0];
  *reducVal2 = smem[blockDim.x];
  __syncthreads();
}

template <template <typename, typename> class Reduction, int ILP, typename T, typename AccumT>
__device__ __forceinline__ AccumT ilpReduce(int shift, T* data, int size, const Reduction<T, AccumT>& r,
                                            AccumT defaultVal) {
  typedef typename std::aligned_storage<ILP * sizeof(T), ILP * alignof(T)>::type LoadT;
  AccumT threadVal = defaultVal;
  int offset = threadIdx.x;

  // shift and do 1
  if (shift > 0) {
    data -= shift;
    size += shift;
    if (threadIdx.x >= shift) {
      threadVal = r(threadVal, data[offset]);
    }
    size -= blockDim.x;
    data += blockDim.x;
  }
  int last = size % (ILP * blockDim.x);

  T v[ILP];
  LoadT* value = reinterpret_cast<LoadT*>(&v);

  for (; offset * ILP < (size - last); offset += blockDim.x) {
    *value = reinterpret_cast<LoadT*>(data)[offset];

    for (int j = 0; j < ILP; ++j) {
      threadVal = r(threadVal, v[j]);
    }
  }

  offset = size - last + threadIdx.x;
  // Epilogue
  for (; offset < size; offset += blockDim.x) threadVal = r(threadVal, data[offset]);

  return threadVal;
}

template <template <typename, typename> class Reduction1, template <typename, typename> class Reduction2, int ILP,
          typename T, typename AccumT>
__device__ __forceinline__ void ilpReduce(int shift, T* data, int size, AccumT* reducVal1,
                                          const Reduction1<T, AccumT>& r1, AccumT defaultVal1, AccumT* reducVal2,
                                          const Reduction2<T, AccumT>& r2, AccumT defaultVal2) {
  typedef typename std::aligned_storage<ILP * sizeof(T), ILP * alignof(T)>::type LoadT;

  AccumT threadVal1 = defaultVal1;
  AccumT threadVal2 = defaultVal2;
  int offset = threadIdx.x;

  // shift and do 1
  if (shift > 0) {
    data -= shift;
    size += shift;
    if (threadIdx.x >= shift) {
      threadVal1 = r1(threadVal1, data[offset]);
      threadVal2 = r2(threadVal2, data[offset]);
    }
    size -= blockDim.x;
    data += blockDim.x;
  }
  int last = size % (ILP * blockDim.x);

  T v[ILP];
  LoadT* value = reinterpret_cast<LoadT*>(&v);

  for (; offset * ILP < (size - last); offset += blockDim.x) {
    *value = reinterpret_cast<LoadT*>(data)[offset];

    for (int j = 0; j < ILP; ++j) {
      threadVal1 = r1(threadVal1, v[j]);
      threadVal2 = r2(threadVal2, v[j]);
    }
  }

  offset = size - last + threadIdx.x;
  // Epilogue
  for (; offset < size; offset += blockDim.x) {
    threadVal1 = r1(threadVal1, data[offset]);
    threadVal2 = r2(threadVal2, data[offset]);
  }

  *reducVal1 = threadVal1;
  *reducVal2 = threadVal2;
}

template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t,
          template <typename, typename, typename> class Epilogue>
__global__ void cunn_SoftMaxXEntropyForward(accscalar_t* losses, outscalar_t* max_log_sum_exp, scalar_t* input,
                                            int64_t* labels, int64_t classes, const float smoothing) {
  extern __shared__ unsigned char smem[];
  auto sdata = reinterpret_cast<accscalar_t*>(smem);
  // forward pointers to batch[blockIdx.x]
  // each block handles a sample in the mini-batch
  input += blockIdx.x * classes;
  // output += blockIdx.x * classes;
  const int shift = ((uint64_t)input) % ALIGN_BYTES / sizeof(scalar_t);

  int64_t label = labels[blockIdx.x];

  // find the max and sum
  accscalar_t threadMax, threadSum, max_k, sum_k;
  ilpReduce<MaxFloat, AddFloat, ILP, scalar_t, accscalar_t>(
      shift, input, classes, &threadMax, MaxFloat<scalar_t, accscalar_t>(), -at::numeric_limits<accscalar_t>::max(),
      &threadSum, AddFloat<scalar_t, accscalar_t>(), static_cast<accscalar_t>(0));

  blockReduce<Max, Add, accscalar_t>(sdata, &max_k, threadMax, Max<accscalar_t>(),
                                     -at::numeric_limits<accscalar_t>::max(), &sum_k, threadSum, Add<accscalar_t>(),
                                     static_cast<accscalar_t>(0));

  accscalar_t threadExp = ilpReduce<SumExpFloat, ILP, scalar_t, accscalar_t>(
      shift, input, classes, SumExpFloat<scalar_t, accscalar_t>(max_k), static_cast<accscalar_t>(0));
  accscalar_t sumAll = blockReduce<Add, accscalar_t>(sdata, threadExp, Add<accscalar_t>(), static_cast<accscalar_t>(0));

  Epilogue<scalar_t, accscalar_t, outscalar_t> epilogue(max_k, sumAll);

  // calculate per element loss with label smoothing
  // reserve max + log_sum_exp for bprop
  if (threadIdx.x == 0) {
    accscalar_t log_prob = epilogue(static_cast<accscalar_t>(input[label]));
    losses[blockIdx.x] = (max_k + std::log(sumAll) - sum_k / classes) * smoothing - log_prob * (1 - smoothing);
    max_log_sum_exp[blockIdx.x] = max_k + std::log(sumAll);
  }
}

template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t>
__device__ __forceinline__ void apply(scalar_t* gradInput, scalar_t* logits, outscalar_t* max_log_sum_exp,
                                      outscalar_t* gradOutput, int64_t* labels, const float smoothing, int classes) {
  accscalar_t smooth_positives = 1.0 - smoothing;
  accscalar_t smooth_negatives = smoothing / classes;
  accscalar_t tmpGradOutput = gradOutput[blockIdx.x];
  int64_t label = labels[blockIdx.x];
  accscalar_t coeff = max_log_sum_exp[blockIdx.x];

  int offset = threadIdx.x;
  int last = classes % (ILP * blockDim.x);

  for (; offset < classes - last; offset += blockDim.x * ILP) {
    accscalar_t tmpLogits[ILP];

#pragma unroll
    for (int j = 0; j < ILP; ++j) {
      tmpLogits[j] = static_cast<accscalar_t>(logits[offset + j * blockDim.x]);
    }

#pragma unroll
    for (int j = 0; j < ILP; ++j)
      gradInput[offset + j * blockDim.x] =
          tmpGradOutput *
          (std::exp(tmpLogits[j] - coeff) -
           static_cast<accscalar_t>((offset + j * blockDim.x == label) ? 1 : 0) * smooth_positives - smooth_negatives);
  }

  for (; offset < classes; offset += blockDim.x)
    gradInput[offset] =
        tmpGradOutput * (std::exp(static_cast<accscalar_t>(logits[offset]) - coeff) -
                         static_cast<accscalar_t>((offset == label) ? 1 : 0) * smooth_positives - smooth_negatives);
}

template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t>
__device__ __forceinline__ void aligned_apply(int shift, scalar_t* gradInput, scalar_t* logits,
                                              outscalar_t* max_log_sum_exp, outscalar_t* gradOutput, int64_t* labels,
                                              const float smoothing, int classes) {
  accscalar_t smooth_positives = 1.0 - smoothing;
  accscalar_t smooth_negatives = smoothing / classes;
  accscalar_t tmpGradOutput = gradOutput[blockIdx.x];
  int64_t label = labels[blockIdx.x];
  accscalar_t coeff = max_log_sum_exp[blockIdx.x];

  int offset = threadIdx.x;

  // shift and do 1
  if (shift > 0) {
    logits -= shift;
    gradInput -= shift;
    classes += shift;
    if (threadIdx.x >= shift) {
      gradInput[offset] =
          tmpGradOutput *
          (std::exp(static_cast<accscalar_t>(logits[offset]) - coeff) -
           static_cast<accscalar_t>(((offset - shift) == label) ? 1 : 0) * smooth_positives - smooth_negatives);
    }
    classes -= blockDim.x;
    gradInput += blockDim.x;
    logits += blockDim.x;
    shift -= blockDim.x;
  }

  int last = classes % (ILP * blockDim.x);

  typedef typename std::aligned_storage<ILP * sizeof(scalar_t), ILP * alignof(scalar_t)>::type LoadT;
  // input
  scalar_t v[ILP];
  LoadT* value = reinterpret_cast<LoadT*>(&v);
  // output
  scalar_t r[ILP];
  LoadT* result = reinterpret_cast<LoadT*>(&r);

  for (; offset * ILP < (classes - last); offset += blockDim.x) {
    *value = reinterpret_cast<LoadT*>(logits)[offset];

#pragma unroll
    for (int j = 0; j < ILP; ++j) {
      r[j] =
          tmpGradOutput * (std::exp(static_cast<accscalar_t>(v[j]) - coeff) -
                           static_cast<accscalar_t>(((ILP * offset + j - shift) == label) ? 1 : 0) * smooth_positives -
                           smooth_negatives);
    }
    reinterpret_cast<LoadT*>(gradInput)[offset] = *result;
  }

  offset = classes - last + threadIdx.x;
  for (; offset < classes; offset += blockDim.x)
    gradInput[offset] =
        tmpGradOutput *
        (std::exp(static_cast<accscalar_t>(logits[offset]) - coeff) -
         static_cast<accscalar_t>(((offset - shift) == label) ? 1 : 0) * smooth_positives - smooth_negatives);
}

template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t,
          template <typename, typename, typename> class Epilogue>
__global__ void cunn_SoftMaxXEntropyBackward(scalar_t* gradInput, scalar_t* logits, outscalar_t* max_log_sum_exp,
                                             outscalar_t* gradOutput, int64_t* labels, const float smoothing,
                                             int classes) {
  gradInput += blockIdx.x * classes;
  logits += blockIdx.x * classes;

  // Do vectorized load/store when input/output have same alignment
  const int shift = ((uint64_t)logits) % ALIGN_BYTES / sizeof(scalar_t);
  const int shift_ = ((uint64_t)gradInput) % ALIGN_BYTES / sizeof(scalar_t);
  if (shift == shift_) {
    aligned_apply<ILP, scalar_t, accscalar_t, outscalar_t>(shift, gradInput, logits, max_log_sum_exp, gradOutput,
                                                           labels, smoothing, classes);
  } else {
    apply<ILP, scalar_t, accscalar_t, outscalar_t>(gradInput, logits, max_log_sum_exp, gradOutput, labels, smoothing,
                                                   classes);
  }
}

template <template <typename, typename, typename> class Epilogue>
std::vector<Tensor> host_softmax_xentropy(const Tensor& input_, const Tensor& labels_, const float smoothing,
                                          const bool half_to_float) {
  if (half_to_float)
    TORCH_CHECK(input_.scalar_type() == ScalarType::Half, "conversion is supported for Half type only");
  TORCH_CHECK(labels_.scalar_type() == ScalarType::Long, "Label type should be CUDA Long");

  auto input = input_.contiguous();
  Tensor max_log_sum_exp =
      at::empty_like(labels_, half_to_float ? input.options().dtype(ScalarType::Float) : input.options());
  Tensor losses = at::empty_like(labels_, input_.options().dtype(ScalarType::Float));

  static_assert(
      std::is_same<acc_type<at::Half, true>, float>::value || std::is_same<acc_type<at::Half, true>, double>::value,
      "accscalar_t for half should be float or double");
  TORCH_CHECK(input.dim() == 2, "Currently only 2 dim input supported");
  TORCH_CHECK(labels_.dim() == 1, "Labels should be 1 dimensional");
  TORCH_CHECK(input.size(0) == labels_.size(0), "Input and label should have same number of examples");
  TORCH_CHECK(input.numel() > 0, "Number of classes in input should not be 0");

  const int64_t dim = 1;
  int64_t outer_size = 1;
  int64_t dim_size = input.size(dim);
  int64_t inner_size = 1;
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  for (int64_t i = 0; i < dim; ++i) outer_size *= input.size(i);
  for (int64_t i = dim + 1; i < input.dim(); ++i) inner_size *= input.size(i);
  // This kernel spawns a block per each element in the batch.
  // XXX: it assumes that inner_size == 1
  TORCH_CHECK(inner_size == 1, "Currently only inner size 1 supported");

  dim3 grid(outer_size);

  using namespace at;
  DISPATCH_FLOAT_HALF_AND_BFLOAT(
      input.scalar_type(), 0, "host_softmax_xentropy", using accscalar_t = at::acc_type<scalar_t_0, true>;
      const int ILP = sizeof(float4) / sizeof(scalar_t_0); dim3 block = SoftMax_getBlockSize(ILP, dim_size);
      if (!half_to_float) {
        cunn_SoftMaxXEntropyForward<ILP, scalar_t_0, accscalar_t, scalar_t_0, Epilogue>
            <<<grid, block, 2 * block.x * sizeof(accscalar_t), stream>>>(
                losses.data_ptr<accscalar_t>(), max_log_sum_exp.data_ptr<scalar_t_0>(), input.data_ptr<scalar_t_0>(),
                labels_.data_ptr<int64_t>(), dim_size, smoothing);
      } else {
        cunn_SoftMaxXEntropyForward<ILP, scalar_t_0, accscalar_t, accscalar_t, Epilogue>
            <<<grid, block, 2 * block.x * sizeof(accscalar_t), stream>>>(
                losses.data_ptr<accscalar_t>(), max_log_sum_exp.data_ptr<accscalar_t>(), input.data_ptr<scalar_t_0>(),
                labels_.data_ptr<int64_t>(), dim_size, smoothing);
      });

  C10_CUDA_CHECK(cudaGetLastError());

  std::vector<at::Tensor> ret = {losses, max_log_sum_exp};
  return ret;
}

template <template <typename, typename, typename> class Epilogue>
Tensor host_softmax_xentropy_backward(const at::Tensor& grad_loss, const at::Tensor& logits_,
                                      const at::Tensor& max_log_sum_exp, const at::Tensor& labels,
                                      const float smoothing, bool half_to_float) {
  const int64_t dim = 1;
  Tensor gI = at::empty_like(logits_);
  if (grad_loss.numel() == 0) {
    return gI;
  }

  auto grad = grad_loss.contiguous();
  auto logits = logits_.contiguous();

  static_assert(
      std::is_same<acc_type<at::Half, true>, float>::value || std::is_same<acc_type<at::Half, true>, double>::value,
      "accscalar_t for half should be float or double");
  if (grad.dim() == 0) grad = grad.view(1);

  TORCH_CHECK(logits_.dim() == 2, "Currently only 2 dim input supported");
  TORCH_CHECK(labels.dim() == 1, "Labels should be 1 dimensional");
  TORCH_CHECK(logits_.numel() > 0, "Number of classes in input should not be 0");
  TORCH_CHECK(logits_.size(0) == labels.size(0), "Input and label should have same number of examples");
  TORCH_CHECK(labels.size(0) == grad.size(0), "Label and loss should have same number of examples");

  int64_t outer_size = 1;
  int64_t dim_size = logits.size(dim);
  int64_t inner_size = 1;
  for (int64_t i = 0; i < dim; ++i) outer_size *= logits.size(i);
  for (int64_t i = dim + 1; i < logits.dim(); ++i) inner_size *= logits.size(i);
  // See descriptions of kernels above.
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  TORCH_CHECK(inner_size == 1, "Currently only inner size 1 supported");

  dim3 grid(outer_size);

  DISPATCH_FLOAT_HALF_AND_BFLOAT(
      gI.scalar_type(), 0, "host_softmax_xentropy_backward", using accscalar_t = acc_type<scalar_t_0, true>;
      const int ILP = sizeof(float4) / sizeof(scalar_t_0); dim3 block = SoftMax_getBlockSize(ILP, dim_size);
      if (!half_to_float) {
        cunn_SoftMaxXEntropyBackward<ILP, scalar_t_0, accscalar_t, scalar_t_0, Epilogue>
            <<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
                gI.data_ptr<scalar_t_0>(), logits.data_ptr<scalar_t_0>(), max_log_sum_exp.data_ptr<scalar_t_0>(),
                grad.data_ptr<scalar_t_0>(), labels.data_ptr<int64_t>(), smoothing, dim_size);
      } else {
        cunn_SoftMaxXEntropyBackward<ILP, scalar_t_0, accscalar_t, accscalar_t, Epilogue>
            <<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
                gI.data_ptr<scalar_t_0>(), logits.data_ptr<scalar_t_0>(), max_log_sum_exp.data_ptr<accscalar_t>(),
                grad.data_ptr<accscalar_t>(), labels.data_ptr<int64_t>(), smoothing, dim_size);
      });

  C10_CUDA_CHECK(cudaGetLastError());
  return gI;
}

std::vector<Tensor> softmax_xentropy_cuda(const Tensor& input, const Tensor& labels, const float smoothing,
                                          const bool half_to_float) {
  return host_softmax_xentropy<LogSoftMaxForwardEpilogue>(input, labels, smoothing, half_to_float);
}

at::Tensor softmax_xentropy_backward_cuda(const at::Tensor& grad_loss, const at::Tensor& logits,
                                          const at::Tensor& max_log_sum_exp, const at::Tensor& labels,
                                          const float smoothing) {
  bool half_to_float = grad_loss.scalar_type() != logits.scalar_type();
  if (half_to_float) {
    TORCH_CHECK((grad_loss.scalar_type() == ScalarType::Float && logits.scalar_type() == ScalarType::Half),
                "expected input and grad types to match, or input to be at::Half and grad to be at::Float");
  }
  return host_softmax_xentropy_backward<LogSoftMaxBackwardEpilogue>(grad_loss, logits, max_log_sum_exp, labels,
                                                                    smoothing, half_to_float);
}


================================================
FILE: apex/contrib/cudnn_gbn/__init__.py
================================================
from .batch_norm import GroupBatchNorm2d


================================================
FILE: apex/contrib/cudnn_gbn/batch_norm.py
================================================
import torch
from torch.nn.modules.batchnorm import _BatchNorm
from torch.nn import functional as F
from torch import Tensor
import peer_memory_cuda as pm
import cudnn_gbn_lib
from torch.cuda.amp import custom_fwd, custom_bwd


class _GroupBatchNorm2d(torch.autograd.Function):
    @staticmethod
    @custom_fwd
    def forward(
        ctx,
        input,
        weight,
        bias,
        running_mean,
        running_variance,
        minibatch_mean,
        minibatch_inv_var,
        momentum,
        eps,
        group_size,
        group_rank,
        fwd_buffers,
        bwd_buffers,
    ):
        ctx.save_for_backward(input, weight, minibatch_mean, minibatch_inv_var)
        ctx.eps = eps
        ctx.bn_group = group_size
        ctx.rank_id = group_rank
        ctx.peer_buffers = bwd_buffers
        return cudnn_gbn_lib.forward(
            input,
            weight,
            bias,
            running_mean,
            running_variance,
            minibatch_mean,
            minibatch_inv_var,
            momentum,
            eps,
            group_size,
            group_rank,
            fwd_buffers,
        )

    @staticmethod
    @custom_bwd
    def backward(ctx, grad_output):
        x, scale, minibatch_mean, minibatch_inv_var = ctx.saved_variables
        eps = ctx.eps
        bn_group = ctx.bn_group
        rank_id = ctx.rank_id
        peer_buffers = ctx.peer_buffers
        dx, dscale, dbias = cudnn_gbn_lib.backward(
            x,
            grad_output,
            scale,
            minibatch_mean,
            minibatch_inv_var,
            eps,
            bn_group,
            rank_id,
            peer_buffers,
        )
        return (
            dx,
            dscale,
            dbias,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
        )


class GroupBatchNorm2d(_BatchNorm):
    """
    synchronized batch normalization module extented from ``torch.nn.BatchNormNd``
    with the added stats reduction across multiple processes.

    When running in training mode, the layer reduces stats across process groups
    to increase the effective batchsize for normalization layer. This is useful
    in applications where batch size is small on a given process that would
    diminish converged accuracy of the model.

    When running in evaluation mode, the layer falls back to
    ``torch.nn.functional.batch_norm``.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, L)` or :math:`L` from input of size :math:`(N, L)`
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics and always uses batch
            statistics in both training and eval modes. Default: ``True``

    Example::

        >>> sbn = apex.contrib.GroupBatchNorm2d(100).cuda()
        >>> inp = torch.randn(10, 100, 14, 14).cuda()
        >>> out = sbn(inp)
        >>> inp = torch.randn(3, 100, 20).cuda()
        >>> out = sbn(inp)
    """

    def __init__(
        self,
        num_features,
        group_size,
        eps=1e-5,
        momentum=0.1,
        affine=True,
        track_running_stats=True,
    ):
        super(GroupBatchNorm2d, self).__init__(
            num_features,
            eps=eps,
            momentum=momentum,
            affine=affine,
            track_running_stats=track_running_stats,
        )
        self.group_size = group_size
        rank = torch.distributed.get_rank()
        self.group_id = rank // group_size
        self.group_rank = rank % group_size
        self.fwd_peer_buffers = self.get_peer_buffers(num_features)
        self.bwd_peer_buffers = self.get_peer_buffers(num_features)
        self.minibatch_mean = torch.cuda.FloatTensor(num_features)
        self.minibatch_inv_var = torch.cuda.FloatTensor(num_features)

    def get_peer_buffers(self, num_features):
        # group_size * 2 (low-latency algo) * 2 (mean+var) * channels * 4 (float32)
        peer_size = self.group_size * 4 * num_features * 4
        raw = pm.allocate_raw(peer_size)
        # exchange peer pointers with nccl
        world_size = torch.distributed.get_world_size()
        raw_ipc = pm.get_raw_ipc_address(raw).cuda()
        raw_ipcs = [torch.empty_like(raw_ipc) for _ in range(world_size)]
        torch.distributed.all_gather(raw_ipcs, raw_ipc)
        group_ipcs = [
            raw_ipcs[x]
            for x in range(
                self.group_id * self.group_size,
                (self.group_id * self.group_size) + self.group_size,
            )
        ]
        peer_raw_ipcs = torch.stack(group_ipcs).cpu()
        return pm.get_raw_peers(peer_raw_ipcs, self.group_rank, raw)

    def _check_input_dim(self, input):
        if input.dim() != 4:
            raise ValueError("expected 4D input (got {}D input)".format(input.dim()))

    def _check_input_channels(self, input):
        if input.size(1) % 8 != 0:
            raise ValueError("GroupBatchNorm2d number of input channels should be a multiple of 8")

    def forward(self, input: Tensor) -> Tensor:
        # currently only GPU input is supported
        if not input.is_cuda:
            raise ValueError("GroupBatchNorm2d expected input tensor to be on GPU")
        if not input.is_contiguous(memory_format=torch.channels_last):
            raise ValueError(
                "GroupBatchNorm2d expected input tensor to be in channels last memory format"
            )
        if torch.is_autocast_enabled():
            input = input.to(torch.get_autocast_gpu_dtype())
        if input.dtype != torch.float16:
            raise ValueError("GroupBatchNorm2d expected input tensor in float16")
        self._check_input_dim(input)
        self._check_input_channels(input)

        if not self.training:
            # fall back to pytorch implementation for inference
            return F.batch_norm(
                input,
                self.running_mean,
                self.running_var,
                self.weight,
                self.bias,
                False,
                self.momentum,
                self.eps,
            )

        return _GroupBatchNorm2d.apply(
            input,
            self.weight,
            self.bias,
            self.running_mean,
            self.running_var,
            self.minibatch_mean,
            self.minibatch_inv_var,
            self.momentum,
            self.eps,
            self.group_size,
            self.group_rank,
            self.fwd_peer_buffers,
            self.bwd_peer_buffers,
        )


================================================
FILE: apex/contrib/examples/gpu_direct_storage/benchmark_load.py
================================================
import timeit
import torch
import apex.contrib.gpu_direct_storage as gds

def run_benchmark_torch_load():
    sizes = [2 ** i for i in range(16, 28)]
    for size in sizes:
        torch.cuda.empty_cache()
        s = torch.cuda.Stream()
        x = torch.empty(size, device = "cuda")
        y = torch.linspace(0, 1, size, device = "cuda")
        torch.save(y, f"{size}.data")

        # warmup
        torch.cuda.synchronize()
        for _ in range(10):
            x = torch.load(f"{size}.data")

        torch.cuda.synchronize()
        start_time = timeit.default_timer()
        for _ in range(10):
            x = torch.load(f"{size}.data")
        torch.cuda.synchronize()
        end_time = timeit.default_timer()
        print(f"torch.load: size = {size}, {end_time - start_time}")
        assert(torch.allclose(x, y))

def run_benchmark(func):
    sizes = [2 ** i for i in range(16, 28)]
    for size in sizes:
        torch.cuda.empty_cache()
        s = torch.cuda.Stream()
        x = torch.empty(size, device = "cuda")
        y = torch.linspace(0, 1, size, device = "cuda")

        with gds.GDSFile(f"{size}.data", "w") as f:
            f.save_data(y)

        # warmup
        torch.cuda.synchronize()
        for _ in range(10):
            func(x, f"{size}.data")

        torch.cuda.synchronize()
        start_time = timeit.default_timer()
        for _ in range(10):
            func(x, f"{size}.data")
        torch.cuda.synchronize()
        end_time = timeit.default_timer()
        print(f"{func.__name__}: size = {size}, {end_time - start_time}")
        assert(torch.allclose(x, y))

def load_data_yes_gds(tensor, filename):
    with gds.GDSFile(filename, "r") as f:
        f.load_data(tensor)

def load_data_no_gds(tensor, filename):
    with gds.GDSFile(filename, "rn") as f:
        f.load_data_no_gds(tensor)

if __name__ == '__main__':
    run_benchmark_torch_load()
    run_benchmark(load_data_yes_gds)
    run_benchmark(load_data_no_gds)


================================================
FILE: apex/contrib/examples/gpu_direct_storage/benchmark_save.py
================================================
import os
import timeit
import torch
import apex.contrib.gpu_direct_storage as gds

def run_benchmark(func):
    sizes = [2 ** i for i in range(16, 28)]
    for size in sizes:
        torch.cuda.empty_cache()
        s = torch.cuda.Stream()
        x = torch.linspace(0, 1, size, device = "cuda")

        # warmup
        torch.cuda.synchronize()
        for _ in range(10):
            func(x, f"{size}.data")
            os.remove(f"{size}.data")

        torch.cuda.synchronize()
        start_time = timeit.default_timer()
        for _ in range(10):
            func(x, f"{size}.data")
            os.remove(f"{size}.data")
        torch.cuda.synchronize()
        end_time = timeit.default_timer()
        print(f"{func.__name__}: size = {size}, {end_time - start_time}")

def save_data_yes_gds(tensor, filename):
    with gds.GDSFile(filename, "w") as f:
        f.save_data(tensor)

def save_data_no_gds(tensor, filename):
    with gds.GDSFile(filename, "wn") as f:
        f.save_data_no_gds(tensor)

if __name__ == '__main__':
    run_benchmark(torch.save)
    run_benchmark(save_data_yes_gds)
    run_benchmark(save_data_no_gds)


================================================
FILE: apex/contrib/examples/gpu_direct_storage/example_load.py
================================================
import torch
import apex.contrib.gpu_direct_storage as gds

for size in [128, 1024, 8192]:
    x = torch.empty(size, device = "cuda")
    with gds.GDSFile(f"{size}.data", "r") as f:
        f.load_data(x)
    xx = torch.linspace(0, 1, size, device = "cuda")
    assert(torch.allclose(x, xx))


================================================
FILE: apex/contrib/examples/gpu_direct_storage/example_save.py
================================================
import torch
import apex.contrib.gpu_direct_storage as gds

for size in [128, 1024, 8192]:
    x = torch.linspace(0, 1, size, device = "cuda")
    with gds.GDSFile(f"{size}.data", "w") as f:
        f.save_data(x)


================================================
FILE: apex/contrib/examples/multihead_attn/func_test_multihead_attn.py
================================================
import torch
import argparse

from apex.contrib.multihead_attn import SelfMultiheadAttn
from apex.contrib.multihead_attn import EncdecMultiheadAttn

parser = argparse.ArgumentParser(description='Multihead Attention Standalone Test')
parser.add_argument('--seq-length', default=64, type=int, help='Sequence Length of Input')
parser.add_argument('--num-seqs-start', default=5, type=int, help='Start Range of Number of Sequences')
parser.add_argument('--num-seqs-stop', default=80, type=int, help='Stop Range of Number of Sequences')
parser.add_argument('--num-seqs-inc', default=5, type=int, help='Range Increment of Number of Sequences')
parser.add_argument('--trials', default=20, type=int, help='Number of Trials to Execute')
parser.add_argument('--warmup-trials', default=5, type=int, help='Warmup Trials to discard')
parser.add_argument('--layers', default=18, type=int, help='Attention Layers to Execute to Gain CPU/GPU Time Overlap')
parser.add_argument('--seed-start', default=1, type=int, help='Attention Layers to Execute to Gain CPU/GPU Time Overlap')
parser.add_argument('--seed-end', default=100, type=int, help='Attention Layers to Execute to Gain CPU/GPU Time Overlap')
parser.add_argument('--hidden-dim', default=1024, type=int, help='Multihead Attention hidden dimension')
parser.add_argument('--heads', default=16, type=int, help='Number of Multihead Attention heads')
parser.add_argument('--encdec-attn', action='store_true', help='Use Encoder-Decoder Attention instead of Self Attention.')
parser.add_argument('--norm-add', action='store_true', help='Include Layer Norm and Dropout-Add in Multihead Attention block.')
parser.add_argument('--ref', action='store_true', help='Reference implementation in python pytorch.')
parser.add_argument('--native', action='store_true', help='torch.nn.MultitheadAttention Version.')
parser.add_argument('--fwd', action='store_true', help='Only execute Fwd Pass.')
parser.add_argument('--eval', action='store_true', help='Inference only, no backward pass.')

args = parser.parse_args()
assert args.seq_length % 64 == 0, "Sequence Length should be a multiple of 64!"

if not torch.cuda.is_available():
    raise NotImplementedError('Running on CPU is not supported')
torch.cuda.set_device(0)

dropout_prob = 0.1

for seed in range(args.seed_start, args.seed_end+1) :
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    ref_layer = None
    if args.encdec_attn :
        ref_layer = EncdecMultiheadAttn(args.hidden_dim, args.heads, dropout=dropout_prob, bias=False, include_norm_add=args.norm_add, impl='default')
    else :
        ref_layer = SelfMultiheadAttn(args.hidden_dim, args.heads, dropout=dropout_prob, bias=False, include_norm_add=args.norm_add, impl='default')
    ref_layer.cuda()
    ref_layer.half()
    ref_layer.reset_parameters()

    ref_inputs    = torch.randn(args.seq_length, args.num_seqs_start, args.hidden_dim, dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True)
    ref_inputs_kv = None
    if args.encdec_attn :
        ref_inputs_kv    = torch.randn(args.seq_length, args.num_seqs_start, args.hidden_dim, dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True)

    ref_grads         = torch.randn_like(ref_inputs)

    ref_outputs,_ = ref_layer.forward(ref_inputs,
                                      ref_inputs_kv,
                                      ref_inputs_kv,
                                      key_padding_mask=None,
                                      need_weights=False,
                                      attn_mask=None,
                                      is_training=(not args.eval))

    ref_outputs.backward(ref_grads)

    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    tst_layer = None
    if args.encdec_attn :
        tst_layer = EncdecMultiheadAttn(args.hidden_dim, args.heads, dropout=dropout_prob, bias=False, include_norm_add=args.norm_add, impl='fast')
    else:
        tst_layer = SelfMultiheadAttn(args.hidden_dim, args.heads, dropout=dropout_prob, bias=False, include_norm_add=args.norm_add, impl='fast')
    tst_layer.cuda()
    tst_layer.half()
    tst_layer.reset_parameters()

    tst_inputs    = torch.randn(args.seq_length, args.num_seqs_start, args.hidden_dim, dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True)
    tst_inputs_kv = None
    if args.encdec_attn :
        tst_inputs_kv    = torch.randn(args.seq_length, args.num_seqs_start, args.hidden_dim, dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True)

    assert torch.equal(ref_inputs,tst_inputs), "ERROR: Inputs are different!"

    tst_grads         = torch.randn_like(tst_inputs)

    tst_outputs,_ = tst_layer.forward(tst_inputs,
                                      tst_inputs_kv,
                                      tst_inputs_kv,
                                      key_padding_mask=None,
                                      need_weights=False,
                                      attn_mask=None,
                                      is_training=(not args.eval))

    tst_outputs.backward(tst_grads)

    fwd_close = torch.equal(ref_outputs, tst_outputs)
    bwd_close = torch.equal(ref_inputs.grad, tst_inputs.grad)

    diff_fwd = ref_outputs - tst_outputs
    diff_cnt_fwd = diff_fwd.ne(0.0).sum()
    diff_accum_fwd = diff_fwd.abs().sum()

    diff_bwd = ref_inputs.grad - tst_inputs.grad
    diff_cnt_bwd = diff_bwd.ne(0.0).sum()
    diff_accum_bwd = diff_bwd.abs().sum()

    print(">>> Seed: ", seed, fwd_close, diff_cnt_fwd.item(), diff_accum_fwd.item(), bwd_close, diff_cnt_bwd.item(), diff_accum_bwd.item())


================================================
FILE: apex/contrib/examples/multihead_attn/perf_test_multihead_attn.py
================================================
import torch
import argparse

from apex.contrib.multihead_attn import SelfMultiheadAttn
from apex.contrib.multihead_attn import EncdecMultiheadAttn

parser = argparse.ArgumentParser(description='Multihead Attention Standalone Test')
parser.add_argument('--seq-length', default=64, type=int, help='Sequence Length of Input')
parser.add_argument('--num-seqs-start', default=10, type=int, help='Start Range of Number of Sequences')
parser.add_argument('--num-seqs-stop', default=120, type=int, help='Stop Range of Number of Sequences')
parser.add_argument('--num-seqs-inc', default=5, type=int, help='Range Increment of Number of Sequences')
parser.add_argument('--trials', default=20, type=int, help='Number of Trials to Execute')
parser.add_argument('--warmup-trials', default=5, type=int, help='Warmup Trials to discard')
parser.add_argument('--layers', default=18, type=int, help='Attention Layers to Execute to Gain CPU/GPU Time Overlap')
parser.add_argument('--hidden-dim', default=1024, type=int, help='Multihead Attention hidden dimension')
parser.add_argument('--heads', default=16, type=int, help='Number of Multihead Attention heads')
parser.add_argument('--encdec-attn', action='store_true', help='Use Encoder-Decoder Attention instead of Self Attention.')
parser.add_argument('--norm-add', action='store_true', help='Include Layer Norm and Dropout-Add in Multihead Attention block.')
parser.add_argument('--ref', action='store_true', help='Reference implementation in python pytorch.')
parser.add_argument('--native', action='store_true', help='torch.nn.MultitheadAttention Version.')
parser.add_argument('--fwd', action='store_true', help='Only execute Fwd Pass.')
parser.add_argument('--biases', action='store_true', help='Execute multihead attention with Linear Biases.')

args = parser.parse_args()

if not torch.cuda.is_available():
    raise NotImplementedError('Running on CPU is not supported')
torch.cuda.set_device(0)

torch.manual_seed(111)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(111)

attn_layers = []
for idx in range(0, args.layers) :
    if args.encdec_attn :
        if args.ref :
            attn_layers.append(EncdecMultiheadAttn(args.hidden_dim, args.heads, dropout=0.1, bias=args.biases, include_norm_add=False, impl='default'))
        else :
            attn_layers.append(EncdecMultiheadAttn(args.hidden_dim, args.heads, dropout=0.1, bias=args.biases, include_norm_add=args.norm_add, impl='fast'))
    else :
        if args.native :
            attn_layers.append(torch.nn.MultiheadAttention(args.hidden_dim, args.heads, dropout=0.1, bias=args.biases))
        elif args.ref :
            attn_layers.append(SelfMultiheadAttn(args.hidden_dim, args.heads, dropout=0.1, bias=args.biases, include_norm_add=args.norm_add, impl='default'))
        else :
            attn_layers.append(SelfMultiheadAttn(args.hidden_dim, args.heads, dropout=0.1, bias=args.biases, include_norm_add=args.norm_add, impl='fast'))
    attn_layers[idx].cuda()
    attn_layers[idx].half()
    if not args.native :
        attn_layers[idx].reset_parameters()

start_evt_fwd = []
start_evt_bwd = []
stop_evt_bwd  = []
for recorded_trial in range(0, args.trials) :
    start_evt_fwd.append(torch.cuda.Event(enable_timing=True))
    start_evt_bwd.append(torch.cuda.Event(enable_timing=True))
    stop_evt_bwd.append(torch.cuda.Event(enable_timing=True))

for sequences in range(args.num_seqs_start, args.num_seqs_stop + args.num_seqs_inc, args.num_seqs_inc) :
    inputs        = torch.randn(args.seq_length, sequences, args.hidden_dim, dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True)
    grads         = torch.randn_like(inputs)
   
    for trial in range(0, args.trials + args.warmup_trials) :
        layer_inputs  = inputs
        evt_idx       = trial - args.warmup_trials
    
        if evt_idx >= 0 :
            start_evt_fwd[evt_idx].record()
    
        for lyr_idx in range(0, args.layers) :
            if args.native :
                outputs,_ = attn_layers[lyr_idx].forward(layer_inputs, 
                                                         layer_inputs, 
                                                         layer_inputs, 
                                                         key_padding_mask=None, 
                                                         need_weights=False, 
                                                         attn_mask=None)
            else :
                outputs,_ = attn_layers[lyr_idx].forward(layer_inputs, 
                                                         layer_inputs, 
                                                         layer_inputs,
                                                         key_padding_mask=None, 
                                                         need_weights=False, 
                                                         attn_mask=None,
                                                         is_training=True)
            layer_inputs = outputs
    
        if evt_idx >= 0 :
            start_evt_bwd[evt_idx].record()

        if not args.fwd :
            layer_inputs.backward(grads)
    
        if evt_idx >= 0 :
            stop_evt_bwd[evt_idx].record()
   
    torch.cuda.synchronize()
    elapsed_time_fwd = 0.0
    elapsed_time_bwd = 0.0
    for evt_idx in range(0, args.trials) :
        elapsed_time_fwd += start_evt_fwd[evt_idx].elapsed_time(start_evt_bwd[evt_idx])
        elapsed_time_bwd += start_evt_bwd[evt_idx].elapsed_time(stop_evt_bwd[evt_idx])
   
    print("[ {} Attn {} ]Total Tokens: {:4d} Sequences: {:3d} Sequence Length: {:3d} Fwd Time / Layer: {:.3f} ms Bwd Time / Layer: {:.3f} ms".format(
        'Encdec' if args.encdec_attn else 'Self',              \
        'Norm&Add' if args.norm_add else '',                   \
        sequences*args.seq_length,                             \
        sequences,                                             \
        args.seq_length,                                       \
        elapsed_time_fwd / ( args.trials * args.layers ),      \
        elapsed_time_bwd / ( args.trials * args.layers )))


================================================
FILE: apex/contrib/examples/nccl_allocator/allreduce.py
================================================
import os
import torch
import torch.distributed as dist
import apex.contrib.nccl_allocator as nccl_allocator

assert os.getenv("WORLD_SIZE") is not None, "Please use: torchrun --nproc-per-node=8 allreduce.py"

rank = int(os.getenv("RANK"))
local_rank = int(os.getenv("LOCAL_RANK"))
world_size = int(os.getenv("WORLD_SIZE"))

nccl_allocator.init()

torch.cuda.set_device(local_rank)
dist.init_process_group(backend="nccl")
pool = nccl_allocator.create_nccl_mem_pool()
with nccl_allocator.nccl_mem(pool):
    a = torch.ones(1024 * 1024 * 2, device="cuda")
dist.all_reduce(a)

torch.cuda.synchronize()


================================================
FILE: apex/contrib/examples/nccl_allocator/cache.py
================================================
import torch
import apex.contrib.nccl_allocator as nccl_allocator
from pynvml.smi import nvidia_smi

def set_device(dev):
    import ctypes
    handle = ctypes.CDLL("libcudart.so")
    result = handle.cudaSetDevice(ctypes.c_int(dev))
    assert result == 0

def print_used_mem(string, nvsmi, device_id = 0):
    print(f"{string}:", nvsmi.DeviceQuery('memory.used')['gpu'][device_id])

nccl_allocator.init()
nrep = 6
nccl_mem = []

set_device(0)
nvsmi = nvidia_smi.getInstance()

print_used_mem("", nvsmi)

pool = nccl_allocator.create_nccl_mem_pool()
with nccl_allocator.nccl_mem(pool):
    for i in range(nrep):
      out = torch.randn(1024 * 1024 * 100).cuda() # >= 400 MB
      nccl_mem.append(out)

print_used_mem("after nccl alloc (+>=2400)", nvsmi) # + 2400+ MB

cudart_mem = []
for i in range(nrep):
  out = torch.randn(1024 * 1024 * 50 ).cuda() # == 200 MB
  cudart_mem.append(out)

print_used_mem("after cudart alloc (+1200)", nvsmi)

del cudart_mem
torch.cuda.empty_cache()
torch.cuda.empty_cache()
print_used_mem("release cudart mem (-1200)", nvsmi) # - 1200 MB

del nccl_mem
nccl_mem2 = []
with nccl_allocator.nccl_mem(pool):
    for i in range(nrep):
      out = torch.randn(1024 * 1024 * 100).cuda() # >= 400 MB
      nccl_mem2.append(out)
print_used_mem("reuse nccl cache (same)", nvsmi) # + 0 MB
del nccl_mem2
torch.cuda.empty_cache()
print_used_mem("release nccl_mem (-2400)", nvsmi) # - 2400 MB

torch.cuda.empty_cache()


================================================
FILE: apex/contrib/examples/nccl_allocator/change_cuda_allocator.py
================================================
import torch
import apex.contrib.nccl_allocator as nccl_allocator

nccl_allocator.init()
nrep = 6
pool = nccl_allocator.create_nccl_mem_pool()
with nccl_allocator.nccl_mem(pool):
    for i in range(nrep):
      out = torch.randn(1024).cuda()

for i in range(nrep):
  out = torch.randn(1024).cuda()

torch.cuda.empty_cache()
torch.cuda.empty_cache()


================================================
FILE: apex/contrib/examples/nccl_allocator/toy_ddp.py
================================================
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP

import apex.contrib.nccl_allocator as nccl_allocator

assert os.getenv("WORLD_SIZE") is not None, "Please use: torchrun --nproc-per-node=8 toy_ddp.py"

class ToyModel(nn.Module):
    def __init__(self):
        super(ToyModel, self).__init__()
        self.net1 = nn.Linear(10, 10)
        self.relu = nn.ReLU()
        self.net2 = nn.Linear(10, 5)

    def forward(self, x):
        return self.net2(self.relu(self.net1(x)))


rank = int(os.getenv("RANK"))
local_rank = int(os.getenv("LOCAL_RANK"))
world_size = int(os.getenv("WORLD_SIZE"))

nccl_allocator.init()

torch.cuda.set_device(local_rank)
dist.init_process_group(backend="nccl")

device = torch.device("cuda", local_rank)
model = ToyModel().to(device)
ddp_model = DDP(model, device_ids=[rank])
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)

data_ptrs = []
pool = nccl_allocator.create_nccl_mem_pool()
with nccl_allocator.nccl_mem(pool):
    for param in ddp_model.parameters():
        param.grad = torch.empty_like(param)
        data_ptrs.append(param.grad.data_ptr())

for _ in range(10):
    optimizer.zero_grad(set_to_none=False)
    outputs = ddp_model(torch.randn(20, 10))
    labels = torch.randn(20, 5).to(rank)
    loss_fn(outputs, labels).backward()
    optimizer.step()

for data_ptr, param in zip(data_ptrs, ddp_model.parameters()):
    assert(data_ptr == param.grad.data_ptr())
dist.destroy_process_group()


================================================
FILE: apex/contrib/fmha/__init__.py
================================================
from .fmha import FMHAFun


================================================
FILE: apex/contrib/fmha/fmha.py
================================================
###############################################################################
# Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#     * Neither the name of the NVIDIA CORPORATION nor the
#       names of its contributors may be used to endorse or promote products
#       derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
###############################################################################


import torch
import fmhalib as mha


class FMHAFun(torch.autograd.Function):
    @staticmethod
    def forward(ctx, qkv, cu_seqlens, p_dropout, max_s, is_training, zero_tensors):
        from apex import deprecated_warning

        deprecated_warning(
            "`apex.contrib.fmha` is deprecated and will be removed in July 2026. "
            "We encourage you to migrate to PyTorch native MultiheadAttention"
            "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html"
        )

        batch_size = cu_seqlens.numel() - 1
        if batch_size < 4:
            max_s = 512
            context, S_dmask = mha.fwd_nl(
                qkv, cu_seqlens, p_dropout, max_s, is_training, True, zero_tensors, None
            )
        else:
            context, S_dmask = mha.fwd(
                qkv,
                cu_seqlens,
                p_dropout,
                max_s,
                is_training,
                False,
                zero_tensors,
                None,
            )
        ctx.save_for_backward(qkv, S_dmask)
        ctx.cu_seqlens = cu_seqlens
        ctx.p_dropout = p_dropout
        ctx.max_s = max_s
        ctx.zero_tensors = zero_tensors
        return context

    @staticmethod
    def backward(ctx, dout):
        qkv, S_dmask = ctx.saved_tensors
        batch_size = ctx.cu_seqlens.numel() - 1
        if batch_size < 4:
            dqkv, dp, _ = mha.bwd_nl(
                dout,
                qkv,
                S_dmask,
                ctx.cu_seqlens,
                ctx.p_dropout,
                ctx.max_s,
                ctx.zero_tensors,
            )
        else:
            dqkv, dp = mha.bwd(
                dout,
                qkv,
                S_dmask,
                ctx.cu_seqlens,
                ctx.p_dropout,
                ctx.max_s,
                ctx.zero_tensors,
            )

        return dqkv, None, None, None, None, None


class FMHA(torch.nn.Module):
    def __init__(self, config):
        super(FMHA, self).__init__()

        self.p_dropout = config.attention_probs_dropout_prob
        self.h = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.d = self.hidden_size // self.h
        assert self.d * self.h == self.hidden_size, "Invalid hidden size/num_heads"

    def forward(self, qkv, cu_seqlens, max_s, is_training=True, zero_tensors=False):
        ctx = FMHAFun.apply(
            qkv.view(-1, 3, self.h, self.d),
            cu_seqlens,
            self.p_dropout,
            max_s,
            is_training,
            zero_tensors,
        )

        return ctx.view(-1, self.hidden_size)


================================================
FILE: apex/contrib/focal_loss/__init__.py
================================================
try:
    import torch
    import focal_loss_cuda
    from .focal_loss import focal_loss

    del torch
    del focal_loss_cuda
    del focal_loss
except ImportError:
    print("apex was installed without --focal_loss flag, apex.contrib.focal_loss is not available")


================================================
FILE: apex/contrib/focal_loss/focal_loss.py
================================================
import torch

import focal_loss_cuda


class FocalLoss(torch.autograd.Function):
    @staticmethod
    def forward(
        ctx,
        cls_output,
        cls_targets_at_level,
        num_positives_sum,
        num_real_classes,
        alpha,
        gamma,
        label_smoothing=0.0,
    ):
        loss, partial_grad = focal_loss_cuda.forward(
            cls_output,
            cls_targets_at_level,
            num_positives_sum,
            num_real_classes,
            alpha,
            gamma,
            label_smoothing,
        )

        ctx.save_for_backward(partial_grad, num_positives_sum)
        return loss

    @staticmethod
    def backward(ctx, grad_loss):
        partial_grad, num_positives_sum = ctx.saved_tensors

        # The backward kernel is actually in-place to save memory space,
        # partial_grad and grad_input are the same tensor.
        grad_input = focal_loss_cuda.backward(grad_loss, partial_grad, num_positives_sum)

        return grad_input, None, None, None, None, None, None


def focal_loss(
    cls_output: torch.Tensor,
    cls_targets_at_level: torch.Tensor,
    num_positive_sum: torch.Tensor,
    num_real_classes: int,
    alpha: float,
    gamma: float,
    label_smoothing: float = 0.0,
) -> torch.Tensor:
    """Fused focal loss function."""
    return FocalLoss.apply(
        cls_output,
        cls_targets_at_level,
        num_positive_sum,
        num_real_classes,
        alpha,
        gamma,
        label_smoothing,
    )


================================================
FILE: apex/contrib/gpu_direct_storage/README.md
================================================
# APEX GPUDirect Storage

This module aims to add a PyTorch extension for [GPUDirect Storage](https://developer.nvidia.com/blog/gpudirect-storage/) (GDS) support through utilizing the [cuFile](https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html) library.

# Build command
```
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--gpu_direct_storage" ./
```

Alternatively:
```
python setup.py install --gpu_direct_storage
```

Check installation:
```
python -c "import torch; import apex.contrib.gpu_direct_storage"
```


================================================
FILE: apex/contrib/gpu_direct_storage/__init__.py
================================================
from _apex_gpu_direct_storage import _GDSFile
from contextlib import contextmanager


@contextmanager
def GDSFile(filename, mode):
    assert type(filename) == str
    assert type(mode) == str
    try:
        from apex import deprecated_warning

        deprecated_warning(
            "`gpu_direct_storage.GDSFile` is deprecated and will be removed in September 2025. "
            "We encourage you to use `torch.cuda.gds` module of PyTorch as a replacement. "
            "Its documentation is available at https://docs.pytorch.org/docs/stable/cuda.html#gpudirect-storage-prototype"
        )
        file_handle = _GDSFile(filename, mode)
        yield file_handle
    finally:
        file_handle.close()
        del file_handle


================================================
FILE: apex/contrib/group_norm/__init__.py
================================================
from .group_norm import *


================================================
FILE: apex/contrib/group_norm/group_norm.py
================================================
#!/usr/bin/env python
# coding: utf-8

#
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#

import functools
import os
import torch
import torch.nn.init as init
import group_norm_cuda
import group_norm_v2_cuda

from torch import Tensor
from torch.nn.parameter import Parameter

__all__ = ["GroupNorm"]


def one_time_warning(msg: str):
    if not hasattr(one_time_warning, "has_been_called"):
        one_time_warning.has_been_called = True
        print(f"\033[93m{msg}\033[0m")  # hightlight with yellow color


@functools.cache
def get_cc_and_sm_count(device_index: int):
    props = torch.cuda.get_device_properties(device_index)
    CC = (props.major, props.minor)
    SM_COUNT = props.multi_processor_count
    return CC, SM_COUNT


# pytorch group norm requires same input type
def torch_group_norm(x, g, w, b, eps, act=""):
    xdtype, wdtype = x.dtype, w.dtype
    if xdtype != wdtype:
        x = x.to(dtype=wdtype)
    y = torch.nn.functional.group_norm(x, g, w, b, eps)
    if act in ["silu", "swish"]:
        y = torch.nn.functional.silu(y)
    if xdtype != wdtype and y.dtype != xdtype:
        y = y.to(dtype=xdtype)
    return y


@torch.library.custom_op("apex::group_norm_nhwc_fprop", mutates_args=())
def group_norm_nhwc_fprop(
    x: torch.Tensor,
    G: int,
    weight: torch.Tensor,
    bias: torch.Tensor,
    eps: float,
    act: str | None = None,
    passes: int = 1,
    use_group_norm_v2: bool = False,
) -> tuple[torch.Tensor, torch.Tensor]:
    # sanity check
    act = act.lower() if act else act
    assert x.is_contiguous(memory_format=torch.channels_last), "Only support NHWC layout."
    assert weight.numel() == x.shape[1], "Unexpected parameter count."
    assert bias.numel() == x.shape[1], "Unexpected parameter count."
    assert x.shape[1] % G == 0, "C % G != 0."
    assert act in [None, "", "silu", "swish"], "Unsupported activation."
    assert passes in [1, 2], "Invalid number of passes for algorithm."

    with_swish = act in ("silu", "swish")
    sm_margin = int(os.environ.get("APEX_GROUP_NORM_FPROP_SM_MARGIN", "0"))

    # enqueue fprop kernel
    if use_group_norm_v2:
        sums = torch.empty(x.shape[0] * G * 2, device=x.device)
        y = group_norm_v2_cuda.gn(
            x, weight, bias, eps, with_swish, G, mean_var_out=sums, sm_margin=sm_margin
        )
    else:
        if sm_margin:
            raise NotImplementedError("sm_margin is not supported for GroupNorm v1")
        y, sums = group_norm_cuda.forward(x, G, weight, bias, eps, passes, with_swish)
    return y, sums


@group_norm_nhwc_fprop.register_fake
def fake_group_norm_nhwc_fprop(
    x, G, weight, bias, eps, act=None, passes=1, use_group_norm_v2=False
):
    # sanity check
    act = act.lower() if act else act
    assert x.is_contiguous(memory_format=torch.channels_last), "Only support NHWC layout."
    assert weight.numel() == x.shape[1], "Unexpected parameter count."
    assert bias.numel() == x.shape[1], "Unexpected parameter count."
    assert x.shape[1] % G == 0, "C % G != 0."
    assert act in [None, "", "silu", "swish"], "Unsupported activation."
    assert passes in [1, 2], "Invalid number of passes for algorithm."

    y = torch.empty_like(x)
    sums = torch.empty(2 * x.shape[0] * G, device="cuda", dtype=torch.float32)
    return y, sums


@torch.library.custom_op("apex::group_norm_nhwc_bprop", mutates_args=())
def group_norm_nhwc_bprop(
    grad_output: torch.Tensor,
    sums: torch.Tensor,
    x: torch.Tensor,
    G: int,
    weight: torch.Tensor,
    bias: torch.Tensor,
    eps: float,
    act: str | None = None,
    passes: int = 1,
    use_group_norm_v2: bool = False,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    # sanity check
    if not grad_output.is_contiguous(memory_format=torch.channels_last):
        one_time_warning(
            "Warning: GroupNorm NHWC expects NHWC grad_output but it's not, "
            "thus a memory format change is introduced. "
            "This may come from the TorchInductor rule that tangents must be "
            "contiguous. Try to avoid graph break around NHWC tensors "
            "can fix this issue. (Future warning will be suppressed.)"
        )
        grad_output = grad_output.contiguous(memory_format=torch.channels_last)

    act = act.lower() if act else act
    with_swish = act in ["silu", "swish"]
    sm_margin = int(os.environ.get("APEX_GROUP_NORM_BPROP_SM_MARGIN", "0"))

    if use_group_norm_v2:
        dx, dw, db = group_norm_v2_cuda.gn_bwd(
            grad_output, x, weight, bias, sums, eps, with_swish, G, sm_margin=sm_margin
        )
    else:
        if sm_margin:
            raise NotImplementedError("sm_margin is not supported for GroupNorm v1")
        dx, dw, db = group_norm_cuda.backward(
            grad_output, sums, x, G, weight, bias, eps, passes, with_swish
        )
    return dx, dw, db


@group_norm_nhwc_bprop.register_fake
def fake_group_norm_nhwc_bprop(
    grad_output,
    sums,
    x,
    G,
    weight,
    bias,
    eps,
    act=None,
    passes=1,
    use_group_norm_v2=False,
):
    dx = torch.empty_like(x)
    dw = torch.empty_like(weight)
    db = torch.empty_like(bias)
    return dx, dw, db


def backward(ctx, grad_output, grad_sums):
    # retrive saved info
    x, w, b, sums = ctx.saved_tensors
    G = ctx.G
    eps = ctx.eps
    passes = ctx.passes
    act = ctx.act
    use_group_norm_v2 = ctx.use_group_norm_v2

    dx, dw, db = group_norm_nhwc_bprop(
        grad_output, sums, x, G, w, b, eps, act, passes, use_group_norm_v2
    )
    return dx, None, dw, db, None, None, None, None


def setup_context(ctx, inputs, output):
    x, G, weight, bias, eps, act, passes, use_group_norm_v2 = inputs
    y, sums = output
    # save for backward
    ctx.save_for_backward(x, weight, bias, sums)
    ctx.G = G
    ctx.eps = eps
    ctx.passes = passes
    ctx.act = act
    ctx.use_group_norm_v2 = use_group_norm_v2


group_norm_nhwc_fprop.register_autograd(backward, setup_context=setup_context)


def cuda_group_norm_nhwc_one_pass(x, G, weight, bias, eps, act=None):
    y, _ = group_norm_nhwc_fprop(x, G, weight, bias, eps, act, passes=1)
    return y


def cuda_group_norm_nhwc_two_pass(x, G, weight, bias, eps, act=None):
    y, _ = group_norm_nhwc_fprop(x, G, weight, bias, eps, act, passes=2)
    return y


def cuda_group_norm_v2_nhwc(x, G, weight, bias, eps, act=None):
    y, _ = group_norm_nhwc_fprop(x, G, weight, bias, eps, act, use_group_norm_v2=True)
    return y


# We do not direct inherit from torch.nn.GroupNorm since several fusers don't
# support inheritance. Extends:
# https://github.com/pytorch/pytorch/blob/main/torch/nn/modules/normalization.py
class GroupNorm(torch.nn.Module):
    """Optimized GroupNorm for NHWC layout with optional Swish/SiLU fusion.

    There are two version of CUDA kernels under the hood: one pass and two
    passes. This operator contains a simple heuristic to choose algorithm.

    Limitations:

    * Designed for 32 groups, also tested with 16 groups, some other number
      of groups can also work but not guaranteed;
    * Supported number of channels C are:

        128, 256, 320, 384, 448, 512, 640, 768, 896, 960, 1024, 1280, 1344,
        1536, 1792, 1920, 2048, 2240, 2560, 2688, 3072, 3136, 3584, 4096.

      One pass algorithm supports only channels mentioned above. Two pass
      algorithm might automatically support some other channels as well.
    * N/H/W do not have lower (except >0) and upper bound limitations;

    All the unsupported cases will be forwarded to PyTorch implementation.
    """

    __constants__ = [
        "num_groups",
        "num_channels",
        "eps",
        "affine",
        "act",
        "SUPPORTED_CHANNELS",
        "SUPPORTED_GROUPS",
    ]
    num_groups: int
    num_channels: int
    eps: float
    affine: bool
    act: str | None
    SUPPORTED_CHANNELS = frozenset(
        [
            128,
            256,
            320,
            384,
            448,
            512,
            640,
            768,
            896,
            960,
            1024,
            1280,
            1344,
            1536,
            1792,
            1920,
            2048,
            2240,
            2560,
            2688,
            3072,
            3136,
            3584,
            4096,
        ]
    )
    SUPPORTED_GROUPS = frozenset([16, 32])
    SUPPORTED_DTYPES = frozenset(
        [
            # (input dtype, parameter dtype)
            (torch.float32, torch.float32),
            (torch.float32, torch.float16),
            (torch.float32, torch.bfloat16),
            (torch.float16, torch.float16),
            (torch.float16, torch.bfloat16),
            (torch.float16, torch.float32),
            (torch.bfloat16, torch.bfloat16),
            (torch.bfloat16, torch.float16),
            (torch.bfloat16, torch.float32),
        ]
    )
    GN_V2_SUPPORTED_CHANNELS = frozenset(
        [
            # (HW, C)
            (8 * 8, 1280),
            (8 * 8, 2560),
            (16 * 16, 640),
            (16 * 16, 1280),
            (16 * 16, 1920),
            (16 * 16, 2560),
            (32 * 32, 320),
            (32 * 32, 640),
            (32 * 32, 960),
            (32 * 32, 1280),
            (32 * 32, 1920),
            (64 * 64, 320),
            (64 * 64, 640),
            (64 * 64, 960),
        ]
    )
    GN_V2_SUPPORTED_DTYPES = frozenset(
        [
            # (input dtype, parameter dtype)
            (torch.float16, torch.float16),
            (torch.bfloat16, torch.bfloat16),
        ]
    )
    GN_V2_SUPPORTED_GROUPS_SWISH = frozenset(
        [
            # (num_groups, with_swish)
            (16, True),
            (32, False),
        ]
    )
    GN_V2_SUPPORTED_LOWER_BOUND_SM_COUNT = {
        (10, 0): 148,
    }

    def __init__(
        self,
        num_groups: int,
        num_channels: int,
        eps: float = 1e-5,
        affine: bool = True,
        device=None,
        dtype=None,
        act=None,
    ) -> None:
        factory_kwargs = {"device": device, "dtype": dtype}
        super().__init__()
        if num_channels % num_groups != 0:
            raise ValueError("num_channels must be divisible by num_groups")

        self.num_groups = num_groups
        self.num_channels = num_channels
        self.eps = eps
        self.affine = affine
        self.act = act.lower() if act else act
        if self.affine:
            self.weight = Parameter(torch.empty(num_channels, **factory_kwargs))
            self.bias = Parameter(torch.empty(num_channels, **factory_kwargs))
        else:
            self.register_parameter("weight", None)
            self.register_parameter("bias", None)

        self.reset_parameters()
        sm = torch.cuda.get_device_capability(device)
        self.sm = sm[0] * 10 + sm[1]

    def reset_parameters(self) -> None:
        if self.affine:
            init.ones_(self.weight)
            init.zeros_(self.bias)

    def _check_legality(self, input: Tensor) -> bool:
        is_nhwc = input.is_contiguous(memory_format=torch.channels_last)
        is_legal_groups = self.num_groups in self.SUPPORTED_GROUPS
        is_legal_channels = self.num_channels in self.SUPPORTED_CHANNELS
        is_input_half_or_float_or_bf16 = input.dtype in [
            torch.float16,
            torch.bfloat16,
            torch.float32,
        ]
        is_supported_dtype_combination = (
            not self.affine or (input.dtype, self.weight.dtype) in self.SUPPORTED_DTYPES
        )
        is_legal_act = self.act in [None, "", "silu", "swish"]

        if (
            is_nhwc
            and is_input_half_or_float_or_bf16
            and is_supported_dtype_combination
            and is_legal_act
            and self.affine
            and is_legal_groups
            and is_legal_channels
        ):
            return True
        else:
            return False

    def _check_v2_legality(self, input: Tensor) -> bool:
        is_legal_channels = (
            input.shape[2] * input.shape[3],
            self.num_channels,
        ) in self.GN_V2_SUPPORTED_CHANNELS
        is_supported_groups_swish_combination = (
            self.num_groups,
            self.act in ["silu", "swish"],
        ) in self.GN_V2_SUPPORTED_GROUPS_SWISH
        is_supported_dtype_combination = (
            self.affine and (input.dtype, self.weight.dtype) in self.GN_V2_SUPPORTED_DTYPES
        )
        cc, sm_count = get_cc_and_sm_count(input.device.index)
        is_supported_sm_count = (
            cc in self.GN_V2_SUPPORTED_LOWER_BOUND_SM_COUNT
            and sm_count >= self.GN_V2_SUPPORTED_LOWER_BOUND_SM_COUNT[cc]
        )

        if (
            is_legal_channels
            and is_supported_groups_swish_combination
            and is_supported_dtype_combination
            and is_supported_sm_count
        ):
            return True
        else:
            return False

    def forward(self, input: Tensor) -> Tensor:
        can_use_nhwc_group_norm = self._check_legality(input)

        if can_use_nhwc_group_norm:
            channels = input.shape[1]
            hw = 1
            for i in range(2, len(input.shape)):
                hw *= input.shape[i]
            max_hw_one_pass = 1024 if self.sm >= 80 else 256
            if (hw >= 512 and channels in (3136, 3584, 4096)) or hw > max_hw_one_pass:
                passes = 2
            else:
                passes = 1
            use_group_norm_v2 = self._check_v2_legality(input)
            y, _ = group_norm_nhwc_fprop(
                input,
                self.num_groups,
                self.weight,
                self.bias,
                self.eps,
                self.act,
                passes,
                use_group_norm_v2,
            )
            return y
        else:
            return torch_group_norm(
                input, self.num_groups, self.weight, self.bias, self.eps, self.act
            )

    def extra_repr(self) -> str:
        if self.act:
            return "{num_groups}, {num_channels}, eps={eps}, affine={affine}, act={act}".format(
                **self.__dict__
            )
        else:
            return "{num_groups}, {num_channels}, eps={eps}, affine={affine}".format(
                **self.__dict__
            )


================================================
FILE: apex/contrib/groupbn/__init__.py
================================================
try:
    import torch
    import bnp
    from .batch_norm import BatchNorm2d_NHWC

    del torch
    del bnp
    del batch_norm
except ImportError:
    print("apex was installed without --bnp flag, contrib.groupbn is not available")


================================================
FILE: apex/contrib/groupbn/batch_norm.py
================================================
import torch
import numpy as np
from torch.nn.modules.batchnorm import _BatchNorm

import bnp


class bn_NHWC_impl(torch.autograd.Function):
    @staticmethod
    def forward(
        ctx,
        x,
        s,
        b,
        rm,
        riv,
        mini_m,
        mini_riv,
        ret_cta,
        mom,
        epsilon,
        fuse_relu,
        is_train,
        bn_group,
        my_data,
        pair_data,
        magic,
        pair_data2,
        pair_data3,
        fwd_occup,
        fwd_grid_x,
        bwd_occup,
        bwd_grid_x,
        multi_stream,
    ):
        if is_train:
            ctx.save_for_backward(x, s, b, rm, riv, mini_m, mini_riv)
            ctx.epsilon = epsilon
            ctx.momentum = mom
            ctx.ret_cta = ret_cta
            ctx.fuse_relu = fuse_relu
            ctx.my_data = my_data
            ctx.pair_data = pair_data
            ctx.magic = magic
            ctx.pair_data2 = pair_data2
            ctx.pair_data3 = pair_data3
            ctx.bn_group = bn_group
            ctx.bwd_occup = bwd_occup
            ctx.bwd_grid_x = bwd_grid_x
            ctx.multi_stream = multi_stream

            res = bnp.bn_fwd_nhwc(
                x,
                s,
                b,
                rm,
                riv,
                mini_m,
                mini_riv,
                ret_cta,
                mom,
                epsilon,
                fuse_relu,
                my_data,
                pair_data,
                pair_data2,
                pair_data3,
                bn_group,
                magic,
                fwd_occup,
                fwd_grid_x,
                multi_stream,
            )
            return res
        else:
            return bnp.bn_fwd_eval_nhwc(
                x, s, b, rm, riv, ret_cta, bn_group, mom, epsilon, fuse_relu
            )

    @staticmethod
    def backward(ctx, grad_y):
        x, s, b, rm, riv, mini_m, mini_riv = ctx.saved_variables
        epsilon = ctx.epsilon
        mom = ctx.momentum
        ret_cta = ctx.ret_cta
        fuse_relu = ctx.fuse_relu
        my_data = ctx.my_data
        pair_data = ctx.pair_data
        magic = ctx.magic
        pair_data2 = ctx.pair_data2
        pair_data3 = ctx.pair_data3
        bn_group = ctx.bn_group
        bwd_occup = ctx.bwd_occup
        bwd_grid_x = ctx.bwd_grid_x
        multi_stream = ctx.multi_stream

        dx, dscale, dbias = bnp.bn_bwd_nhwc(
            x,
            grad_y,
            s,
            b,
            rm,
            riv,
            mini_m,
            mini_riv,
            ret_cta,
            mom,
            epsilon,
            fuse_relu,
            my_data,
            pair_data,
            pair_data2,
            pair_data3,
            bn_group,
            magic,
            bwd_occup,
            bwd_grid_x,
            multi_stream,
        )

        return (
            dx,
            dscale,
            dbias,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
        )


class bn_addrelu_NHWC_impl(torch.autograd.Function):
    @staticmethod
    def forward(
        ctx,
        x,
        z,
        s,
        b,
        rm,
        riv,
        mini_m,
        mini_riv,
        grid_dim_y,
        ret_cta,
        mom,
        epsilon,
        is_train,
        bn_group,
        my_data,
        pair_data,
        magic,
        pair_data2,
        pair_data3,
        fwd_occup,
        fwd_grid_x,
        bwd_occup,
        bwd_grid_x,
        multi_stream,
    ):
        if is_train:
            bitmask = torch.cuda.IntTensor(((x.numel() + 31) // 32) * 2 * grid_dim_y)
            ctx.save_for_backward(x, s, b, rm, riv, mini_m, mini_riv, bitmask)
            ctx.epsilon = epsilon
            ctx.momentum = mom
            ctx.ret_cta = ret_cta
            ctx.my_data = my_data
            ctx.pair_data = pair_data
            ctx.magic = magic
            ctx.pair_data2 = pair_data2
            ctx.pair_data3 = pair_data3
            ctx.bn_group = bn_group
            ctx.bwd_occup = bwd_occup
            ctx.bwd_grid_x = bwd_grid_x
            ctx.multi_stream = multi_stream

            res = bnp.bn_addrelu_fwd_nhwc(
                x,
                z,
                s,
                b,
                rm,
                riv,
                mini_m,
                mini_riv,
                bitmask,
                ret_cta,
                mom,
                epsilon,
                my_data,
                pair_data,
                pair_data2,
                pair_data3,
                bn_group,
                magic,
                fwd_occup,
                fwd_grid_x,
                multi_stream,
            )
            return res
        else:
            return bnp.bn_addrelu_fwd_eval_nhwc(
                x, z, s, b, rm, riv, ret_cta, bn_group, mom, epsilon
            )

    @staticmethod
    def backward(ctx, grad_y):
        x, s, b, rm, riv, mini_m, mini_riv, bitmask = ctx.saved_variables
        epsilon = ctx.epsilon
        mom = ctx.momentum
        ret_cta = ctx.ret_cta
        my_data = ctx.my_data
        pair_data = ctx.pair_data
        magic = ctx.magic
        pair_data2 = ctx.pair_data2
        pair_data3 = ctx.pair_data3
        bn_group = ctx.bn_group
        bwd_occup = ctx.bwd_occup
        bwd_grid_x = ctx.bwd_grid_x
        multi_stream = ctx.multi_stream

        dx, dz, dscale, dbias = bnp.bn_addrelu_bwd_nhwc(
            x,
            grad_y,
            s,
            b,
            rm,
            riv,
            mini_m,
            mini_riv,
            bitmask,
            ret_cta,
            mom,
            epsilon,
            my_data,
            pair_data,
            pair_data2,
            pair_data3,
            bn_group,
            magic,
            bwd_occup,
            bwd_grid_x,
            multi_stream,
        )

        return (
            dx,
            dz,
            dscale,
            dbias,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
        )


class BatchNorm2d_NHWC(_BatchNorm):
    # if using BatchNorm2d_NHWC simultaneously with multiple streams set multi_stream to True
    def __init__(
        self,
        num_features,
        fuse_relu=False,
        bn_group=1,
        max_cta_per_sm=2,
        cta_launch_margin=12,
        multi_stream=False,
    ):
        super(BatchNorm2d_NHWC, self).__init__(num_features)

        self.fuse_relu = fuse_relu
        self.multi_stream = multi_stream

        self.minibatch_mean = torch.cuda.FloatTensor(num_features)
        self.minibatch_riv = torch.cuda.FloatTensor(num_features)

        # defaut to distributed bn disabled
        self.bn_group = bn_group
        self.max_cta_per_sm = max_cta_per_sm  # used only in training fwd and bwd
        self.cta_launch_margin = cta_launch_margin  # used only in training fwd and bwd
        self.my_data = None
        self.pair_data = None
        self.pair_data2 = None
        self.pair_data3 = None
        self.local_rank = 0
        self.magic = torch.IntTensor([0])

        # calculate cta per sm occupancies
        assert max_cta_per_sm > 0  # won't be able to do much with 0 CTAs :)
        self.fwd_occupancy = min(bnp.bn_fwd_nhwc_occupancy(), max_cta_per_sm)
        self.bwd_occupancy = min(bnp.bn_bwd_nhwc_occupancy(), max_cta_per_sm)
        self.addrelu_fwd_occupancy = min(bnp.bn_addrelu_fwd_nhwc_occupancy(), max_cta_per_sm)
        self.addrelu_bwd_occupancy = min(bnp.bn_addrelu_bwd_nhwc_occupancy(), max_cta_per_sm)

        # calculate grid dimentions based on occupancy numbers
        mp_count = torch.cuda.get_device_properties(None).multi_processor_count
        self.fwd_grid_dim_x = max(mp_count * self.fwd_occupancy - cta_launch_margin, 1)
        self.bwd_grid_dim_x = max(mp_count * self.bwd_occupancy - cta_launch_margin, 1)
        self.addrelu_fwd_grid_dim_x = max(
            mp_count * self.addrelu_fwd_occupancy - cta_launch_margin, 1
        )
        self.addrelu_bwd_grid_dim_x = max(
            mp_count * self.addrelu_bwd_occupancy - cta_launch_margin, 1
        )
        self.grid_dim_y = (num_features + 63) // 64

        # allocate scratch space used by implementation
        # TODO: scratch space that is not supposed to be exposed at user code. We only need one time initialization, the
        # same buffer could be reused in future iterations. Currently we exposed it here instead of requesting new
        # buffer from cache allocator to avoid unnecessary initialization at future iterations.
        self.ret_cta = torch.cuda.ByteTensor(8192).fill_(0)

        # FIXME: turn pair handles into an array
        if bn_group > 1:
            local_rank = torch.distributed.get_rank()
            world_size = torch.distributed.get_world_size()
            assert world_size >= bn_group
            assert world_size % bn_group == 0

            bn_sync_steps = 1
            if bn_group == 4:
                bn_sync_steps = 2
            if bn_group == 8:
                bn_sync_steps = 3

            self.ipc_buffer = torch.cuda.ByteTensor(bnp.get_buffer_size(bn_sync_steps))
            self.my_data = bnp.get_data_ptr(self.ipc_buffer)
            # we are walking on very thin ice here by utilizing internal `_share_cuda_()`
            self.storage = self.ipc_buffer.storage()
            self.share_cuda = self.storage._share_cuda_()
            internal_cuda_mem = self.share_cuda
            # internal_cuda_mem[1]: ipc_mem_handle
            my_handle = torch.cuda.ByteTensor(np.frombuffer(internal_cuda_mem[1], dtype=np.uint8))
            # internal_cuda_mem[3]: offset
            my_offset = torch.cuda.IntTensor([internal_cuda_mem[3]])

            handles_all = torch.empty(
                world_size,
                my_handle.size(0),
                dtype=my_handle.dtype,
                device=my_handle.device,
            )
            handles_l = list(handles_all.unbind(0))
            torch.distributed.all_gather(handles_l, my_handle)

            offsets_all = torch.empty(
                world_size,
                my_offset.size(0),
                dtype=my_offset.dtype,
                device=my_offset.device,
            )
            offsets_l = list(offsets_all.unbind(0))
            torch.distributed.all_gather(offsets_l, my_offset)

            # whom do I actually care about? that would be local_rank XOR 1
            self.pair_handle = handles_l[local_rank ^ 1].cpu().contiguous()
            pair_offset = offsets_l[local_rank ^ 1].cpu()
            self.pair_data = bnp.get_remote_data_ptr(self.pair_handle, pair_offset)

            if bn_group > 2:
                self.pair_handle2 = handles_l[local_rank ^ 2].cpu().contiguous()
                pair_offset2 = offsets_l[local_rank ^ 2].cpu()
                self.pair_data2 = bnp.get_remote_data_ptr(self.pair_handle2, pair_offset2)

            if bn_group > 4:
                self.pair_handle3 = handles_l[local_rank ^ 4].cpu().contiguous()
                pair_offset3 = offsets_l[local_rank ^ 4].cpu()
                self.pair_data3 = bnp.get_remote_data_ptr(self.pair_handle3, pair_offset3)

            # FIXME: get magic value into C code and eliminate from here
            self.magic = torch.IntTensor([2])
            self.local_rank = local_rank

    def forward(self, x, z=None):
        if z is not None:
            assert self.fuse_relu == True
            return bn_addrelu_NHWC_impl.apply(
                x,
                z,
                self.weight,
                self.bias,
                self.running_mean,
                self.running_var,
                self.minibatch_mean,
                self.minibatch_riv,
                self.grid_dim_y,
                self.ret_cta,
                self.momentum,
                self.eps,
                self.training,
                self.bn_group,
                self.my_data,
                self.pair_data,
                (self.magic),
                self.pair_data2,
                self.pair_data3,
                self.addrelu_fwd_occupancy,
                self.addrelu_fwd_grid_dim_x,
                self.addrelu_bwd_occupancy,
                self.addrelu_bwd_grid_dim_x,
                self.multi_stream,
            )
        else:
            return bn_NHWC_impl.apply(
                x,
                self.weight,
                self.bias,
                self.running_mean,
                self.running_var,
                self.minibatch_mean,
                self.minibatch_riv,
                self.ret_cta,
                self.momentum,
                self.eps,
                self.fuse_relu,
                self.training,
                self.bn_group,
                self.my_data,
                self.pair_data,
                (self.magic),
                self.pair_data2,
                self.pair_data3,
                self.fwd_occupancy,
                self.fwd_grid_dim_x,
                self.bwd_occupancy,
                self.bwd_grid_dim_x,
                self.multi_stream,
            )

    def __del__(self):
        if self.bn_group > 1:
            bnp.close_remote_data(self.pair_handle)
            if self.bn_group > 2:
                bnp.close_remote_data(self.pair_handle2)
                if self.bn_group > 4:
                    bnp.close_remote_data(self.pair_handle3)


================================================
FILE: apex/contrib/index_mul_2d/__init__.py
================================================
from .index_mul_2d import index_mul_2d


================================================
FILE: apex/contrib/index_mul_2d/index_mul_2d.py
================================================
import torch

import fused_index_mul_2d


class IndexMul2d_(torch.autograd.Function):
    """
    Currently only support index in dimension 0 with a 2-dimension tensor.
    The shape of indexed in1 must be same with in2. Now this kernel does not support broadcast.
    The datatype must be float32 or float16.
    """

    @staticmethod
    def forward(ctx, in1: torch.Tensor, in2: torch.Tensor, idx1: torch.Tensor) -> torch.Tensor:
        assert in2.size(0) == idx1.size(0)
        if (in1.dtype != torch.float32 and in1.dtype != torch.half) or in2.dtype != in1.dtype:
            raise RuntimeError(
                "input1'dtype and input2's dtype must be fp32 or fp16. And input type must be same"
            )
        if in1.dim() != 2 or in2.dim() != 2:
            raise RuntimeError("in1 and in2 must be 2-dimension tensor.")
        if idx1.dim() != 1:
            raise RuntimeError("idx1 must be 1-dimension tensor.")

        if not in1.is_contiguous():
            in1 = in1.contiguous()
        if not in2.is_contiguous():
            in2 = in2.contiguous()
        if not idx1.is_contiguous():
            idx1 = idx1.contiguous()

        assert in1.is_contiguous()
        assert in2.is_contiguous()
        assert idx1.is_contiguous()

        out = torch.empty_like(in2)

        if in1.dtype == torch.float32:
            fused_index_mul_2d.float_forward(out, in1, in2, idx1)
        elif in1.dtype == torch.half:
            fused_index_mul_2d.half_forward(out, in1, in2, idx1)

        ctx.for_backwards = (in1, in2, idx1)
        return out

    @staticmethod
    def backward(ctx, grad_out):
        in1, in2, idx1 = ctx.for_backwards

        grad_in1, grad_in2 = index_mul_2d_backward(in1, in2, idx1, grad_out)

        return grad_in1, grad_in2, None


class IndexMul2dBackward_(torch.autograd.Function):
    @staticmethod
    def forward(
        ctx,
        in1: torch.Tensor,
        in2: torch.Tensor,
        idx1: torch.Tensor,
        grad_out: torch.Tensor,
    ) -> torch.Tensor:
        if not in1.is_contiguous():
            in1 = in1.contiguous()
        if not in2.is_contiguous():
            in2 = in2.contiguous()
        if not idx1.is_contiguous():
            idx1 = idx1.contiguous()
        if not grad_out.is_contiguous():
            grad_out = grad_out.contiguous()

        assert in1.is_contiguous()
        assert in2.is_contiguous()
        assert idx1.is_contiguous()
        assert grad_out.is_contiguous()

        grad_in1 = torch.zeros_like(in1)
        grad_in2 = torch.empty_like(in2)

        if in1.dtype == torch.float32:
            fused_index_mul_2d.float_backward(grad_in1, grad_in2, grad_out, in1, in2, idx1)
        elif in1.dtype == torch.half:
            fused_index_mul_2d.half_backward(grad_in1, grad_in2, grad_out, in1, in2, idx1)

        ctx.for_backwards = (in1, in2, idx1, grad_out)
        return grad_in1, grad_in2

    @staticmethod
    def backward(ctx, grad_grad_in1, grad_grad_in2):
        if not grad_grad_in1.is_contiguous():
            grad_grad_in1 = grad_grad_in1.contiguous()
        if not grad_grad_in2.is_contiguous():
            grad_grad_in2 = grad_grad_in2.contiguous()

        assert grad_grad_in1.is_contiguous()
        assert grad_grad_in2.is_contiguous()

        in1, in2, idx1, grad_out = ctx.for_backwards

        grad_in1 = torch.zeros_like(in1)
        grad_in2 = torch.empty_like(in2)
        grad_grad_out = torch.empty_like(grad_out)

        if in1.dtype == torch.float32:
            fused_index_mul_2d.float_backward_backward(
                grad_grad_out,
                grad_in1,
                grad_in2,
                grad_out,
                grad_grad_in1,
                grad_grad_in2,
                in1,
                in2,
                idx1,
            )
        elif in1.dtype == torch.half:
            fused_index_mul_2d.half_backward_backward(
                grad_grad_out,
                grad_in1,
                grad_in2,
                grad_out,
                grad_grad_in1,
                grad_grad_in2,
                in1,
                in2,
                idx1,
            )

        return grad_in1, grad_in2, None, grad_grad_out


index_mul_2d = IndexMul2d_.apply
index_mul_2d_backward = IndexMul2dBackward_.apply


================================================
FILE: apex/contrib/layer_norm/__init__.py
================================================
from .layer_norm import FastLayerNorm


================================================
FILE: apex/contrib/layer_norm/layer_norm.py
================================================
import torch
from torch.nn import init

from apex._autocast_utils import _cast_if_autocast_enabled
import fast_layer_norm


class FastLayerNormFN(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, gamma, beta, epsilon, memory_efficient=False):
        ctx.x_shape = x.shape
        ctx.memory_efficient = memory_efficient

        x = x.contiguous()
        gamma = gamma.contiguous()
        beta = beta.contiguous()
        hidden_size = gamma.numel()
        xmat = x.view((-1, hidden_size))
        ymat, mu, rsigma = fast_layer_norm.ln_fwd(xmat, gamma, beta, epsilon)
        if ctx.memory_efficient:
            ctx.save_for_backward(ymat, gamma, None, rsigma, beta)
        else:
            ctx.save_for_backward(xmat, gamma, mu, rsigma, None)
        return ymat.view(x.shape)

    @staticmethod
    def backward(ctx, dy):
        # assert dy.is_contiguous()
        dy = dy.contiguous()  # this happens!
        x_or_y_mat, gamma, mu, rsigma, beta = ctx.saved_tensors
        dymat = dy.view(x_or_y_mat.shape)
        dxmat, dgamma, dbeta, _, _ = fast_layer_norm.ln_bwd(
            dymat, x_or_y_mat, mu, rsigma, gamma, beta, ctx.memory_efficient
        )
        dx = dxmat.view(ctx.x_shape)
        return dx, dgamma, dbeta, None, None


def _fast_layer_norm(x, weight, bias, epsilon, memory_efficient):
    args = _cast_if_autocast_enabled(x, weight, bias, epsilon, memory_efficient)
    with torch.amp.autocast("cuda", enabled=False):
        return FastLayerNormFN.apply(*args)


class FastLayerNorm(torch.nn.Module):
    def __init__(self, hidden_size, eps=1e-5, memory_efficient=False):
        super().__init__()
        self.epsilon = eps
        self.memory_efficient = memory_efficient
        self.weight = torch.nn.Parameter(torch.empty(hidden_size))
        self.bias = torch.nn.Parameter(torch.empty(hidden_size))
        self.reset_parameters()

    def reset_parameters(self):
        init.ones_(self.weight)
        init.zeros_(self.bias)

    def forward(self, x):
        return _fast_layer_norm(x, self.weight, self.bias, self.epsilon, self.memory_efficient)


================================================
FILE: apex/contrib/multihead_attn/README.md
================================================
# Fast Multihead Attention 

This implementation has two main features :
* A C++ implementation to avoid the CPU overheads of Pytorch found with smaller batch sizes.
* The removal of all copies and transposes found in standard implementations of Multihead Attention.

|                                            | Python Version | C++ Version |
| :----------------------------------------- | :------------: | :---------: |
| Layer Norm and Residual Add Variant        | X              | X           |
| Includes Linear Biases                     | X              |             |
| Reduces CPU Overheads                      |                | X           |
| Fuses masking with Softmax                 |                | X           |
| Removes Transposes and Copies              | X              | X           |
| Includes Self and Encoder/Decoder Variants | X              | X           |

## How to Instantiate

`SelfMultiheadAttn(` _hidden dim_, _heads_, _dropout=prob_, _bias=bool_, _include_norm_add=bool_, _impl='fast'_ `)`
`EncdecMultiheadAttn(` _hidden dim_, _heads_, _dropout=prob_, _bias=bool_, _include_norm_add=bool_, _impl='fast'_ `)`

 `impl` has two options:
 * `fast` uses C++ Version
 * `default` uses Python Version

## Instructions to build on Linux

```
$ git clone https://github.com/NVIDIA/apex
$ cd apex
$ pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_multihead_attn" ./
```
## Try Performance Tests Yourself!
Perf test script is found here!
```
cd contrib/examples/multihead_attn
```
#### Fast Multihead Attention
```
python perf_test_multihead_attn.py --ref
```
#### Fast Multihead Attention with C++ Implementation
```
python perf_test_multihead_attn.py
```
#### Compare with `torch.nn.MultiheadAttn`
```
python perf_test_multihead_attn.py --native
```
#### Test your own range!
```
python perf_test_multihead_attn.py --seq-length 64 --num-seqs-start 10 --num-seqs-stop 120 --num-seqs-inc 5
```

## Performance Comparisons

* Performance was measured with 64 token sequence lengths on an NVIDIA TitanV card.
* Time is measured across multiple layers to simulate an in model scenario.

![Multihead Attention Forward](MHA_fwd.png)
![Multihead Attention Backward](MHA_bwd.png)


================================================
FILE: apex/contrib/multihead_attn/__init__.py
================================================
from .self_multihead_attn import SelfMultiheadAttn
from .encdec_multihead_attn import EncdecMultiheadAttn
from .mask_softmax_dropout_func import fast_mask_softmax_dropout_func


================================================
FILE: apex/contrib/multihead_attn/encdec_multihead_attn.py
================================================
import math

import torch
from torch import nn
from torch.nn import Parameter
import torch.nn.functional as F

from .encdec_multihead_attn_func import encdec_attn_func
from .fast_encdec_multihead_attn_func import fast_encdec_attn_func
from .fast_encdec_multihead_attn_norm_add_func import fast_encdec_attn_norm_add_func
from apex.normalization.fused_layer_norm import FusedLayerNorm


@torch.jit.script
def jit_dropout_add(x, residual, prob, is_training):
    # type: (Tensor, Tensor, float, bool) -> Tensor
    out = F.dropout(x, p=prob, training=True)
    out = residual + out
    return out


class EncdecMultiheadAttn(nn.Module):
    """Multi-headed attention.

    See "Attention Is All You Need" for more details.
    """

    def __init__(
        self,
        embed_dim,
        num_heads,
        dropout=0.0,
        bias=False,
        include_norm_add=False,
        impl="fast",
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        assert self.head_dim * num_heads == self.embed_dim, (
            "embed_dim must be divisible by num_heads"
        )
        self.bias = bias
        self.include_norm_add = include_norm_add
        self.impl = impl
        self.scaling = self.head_dim**-0.5

        self.in_proj_weight_q = Parameter(torch.empty(embed_dim, embed_dim))
        self.in_proj_weight_kv = Parameter(torch.empty(2 * embed_dim, embed_dim))
        self.out_proj_weight = Parameter(torch.empty(embed_dim, embed_dim))
        if self.bias:
            assert impl != "fast", "ERROR! The Fast implementation does not support biases!"
            self.in_proj_bias_q = Parameter(torch.empty(embed_dim))
            self.in_proj_bias_kv = Parameter(torch.empty(2 * embed_dim))
            self.out_proj_bias = Parameter(torch.empty(embed_dim))
        else:
            self.register_parameter("in_proj_bias_q", None)
            self.register_parameter("in_proj_bias_kv", None)
            self.in_proj_bias_q = None
            self.in_proj_bias_kv = None
            self.out_proj_bias = None
        if self.include_norm_add:
            if impl == "fast":
                self.lyr_nrm_gamma_weights = Parameter(torch.empty(embed_dim))
                self.lyr_nrm_beta_weights = Parameter(torch.empty(embed_dim))
                self.lyr_nrm = None
            else:
                self.register_parameter("lyr_norm_gamma_weights", None)
                self.register_parameter("lyr_norm_beta_weights", None)
                self.lyr_nrm_gamma_weights = None
                self.lyr_nrm_beta_weights = None
                self.lyr_nrm = FusedLayerNorm(embed_dim)
        self.reset_parameters()

        if self.include_norm_add:
            if impl == "fast":
                self.attn_func = fast_encdec_attn_norm_add_func
            elif impl == "default":
                self.attn_func = encdec_attn_func
            else:
                assert False, "Unsupported impl: {} !".format(impl)
        else:
            if impl == "fast":
                self.attn_func = fast_encdec_attn_func
            elif impl == "default":
                self.attn_func = encdec_attn_func
            else:
                assert False, "Unsupported impl: {} !".format(impl)

    def reset_parameters(self):
        nn.init.xavier_uniform_(self.in_proj_weight_q)
        # in_proj_weight_kv has shape [2 * hidden, hidden] but it should be
        # initialized like a [hidden, hidden] matrix.
        # sqrt(6 / (hidden + hidden)) / sqrt(6 / (2 * hidden + hidden)) = sqrt(1.5)
        # therefore xavier_uniform gain should be set to sqrt(1.5).
        nn.init.xavier_uniform_(self.in_proj_weight_kv, gain=math.sqrt(1.5))
        nn.init.xavier_uniform_(self.out_proj_weight)
        if self.bias:
            nn.init.constant_(self.in_proj_bias_q, 0.0)
            nn.init.constant_(self.in_proj_bias_kv, 0.0)
            nn.init.constant_(self.out_proj_bias, 0.0)
        if self.include_norm_add:
            if self.impl == "fast":
                nn.init.ones_(self.lyr_nrm_gamma_weights)
                nn.init.zeros_(self.lyr_nrm_beta_weights)
            else:
                self.lyr_nrm.reset_parameters()

    def forward(
        self,
        query,
        key,
        value,
        key_padding_mask=None,
        need_weights=False,
        attn_mask=None,
        is_training=True,
    ):
        """Input shape: Time x Batch x Channel

        Self-attention can be implemented by passing in the same arguments for
        query, key and value. Future timesteps can be masked with the
        `mask_future_timesteps` argument. Padding elements can be excluded from
        the key by passing a binary ByteTensor (`key_padding_mask`) with shape:
        batch x src_len, where padding elements are indicated by 1s.
        """

        if key_padding_mask is not None:
            assert attn_mask is None, (
                "ERROR attn_mask and key_padding_mask should not be both defined!"
            )
            mask = key_padding_mask
        elif attn_mask is not None:
            mask = attn_mask
        else:
            mask = None

        if self.include_norm_add:
            if self.impl == "fast":
                outputs = self.attn_func(
                    attn_mask is not None,
                    is_training,
                    self.num_heads,
                    query,
                    key,
                    self.lyr_nrm_gamma_weights,
                    self.lyr_nrm_beta_weights,
                    self.in_proj_weight_q,
                    self.in_proj_weight_kv,
                    self.out_proj_weight,
                    mask,
                    self.dropout,
                )
            else:
                lyr_nrm_results = self.lyr_nrm(query)
                outputs = self.attn_func(
                    attn_mask is not None,
                    is_training,
                    self.num_heads,
                    self.scaling,
                    lyr_nrm_results,
                    key,
                    self.in_proj_weight_q,
                    self.in_proj_weight_kv,
                    self.out_proj_weight,
                    self.in_proj_bias_q,
                    self.in_proj_bias_kv,
                    self.out_proj_bias,
                    mask,
                    self.dropout,
                )
                if is_training:
                    outputs = jit_dropout_add(outputs, query, self.dropout, is_training)
                else:
                    outputs = outputs + query
        else:
            if self.impl == "fast":
                outputs = self.attn_func(
                    attn_mask is not None,
                    is_training,
                    self.num_heads,
                    query,
                    key,
                    self.in_proj_weight_q,
                    self.in_proj_weight_kv,
                    self.out_proj_weight,
                    mask,
                    self.dropout,
                )
            else:
                outputs = self.attn_func(
                    attn_mask is not None,
                    is_training,
                    self.num_heads,
                    self.scaling,
                    query,
                    key,
                    self.in_proj_weight_q,
                    self.in_proj_weight_kv,
                    self.out_proj_weight,
                    self.in_proj_bias_q,
                    self.in_proj_bias_kv,
                    self.out_proj_bias,
                    mask,
                    self.dropout,
                )

        return outputs, None


================================================
FILE: apex/contrib/multihead_attn/encdec_multihead_attn_func.py
================================================
import torch
import torch.nn.functional as F


class EncdecAttnFunc(torch.autograd.Function):
    @staticmethod
    def forward(
        ctx,
        use_time_mask,
        is_training,
        heads,
        scale,
        inputs_q,
        inputs_kv,
        input_weights_q,
        input_weights_kv,
        output_weights,
        input_biases_q,
        input_biases_kv,
        output_biases,
        mask,
        dropout_prob,
    ):
        from apex import deprecated_warning

        deprecated_warning(
            "`apex.contrib.multihead_attn` is deprecated and will be removed in July 2026. "
            "We encourage you to migrate to PyTorch native MultiheadAttention"
            "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html"
        )

        use_biases_t = torch.tensor([input_biases_q is not None])
        heads_t = torch.tensor([heads])
        scale_t = torch.tensor([scale])
        dropout_prob_t = torch.tensor([dropout_prob])
        null_tensor = torch.tensor([])
        head_dim = inputs_q.size(2) // heads

        # Input Linear GEMM Q
        # input1: (activations) [seql_q, seqs, embed_dim(1024)]
        # input2: (weights)     [embed_dim (1024), embed_dim (1024)] (transpose [0,1])
        # output:               [seql_q, seqs, embed_dim]
        # GEMM: ( (seql_q*seqs) x embed_dim ) x ( embed_dim x embed_dim ) = (seql_q*seqs x embed_dim)
        if use_biases_t[0]:
            input_lin_q_results = torch.addmm(
                input_biases_q,
                inputs_q.view(inputs_q.size(0) * inputs_q.size(1), inputs_q.size(2)),
                input_weights_q.transpose(0, 1),
                beta=1.0,
                alpha=1.0,
            )
        else:
            input_lin_q_results = torch.mm(
                inputs_q.view(inputs_q.size(0) * inputs_q.size(1), inputs_q.size(2)),
                input_weights_q.transpose(0, 1),
            )
        input_lin_q_results = input_lin_q_results.view(
            inputs_q.size(0), inputs_q.size(1), input_weights_q.size(0)
        )
        # Input Linear GEMM KV
        # input1: (activations) [seql_k, seqs, embed_dim(1024)]
        # input2: (weights)     [embed_dim*2 (2048), embed_dim (1024)] (transpose [0,1])
        # output:               [seql_k, seqs, embed_dim*2]
        # GEMM: ( (seql_k*seqs) x embed_dim ) x ( embed_dim x embed_dim*2 ) = (seql_k*seqs x embed_dim*2)
        if use_biases_t[0]:
            input_lin_kv_results = torch.addmm(
                input_biases_kv,
                inputs_kv.view(inputs_kv.size(0) * inputs_kv.size(1), inputs_kv.size(2)),
                input_weights_kv.transpose(0, 1),
                beta=1.0,
                alpha=1.0,
            )
        else:
            input_lin_kv_results = torch.mm(
                inputs_kv.view(inputs_kv.size(0) * inputs_kv.size(1), inputs_kv.size(2)),
                input_weights_kv.transpose(0, 1),
            )
        input_lin_kv_results = input_lin_kv_results.view(
            inputs_kv.size(0), inputs_kv.size(1), input_weights_kv.size(0)
        )

        # Slice out k,v from one big Input Linear outuput (should only impact meta data, no copies!)
        # Sequences and heads are combined to make the batch of the Batched GEMM
        # input_lin_kv_results: [seql_k, seqs, heads(16), 2, head_dim(64)]
        # input_lin_kv_results: [seql_k, batches=seqs*heads, 2, head_dim]
        queries = input_lin_q_results.view(inputs_q.size(0), inputs_q.size(1) * heads, head_dim)
        input_lin_kv_results = input_lin_kv_results.view(
            inputs_kv.size(0), inputs_kv.size(1) * heads, 2, head_dim
        )
        keys = input_lin_kv_results[:, :, 0, :]
        values = input_lin_kv_results[:, :, 1, :]

        # Matmul1 Batched GEMMs
        # The output tensor is specified prior to the Batch GEMM because baddbmm requires its specification
        # baddbmm is used to apply the scale parameter via the Batched GEMM's alpha parameter instead of
        # a separate elementwise operation.
        # Input1: (Queries) [seql_q, seqs*heads, head_dim] tranpose(0,1)
        # Input2: (Keys)    [seql_k, seqs*heads, head_dim] transpose(0,1)
        # output:           [seqs*heads, seql_q, seql_k]
        # GEMM: Per batch: ( seql_q x head_dim ) x ( head_dim x seql_k ) = ( seql_q x seql_k )
        matmul1_results = torch.empty(
            (queries.size(1), queries.size(0), keys.size(0)),
            dtype=queries.dtype,
            device=torch.device("cuda"),
        )
        matmul1_results = torch.baddbmm(
            matmul1_results,
            queries.transpose(0, 1),
            keys.transpose(0, 1).transpose(1, 2),
            out=matmul1_results,
            beta=0.0,
            alpha=scale_t[0],
        )

        if mask is not None:
            # Self Attention Time Mask
            if use_time_mask:
                assert len(mask.size()) == 2, "Timing mask is not 2D!"
                assert mask.size(0) == mask.size(1), "Sequence length should match!"
                mask = mask.to(torch.bool)
                matmul1_results = matmul1_results.masked_fill_(mask, float("-inf"))
            # Key Padding Mask
            else:
                batches, seql_q, seql_k = matmul1_results.size()
                seqs = int(batches / heads)
                matmul1_results = matmul1_results.view(seqs, heads, seql_q, seql_k)
                mask = mask.to(torch.bool)
                matmul1_results = matmul1_results.masked_fill_(
                    mask.unsqueeze(1).unsqueeze(2), float("-inf")
                )
                matmul1_results = matmul1_results.view(seqs * heads, seql_q, seql_k)

        softmax_results = F.softmax(matmul1_results, dim=-1)

        # Dropout - is not executed for inference
        if is_training:
            dropout_results, dropout_mask = torch._fused_dropout(
                softmax_results, p=(1.0 - dropout_prob_t[0])
            )
        else:
            dropout_results = softmax_results
            dropout_mask = null_tensor

        # Matmul2 Batched GEMMs
        # The output tensor specification is needed here to specify the non-standard output.
        # Given that pytorch cannot currently perform autograd with an output tensor specified,
        # this requires a backward pass specified.
        # Input1: from_softmax [seqs*heads, seql_q, seql_k]
        # Input2: (values)     [seql_v, seqs*heads, head_dim] transpose(0,1)
        # Output:              [seql_q, seqs*heads, head_dim] transpose(0,1)
        # GEMM: Per batch: ( seql_q x seql_k ) x ( seql_k x head_dim ) = (seql_q x head_dim)
        matmul2_results = torch.empty(
            (dropout_results.size(1), dropout_results.size(0), values.size(2)),
            dtype=dropout_results.dtype,
            device=torch.device("cuda"),
        ).transpose(1, 0)
        matmul2_results = torch.bmm(dropout_results, values.transpose(0, 1), out=matmul2_results)
        matmul2_results = (
            matmul2_results.transpose(0, 1)
            .contiguous()
            .view(inputs_q.size(0), inputs_q.size(1), inputs_q.size(2))
        )

        # Output Linear GEMM
        # Input1: (activations) [seql_q, seqs, embed_dim=heads*head_dim]
        # Input2: (weights)     [ embed_dim, embed_dim ] transpose(0,1)
        # Output:               [ seql_q, seqs, embed_dim ]
        # GEMM: ( seql_q*seqs x embed_dim ) x ( embed_dim x embed_dim ) = ( seql_q*seqs x embed_dim )
        if use_biases_t[0]:
            outputs = torch.addmm(
                output_biases,
                matmul2_results.view(inputs_q.size(0) * inputs_q.size(1), inputs_q.size(2)),
                output_weights.transpose(0, 1),
                beta=1.0,
                alpha=1.0,
            )
        else:
            outputs = torch.mm(
                matmul2_results.view(inputs_q.size(0) * inputs_q.size(1), inputs_q.size(2)),
                output_weights.transpose(0, 1),
            )
        outputs = outputs.view(inputs_q.size(0), inputs_q.size(1), output_weights.size(0))

        ctx.save_for_backward(
            use_biases_t,
            heads_t,
            scale_t,
            matmul2_results,
            dropout_results,
            softmax_results,
            input_lin_q_results,
            input_lin_kv_results,
            inputs_q,
            inputs_kv,
            input_weights_q,
            input_weights_kv,
            output_weights,
            dropout_mask,
            dropout_prob_t,
        )

        return outputs.detach()

    @staticmethod
    def backward(ctx, output_grads):
        (
            use_biases_t,
            heads_t,
            scale_t,
            matmul2_results,
            dropout_results,
            softmax_results,
            input_lin_q_results,
            input_lin_kv_results,
            inputs_q,
            inputs_kv,
            input_weights_q,
            input_weights_kv,
            output_weights,
            dropout_mask,
            dropout_prob_t,
        ) = ctx.saved_tensors

        head_dim = inputs_q.size(2) // heads_t[0]

        # Slice out k,v from one big Input Linear outuput (should only impact meta data, no copies!)
        # Sequences and heads are combined to make the batch of the Batched GEMM
        # input_lin_kv_results: [seql_k, seqs, heads(16), 2, head_dim(64)]
        # input_lin_kv_results: [seql_k, batches=seqs*heads, 2, head_dim]
        queries = input_lin_q_results.view(
            inputs_q.size(0), inputs_q.size(1) * heads_t[0], head_dim
        )
        input_lin_kv_results = input_lin_kv_results.view(
            inputs_kv.size(0), inputs_kv.size(1) * heads_t[0], 2, head_dim
        )
        keys = input_lin_kv_results[:, :, 0, :]
        values = input_lin_kv_results[:, :, 1, :]

        # Slice out k,v from one big set of gradients entering the input linear's bprop  (should only impact meta data, no copies!)
        # The gradients are identical in size to the Input Linear outputs.
        # The tensor is declared before hand to properly slice out query, key, and value grads.
        input_lin_kv_results_grads = torch.empty_like(input_lin_kv_results)
        queries_grads = torch.empty_like(queries)
        keys_grads = input_lin_kv_results_grads[:, :, 0, :]
        values_grads = input_lin_kv_results_grads[:, :, 1, :]

        # Output Linear GEMM - DGRAD
        # Input1: (data grads)  [seql_q, seqs, embed_dim=heads*head_dim]
        # Input2: (weights)     [ embed_dim, embed_dim ]
        # Output:               [ seql_q, seqs, embed_dim ]
        # GEMM: ( seql_q*seqs x embed_dim ) x ( embed_dim x embed_dim ) = ( seql_q*seqs x embed_dim )
        output_lin_grads = torch.mm(
            output_grads.view(output_grads.size(0) * output_grads.size(1), output_grads.size(2)),
            output_weights,
        )
        output_lin_grads = output_lin_grads.view(
            output_grads.size(0), output_grads.size(1), output_weights.size(1)
        )
        # Output Linear GEMM - WGRAD
        # Input1: (data grads)  [seql_q*seqs, embed_dim=heads*head_dim] transpose(0,1)
        # Input2: (activations) [seql_q*seqs, embed_dim ]
        # Output:               [ seql_q, seqs, embed_dim ]
        # GEMM: ( embed_dim x seql_q*seqs ) x ( seql_q*seqs x embed_dim ) = ( embed_dim x embed_dim )
        output_weight_grads = torch.mm(
            output_grads.view(
                output_grads.size(0) * output_grads.size(1), output_grads.size(2)
            ).transpose(0, 1),
            matmul2_results.view(
                matmul2_results.size(0) * matmul2_results.size(1),
                matmul2_results.size(2),
            ),
        )
        output_lin_grads = output_lin_grads.view(
            output_grads.size(0), output_grads.size(1) * heads_t[0], head_dim
        ).transpose(0, 1)

        if use_biases_t[0]:
            output_bias_grads = torch.sum(
                output_grads.view(
                    output_grads.size(0) * output_grads.size(1), output_grads.size(2)
                ),
                0,
            )
        else:
            output_bias_grads = None

        # Matmul2 - DGRAD1
        # Input1: (data grads)  [seql_q, seqs*heads, head_dim] transpose(0,1)
        # Input2: (activations) [seql_k, seqs*heads, head_dim] transpose(0,1).transpose(1,2)
        # Output:               [seqs*heads, seql_q, seql_k]
        # GEMM: Per batch: ( seql_q x head_dim ) x ( head_dim x seql_k ) = ( seql_q x seql_k )
        matmul2_dgrad1 = torch.bmm(output_lin_grads, values.transpose(0, 1).transpose(1, 2))
        # Matmul2 - DGRAD2
        # Input1: (data grads)  [seql_q, seqs*heads, head_dim] transpose(0,1)
        # Input2: (activations) [seql_k, seqs*heads, head_dim] transpose(0,1).transpose(1,2)
        # Output:               [seqs*heads, seql_q, seql_k]
        # GEMM: Per batch: ( seql_q x head_dim ) x ( head_dim x seql_k ) = ( seql_q x seql_k )
        values_grads = torch.bmm(
            dropout_results.transpose(1, 2),
            output_lin_grads,
            out=values_grads.transpose(0, 1),
        )

        # Mask and Scaling for Dropout (not a publically documented op)
        dropout_grads = torch._masked_scale(
            matmul2_dgrad1, dropout_mask, 1.0 / (1.0 - dropout_prob_t[0])
        )

        # Softmax Grad (not a publically documented op)
        softmax_grads = torch._softmax_backward_data(
            dropout_grads, softmax_results, -1, softmax_results.dtype
        )

        # Matmul1 - DGRAD1
        # Input1: (data grads)  [seqs*heads, seql_q, seql_k]
        # Input2: (activations) [seql_k, seqs*heads, head_dim] transpose(0,1)
        # Output:               [seqs*heads, seql_q, head_dim] transpose(0,1)
        # GEMM: Per batch: ( seql_q x seql_k ) x ( seql_k x head_dim ) = ( seql_q x head_dim )
        queries_grads = torch.baddbmm(
            queries_grads.transpose(0, 1),
            softmax_grads,
            keys.transpose(0, 1),
            out=queries_grads.transpose(0, 1),
            beta=0.0,
            alpha=scale_t[0],
        )
        # Matmul1 - DGRAD2
        # Input1: (data grads)  [seqs*heads, seql_q, seql_k] transpose(1,2)
        # Input2: (activations) [seql_q, seqs*heads, head_dim] transpose(0,1)
        # Output:               [seqs*heads, seql_k, head_dim] transpose(0,1)
        # GEMM: Per batch: ( seql_k x seql_q ) x ( seql_q x head_dim ) = ( seql_k x head_dim )
        keys_grads = torch.baddbmm(
            keys_grads.transpose(0, 1),
            softmax_grads.transpose(1, 2),
            queries.transpose(0, 1),
            out=keys_grads.transpose(0, 1),
            beta=0.0,
            alpha=scale_t[0],
        )

        # Input Q Linear GEMM - DGRAD
        # input1: (data grads) [seql_q, seqs, embed_dim(1024)]
        # input2: (weights)    [embed_dim (1024), embed_dim (1024)]
        # output:              [seql_q, seqs, embed_dim]
        # GEMM: ( (seql_q*seqs) x embed_dim ) x ( embed_dim x embed_dim ) = (seql_q*seqs x embed_dim)
        queries_grads = queries_grads.transpose(0, 1).view(
            inputs_q.size(0) * inputs_q.size(1), heads_t[0] * head_dim
        )
        input_q_grads = torch.mm(queries_grads, input_weights_q)
        input_q_grads = input_q_grads.view(inputs_q.size(0), inputs_q.size(1), inputs_q.size(2))
        # Input KV Linear GEMM - DGRAD
        # input1: (data grads) [seql_k, seqs, 2*embed_dim(2048)]
        # input2: (weights)    [embed_dim*2 (2048), embed_dim (1024)]
        # output:              [seql_k, seqs, embed_dim]
        # GEMM: ( (seql_k*seqs) x 2*embed_dim ) x ( 2*embed_dim x embed_dim ) = (seql_k*seqs x embed_dim)
        input_lin_kv_results_grads = input_lin_kv_results_grads.view(
            inputs_kv.size(0) * inputs_kv.size(1), heads_t[0] * 2 * head_dim
        )
        input_kv_grads = torch.mm(input_lin_kv_results_grads, input_weights_kv)
        input_kv_grads = input_kv_grads.view(
            inputs_kv.size(0), inputs_kv.size(1), inputs_kv.size(2)
        )
        # Input Q Linear GEMM - WGRAD
        # input1: (data grads)  [seql_q*seqs, embed_dim(1024)]
        # input2: (activations) [seql_q*seqs, embed_dim(1024)]
        # output:               [embed_dim, embed_dim]
        # GEMM: ( embed_dim x seql_q*seqs ) x ( seql_q*seqs x embed_dim ) = (embed_dim x embed_dim)
        input_weight_q_grads = torch.mm(
            queries_grads.transpose(0, 1),
            inputs_q.view(inputs_q.size(0) * inputs_q.size(1), inputs_q.size(2)),
        )
        # Input KV Linear GEMM - WGRAD
        # input1: (data grads)  [seql_k*seqs, 2*embed_dim(2048)]
        # input2: (activations) [seql_k*seqs, embed_dim(1024)]
        # output:               [2*embed_dim, embed_dim]
        # GEMM: ( 2*embed_dim x seql_k*seqs ) x ( seql_k*seqs x embed_dim ) = (2*embed_dim x embed_dim)
        input_weight_kv_grads = torch.mm(
            input_lin_kv_results_grads.transpose(0, 1),
            inputs_kv.view(inputs_kv.size(0) * inputs_kv.size(1), inputs_kv.size(2)),
        )

        if use_biases_t[0]:
            input_bias_grads_q = torch.sum(queries_grads, 0)
            input_bias_grads_kv = torch.sum(input_lin_kv_results_grads, 0)
        else:
            input_bias_grads_q = None
            input_bias_grads_kv = None

        return (
            None,
            None,
            None,
            None,
            input_q_grads,
            input_kv_grads,
            input_weight_q_grads,
            input_weight_kv_grads,
            output_weight_grads,
            input_bias_grads_q,
            input_bias_grads_kv,
            output_bias_grads,
            None,
            None,
        )


encdec_attn_func = EncdecAttnFunc.apply


================================================
FILE: apex/contrib/multihead_attn/fast_encdec_multihead_attn_func.py
================================================
import torch

import fast_multihead_attn


class FastEncdecAttnFunc(torch.autograd.Function):
    @staticmethod
    def forward(
        ctx,
        use_time_mask,
        is_training,
        heads,
        inputs_q,
        inputs_kv,
        input_weights_q,
        input_weights_kv,
        output_weights,
        pad_mask,
        dropout_prob,
    ):
        from apex import deprecated_warning

        deprecated_warning(
            "`apex.contrib.multihead_attn` is deprecated and will be removed in July 2026. "
            "We encourage you to migrate to PyTorch native MultiheadAttention"
            "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html"
        )

        heads_t = torch.tensor([heads])
        dropout_prob_t = torch.tensor([dropout_prob])
        null_tensor = torch.tensor([])
        use_mask = pad_mask is not None

        (
            input_lin_q_results,
            input_lin_kv_results,
            softmax_results,
            dropout_results,
            dropout_mask,
            matmul2_results,
            outputs,
        ) = fast_multihead_attn.encdec_multihead_attn_forward(
            use_mask,
            use_time_mask,
            is_training,
            heads,
            inputs_q,
            inputs_kv,
            input_weights_q,
            input_weights_kv,
            output_weights,
            pad_mask if use_mask else null_tensor,
            dropout_prob,
        )

        ctx.save_for_backward(
            heads_t,
            matmul2_results,
            dropout_results,
            softmax_results,
            input_lin_q_results,
            input_lin_kv_results,
            inputs_q,
            inputs_kv,
            input_weights_q,
            input_weights_kv,
            output_weights,
            dropout_mask,
            dropout_prob_t,
        )

        return outputs.detach()

    @staticmethod
    def backward(ctx, output_grads):
        (
            heads_t,
            matmul2_results,
            dropout_results,
            softmax_results,
            input_lin_q_results,
            input_lin_kv_results,
            inputs_q,
            inputs_kv,
            input_weights_q,
            input_weights_kv,
            output_weights,
            dropout_mask,
            dropout_prob_t,
        ) = ctx.saved_tensors

        (
            input_q_grads,
            input_kv_grads,
            input_weight_q_grads,
            input_weight_kv_grads,
            output_weight_grads,
        ) = fast_multihead_attn.encdec_multihead_attn_backward(
            heads_t[0],
            output_grads,
            matmul2_results,
            dropout_results,
            softmax_results,
            input_lin_q_results,
            input_lin_kv_results,
            inputs_q,
            inputs_kv,
            input_weights_q,
            input_weights_kv,
            output_weights,
            dropout_mask,
            dropout_prob_t[0],
        )

        return (
            None,
            None,
            None,
            input_q_grads,
            input_kv_grads,
            input_weight_q_grads,
            input_weight_kv_grads,
            output_weight_grads,
            None,
            None,
        )


fast_encdec_attn_func = FastEncdecAttnFunc.apply


================================================
FILE: apex/contrib/multihead_attn/fast_encdec_multihead_attn_norm_add_func.py
================================================
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.

import torch

import fast_multihead_attn


class FastEncdecAttnNormAddFunc(torch.autograd.Function):
    @staticmethod
    def forward(
        ctx,
        use_time_mask,
        is_training,
        heads,
        inputs_q,
        inputs_kv,
        lyr_nrm_gamma_weights,
        lyr_nrm_beta_weights,
        input_weights_q,
        input_weights_kv,
        output_weights,
        pad_mask,
        dropout_prob,
    ):
        from apex import deprecated_warning

        deprecated_warning(
            "`apex.contrib.multihead_attn` is deprecated and will be removed in July 2026. "
            "We encourage you to migrate to PyTorch native MultiheadAttention"
            "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html"
        )

        heads_t = torch.tensor([heads])
        dropout_prob_t = torch.tensor([dropout_prob])
        null_tensor = torch.tensor([])
        use_mask = pad_mask is not None

        (
            lyr_nrm_results,
            lyr_nrm_mean,
            lyr_nrm_invvar,
            input_lin_q_results,
            input_lin_kv_results,
            softmax_results,
            dropout_results,
            dropout_mask,
            matmul2_results,
            dropout_add_mask,
            outputs,
        ) = fast_multihead_attn.encdec_multihead_attn_norm_add_forward(
            use_mask,
            use_time_mask,
            is_training,
            heads,
            inputs_q,
            inputs_kv,
            lyr_nrm_gamma_weights,
            lyr_nrm_beta_weights,
            input_weights_q,
            input_weights_kv,
            output_weights,
            pad_mask if use_mask else null_tensor,
            dropout_prob,
        )

        ctx.save_for_backward(
            heads_t,
            matmul2_results,
            dropout_results,
            softmax_results,
            input_lin_q_results,
            input_lin_kv_results,
            lyr_nrm_results,
            lyr_nrm_mean,
            lyr_nrm_invvar,
            inputs_q,
            inputs_kv,
            lyr_nrm_gamma_weights,
            lyr_nrm_beta_weights,
            input_weights_q,
            input_weights_kv,
            output_weights,
            dropout_mask,
            dropout_add_mask,
            dropout_prob_t,
        )

        return outputs.detach()

    @staticmethod
    def backward(ctx, output_grads):
        (
            heads_t,
            matmul2_results,
            dropout_results,
            softmax_results,
            input_lin_q_results,
            input_lin_kv_results,
            lyr_nrm_results,
            lyr_nrm_mean,
            lyr_nrm_invvar,
            inputs_q,
            inputs_kv,
            lyr_nrm_gamma_weights,
            lyr_nrm_beta_weights,
            input_weights_q,
            input_weights_kv,
            output_weights,
            dropout_mask,
            dropout_add_mask,
            dropout_prob_t,
        ) = ctx.saved_tensors

        (
            input_q_grads,
            input_kv_grads,
            lyr_nrm_gamma_grads,
            lyr_nrm_beta_grads,
            input_weight_q_grads,
            input_weight_kv_grads,
            output_weight_grads,
        ) = fast_multihead_attn.encdec_multihead_attn_norm_add_backward(
            heads_t[0],
            output_grads,
            matmul2_results,
            dropout_results,
            softmax_results,
            input_lin_q_results,
            input_lin_kv_results,
            lyr_nrm_results,
            lyr_nrm_mean,
            lyr_nrm_invvar,
            inputs_q,
            inputs_kv,
            lyr_nrm_gamma_weights,
            lyr_nrm_beta_weights,
            input_weights_q,
            input_weights_kv,
            output_weights,
            dropout_mask,
            dropout_add_mask,
            dropout_prob_t[0],
        )

        # import pdb; pdb.set_trace()
        return (
            None,
            None,
            None,
            input_q_grads,
            input_kv_grads,
            lyr_nrm_gamma_grads,
            lyr_nrm_beta_grads,
            input_weight_q_grads,
            input_weight_kv_grads,
            output_weight_grads,
            None,
            None,
        )


fast_encdec_attn_norm_add_func = FastEncdecAttnNormAddFunc.apply


================================================
FILE: apex/contrib/multihead_attn/fast_self_multihead_attn_func.py
================================================
import torch

import fast_multihead_attn


class FastSelfAttnFunc(torch.autograd.Function):
    @staticmethod
    def forward(
        ctx,
        use_time_mask,
        is_training,
        heads,
        inputs,
        input_weights,
        output_weights,
        input_biases,
        output_biases,
        pad_mask,
        mask_additive,
        dropout_prob,
    ):
        from apex import deprecated_warning

        deprecated_warning(
            "`apex.contrib.multihead_attn` is deprecated and will be removed in July 2026. "
            "We encourage you to migrate to PyTorch native MultiheadAttention"
            "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html"
        )

        use_biases_t = torch.tensor([input_biases is not None])
        heads_t = torch.tensor([heads])
        dropout_prob_t = torch.tensor([dropout_prob])
        null_tensor = torch.tensor([])
        use_mask = pad_mask is not None
        mask_additive_t = torch.tensor([mask_additive])

        if use_biases_t[0]:
            if not mask_additive:
                (
                    input_lin_results,
                    softmax_results,
                    dropout_results,
                    dropout_mask,
                    matmul2_results,
                    outputs,
                ) = fast_multihead_attn.self_attn_bias_forward(
                    use_mask,
                    use_time_mask,
                    is_training,
                    heads,
                    inputs,
                    input_weights,
                    output_weights,
                    input_biases,
                    output_biases,
                    pad_mask if use_mask else null_tensor,
                    dropout_prob,
                )
                # fast_self_multihead_attn_bias.forward()                           \
                ctx.save_for_backward(
                    use_biases_t,
                    heads_t,
                    matmul2_results,
                    dropout_results,
                    softmax_results,
                    null_tensor,
                    null_tensor,
                    mask_additive_t,
                    input_lin_results,
                    inputs,
                    input_weights,
                    output_weights,
                    dropout_mask,
                    dropout_prob_t,
                )

            else:
                (
                    input_lin_results,
                    bmm1_results,
                    dropout_results,
                    dropout_mask,
                    matmul2_results,
                    outputs,
                ) = fast_multihead_attn.self_attn_bias_additive_mask_forward(
                    use_mask,
                    use_time_mask,
                    is_training,
                    heads,
                    inputs,
                    input_weights,
                    output_weights,
                    input_biases,
                    output_biases,
                    pad_mask if use_mask else null_tensor,
                    dropout_prob,
                )
                # fast_self_multihead_attn_bias_additive_mask.forward(                           \
                ctx.save_for_backward(
                    use_biases_t,
                    heads_t,
                    matmul2_results,
                    dropout_results,
                    null_tensor,
                    bmm1_results,
                    pad_mask,
                    mask_additive_t,
                    input_lin_results,
                    inputs,
                    input_weights,
                    output_weights,
                    dropout_mask,
                    dropout_prob_t,
                )

        else:
            (
                input_lin_results,
                softmax_results,
                dropout_results,
                dropout_mask,
                matmul2_results,
                outputs,
            ) = fast_multihead_attn.self_attn_forward(
                use_mask,
                use_time_mask,
                is_training,
                heads,
                inputs,
                input_weights,
                output_weights,
                pad_mask if use_mask else null_tensor,
                dropout_prob,
            )
            # fast_self_multihead_attn.forward(                           \
            ctx.save_for_backward(
                use_biases_t,
                heads_t,
                matmul2_results,
                dropout_results,
                softmax_results,
                null_tensor,
                null_tensor,
                mask_additive_t,
                input_lin_results,
                inputs,
                input_weights,
                output_weights,
                dropout_mask,
                dropout_prob_t,
            )
        return outputs.detach()

    @staticmethod
    def backward(ctx, output_grads):
        (
            use_biases_t,
            heads_t,
            matmul2_results,
            dropout_results,
            softmax_results,
            bmm1_results,
            pad_mask,
            mask_additive_t,
            input_lin_results,
            inputs,
            input_weights,
            output_weights,
            dropout_mask,
            dropout_prob_t,
        ) = ctx.saved_tensors

        if use_biases_t[0]:
            if not mask_additive_t[0]:
                (
                    input_grads,
                    input_weight_grads,
                    output_weight_grads,
                    input_bias_grads,
                    output_bias_grads,
                ) = fast_multihead_attn.self_attn_bias_backward(
                    heads_t[0],
                    output_grads,
                    matmul2_results,
                    dropout_results,
                    softmax_results,
                    input_lin_results,
                    inputs,
                    input_weights,
                    output_weights,
                    dropout_mask,
                    dropout_prob_t[0],
                )
                # fast_self_multihead_attn_bias.backward(                          \

            else:
                (
                    input_grads,
                    input_weight_grads,
                    output_weight_grads,
                    input_bias_grads,
                    output_bias_grads,
                ) = fast_multihead_attn.self_attn_bias_additive_mask_backward(
                    heads_t[0],
                    output_grads,
                    matmul2_results,
                    dropout_results,
                    bmm1_results,
                    pad_mask,
                    input_lin_results,
                    inputs,
                    input_weights,
                    output_weights,
                    dropout_mask,
                    dropout_prob_t[0],
                )
                # fast_self_multihead_attn_bias_additive_mask.backward(                          \

        else:
            input_bias_grads = None
            output_bias_grads = None
            input_grads, input_weight_grads, output_weight_grads = (
                fast_multihead_attn.self_attn_backward(
                    heads_t[0],
                    output_grads,
                    matmul2_results,
                    dropout_results,
                    softmax_results,
                    input_lin_results,
                    inputs,
                    input_weights,
                    output_weights,
                    dropout_mask,
                    dropout_prob_t[0],
                )
            )
            # fast_self_multihead_attn.backward(                          \
        return (
            None,
            None,
            None,
            input_grads,
            input_weight_grads,
            output_weight_grads,
            input_bias_grads,
            output_bias_grads,
            None,
            None,
            None,
        )


fast_self_attn_func = FastSelfAttnFunc.apply


================================================
FILE: apex/contrib/multihead_attn/fast_self_multihead_attn_norm_add_func.py
================================================
import torch

import fast_multihead_attn


class FastSelfAttnNormAddFunc(torch.autograd.Function):
    @staticmethod
    def forward(
        ctx,
        use_time_mask,
        is_training,
        heads,
        inputs,
        lyr_nrm_gamma_weights,
        lyr_nrm_beta_weights,
        input_weights,
        output_weights,
        pad_mask,
        dropout_prob,
    ):
        from apex import deprecated_warning

        deprecated_warning(
            "`apex.contrib.multihead_attn` is deprecated and will be removed in July 2026. "
            "We encourage you to migrate to PyTorch native MultiheadAttention"
            "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html"
        )

        heads_t = torch.tensor([heads])
        dropout_prob_t = torch.tensor([dropout_prob])
        null_tensor = torch.tensor([])
        use_mask = pad_mask is not None

        (
            lyr_nrm_results,
            lyr_nrm_mean,
            lyr_nrm_invvar,
            input_lin_results,
            softmax_results,
            dropout_results,
            dropout_mask,
            matmul2_results,
            dropout_add_mask,
            outputs,
        ) = fast_multihead_attn.self_attn_norm_add_forward(
            use_mask,
            use_time_mask,
            is_training,
            heads,
            inputs,
            lyr_nrm_gamma_weights,
            lyr_nrm_beta_weights,
            input_weights,
            output_weights,
            pad_mask if use_mask else null_tensor,
            dropout_prob,
        )
        # fast_self_multihead_attn_norm_add.forward(                 \

        ctx.save_for_backward(
            heads_t,
            matmul2_results,
            dropout_results,
            softmax_results,
            input_lin_results,
            lyr_nrm_results,
            lyr_nrm_mean,
            lyr_nrm_invvar,
            inputs,
            lyr_nrm_gamma_weights,
            lyr_nrm_beta_weights,
            input_weights,
            output_weights,
            dropout_mask,
            dropout_add_mask,
            dropout_prob_t,
        )

        return outputs.detach()

    @staticmethod
    def backward(ctx, output_grads):
        (
            heads_t,
            matmul2_results,
            dropout_results,
            softmax_results,
            input_lin_results,
            lyr_nrm_results,
            lyr_nrm_mean,
            lyr_nrm_invvar,
            inputs,
            lyr_nrm_gamma_weights,
            lyr_nrm_beta_weights,
            input_weights,
            output_weights,
            dropout_mask,
            dropout_add_mask,
            dropout_prob_t,
        ) = ctx.saved_tensors

        (
            input_grads,
            lyr_nrm_gamma_grads,
            lyr_nrm_beta_grads,
            input_weight_grads,
            output_weight_grads,
        ) = fast_multihead_attn.self_attn_norm_add_backward(
            heads_t[0],
            output_grads,
            matmul2_results,
            dropout_results,
            softmax_results,
            input_lin_results,
            lyr_nrm_results,
            lyr_nrm_mean,
            lyr_nrm_invvar,
            inputs,
            lyr_nrm_gamma_weights,
            lyr_nrm_beta_weights,
            input_weights,
            output_weights,
            dropout_mask,
            dropout_add_mask,
            dropout_prob_t[0],
        )
        # fast_self_multihead_attn_norm_add.backward(                 \

        return (
            None,
            None,
            None,
            input_grads,
            lyr_nrm_gamma_grads,
            lyr_nrm_beta_grads,
            input_weight_grads,
            output_weight_grads,
            None,
            None,
        )


fast_self_attn_norm_add_func = FastSelfAttnNormAddFunc.apply


================================================
FILE: apex/contrib/multihead_attn/mask_softmax_dropout_func.py
================================================
import torch

import fast_multihead_attn


class MaskSoftmaxDropout(torch.autograd.Function):
    @staticmethod
    def forward(ctx, is_training, heads, inputs, pad_mask, mask_additive, dropout_prob):
        from apex import deprecated_warning

        deprecated_warning(
            "`apex.contrib.multihead_attn` is deprecated and will be removed in July 2026. "
            "We encourage you to migrate to PyTorch native MultiheadAttention"
            "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html"
        )

        heads_t = torch.tensor([heads])
        dropout_prob_t = torch.tensor([dropout_prob])
        null_tensor = torch.tensor([])
        use_mask = pad_mask is not None
        use_mask_t = torch.tensor([use_mask])
        mask_additive_t = torch.tensor([mask_additive])

        if mask_additive:
            dropout_results, dropout_mask, softmax_results = (
                fast_multihead_attn.additive_mask_softmax_dropout_forward(
                    use_mask,
                    is_training,
                    heads,
                    inputs,
                    pad_mask if use_mask else null_tensor,
                    dropout_prob,
                )
            )
            # fast_additive_mask_softmax_dropout.forward(                           \
        else:
            dropout_results, dropout_mask, softmax_results = (
                fast_multihead_attn.mask_softmax_dropout_forward(
                    use_mask,
                    is_training,
                    heads,
                    inputs,
                    pad_mask if use_mask else null_tensor,
                    dropout_prob,
                )
            )
            # fast_mask_softmax_dropout.forward(                           \

        ctx.save_for_backward(
            use_mask_t,
            heads_t,
            softmax_results,
            dropout_mask,
            pad_mask if use_mask else null_tensor,
            mask_additive_t,
            dropout_prob_t,
        )

        return dropout_results.detach()

    @staticmethod
    def backward(ctx, output_grads):
        (
            use_mask_t,
            heads_t,
            softmax_results,
            dropout_mask,
            pad_mask,
            mask_additive_t,
            dropout_prob_t,
        ) = ctx.saved_tensors

        if mask_additive_t[0]:
            input_grads = fast_multihead_attn.additive_mask_softmax_dropout_backward(
                use_mask_t[0],
                heads_t[0],
                output_grads,
                softmax_results,
                dropout_mask,
                dropout_prob_t[0],
            )
            # fast_additive_mask_softmax_dropout.backward(                          \
        else:
            input_grads = fast_multihead_attn.mask_softmax_dropout_backward(
                use_mask_t[0],
                heads_t[0],
                output_grads,
                softmax_results,
                dropout_mask,
                pad_mask,
                dropout_prob_t[0],
            )
            # fast_mask_softmax_dropout.backward(                          \
        return None, None, input_grads, None, None, None


fast_mask_softmax_dropout_func = MaskSoftmaxDropout.apply


================================================
FILE: apex/contrib/multihead_attn/self_multihead_attn.py
================================================
import math

import torch
from torch import nn
from torch.nn import Parameter
import torch.nn.functional as F

from .self_multihead_attn_func import self_attn_func
from .fast_self_multihead_attn_func import fast_self_attn_func
from .fast_self_multihead_attn_norm_add_func import fast_self_attn_norm_add_func
from apex.normalization.fused_layer_norm import FusedLayerNorm


@torch.jit.script
def jit_dropout_add(x, residual, prob, is_training):
    # type: (Tensor, Tensor, float, bool) -> Tensor
    out = F.dropout(x, p=prob, training=True)
    out = residual + out
    return out


class SelfMultiheadAttn(nn.Module):
    """Multi-headed attention.

    See "Attention Is All You Need" for more details.
    """

    def __init__(
        self,
        embed_dim,
        num_heads,
        dropout=0.0,
        bias=False,
        include_norm_add=False,
        impl="fast",
        separate_qkv_params=False,
        mask_additive=False,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        assert self.head_dim * num_heads == self.embed_dim, (
            "embed_dim must be divisible by num_heads"
        )
        self.bias = bias
        self.include_norm_add = include_norm_add
        self.impl = impl
        self.scaling = self.head_dim**-0.5
        self.separate_qkv_params = separate_qkv_params
        self.mask_additive = mask_additive
        if mask_additive:
            assert self.include_norm_add == False, "additive mask not supported with layer norm"
            assert impl == "default" or (impl == "fast" and bias), (
                "additive mask not supported for fast mode without bias"
            )
        if separate_qkv_params:
            self.q_weight = Parameter(torch.empty(embed_dim, embed_dim))
            self.k_weight = Parameter(torch.empty(embed_dim, embed_dim))
            self.v_weight = Parameter(torch.empty(embed_dim, embed_dim))
        else:
            self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim))
        self.out_proj_weight = Parameter(torch.empty(embed_dim, embed_dim))
        if self.bias:
            if separate_qkv_params:
                self.q_bias = Parameter(torch.empty(embed_dim))
                self.k_bias = Parameter(torch.empty(embed_dim))
                self.v_bias = Parameter(torch.empty(embed_dim))
            else:
                self.in_proj_bias = Parameter(torch.empty(3 * embed_dim))
            self.out_proj_bias = Parameter(torch.empty(embed_dim))
        else:
            if separate_qkv_params:
                self.register_parameter("q_bias", None)
                self.register_parameter("k_bias", None)
                self.register_parameter("v_bias", None)
                self.q_bias = None
                self.k_bias = None
                self.v_bias = None
            else:
                self.register_parameter("in_proj_bias", None)
                self.in_proj_bias = None
            self.register_parameter("out_proj_bias", None)
            self.out_proj_bias = None
        if self.include_norm_add:
            if impl == "fast":
                self.lyr_nrm_gamma_weights = Parameter(torch.empty(embed_dim))
                self.lyr_nrm_beta_weights = Parameter(torch.empty(embed_dim))
                self.lyr_nrm = None
            else:
                self.register_parameter("lyr_norm_gamma_weights", None)
                self.register_parameter("lyr_norm_beta_weights", None)
                self.lyr_nrm_gamma_weights = None
                self.lyr_nrm_beta_weights = None
                self.lyr_nrm = FusedLayerNorm(embed_dim)
        self.reset_parameters()

        if self.include_norm_add:
            if impl == "fast":
                self.attn_func = fast_self_attn_norm_add_func
            elif impl == "default":
                self.attn_func = self_attn_func
            else:
                assert False, "Unsupported impl: {} !".format(impl)
        else:
            if impl == "fast":
                self.attn_func = fast_self_attn_func
            elif impl == "default":
                self.attn_func = self_attn_func
            else:
                assert False, "Unsupported impl: {} !".format(impl)

    def reset_parameters(self):
        if self.separate_qkv_params:
            nn.init.xavier_uniform_(self.q_weight)
            nn.init.xavier_uniform_(self.k_weight)
            nn.init.xavier_uniform_(self.v_weight)
        else:
            # in_proj_weight has shape [3 * hidden, hidden] but it should be
            # initialized like a [hidden, hidden] matrix.
            # sqrt(6 / (hidden + hidden)) / sqrt(6 / (3 * hidden + hidden)) = sqrt(2)
            # therefore xavier_uniform gain should be set to sqrt(2).
            nn.init.xavier_uniform_(self.in_proj_weight, gain=math.sqrt(2))
        nn.init.xavier_uniform_(self.out_proj_weight)
        if self.bias:
            if self.separate_qkv_params:
                nn.init.constant_(self.q_bias, 0.0)
                nn.init.constant_(self.k_bias, 0.0)
                nn.init.constant_(self.v_bias, 0.0)
            else:
                nn.init.constant_(self.in_proj_bias, 0.0)
            nn.init.constant_(self.out_proj_bias, 0.0)
        if self.include_norm_add:
            if self.impl == "fast":
                nn.init.ones_(self.lyr_nrm_gamma_weights)
                nn.init.zeros_(self.lyr_nrm_beta_weights)
            else:
                self.lyr_nrm.reset_parameters()

    def forward(
        self,
        query,
        key,
        value,
        key_padding_mask=None,
        need_weights=False,
        attn_mask=None,
        is_training=True,
    ):
        """Input shape: Time x Batch x Channel

        Self-attention can be implemented by passing in the same arguments for
        query, key and value. Future timesteps can be masked with the
        `mask_future_timesteps` argument. Padding elements can be excluded from
        the key by passing a binary ByteTensor (`key_padding_mask`) with shape:
        batch x src_len, where padding elements are indicated by 1s.
        """
        if self.separate_qkv_params:
            input_weights = (
                torch.cat(
                    [
                        self.q_weight.view(self.num_heads, 1, self.head_dim, self.embed_dim),
                        self.k_weight.view(self.num_heads, 1, self.head_dim, self.embed_dim),
                        self.v_weight.view(self.num_heads, 1, self.head_dim, self.embed_dim),
                    ],
                    dim=1,
                )
                .reshape(3 * self.embed_dim, self.embed_dim)
                .contiguous()
            )
        else:
            input_weights = self.in_proj_weight
        if self.bias:
            if self.separate_qkv_params:
                input_bias = (
                    torch.cat(
                        [
                            self.q_bias.view(self.num_heads, 1, self.head_dim),
                            self.k_bias.view(self.num_heads, 1, self.head_dim),
                            self.v_bias.view(self.num_heads, 1, self.head_dim),
                        ],
                        dim=1,
                    )
                    .reshape(3 * self.embed_dim)
                    .contiguous()
                )
            else:
                input_bias = self.in_proj_bias
        else:
            input_bias = None
        if key_padding_mask is not None:
            assert attn_mask is None, (
                "ERROR attn_mask and key_padding_mask should not be both defined!"
            )
            mask = key_padding_mask
        elif attn_mask is not None:
            assert self.mask_additive == False, "additive mask not supported for time mask"
            mask = attn_mask
        else:
            mask = None

        if self.include_norm_add:
            if self.impl == "fast":
                outputs = self.attn_func(
                    attn_mask is not None,
                    is_training,
                    self.num_heads,
                    query,
                    self.lyr_nrm_gamma_weights,
                    self.lyr_nrm_beta_weights,
                    input_weights,
                    self.out_proj_weight,
                    mask,
                    self.dropout,
                )
            else:
                lyr_nrm_results = self.lyr_nrm(query)
                outputs = self.attn_func(
                    attn_mask is not None,
                    is_training,
                    self.num_heads,
                    self.scaling,
                    lyr_nrm_results,
                    input_weights,
                    self.out_proj_weight,
                    input_bias,
                    self.out_proj_bias,
                    mask,
                    self.mask_additive,
                    self.dropout,
                )
                if is_training:
                    outputs = jit_dropout_add(outputs, query, self.dropout, is_training)
                else:
                    outputs = outputs + query
        else:
            if self.impl == "fast":
                outputs = self.attn_func(
                    attn_mask is not None,
                    is_training,
                    self.num_heads,
                    query,
                    input_weights,
                    self.out_proj_weight,
                    input_bias,
                    self.out_proj_bias,
                    mask,
                    self.mask_additive,
                    self.dropout,
                )
            else:
                outputs = self.attn_func(
                    attn_mask is not None,
                    is_training,
                    self.num_heads,
                    self.scaling,
                    query,
                    input_weights,
                    self.out_proj_weight,
                    input_bias,
                    self.out_proj_bias,
                    mask,
                    self.mask_additive,
                    self.dropout,
                )

        return outputs, None


================================================
FILE: apex/contrib/multihead_attn/self_multihead_attn_func.py
================================================
import torch
import torch.nn.functional as F


class SelfAttnFunc(torch.autograd.Function):
    @staticmethod
    def forward(
        ctx,
        use_time_mask,
        is_training,
        heads,
        scale,
        inputs,
        input_weights,
        output_weights,
        input_biases,
        output_biases,
        mask,
        is_additive_mask,
        dropout_prob,
    ):
        from apex import deprecated_warning

        deprecated_warning(
            "`apex.contrib.multihead_attn` is deprecated and will be removed in July 2026. "
            "We encourage you to migrate to PyTorch native MultiheadAttention"
            "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html"
        )

        use_biases_t = torch.tensor([input_biases is not None])
        heads_t = torch.tensor([heads])
        scale_t = torch.tensor([scale])
        dropout_prob_t = torch.tensor([dropout_prob])
        null_tensor = torch.tensor([])
        head_dim = inputs.size(2) // heads

        # Input Linear GEMM
        # input1: (activations) [seql_q, seqs, embed_dim(1024)]
        # input2: (weights)     [embed_dim*3 (3072), embed_dim (1024)] (transpose [0,1])
        # output:               [seql_q, seqs, embed_dim*3]
        # GEMM: ( (seql_q*seqs) x embed_dim ) x ( embed_dim x embed_dim*3 ) = (seql_q*seqs x embed_dim*3)
        if use_biases_t[0]:
            input_lin_results = torch.addmm(
                input_biases,
                inputs.view(inputs.size(0) * inputs.size(1), inputs.size(2)),
                input_weights.transpose(0, 1),
                beta=1.0,
                alpha=1.0,
            )
        else:
            input_lin_results = torch.mm(
                inputs.view(inputs.size(0) * inputs.size(1), inputs.size(2)),
                input_weights.transpose(0, 1),
            )
        input_lin_results = input_lin_results.view(
            inputs.size(0), inputs.size(1), input_weights.size(0)
        )

        # Slice out q,k,v from one big Input Linear outuput (should only impact meta data, no copies!)
        # Sequences and heads are combined to make the batch of the Batched GEMM
        # input_lin_results: [seql_q, seqs, heads(16), 3, head_dim(64)]
        # input_lin_results: [seql_q, batches=seqs*heads, 3, head_dim]
        input_lin_results = input_lin_results.view(
            inputs.size(0), inputs.size(1) * heads, 3, head_dim
        )
        queries = input_lin_results[:, :, 0, :]
        keys = input_lin_results[:, :, 1, :]
        values = input_lin_results[:, :, 2, :]

        # Matmul1 Batched GEMMs
        # The output tensor is specified prior to the Batch GEMM because baddbmm requires its specification
        # baddbmm is used to apply the scale parameter via the Batched GEMM's alpha parameter instead of
        # a separate elementwise operation.
        # Input1: (Queries) [seql_q, seqs*heads, head_dim] tranpose(0,1)
        # Input2: (Keys)    [seql_k, seqs*heads, head_dim] transpose(0,1)
        # output:           [seqs*heads, seql_q, seql_k]
        # GEMM: Per batch: ( seql_q x head_dim ) x ( head_dim x seql_k ) = ( seql_q x seql_k )
        matmul1_results = torch.empty(
            (queries.size(1), queries.size(0), keys.size(0)),
            dtype=queries.dtype,
            device=torch.device("cuda"),
        )
        matmul1_results = torch.baddbmm(
            matmul1_results,
            queries.transpose(0, 1),
            keys.transpose(0, 1).transpose(1, 2),
            out=matmul1_results,
            beta=0.0,
            alpha=scale_t[0],
        )

        if mask is not None:
            # Self Attention Time Mask
            if use_time_mask:
                assert len(mask.size()) == 2, "Timing mask is not 2D!"
                assert mask.size(0) == mask.size(1), "Sequence length should match!"
                mask = mask.to(torch.bool)
                matmul1_results = matmul1_results.masked_fill_(mask, float("-inf"))
            # Key Padding Mask
            else:
                batches, seql_q, seql_k = matmul1_results.size()
                seqs = int(batches / heads)
                matmul1_results = matmul1_results.view(seqs, heads, seql_q, seql_k)
                if is_additive_mask:
                    matmul1_results = matmul1_results + mask.unsqueeze(1).unsqueeze(2)
                else:
                    mask = mask.to(torch.bool)
                    matmul1_results = matmul1_results.masked_fill_(
                        mask.unsqueeze(1).unsqueeze(2), float("-inf")
                    )
                matmul1_results = matmul1_results.view(seqs * heads, seql_q, seql_k)

        softmax_results = F.softmax(matmul1_results, dim=-1)

        # Dropout - is not executed for inference
        if is_training:
            dropout_results, dropout_mask = torch._fused_dropout(
                softmax_results, p=(1.0 - dropout_prob_t[0])
            )
        else:
            dropout_results = softmax_results
            dropout_mask = null_tensor

        # Matmul2 Batched GEMMs
        # The output tensor specification is needed here to specify the non-standard output.
        # Given that pytorch cannot currently perform autograd with an output tensor specified,
        # this requires a backward pass specified.
        # Input1: from_softmax [seqs*heads, seql_q, seql_k]
        # Input2: (values)     [seql_v, seqs*heads, head_dim] transpose(0,1)
        # Output:              [seql_q, seqs*heads, head_dim] transpose(0,1)
        # GEMM: Per batch: ( seql_q x seql_k ) x ( seql_k x head_dim ) = (seql_q x head_dim)
        matmul2_results = torch.empty(
            (dropout_results.size(1), dropout_results.size(0), values.size(2)),
            dtype=dropout_results.dtype,
            device=torch.device("cuda"),
        ).transpose(1, 0)
        matmul2_results = torch.bmm(dropout_results, values.transpose(0, 1), out=matmul2_results)
        matmul2_results = (
            matmul2_results.transpose(0, 1)
            .contiguous()
            .view(inputs.size(0), inputs.size(1), inputs.size(2))
        )

        # Output Linear GEMM
        # Input1: (activations) [seql_q, seqs, embed_dim=heads*head_dim]
        # Input2: (weights)     [ embed_dim, embed_dim ] transpose(0,1)
        # Output:               [ seql_q, seqs, embed_dim ]
        # GEMM: ( seql_q*seqs x embed_dim ) x ( embed_dim x embed_dim ) = ( seql_q*seqs x embed_dim )
        if use_biases_t[0]:
            outputs = torch.addmm(
                output_biases,
                matmul2_results.view(inputs.size(0) * inputs.size(1), inputs.size(2)),
                output_weights.transpose(0, 1),
                beta=1.0,
                alpha=1.0,
            )
        else:
            outputs = torch.mm(
                matmul2_results.view(inputs.size(0) * inputs.size(1), inputs.size(2)),
                output_weights.transpose(0, 1),
            )
        outputs = outputs.view(inputs.size(0), inputs.size(1), output_weights.size(0))

        ctx.save_for_backward(
            use_biases_t,
            heads_t,
            scale_t,
            matmul2_results,
            dropout_results,
            softmax_results,
            input_lin_results,
            inputs,
            input_weights,
            output_weights,
            dropout_mask,
            dropout_prob_t,
        )

        return outputs.detach()

    @staticmethod
    def backward(ctx, output_grads):
        (
            use_biases_t,
            heads_t,
            scale_t,
            matmul2_results,
            dropout_results,
            softmax_results,
            input_lin_results,
            inputs,
            input_weights,
            output_weights,
            dropout_mask,
            dropout_prob_t,
        ) = ctx.saved_tensors

        head_dim = inputs.size(2) // heads_t[0]

        # Slice out q,k,v from one big Input Linear outuput (should only impact meta data, no copies!)
        # Sequences and heads are combined to make the batch of the Batched GEMM
        # input_lin_results: [seql_q, seqs, heads(16), 3, head_dim(64)]
        # input_lin_results: [seql_q, batches=seqs*heads, 3, head_dim]
        input_lin_results = input_lin_results.view(
            inputs.size(0), inputs.size(1) * heads_t[0], 3, head_dim
        )
        queries = input_lin_results[:, :, 0, :]
        keys = input_lin_results[:, :, 1, :]
        values = input_lin_results[:, :, 2, :]

        # Slice out q,k,v from one big set of gradients entering the input linear's bprop  (should only impact meta data, no copies!)
        # The gradients are identical in size to the Input Linear outputs.
        # The tensor is declared before hand to properly slice out query, key, and value grads.
        input_lin_results_grads = torch.empty_like(input_lin_results)
        queries_grads = input_lin_results_grads[:, :, 0, :]
        keys_grads = input_lin_results_grads[:, :, 1, :]
        values_grads = input_lin_results_grads[:, :, 2, :]

        # Output Linear GEMM - DGRAD
        # Input1: (data grads)  [seql_q, seqs, embed_dim=heads*head_dim]
        # Input2: (weights)     [ embed_dim, embed_dim ]
        # Output:               [ seql_q, seqs, embed_dim ]
        # GEMM: ( seql_q*seqs x embed_dim ) x ( embed_dim x embed_dim ) = ( seql_q*seqs x embed_dim )
        output_lin_grads = torch.mm(
            output_grads.view(output_grads.size(0) * output_grads.size(1), output_grads.size(2)),
            output_weights,
        )
        output_lin_grads = output_lin_grads.view(
            output_grads.size(0), output_grads.size(1), output_weights.size(1)
        )
        # Output Linear GEMM - WGRAD
        # Input1: (data grads)  [seql_q*seqs, embed_dim=heads*head_dim] transpose(0,1)
        # Input2: (activations) [seql_q*seqs, embed_dim ]
        # Output:               [ seql_q, seqs, embed_dim ]
        # GEMM: ( embed_dim x seql_q*seqs ) x ( seql_q*seqs x embed_dim ) = ( embed_dim x embed_dim )
        output_weight_grads = torch.mm(
            output_grads.view(
                output_grads.size(0) * output_grads.size(1), output_grads.size(2)
            ).transpose(0, 1),
            matmul2_results.view(
                matmul2_results.size(0) * matmul2_results.size(1),
                matmul2_results.size(2),
            ),
        )
        output_lin_grads = output_lin_grads.view(
            inputs.size(0), inputs.size(1) * heads_t[0], head_dim
        ).transpose(0, 1)

        if use_biases_t[0]:
            output_bias_grads = torch.sum(
                output_grads.view(
                    output_grads.size(0) * output_grads.size(1), output_grads.size(2)
                ),
                0,
            )
        else:
            output_bias_grads = None

        # Matmul2 - DGRAD1
        # Input1: (data grads)  [seql_q, seqs*heads, head_dim] transpose(0,1)
        # Input2: (activations) [seql_k, seqs*heads, head_dim] transpose(0,1).transpose(1,2)
        # Output:               [seqs*heads, seql_q, seql_k]
        # GEMM: Per batch: ( seql_q x head_dim ) x ( head_dim x seql_k ) = ( seql_q x seql_k )
        matmul2_dgrad1 = torch.bmm(output_lin_grads, values.transpose(0, 1).transpose(1, 2))
        # Matmul2 - DGRAD2
        # Input1: (data grads)  [seql_q, seqs*heads, head_dim] transpose(0,1)
        # Input2: (activations) [seql_k, seqs*heads, head_dim] transpose(0,1).transpose(1,2)
        # Output:               [seqs*heads, seql_q, seql_k]
        # GEMM: Per batch: ( seql_q x head_dim ) x ( head_dim x seql_k ) = ( seql_q x seql_k )
        values_grads = torch.bmm(
            dropout_results.transpose(1, 2),
            output_lin_grads,
            out=values_grads.transpose(0, 1),
        )

        # Mask and Scaling for Dropout (not a publically documented op)
        dropout_grads = torch._masked_scale(
            matmul2_dgrad1, dropout_mask, 1.0 / (1.0 - dropout_prob_t[0])
        )

        # Softmax Grad (not a publically documented op)
        softmax_grads = torch._softmax_backward_data(
            dropout_grads, softmax_results, -1, softmax_results.dtype
        )

        # Matmul1 - DGRAD1
        # Input1: (data grads)  [seqs*heads, seql_q, seql_k]
        # Input2: (activations) [seql_k, seqs*heads, head_dim] transpose(0,1)
        # Output:               [seqs*heads, seql_q, head_dim] transpose(0,1)
        # GEMM: Per batch: ( seql_q x seql_k ) x ( seql_k x head_dim ) = ( seql_q x head_dim )
        queries_grads = torch.baddbmm(
            queries_grads.transpose(0, 1),
            softmax_grads,
            keys.transpose(0, 1),
            out=queries_grads.transpose(0, 1),
            beta=0.0,
            alpha=scale_t[0],
        )
        # Matmul1 - DGRAD2
        # Input1: (data grads)  [seqs*heads, seql_q, seql_k] transpose(1,2)
        # Input2: (activations) [seql_q, seqs*heads, head_dim] transpose(0,1)
        # Output:               [seqs*heads, seql_k, head_dim] transpose(0,1)
        # GEMM: Per batch: ( seql_k x seql_q ) x ( seql_q x head_dim ) = ( seql_k x head_dim )
        keys_grads = torch.baddbmm(
            keys_grads.transpose(0, 1),
            softmax_grads.transpose(1, 2),
            queries.transpose(0, 1),
            out=keys_grads.transpose(0, 1),
            beta=0.0,
            alpha=scale_t[0],
        )

        # Input Linear GEMM - DGRAD
        # input1: (data grads) [seql_q, seqs, 3*embed_dim(3072)]
        # input2: (weights)    [embed_dim*3 (3072), embed_dim (1024)]
        # output:              [seql_q, seqs, embed_dim]
        # GEMM: ( (seql_q*seqs) x 3*embed_dim ) x ( 3*embed_dim x embed_dim ) = (seql_q*seqs x embed_dim)
        input_lin_results_grads = input_lin_results_grads.view(
            inputs.size(0) * inputs.size(1), heads_t[0] * 3 * head_dim
        )
        input_grads = torch.mm(input_lin_results_grads, input_weights)
        input_grads = input_grads.view(inputs.size(0), inputs.size(1), inputs.size(2))
        # Input Linear GEMM - WGRAD
        # input1: (data grads)  [seql_q*seqs, 3*embed_dim(3072)]
        # input2: (activations) [seql_q*seqs, embed_dim(1024)]
        # output:               [3*embed_dim, embed_dim]
        # GEMM: ( 3*embed_dim x seql_q*seqs ) x ( seql_q*seqs x embed_dim ) = (3*embed_dim x embed_dim)
        input_weight_grads = torch.mm(
            input_lin_results_grads.transpose(0, 1),
            inputs.view(inputs.size(0) * inputs.size(1), inputs.size(2)),
        )

        if use_biases_t[0]:
            input_bias_grads = torch.sum(input_lin_results_grads, 0)
        else:
            input_bias_grads = None

        return (
            None,
            None,
            None,
            None,
            input_grads,
            input_weight_grads,
            output_weight_grads,
            input_bias_grads,
            output_bias_grads,
            None,
            None,
            None,
        )


self_attn_func = SelfAttnFunc.apply


================================================
FILE: apex/contrib/nccl_allocator/README.md
================================================
## General information

`nccl_allocator` is a module that enables `ncclMemAlloc`[^1] to be used within PyTorch for faster NCCL NVLS collective communications.
It is mainly based on `CUDAPluggableAllocator`.
The context manager `nccl_allocator.nccl_mem(enabled=True)` is used as a switch between `cudaMalloc` and `ncclMemAlloc` (if `enabled=True` it will use `cudaMalloc`).

[^1]: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/bufferreg.html

### Example usage:

Here is a minimalistic example:

```
import os
import torch
import torch.distributed as dist
import apex.contrib.nccl_allocator as nccl_allocator

rank = int(os.getenv("RANK"))
local_rank = int(os.getenv("LOCAL_RANK"))
world_size = int(os.getenv("WORLD_SIZE"))

nccl_allocator.init()

torch.cuda.set_device(local_rank)
dist.init_process_group(backend="nccl")

with nccl_allocator.nccl_mem():
	a = torch.ones(1024 * 1024 * 2, device="cuda")
dist.all_reduce(a)

torch.cuda.synchronize()
```

Please visit `apex/contrib/examples/nccl_allocator` for more examples.


### IMPORTANT

There are several strict requirements:
- PyTorch must include PR [#112850](https://github.com/pytorch/pytorch/pull/112850)
- NCCL v2.19.4 and newer
- NCCL NVLS requires CUDA Driver 530 and newer (tested on 535)


================================================
FILE: apex/contrib/nccl_allocator/__init__.py
================================================
from .nccl_allocator import *


================================================
FILE: apex/contrib/nccl_allocator/nccl_allocator.py
================================================
import os
import torch
import _apex_nccl_allocator

from contextlib import nullcontext


__all__ = ["init", "nccl_mem", "create_nccl_mem_pool"]


def get_func_args(func):
    import inspect

    sig = inspect.signature(func)
    return [arg.name for arg in sig.parameters.values()]


def create_nccl_mem_pool(symmetric: bool | None = None) -> torch.cuda.MemPool:
    _allocator = _apex_nccl_allocator.get_nccl_allocator()
    if symmetric is None:
        _pool = torch.cuda.MemPool(_allocator)
    else:
        if "symmetric" in get_func_args(torch.cuda.MemPool):
            _pool = torch.cuda.MemPool(_allocator, symmetric=symmetric)
        elif "symm_mem" in get_func_args(torch.cuda.MemPool):
            # This path handles argument name divergence between
            # nvidia pytorch and the official pytorch.
            _pool = torch.cuda.MemPool(_allocator, symm_mem=symmetric)
        else:
            raise ValueError(
                "symmetric setting with torch.cuda.MemPool requires higher PyTorch version"
            )
    return _pool


def init() -> None:
    os.environ["NCCL_NVLS_ENABLE"] = "1"
    os.environ["TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK"] = "0"


class nccl_mem:
    def __init__(self, pool, enabled=True, device=None, group=None):
        self.device = None
        self.group = None
        self.mem_context = None
        self.pool = pool

        if enabled:
            if device is None:
                self.device = torch.device("cuda", torch.cuda.current_device())
            elif isinstance(device, int):
                self.device = torch.device("cuda", device)
            elif isinstance(device, str):
                assert "cuda" in device, "only cuda devices are supported"
                self.device = torch.device(device)

            if group is None:
                self.group = torch.distributed.distributed_c10d._get_default_group()
            else:
                self.group = group

            self.mem_context = torch.cuda.use_mem_pool(self.pool)
        else:
            self.mem_context = nullcontext()

    def __enter__(self):
        self.mem_context.__enter__()
        if self.group is not None:
            backend = self.group._get_backend(self.device)
            try:
                backend.deregister_mem_pool(self.pool)
            except RuntimeError:
                pass

    def __exit__(self, *args):
        if self.group is not None:
            backend = self.group._get_backend(self.device)
            try:
                backend.register_mem_pool(self.pool)
            except RuntimeError:
                pass
        self.mem_context.__exit__(*args)


================================================
FILE: apex/contrib/openfold_triton/README.md
================================================
# OpenFold triton kernels

This subpackage is a collection of Triton kernels written specifically for the OpenFold model architecture initial training mode.

To use this subpackage, you must install additional dependencies:

```bash
pip install einops
```

The following sections list all main features and show how to use them.

## Multi-Head Attention

```python
import apex.contrib.openfold_triton.mha as mha
from apex.contrib.openfold_triton import AttnBiasJIT, AttnNoBiasJIT, AttnTri, CanSchTriMHA

# Integration with Attention module:
class SelfAttentionWithGate(nn.Module):
    # ...

    def _attention_forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        mask: torch.Tensor,
        bias: Optional[torch.Tensor],
    ) -> torch.Tensor:
        if self.chunk_size is None:
            if mha.is_enabled() and CanSchTriMHA(
                list(query.shape),
                bias is not None,
                inf=self.inf,
                training=self.training,
            ):
                if mask is not None:
                    mask = mask.contiguous()
                if bias is not None:
                    bias = bias.contiguous()
                return AttnTri(
                    query, key, value, mask, bias, self.inf, torch.is_grad_enabled()
                )
            elif mha.is_enabled() and bias is not None and self.training:
                return AttnBiasJIT(query, key, value, mask, bias, self.inf)
            elif mha.is_enabled() and bias is None and self.training:
                return AttnNoBiasJIT(query, key, value, mask, self.inf)

# Switch on/off MHA dynamically at runtime via:
mha.enable()
mha.disable()

```

## LayerNorm

```python
from apex.contrib.openfold_triton import LayerNormSmallShapeOptImpl

# Integration with LayerNorm module:
class LayerNorm(nn.Module):
    # ...

    def _should_use_triton_kernels(self, x: torch.Tensor) -> bool:
        ln_triton_shapes = (
            (256, 128),
            (256, 256),
        )
        ln_triton_dim = 4
        return (
            self.training
            and x.dim() == ln_triton_dim
            and x.shape[-2:] in ln_triton_shapes
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if self._should_use_triton_kernels(x):
            return LayerNormSmallShapeOptImpl.apply(
                x, self.normalized_shape, self.weight, self.bias, self.eps
            )
        else:
            return F.layer_norm(
                x, self.normalized_shape, self.weight, self.bias, self.eps
            )

# To load auto tuned cache:
from apex.contrib.openfold_triton._layer_norm_config_ampere import _auto_tuned_config_ampere
from apex.contrib.openfold_triton._layer_norm_config_hopper import _auto_tuned_config_hopper
from apex.contrib.openfold_triton import _tuneable_triton_kernels

def load_triton_auto_tuned_cache(dap_size: int, arch_type: str) -> None:
    auto_tuned_config = {
        "hopper": _auto_tuned_config_hopper,
        "ampere": _auto_tuned_config_ampere,
    }[arch_type]
    config_for_current_dap = auto_tuned_config[dap_size]
    for func_name, cache in config_for_current_dap.items():
        _tuneable_triton_kernels[func_name].cache = cache

load_triton_auto_tuned_cache(
    dap_size=4,  # supported values: 0, 1, 2, 4, 8
    arch_type="hopper",
)

```

## FusedAdamSWA

```python
from apex.contrib.openfold_triton.fused_adam_swa import FusedAdamSWA

fused_optimizer = FusedAdamSWA.from_optim(
    adam_optimizer=adam_optimizer,  # standard pytorch optimizer
    fp32_params=fp32_params,  # FP32 used in weight update
    bf16_params=bf16_params,  # BF16 used in forward, backward, reduction
    swa_params=swa_params,  # SWA used for evaluation
    swa_decay_rate=swa_decay_rate,  # for example: 0.9, 0.99, 0.999
)

fused_optimizer.step()  # fused optimizer step: casting BF16/FP32 + param updates + SWA

```


================================================
FILE: apex/contrib/openfold_triton/__init__.py
================================================
# © 2023 NVIDIA CORPORATION & AFFILIATES

import json
import warnings
from collections import OrderedDict
from copy import deepcopy
from io import BytesIO
from typing import BinaryIO, Union

import torch
from triton.runtime.autotuner import Autotuner, Config, Heuristics
from triton.runtime.jit import JITFunction

from apex.contrib.openfold_triton._layer_norm_backward_kernels import (
    _layer_norm_backward_dw_db_partial,
    _layer_norm_backward_dw_db_partial_strided,
    _layer_norm_backward_dx,
    _layer_norm_backward_dx_strided,
)
from apex.contrib.openfold_triton._layer_norm_forward_kernels import (
    _layer_norm_forward,
    _layer_norm_forward_strided,
)
from apex.contrib.openfold_triton.layer_norm import LayerNormSmallShapeOptImpl
from apex.contrib.openfold_triton.mha import (
    AttnBiasJIT,
    AttnNoBiasJIT,
    AttnTri,
    CanSchTriMHA,
)

__all__ = (
    "LayerNormSmallShapeOptImpl",
    "sync_triton_auto_tune_cache_across_gpus",
    "CanSchTriMHA",
    "AttnTri",
    "AttnBiasJIT",
    "AttnNoBiasJIT",
)


def _get_tuneable_triton_func_name(f: Union[Autotuner, Heuristics, JITFunction]) -> str:
    if isinstance(f, JITFunction):
        return f.__name__
    else:
        return _get_tuneable_triton_func_name(f.fn)


_tuneable_triton_kernels = OrderedDict(
    (_get_tuneable_triton_func_name(func), func)
    for func in (
        _layer_norm_backward_dw_db_partial,
        _layer_norm_backward_dw_db_partial_strided,
        _layer_norm_backward_dx,
        _layer_norm_backward_dx_strided,
        _layer_norm_forward,
        _layer_norm_forward_strided,
    )
)


def _save_triton_auto_tune_cache(strict: bool = True, verbose: bool = False) -> BytesIO:
    caches = OrderedDict()
    for func_name, func in _tuneable_triton_kernels.items():
        if len(func.cache) < 1:
            msg = f"Triton JIT kernel {func_name} didn't have tuning cache"
            if strict:
                raise ValueError(msg)
            else:
                warnings.warn(msg)
        else:
            caches[func_name] = [
                (keys, vals.all_kwargs())
                for keys, vals in zip(func.cache.keys(), func.cache.values())
            ]
    f = BytesIO(json.dumps(caches).encode("utf-8"))
    if verbose:
        print(f"Triton kernel auto-tuning caches written to {f}")
    return f


def _load_triton_auto_tune_cache(f: BinaryIO, strict: bool = True, verbose: bool = False) -> None:
    caches = json.load(f)
    if strict:
        loaded_func_name = set(caches.keys())
        tuneable_func_name = set(_tuneable_triton_kernels.keys())
        if loaded_func_name != tuneable_func_name:
            raise ValueError(
                f"Tuneable Triton kernels don't match with provided auto-tuning cache file {f}\n"
                f"Missing kernel caches: {tuneable_func_name - loaded_func_name}\n"
                f"Unexpected kernel caches: {loaded_func_name - tuneable_func_name}"
            )
    for func_name, func_cache in caches.items():
        if func_name not in _tuneable_triton_kernels:
            raise ValueError(f"{func_name} from {f} doesn't match any tuneable Triton kernels")
        for key, val in func_cache:
            _tuneable_triton_kernels[func_name].cache[tuple(key)] = Config(val)
    if verbose:
        print(f"Triton kernel auto-tuning caches loaded from {f}")


def sync_triton_auto_tune_cache_across_gpus(strict: bool = True, verbose: bool = False) -> None:
    if not torch.distributed.is_initialized():
        return
    if torch.distributed.get_rank() == 0:
        print("Broadcasting Triton auto-tuning cache from rank 0 to other ranks...")
        cache = _save_triton_auto_tune_cache(strict=strict, verbose=verbose)
        cache.seek(0)
        cache_list = [
            cache,
        ]
    else:
        print(
            f"Rank {torch.distributed.get_rank()} is waiting for Triton auto-tuning cache from rank 0..."
        )
        cache_list = [
            None,
        ]
    torch.distributed.broadcast_object_list(cache_list)
    _load_triton_auto_tune_cache(cache_list[0], strict=strict, verbose=verbose)
    print("Succeed!")


================================================
FILE: apex/contrib/openfold_triton/_layer_norm_backward_kernels.py
================================================
# © 2023 NVIDIA CORPORATION & AFFILIATES

import torch
import triton
import triton.language as tl
from triton import Config

# %% Constants for efficient memory access.
CACHE_SECTOR_SIZE = 32 * 8
BF16_LOAD_SIZE = int(CACHE_SECTOR_SIZE / torch.finfo(torch.bfloat16).bits)
PARTIAL_REDUCE_MIN = 32


# %% Separated backward kernels for contiguous inputs. We choose to not fusing them because dX and
# d{W, b} reduce along different directions.
@triton.autotune(
    configs=[
        Config({"M_BLOCK": 1}, num_warps=1),
        Config({"M_BLOCK": 2}, num_warps=1),
        Config({"M_BLOCK": 4}, num_warps=2),
        Config({"M_BLOCK": 8}, num_warps=4),
        Config({"M_BLOCK": 16}, num_warps=8),
        Config({"M_BLOCK": 32}, num_warps=8),
        Config({"M_BLOCK": 64}, num_warps=8),
    ],
    key=["M", "N"],
)
@triton.heuristics(
    values={
        "N_BLOCK": lambda kwargs: triton.next_power_of_2(kwargs["N"]),
    },
)
@triton.jit
def _layer_norm_backward_dx(
    dy_ptr,
    x_ptr,
    w_ptr,
    x_invstd_ptr,
    x_mean_ptr,
    dx_ptr,
    M: tl.constexpr,
    N: tl.constexpr,
    M_BLOCK: tl.constexpr,
    N_BLOCK: tl.constexpr,
):
    m_idx = (tl.program_id(0) * M_BLOCK + tl.arange(0, M_BLOCK))[:, None]
    m_mask = m_idx < M
    n_idx = tl.arange(0, N_BLOCK)[None, :]
    n_mask = n_idx < N
    mask = m_mask & n_mask
    x = tl.load(x_ptr + N * m_idx + n_idx, mask, other=0).to(tl.float32)
    x_mean = tl.load(x_mean_ptr + m_idx, m_mask, other=0).to(tl.float32)
    x_invstd = tl.load(x_invstd_ptr + m_idx, m_mask, other=0).to(tl.float32)
    x_hat = (x - x_mean) * x_invstd
    dy = tl.load(dy_ptr + N * m_idx + n_idx, mask, other=0).to(tl.float32)
    w = tl.load(w_ptr + n_idx, n_mask, other=0).to(tl.float32)
    c1 = tl.sum(x_hat * dy * w, axis=1) / N
    c2 = tl.sum(dy * w, axis=1) / N
    dx = x_invstd * (dy * w - c1[:, None] * x_hat - c2[:, None])
    tl.store(dx_ptr + N * m_idx + n_idx, dx, mask)


@triton.autotune(
    configs=[
        Config(
            {"N_BLOCK": BF16_LOAD_SIZE, "M_PARTIAL_REDUCE": PARTIAL_REDUCE_MIN},
            num_warps=2,
        ),
        Config(
            {"N_BLOCK": BF16_LOAD_SIZE, "M_PARTIAL_REDUCE": PARTIAL_REDUCE_MIN * 2},
            num_warps=4,
        ),
        Config(
            {"N_BLOCK": BF16_LOAD_SIZE, "M_PARTIAL_REDUCE": PARTIAL_REDUCE_MIN * 4},
            num_warps=8,
        ),
        Config(
            {"N_BLOCK": BF16_LOAD_SIZE, "M_PARTIAL_REDUCE": PARTIAL_REDUCE_MIN * 8},
            num_warps=8,
        ),
        Config(
            {"N_BLOCK": BF16_LOAD_SIZE, "M_PARTIAL_REDUCE": PARTIAL_REDUCE_MIN * 16},
            num_warps=8,
        ),
        Config(
            {"N_BLOCK": BF16_LOAD_SIZE * 2, "M_PARTIAL_REDUCE": PARTIAL_REDUCE_MIN},
            num_warps=4,
        ),
        Config(
            {"N_BLOCK": BF16_LOAD_SIZE * 2, "M_PARTIAL_REDUCE": PARTIAL_REDUCE_MIN * 2},
            num_warps=8,
        ),
        Config(
            {"N_BLOCK": BF16_LOAD_SIZE * 2, "M_PARTIAL_REDUCE": PARTIAL_REDUCE_MIN * 4},
            num_warps=8,
        ),
        Config(
            {"N_BLOCK": BF16_LOAD_SIZE * 2, "M_PARTIAL_REDUCE": PARTIAL_REDUCE_MIN * 8},
            num_warps=8,
        ),
        Config(
            {
                "N_BLOCK": BF16_LOAD_SIZE * 2,
                "M_PARTIAL_REDUCE": PARTIAL_REDUCE_MIN * 16,
            },
            num_warps=8,
        ),
    ],
    key=["M", "N"],
)
@triton.jit
def _layer_norm_backward_dw_db_partial(
    dy_ptr,
    x_ptr,
    x_invstd_ptr,
    x_mean_ptr,
    dw_partial_buf_ptr,
    db_partial_buf_ptr,
    M: tl.constexpr,
    N: tl.constexpr,
    BUF_N_STRIDE: tl.constexpr,
    N_BLOCK: tl.constexpr,
    M_PARTIAL_REDUCE: tl.constexpr,
):
    m_idx = (tl.program_id(0) * M_PARTIAL_REDUCE + tl.arange(0, M_PARTIAL_REDUCE))[:, None]
    m_mask = m_idx < M
    n_idx = tl.program_id(1) * N_BLOCK + tl.arange(0, N_BLOCK)
    n_mask = n_idx < N
    idx = N * m_idx + n_idx[None, :]
    mask = m_mask & n_mask[None, :]
    x = tl.load(x_ptr + idx, mask, other=0).to(tl.float32)
    x_mean = tl.load(x_mean_ptr + m_idx, m_mask, other=0).to(tl.float32)
    x_invstd = tl.load(x_invstd_ptr + m_idx, m_mask, other=0).to(tl.float32)
    x_hat = (x - x_mean) * x_invstd
    dy = tl.load(dy_ptr + idx, mask, other=0).to(tl.float32)
    dw_partial = tl.sum(dy * x_hat, axis=0)
    db_partial = tl.sum(dy, axis=0)
    tl.store(dw_partial_buf_ptr + BUF_N_STRIDE * n_idx + tl.program_id(0), dw_partial, n_mask)
    tl.store(db_partial_buf_ptr + BUF_N_STRIDE * n_idx + tl.program_id(0), db_partial, n_mask)


# %% Backward kernels for noncontiguous inputs. Using similar strided access logic as in forward.
@triton.autotune(
    configs=[
        Config({"M_BLOCK": 1}, num_warps=1),
        Config({"M_BLOCK": 2}, num_warps=1),
        Config({"M_BLOCK": 4}, num_warps=2),
        Config({"M_BLOCK": 8}, num_warps=4),
        Config({"M_BLOCK": 16}, num_warps=8),
        Config({"M_BLOCK": 32}, num_warps=8),
        Config({"M_BLOCK": 64}, num_warps=8),
    ],
    key=["M", "N"],
)
@triton.heuristics(
    values={
        "N_BLOCK": lambda kwargs: triton.next_power_of_2(kwargs["N"]),
    },
)
@triton.jit
def _layer_norm_backward_dx_strided(
    dy_ptr,
    x_ptr,
    w_ptr,
    x_invstd_ptr,
    x_mean_ptr,
    dx_ptr,
    M: tl.constexpr,
    N: tl.constexpr,
    M_BLOCK: tl.constexpr,
    N_BLOCK: tl.constexpr,
    D0: tl.constexpr,
    D1: tl.constexpr,
    D2: tl.constexpr,
    D3: tl.constexpr,
    S0: tl.constexpr,
    S1: tl.constexpr,
    S2: tl.constexpr,
    S3: tl.constexpr,
):
    m_logic_idx = tl.program_id(0) * M_BLOCK + tl.arange(0, M_BLOCK)
    m_mask = m_logic_idx < M
    m_logic_idx_0 = m_logic_idx // (D1 * D2) % D0
    m_logic_idx_1 = m_logic_idx // D2 % D1
    m_logic_idx_2 = m_logic_idx % D2
    m_idx = m_logic_idx_0 * S0 + m_logic_idx_1 * S1 + m_logic_idx_2 * S2
    n_logic_idx = tl.arange(0, N_BLOCK)
    n_mask = n_logic_idx < N
    n_idx = n_logic_idx * S3
    mask = m_mask[:, None] & n_mask[None, :]
    x_idx = m_idx[:, None] + n_idx[None, :]
    x = tl.load(x_ptr + x_idx, mask, other=0).to(tl.float32)
    x_mean = tl.load(x_mean_ptr + m_logic_idx, m_mask, other=0).to(tl.float32)[:, None]
    x_invstd = tl.load(x_invstd_ptr + m_logic_idx, m_mask, other=0).to(tl.float32)[:, None]
    x_hat = (x - x_mean) * x_invstd
    dy_idx = N * m_logic_idx[:, None] + n_logic_idx[None, :]
    dy = tl.load(dy_ptr + dy_idx, mask, other=0).to(tl.float32)
    w = tl.load(w_ptr + n_logic_idx, n_mask, other=0).to(tl.float32)[None, :]
    c1 = tl.sum(x_hat * dy * w, axis=1) / N
    c2 = tl.sum(dy * w, axis=1) / N
    dx = x_invstd * (dy * w - c1[:, None] * x_hat - c2[:, None])
    tl.store(dx_ptr + x_idx, dx, mask)


@triton.autotune(
    configs=[
        Config(
            {"N_BLOCK": BF16_LOAD_SIZE, "M_PARTIAL_REDUCE": PARTIAL_REDUCE_MIN},
            num_warps=2,
        ),
        Config(
            {"N_BLOCK": BF16_LOAD_SIZE, "M_PARTIAL_REDUCE": PARTIAL_REDUCE_MIN * 2},
            num_warps=4,
        ),
        Config(
            {"N_BLOCK": BF16_LOAD_SIZE, "M_PARTIAL_REDUCE": PARTIAL_REDUCE_MIN * 4},
            num_warps=8,
        ),
        Config(
            {"N_BLOCK": BF16_LOAD_SIZE, "M_PARTIAL_REDUCE": PARTIAL_REDUCE_MIN * 8},
            num_warps=8,
        ),
        Config(
            {"N_BLOCK": BF16_LOAD_SIZE, "M_PARTIAL_REDUCE": PARTIAL_REDUCE_MIN * 16},
            num_warps=8,
        ),
        Config(
            {"N_BLOCK": BF16_LOAD_SIZE * 2, "M_PARTIAL_REDUCE": PARTIAL_REDUCE_MIN},
            num_warps=4,
        ),
        Config(
            {"N_BLOCK": BF16_LOAD_SIZE * 2, "M_PARTIAL_REDUCE": PARTIAL_REDUCE_MIN * 2},
            num_warps=8,
        ),
        Config(
            {"N_BLOCK": BF16_LOAD_SIZE * 2, "M_PARTIAL_REDUCE": PARTIAL_REDUCE_MIN * 4},
            num_warps=8,
        ),
        Config(
            {"N_BLOCK": BF16_LOAD_SIZE * 2, "M_PARTIAL_REDUCE": PARTIAL_REDUCE_MIN * 8},
            num_warps=8,
        ),
        Config(
            {
                "N_BLOCK": BF16_LOAD_SIZE * 2,
                "M_PARTIAL_REDUCE": PARTIAL_REDUCE_MIN * 16,
            },
            num_warps=8,
        ),
    ],
    key=["M", "N"],
)
@triton.jit
def _layer_norm_backward_dw_db_partial_strided(
    dy_ptr,
    x_ptr,
    x_invstd_ptr,
    x_mean_ptr,
    dw_partial_buf_ptr,
    db_partial_buf_ptr,
    M: tl.constexpr,
    N: tl.constexpr,
    BUF_N_STRIDE: tl.constexpr,
    N_BLOCK: tl.constexpr,
    M_PARTIAL_REDUCE: tl.constexpr,
    D0: tl.constexpr,
    D1: tl.constexpr,
    D2: tl.constexpr,
    D3: tl.constexpr,
    S0: tl.constexpr,
    S1: tl.constexpr,
    S2: tl.constexpr,
    S3: tl.constexpr,
):
    m_logic_idx = tl.program_id(0) * M_PARTIAL_REDUCE + tl.arange(0, M_PARTIAL_REDUCE)
    m_mask = m_logic_idx < M
    m_logic_idx_0 = m_logic_idx // (D1 * D2) % D0
    m_logic_idx_1 = m_logic_idx // D2 % D1
    m_logic_idx_2 = m_logic_idx % D2
    m_idx = m_logic_idx_0 * S0 + m_logic_idx_1 * S1 + m_logic_idx_2 * S2
    n_logic_idx = tl.program_id(1) * N_BLOCK + tl.arange(0, N_BLOCK)
    n_mask = n_logic_idx < N
    n_idx = n_logic_idx * S3
    mask = m_mask[:, None] & n_mask[None, :]
    x_idx = m_idx[:, None] + n_idx[None, :]
    x = tl.load(x_ptr + x_idx, mask, other=0).to(tl.float32)
    x_mean = tl.load(x_mean_ptr + m_logic_idx, m_mask, other=0).to(tl.float32)[:, None]
    x_invstd = tl.load(x_invstd_ptr + m_logic_idx, m_mask, other=0).to(tl.float32)[:, None]
    x_hat = (x - x_mean) * x_invstd
    dy_idx = N * m_logic_idx[:, None] + n_logic_idx[None, :]
    dy = tl.load(dy_ptr + dy_idx, mask, other=0).to(tl.float32)
    dw_partial = tl.sum(dy * x_hat, axis=0)
    db_partial = tl.sum(dy, axis=0)
    tl.store(
        dw_partial_buf_ptr + BUF_N_STRIDE * n_logic_idx + tl.program_id(0),
        dw_partial,
        n_mask,
    )
    tl.store(
        db_partial_buf_ptr + BUF_N_STRIDE * n_logic_idx + tl.program_id(0),
        db_partial,
        n_mask,
    )


# %% Reduce partial accumulator buffers along the row dimension. Straightforward.
@triton.jit
def _layer_norm_backward_buf_reduce(
    partial_buf_ptr,
    output_ptr,
    N: tl.constexpr,
    M: tl.constexpr,
    N_STRIDE: tl.constexpr,
    M_STRIDE: tl.constexpr,
):
    idx = N_STRIDE * tl.program_id(0) + M_STRIDE * tl.arange(0, M)
    mask = tl.program_id(0) < N
    x = tl.sum(tl.load(partial_buf_ptr + idx, mask, other=0).to(tl.float32), axis=0)
    tl.store(output_ptr + tl.program_id(0), x, mask)


================================================
FILE: apex/contrib/openfold_triton/_layer_norm_config_ampere.py
================================================
# © 2023 NVIDIA CORPORATION & AFFILIATES

from triton import Config

# Mapping schema: Dict[
#   dap_size: int, Dict[
#     function_name: str, Dict[
#       input_shape: Tuple[int, int], config: triton.Config
#     ]
#   ]
# ]
_auto_tuned_config_ampere = {
    0: {
        "_layer_norm_backward_dw_db_partial": {
            (65536, 128): Config(
                {"N_BLOCK": 16, "M_PARTIAL_REDUCE": 512}, num_warps=8, num_stages=2
            ),
            (32768, 256): Config(
                {"N_BLOCK": 32, "M_PARTIAL_REDUCE": 256}, num_warps=8, num_stages=2
            ),
        },
        "_layer_norm_backward_dw_db_partial_strided": {
            (65536, 128): Config(
                {"N_BLOCK": 32, "M_PARTIAL_REDUCE": 512}, num_warps=8, num_stages=2
            )
        },
        "_layer_norm_backward_dx": {
            (65536, 128): Config({"M_BLOCK": 4}, num_warps=2, num_stages=2),
            (32768, 256): Config({"M_BLOCK": 4}, num_warps=2, num_stages=2),
        },
        "_layer_norm_backward_dx_strided": {
            (65536, 128): Config({"M_BLOCK": 2}, num_warps=1, num_stages=2)
        },
        "_layer_norm_forward": {
            (65536, 128): Config({"M_BLOCK": 32}, num_warps=8, num_stages=2),
            (32768, 256): Config({"M_BLOCK": 16}, num_warps=8, num_stages=2),
        },
        "_layer_norm_forward_strided": {
            (65536, 128): Config({"M_BLOCK": 8}, num_warps=4, num_stages=2)
        },
    },
    2: {
        "_layer_norm_backward_dw_db_partial": {
            (65536, 128): Config(
                {"N_BLOCK": 16, "M_PARTIAL_REDUCE": 512}, num_warps=8, num_stages=2
            ),
            (32768, 128): Config(
                {"N_BLOCK": 32, "M_PARTIAL_REDUCE": 256}, num_warps=8, num_stages=2
            ),
            (16384, 256): Config(
                {"N_BLOCK": 32, "M_PARTIAL_REDUCE": 256}, num_warps=8, num_stages=2
            ),
        },
        "_layer_norm_backward_dw_db_partial_strided": {
            (32768, 128): Config(
                {"N_BLOCK": 32, "M_PARTIAL_REDUCE": 512}, num_warps=8, num_stages=2
            )
        },
        "_layer_norm_backward_dx": {
            (65536, 128): Config({"M_BLOCK": 8}, num_warps=4, num_stages=2),
            (32768, 128): Config({"M_BLOCK": 8}, num_warps=4, num_stages=2),
            (16384, 256): Config({"M_BLOCK": 4}, num_warps=2, num_stages=2),
        },
        "_layer_norm_backward_dx_strided": {
            (32768, 128): Config({"M_BLOCK": 2}, num_warps=1, num_stages=2)
        },
        "_layer_norm_forward": {
            (65536, 128): Config({"M_BLOCK": 32}, num_warps=8, num_stages=2),
            (32768, 128): Config({"M_BLOCK": 32}, num_warps=8, num_stages=2),
            (16384, 256): Config({"M_BLOCK": 16}, num_warps=8, num_stages=2),
        },
        "_layer_norm_forward_strided": {
            (32768, 128): Config({"M_BLOCK": 8}, num_warps=4, num_stages=2)
        },
    },
    4: {
        "_layer_norm_backward_dw_db_partial": {
            (65536, 128): Config(
                {"N_BLOCK": 16, "M_PARTIAL_REDUCE": 512}, num_warps=8, num_stages=2
            ),
            (16384, 128): Config(
                {"N_BLOCK": 16, "M_PARTIAL_REDUCE": 256}, num_warps=8, num_stages=2
            ),
            (8192, 256): Config(
                {"N_BLOCK": 16, "M_PARTIAL_REDUCE": 256}, num_warps=8, num_stages=2
            ),
        },
        "_layer_norm_backward_dw_db_partial_strided": {
            (16384, 128): Config(
                {"N_BLOCK": 32, "M_PARTIAL_REDUCE": 512}, num_warps=8, num_stages=2
            )
        },
        "_layer_norm_backward_dx": {
            (65536, 128): Config({"M_BLOCK": 8}, num_warps=4, num_stages=2),
            (16384, 128): Config({"M_BLOCK": 4}, num_warps=2, num_stages=2),
            (8192, 256): Config({"M_BLOCK": 1}, num_warps=1, num_stages=2),
        },
        "_layer_norm_backward_dx_strided": {
            (16384, 128): Config({"M_BLOCK": 2}, num_warps=1, num_stages=2)
        },
        "_layer_norm_forward": {
            (65536, 128): Config({"M_BLOCK": 32}, num_warps=8, num_stages=2),
            (16384, 128): Config({"M_BLOCK": 32}, num_warps=8, num_stages=2),
            (8192, 256): Config({"M_BLOCK": 16}, num_warps=8, num_stages=2),
        },
        "_layer_norm_forward_strided": {
            (16384, 128): Config({"M_BLOCK": 8}, num_warps=4, num_stages=2)
        },
    },
    8: {
        "_layer_norm_backward_dw_db_partial": {
            (65536, 128): Config(
                {"N_BLOCK": 16, "M_PARTIAL_REDUCE": 512}, num_warps=8, num_stages=2
            ),
            (8192, 128): Config(
                {"N_BLOCK": 16, "M_PARTIAL_REDUCE": 256}, num_warps=8, num_stages=2
            ),
            (4096, 256): Config(
                {"N_BLOCK": 16, "M_PARTIAL_REDUCE": 256}, num_warps=8, num_stages=2
            ),
        },
        "_layer_norm_backward_dw_db_partial_strided": {
            (8192, 128): Config({"N_BLOCK": 32, "M_PARTIAL_REDUCE": 512}, num_warps=8, num_stages=2)
        },
        "_layer_norm_backward_dx": {
            (65536, 128): Config({"M_BLOCK": 8}, num_warps=4, num_stages=2),
            (8192, 128): Config({"M_BLOCK": 2}, num_warps=1, num_stages=2),
            (4096, 256): Config({"M_BLOCK": 1}, num_warps=1, num_stages=2),
        },
        "_layer_norm_backward_dx_strided": {
            (8192, 128): Config({"M_BLOCK": 1}, num_warps=1, num_stages=2)
        },
        "_layer_norm_forward": {
            (65536, 128): Config({"M_BLOCK": 32}, num_warps=8, num_stages=2),
            (8192, 128): Config({"M_BLOCK": 16}, num_warps=8, num_stages=2),
            (4096, 256): Config({"M_BLOCK": 8}, num_warps=4, num_stages=2),
        },
        "_layer_norm_forward_strided": {
            (8192, 128): Config({"M_BLOCK": 8}, num_warps=4, num_stages=2)
        },
    },
}

_auto_tuned_config_ampere[1] = _auto_tuned_config_ampere[0]


================================================
FILE: apex/contrib/openfold_triton/_layer_norm_config_hopper.py
================================================
# © 2023 NVIDIA CORPORATION & AFFILIATES

from triton import Config

# Mapping schema: Dict[
#   dap_size: int, Dict[
#     function_name: str, Dict[
#       input_shape: Tuple[int, int], config: triton.Config
#     ]
#   ]
# ]
_auto_tuned_config_hopper = {
    0: {
        "_layer_norm_backward_dw_db_partial": {
            (65536, 128): Config(
                {"N_BLOCK": 32, "M_PARTIAL_REDUCE": 256}, num_warps=8, num_stages=2
            ),
            (32768, 256): Config(
                {"N_BLOCK": 32, "M_PARTIAL_REDUCE": 256}, num_warps=8, num_stages=2
            ),
        },
        "_layer_norm_backward_dw_db_partial_strided": {
            (65536, 128): Config(
                {"N_BLOCK": 32, "M_PARTIAL_REDUCE": 512}, num_warps=8, num_stages=2
            )
        },
        "_layer_norm_backward_dx": {
            (65536, 128): Config({"M_BLOCK": 8}, num_warps=4, num_stages=2),
            (32768, 256): Config({"M_BLOCK": 8}, num_warps=4, num_stages=2),
        },
        "_layer_norm_backward_dx_strided": {
            (65536, 128): Config({"M_BLOCK": 2}, num_warps=1, num_stages=2)
        },
        "_layer_norm_forward": {
            (65536, 128): Config({"M_BLOCK": 64}, num_warps=8, num_stages=2),
            (32768, 256): Config({"M_BLOCK": 32}, num_warps=8, num_stages=2),
        },
        "_layer_norm_forward_strided": {
            (65536, 128): Config({"M_BLOCK": 8}, num_warps=4, num_stages=2)
        },
    },
    2: {
        "_layer_norm_backward_dw_db_partial": {
            (65536, 128): Config(
                {"N_BLOCK": 32, "M_PARTIAL_REDUCE": 256}, num_warps=8, num_stages=2
            ),
            (32768, 128): Config(
                {"N_BLOCK": 32, "M_PARTIAL_REDUCE": 256}, num_warps=8, num_stages=2
            ),
            (16384, 256): Config(
                {"N_BLOCK": 32, "M_PARTIAL_REDUCE": 256}, num_warps=8, num_stages=2
            ),
        },
        "_layer_norm_backward_dw_db_partial_strided": {
            (32768, 128): Config(
                {"N_BLOCK": 32, "M_PARTIAL_REDUCE": 512}, num_warps=8, num_stages=2
            )
        },
        "_layer_norm_backward_dx": {
            (65536, 128): Config({"M_BLOCK": 4}, num_warps=2, num_stages=2),
            (32768, 128): Config({"M_BLOCK": 16}, num_warps=8, num_stages=2),
            (16384, 256): Config({"M_BLOCK": 4}, num_warps=2, num_stages=2),
        },
        "_layer_norm_backward_dx_strided": {
            (32768, 128): Config({"M_BLOCK": 2}, num_warps=1, num_stages=2)
        },
        "_layer_norm_forward": {
            (65536, 128): Config({"M_BLOCK": 64}, num_warps=8, num_stages=2),
            (32768, 128): Config({"M_BLOCK": 64}, num_warps=8, num_stages=2),
            (16384, 256): Config({"M_BLOCK": 64}, num_warps=8, num_stages=2),
        },
        "_layer_norm_forward_strided": {
            (32768, 128): Config({"M_BLOCK": 8}, num_warps=4, num_stages=2)
        },
    },
    4: {
        "_layer_norm_backward_dw_db_partial": {
            (65536, 128): Config(
                {"N_BLOCK": 32, "M_PARTIAL_REDUCE": 256}, num_warps=8, num_stages=2
            ),
            (16384, 128): Config(
                {"N_BLOCK": 32, "M_PARTIAL_REDUCE": 256}, num_warps=8, num_stages=2
            ),
            (8192, 256): Config(
                {"N_BLOCK": 32, "M_PARTIAL_REDUCE": 256}, num_warps=8, num_stages=2
            ),
        },
        "_layer_norm_backward_dw_db_partial_strided": {
            (16384, 128): Config(
                {"N_BLOCK": 32, "M_PARTIAL_REDUCE": 512}, num_warps=8, num_stages=2
            )
        },
        "_layer_norm_backward_dx": {
            (65536, 128): Config({"M_BLOCK": 4}, num_warps=2, num_stages=2),
            (16384, 128): Config({"M_BLOCK": 32}, num_warps=8, num_stages=2),
            (8192, 256): Config({"M_BLOCK": 8}, num_warps=4, num_stages=2),
        },
        "_layer_norm_backward_dx_strided": {
            (16384, 128): Config({"M_BLOCK": 2}, num_warps=1, num_stages=2)
        },
        "_layer_norm_forward": {
            (65536, 128): Config({"M_BLOCK": 64}, num_warps=8, num_stages=2),
            (16384, 128): Config({"M_BLOCK": 32}, num_warps=8, num_stages=2),
            (8192, 256): Config({"M_BLOCK": 16}, num_warps=8, num_stages=2),
        },
        "_layer_norm_forward_strided": {
            (16384, 128): Config({"M_BLOCK": 8}, num_warps=4, num_stages=2)
        },
    },
    8: {
        "_layer_norm_backward_dw_db_partial": {
            (65536, 128): Config(
                {"N_BLOCK": 32, "M_PARTIAL_REDUCE": 256}, num_warps=8, num_stages=2
            ),
            (8192, 128): Config(
                {"N_BLOCK": 32, "M_PARTIAL_REDUCE": 256}, num_warps=8, num_stages=2
            ),
            (4096, 256): Config(
                {"N_BLOCK": 32, "M_PARTIAL_REDUCE": 256}, num_warps=8, num_stages=2
            ),
        },
        "_layer_norm_backward_dw_db_partial_strided": {
            (8192, 128): Config({"N_BLOCK": 32, "M_PARTIAL_REDUCE": 256}, num_warps=8, num_stages=2)
        },
        "_layer_norm_backward_dx": {
            (65536, 128): Config({"M_BLOCK": 8}, num_warps=4, num_stages=2),
            (8192, 128): Config({"M_BLOCK": 32}, num_warps=8, num_stages=2),
            (4096, 256): Config({"M_BLOCK": 2}, num_warps=1, num_stages=2),
        },
        "_layer_norm_backward_dx_strided": {
            (8192, 128): Config({"M_BLOCK": 2}, num_warps=1, num_stages=2)
        },
        "_layer_norm_forward": {
            (65536, 128): Config({"M_BLOCK": 64}, num_warps=8, num_stages=2),
            (8192, 128): Config({"M_BLOCK": 32}, num_warps=8, num_stages=2),
            (4096, 256): Config({"M_BLOCK": 8}, num_warps=4, num_stages=2),
        },
        "_layer_norm_forward_strided": {
            (8192, 128): Config({"M_BLOCK": 8}, num_warps=4, num_stages=2)
        },
    },
}

_auto_tuned_config_hopper[1] = _auto_tuned_config_hopper[0]


================================================
FILE: apex/contrib/openfold_triton/_layer_norm_forward_kernels.py
================================================
# © 2023 NVIDIA CORPORATION & AFFILIATES

from packaging.version import Version

import triton
import triton.language as tl
from triton import Config

if Version("2.0.0") < Version(triton.__version__):
    rsqrt = tl.math.rsqrt
else:
    rsqrt = tl.libdevice.rsqrt


# %% Forward kernel for contiguous inputs.
@triton.autotune(
    configs=[
        Config({"M_BLOCK": 1}, num_warps=1),
        Config({"M_BLOCK": 2}, num_warps=1),
        Config({"M_BLOCK": 4}, num_warps=2),
        Config({"M_BLOCK": 8}, num_warps=4),
        Config({"M_BLOCK": 16}, num_warps=8),
        Config({"M_BLOCK": 32}, num_warps=8),
        Config({"M_BLOCK": 64}, num_warps=8),
    ],
    key=["M", "N"],
)
@triton.heuristics(
    values={
        "N_BLOCK": lambda kwargs: triton.next_power_of_2(kwargs["N"]),
    },
)
@triton.jit
def _layer_norm_forward(
    x_ptr,
    w_ptr,
    b_ptr,
    eps,
    x_invstd_ptr,
    x_mean_ptr,
    y_ptr,
    M: tl.constexpr,
    N: tl.constexpr,
    M_BLOCK: tl.constexpr,
    N_BLOCK: tl.constexpr,
):
    m_idx = tl.program_id(0) * M_BLOCK + tl.arange(0, M_BLOCK)
    m_mask = m_idx < M
    n_idx = tl.arange(0, N_BLOCK)
    n_mask = n_idx < N
    mask = m_mask[:, None] & n_mask[None, :]
    x = tl.load(x_ptr + N * m_idx[:, None] + n_idx[None, :], mask, other=0).to(tl.float32)
    x_mean = tl.sum(x, 1) / N
    tl.store(x_mean_ptr + m_idx, x_mean, m_mask)
    x_bar = x - x_mean[:, None]
    x_var = tl.sum(x_bar * x_bar, 1) / N
    x_invstd = rsqrt(x_var + eps)
    tl.store(x_invstd_ptr + m_idx, x_invstd, m_mask)
    x_hat = x_bar * x_invstd[:, None]
    w = tl.load(w_ptr + n_idx, n_mask, other=0).to(tl.float32)[None, :]
    b = tl.load(b_ptr + n_idx, n_mask, other=0).to(tl.float32)[None, :]
    y = w * x_hat + b
    tl.store(y_ptr + N * m_idx[:, None] + n_idx[None, :], y, mask)


# %% Forward kernel for noncontiguous inputs. Using strided access to avoid extra memory overhead.
@triton.autotune(
    configs=[
        Config({"M_BLOCK": 1}, num_warps=1),
        Config({"M_BLOCK": 2}, num_warps=1),
        Config({"M_BLOCK": 4}, num_warps=2),
        Config({"M_BLOCK": 8}, num_warps=4),
        Config({"M_BLOCK": 16}, num_warps=8),
        Config({"M_BLOCK": 32}, num_warps=8),
        Config({"M_BLOCK": 64}, num_warps=8),
    ],
    key=["M", "N"],
)
@triton.heuristics(
    values={
        "N_BLOCK": lambda kwargs: triton.next_power_of_2(kwargs["N"]),
    },
)
@triton.jit
def _layer_norm_forward_strided(
    x_ptr,
    w_ptr,
    b_ptr,
    eps,
    x_invstd_ptr,
    x_mean_ptr,
    y_ptr,
    M: tl.constexpr,
    N: tl.constexpr,
    M_BLOCK: tl.constexpr,
    N_BLOCK: tl.constexpr,
    D0: tl.constexpr,
    D1: tl.constexpr,
    D2: tl.constexpr,
    D3: tl.constexpr,
    S0: tl.constexpr,
    S1: tl.constexpr,
    S2: tl.constexpr,
    S3: tl.constexpr,
):
    m_logic_idx = tl.program_id(0) * M_BLOCK + tl.arange(0, M_BLOCK)
    m_mask = m_logic_idx < M
    m_logic_idx_0 = m_logic_idx // (D1 * D2) % D0
    m_logic_idx_1 = m_logic_idx // D2 % D1
    m_logic_idx_2 = m_logic_idx % D2
    m_idx = m_logic_idx_0 * S0 + m_logic_idx_1 * S1 + m_logic_idx_2 * S2
    n_logic_idx = tl.arange(0, N_BLOCK)
    n_mask = n_logic_idx < N
    n_idx = n_logic_idx * S3
    mask = m_mask[:, None] & n_mask[None, :]
    x_idx = m_idx[:, None] + n_idx[None, :]
    x = tl.load(x_ptr + x_idx, mask, other=0).to(tl.float32)
    x_mean = tl.sum(x, 1) / N
    tl.store(x_mean_ptr + m_logic_idx, x_mean, m_mask)
    x_bar = x - x_mean[:, None]
    x_var = tl.sum(x_bar * x_bar, 1) / N
    x_invstd = rsqrt(x_var + eps)
    tl.store(x_invstd_ptr + m_logic_idx, x_invstd, m_mask)
    x_hat = x_bar * x_invstd[:, None]
    w = tl.load(w_ptr + n_logic_idx, n_mask, other=0).to(tl.float32)[None, :]
    b = tl.load(b_ptr + n_logic_idx, n_mask, other=0).to(tl.float32)[None, :]
    y = w * x_hat + b
    tl.store(y_ptr + N * m_logic_idx[:, None] + n_logic_idx[None, :], y, mask)


================================================
FILE: apex/contrib/openfold_triton/_mha_kernel.py
================================================
# © 2023 NVIDIA CORPORATION & AFFILIATES

import triton
import triton.language as tl


def init_to_zero(name):
    return lambda nargs: nargs[name].zero_()


def get_configs_fwd():
    configs = []
    for num_stages in [0, 1, 2, 3, 4]:
        for block_m in [32, 64, 128]:
            for block_n in [16, 32, 64, 128]:
                if block_n > block_m:
                    continue
                for num_warps in [1, 2, 4, 8]:
                    if 32 * num_warps * 32 > block_m * block_n:
                        continue
                    configs.append(
                        triton.Config(
                            {"BLOCK_M": block_m, "BLOCK_N": block_n},
                            num_stages=num_stages,
                            num_warps=num_warps,
                        )
                    )
    return configs


"""
@triton.autotune(
    configs=get_configs_fwd(), 
    key=['Z', 'H', 'N_CTX', 'H_DIM', 'IS_TRAINING'],
)
"""


@triton.heuristics(
    {
        "EVEN_M": lambda args: args["N_CTX"] % args["BLOCK_M"] == 0,
        "EVEN_N": lambda args: args["N_CTX"] % args["BLOCK_N"] == 0,
        "EVEN_HEADDIM": lambda args: args["H_DIM"] == args["BLOCK_DMODEL"],
    }
)
@triton.jit
def _attention_core(
    Q,
    K,
    V,
    Mask,
    Bias,
    sm_scale,
    L,
    M,
    Out,
    stride_qz,
    stride_qh,
    stride_qm,
    stride_qk,
    stride_kz,
    stride_kh,
    stride_kn,
    stride_kk,
    stride_vz,
    stride_vh,
    stride_vk,
    stride_vn,
    stride_oz,
    stride_oh,
    stride_om,
    stride_on,
    stride_bz,
    stride_bh,
    stride_bm,
    stride_bn,
    stride_mz,
    stride_mh,
    stride_mm,
    stride_mn,
    Z,
    H,
    N_CTX,
    H_DIM,
    BATCH,  # 256 8 128 32 1
    inf: tl.constexpr,
    IS_TRAINING: tl.constexpr,
    BLOCK_M: tl.constexpr,
    BLOCK_N: tl.constexpr,
    BLOCK_DMODEL: tl.constexpr,
    use_mask: tl.constexpr,
    use_bias: tl.constexpr,
    EVEN_M: tl.constexpr,
    EVEN_N: tl.constexpr,
    EVEN_HEADDIM: tl.constexpr,
):
    start_m = tl.program_id(0)
    off_hz = tl.program_id(1)
    off_b = off_hz // H
    off_h = off_hz % H
    # initialize offsets
    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
    offs_n = tl.arange(0, BLOCK_N)
    offs_d = tl.arange(0, BLOCK_DMODEL)
    off_q = (
        off_b * stride_qz
        + off_h * stride_qh
        + offs_m[:, None] * stride_qm
        + offs_d[None, :] * stride_qk
    )
    off_k = (
        off_b * stride_kz
        + off_h * stride_kh
        + offs_n[None, :] * stride_kn
        + offs_d[:, None] * stride_kk
    )
    off_v = (
        off_b * stride_vz
        + off_h * stride_vh
        + offs_n[:, None] * stride_vk
        + offs_d[None, :] * stride_vn
    )
    # Initialize pointers to Q, K, V
    q_ptrs = Q + off_q
    k_ptrs = K + off_k
    v_ptrs = V + off_v

    # Initialize pointers to bias, mask
    if use_bias:
        batch_2 = Z // BATCH
        off_hz_bias = (off_hz // (batch_2 * H) * H) + (off_hz % H)
        offs_base_bias = off_hz_bias * (N_CTX * N_CTX) + offs_m[:, None] * N_CTX + offs_n[None, :]
        """
        off_b = off_hz // H
        off_h = off_hz % H
        bias_ptrs = Bias + off_b * stride_bz + off_h * stride_bh + (offs_m[:, None] * stride_bm + offs_n[None, :] * stride_bn)
        """

    if use_mask:
        # off_hz_mask = (off_hz // H)
        # offs_base_mask = off_hz_mask * N_CTX
        off_b = off_hz // H
        off_h = off_hz % H
        mask_ptrs = (
            Mask
            + off_b * stride_mz
            + off_h * stride_mh
            + (offs_m[:, None] * stride_mm + offs_n[None, :] * stride_mn)
        )

    # initialize pointer to m and l
    m_prev = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
    l_prev = tl.zeros([BLOCK_M], dtype=tl.float32)
    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
    # load q: it will stay in SRAM throughout
    if EVEN_M & EVEN_N:
        if EVEN_HEADDIM:
            q = tl.load(q_ptrs)
        else:
            q = tl.load(q_ptrs, mask=offs_d[None, :] < H_DIM, other=0.0)
    else:
        if EVEN_HEADDIM:
            q = tl.load(q_ptrs, mask=offs_m[:, None] < N_CTX, other=0.0)
        else:
            q = tl.load(
                q_ptrs,
                mask=(offs_m[:, None] < N_CTX) & (offs_d[None, :] < H_DIM),
                other=0.0,
            )

    # loop over k, v and update accumulator
    #  (start_m + 1) * BLOCK_M
    for start_n in range(0, N_CTX, BLOCK_N):
        start_n = tl.multiple_of(start_n, BLOCK_N)
        # -- compute qk ----
        if EVEN_N & EVEN_M:  # If we just do "if EVEN_N", there seems to be some race condition
            if EVEN_HEADDIM:
                k = tl.load(k_ptrs)
            else:
                k = tl.load(k_ptrs, mask=offs_d[:, None] < H_DIM, other=0.0)
        else:
            if EVEN_HEADDIM:
                k = tl.load(k_ptrs, mask=(start_n + offs_n)[None, :] < N_CTX, other=0.0)
            else:
                k = tl.load(
                    k_ptrs,
                    mask=((start_n + offs_n)[None, :] < N_CTX) & (offs_d[:, None] < H_DIM),
                    other=0.0,
                )

        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)

        # qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, float("-inf"))
        if use_bias:
            qk += tl.dot(q * sm_scale.to(tl.bfloat16), k).to(tl.bfloat16)
            qk += tl.where((start_n + offs_n)[None, :] < N_CTX, 0, -inf).to(tl.bfloat16)
            if EVEN_M & EVEN_N:
                bias_data = tl.load(Bias + offs_base_bias + start_n)
            else:
                bias_load_mask = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
                bias_load_mask = tl.where(offs_m[:, None] >= N_CTX, 1.0, bias_load_mask)
                bias_load_mask = tl.where((start_n + offs_n)[None, :] >= N_CTX, 1.0, bias_load_mask)
                bias_data = tl.load(
                    Bias + offs_base_bias + start_n,
                    mask=(bias_load_mask == 0.0),
                    other=0.0,
                )
            qk = qk + bias_data
        else:
            qk += tl.dot(q, k)
            qk += tl.where((start_n + offs_n)[None, :] < N_CTX, 0, -inf)

        qk = qk.to(tl.bfloat16)

        if use_mask:
            if EVEN_M & EVEN_N:
                mask_data = tl.load(mask_ptrs + start_n).to(tl.int32)
            else:
                mask_data = tl.load(
                    mask_ptrs + start_n,
                    mask=(offs_m[:, None] < N_CTX) & ((start_n + offs_n)[None, :] < N_CTX),
                    other=0,
                ).to(tl.int32)
            qk += tl.where(mask_data == 0, -inf, 0.0)

        if use_bias:
            # compute new m
            m_curr = tl.maximum(tl.max(qk, 1), m_prev)
            # correct old l
            l_prev *= tl.exp(m_prev - m_curr)
            # attention weights
            p = tl.exp(qk - m_curr[:, None])
        else:
            m_curr = tl.maximum(tl.max(qk, 1) * sm_scale, m_prev)
            l_prev *= tl.exp(m_prev - m_curr)
            p = tl.exp(qk * sm_scale - m_curr[:, None])

        l_curr = tl.sum(p, 1) + l_prev
        # rescale operands of matmuls
        l_rcp = 1.0 / l_curr
        p *= l_rcp[:, None]
        acc *= (l_prev * l_rcp)[:, None]
        # update acc
        p = p.to(Q.dtype.element_ty)

        if EVEN_N & EVEN_M:  # If we just do "if EVEN_N", there seems to be some race condition
            if EVEN_HEADDIM:
                v = tl.load(v_ptrs)
            else:
                v = tl.load(v_ptrs, mask=offs_d[None, :] < H_DIM, other=0.0)
        else:
            if EVEN_HEADDIM:
                v = tl.load(v_ptrs, mask=(start_n + offs_n)[:, None] < N_CTX, other=0.0)
            else:
                v = tl.load(
                    v_ptrs,
                    mask=((start_n + offs_n)[:, None] < N_CTX) & (offs_d[None, :] < H_DIM),
                    other=0.0,
                )
        acc += tl.dot(p, v)
        # update m_i and l_i
        l_prev = l_curr
        m_prev = m_curr
        # update pointers
        k_ptrs += BLOCK_N * stride_kn
        v_ptrs += BLOCK_N * stride_vk
    # rematerialize offsets to save registers
    start_m = tl.program_id(0)
    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
    # write back l and m
    if IS_TRAINING:
        l_ptrs = L + off_hz * N_CTX + offs_m
        m_ptrs = M + off_hz * N_CTX + offs_m
        tl.store(l_ptrs, l_prev)
        tl.store(m_ptrs, m_prev)
    # initialize pointers to output
    offs_n = tl.arange(0, BLOCK_DMODEL)
    off_o = (
        off_b * stride_oz
        + off_h * stride_oh
        + offs_m[:, None] * stride_om
        + offs_n[None, :] * stride_on
    )
    out_ptrs = Out + off_o
    if EVEN_M:
        if EVEN_HEADDIM:
            tl.store(out_ptrs, acc.to(Q.dtype.element_ty))
        else:
            tl.store(out_ptrs, acc.to(Q.dtype.element_ty), mask=offs_n[None, :] < H_DIM)
    else:
        if EVEN_HEADDIM:
            tl.store(out_ptrs, acc.to(Q.dtype.element_ty), mask=offs_m[:, None] < N_CTX)
        else:
            tl.store(
                out_ptrs,
                acc.to(Q.dtype.element_ty),
                mask=(offs_m[:, None] < N_CTX) & (offs_n[None, :] < H_DIM),
            )
    # tl.store(out_ptrs, acc.to(Q.dtype.element_ty), mask=out_store_mask)


@triton.jit
def _bwd_preprocess(
    Out,
    DO,
    L,
    NewDO,
    Delta,
    stride_ob,
    stride_oh,
    stride_om,
    stride_ok,
    stride_dob,
    stride_doh,
    stride_dom,
    stride_dok,
    BLOCK_M: tl.constexpr,
    D_HEAD: tl.constexpr,
):
    off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
    off_n = tl.arange(0, D_HEAD)
    # load
    o = tl.load(Out + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32)
    do = tl.load(DO + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32)
    denom = tl.load(L + off_m).to(tl.float32)
    # compute
    do = do / denom[:, None]
    delta = tl.sum(o * do, axis=1)
    # write-back
    tl.store(NewDO + off_m[:, None] * D_HEAD + off_n[None, :], do)
    tl.store(Delta + off_m, delta)


def get_configs_bwd():
    configs = []
    for num_stages in [0, 1, 2, 3, 4]:
        for block_m in [32, 64, 128]:
            for block_n in [16, 32, 64, 128]:
                if block_n > block_m:
                    continue
                for num_warps in [1, 2, 4, 8]:
                    if 32 * num_warps * 32 > block_m * block_n:
                        continue
                    configs.append(
                        triton.Config(
                            {"BLOCK_M": block_m, "BLOCK_N": block_n},
                            num_stages=num_stages,
                            num_warps=num_warps,
                            pre_hook=init_to_zero("DQ"),
                        )
                    )
    return configs


"""
@triton.autotune(
    configs=get_configs_bwd(),
    key=['Z', 'H', 'N_CTX', 'H_DIM'],
)
"""


@triton.heuristics(
    {
        "EVEN_M": lambda args: args["N_CTX"] % args["BLOCK_M"] == 0,
        "EVEN_N": lambda args: args["N_CTX"] % args["BLOCK_N"] == 0,
        "EVEN_HEADDIM": lambda args: args["H_DIM"] == args["BLOCK_DMODEL"],
    }
)
@triton.jit
def _bwd_kernel(
    Q,
    K,
    V,
    Mask,
    Bias,
    sm_scale,
    Out,
    DO,
    DQ,
    DK,
    DV,
    DP,
    L,
    M,
    D,
    stride_qz,
    stride_qh,
    stride_qm,
    stride_qk,
    stride_kz,
    stride_kh,
    stride_kn,
    stride_kk,
    stride_vz,
    stride_vh,
    stride_vk,
    stride_vn,
    stride_mz,
    stride_mh,
    stride_mm,
    stride_mn,
    stride_bz,
    stride_bh,
    stride_bm,
    stride_bn,
    stride_dpz,
    stride_dph,
    stride_dpm,
    stride_dpn,
    stride_dob,
    stride_doh,
    stride_dom,
    stride_dok,
    stride_dqb,
    stride_dqh,
    stride_dqm,
    stride_dqk,
    stride_dkb,
    stride_dkh,
    stride_dkn,
    stride_dkk,
    stride_dvb,
    stride_dvh,
    stride_dvn,
    stride_dvk,
    Z,
    H,
    N_CTX,
    H_DIM,
    # num_block,
    inf: tl.constexpr,
    BLOCK_M: tl.constexpr,
    BLOCK_DMODEL: tl.constexpr,
    BLOCK_N: tl.constexpr,
    use_mask: tl.constexpr,
    use_bias: tl.constexpr,
    EVEN_M: tl.constexpr,
    EVEN_N: tl.constexpr,
    EVEN_HEADDIM: tl.constexpr,
    SEQUENCE_PARALLEL: tl.constexpr,
):
    off_hz = tl.program_id(0)
    off_b = off_hz // H
    off_h = off_hz % H

    # offset pointers for batch/head
    Q += off_b * stride_qz + off_h * stride_qh
    K += off_b * stride_kz + off_h * stride_kh
    V += off_b * stride_vz + off_h * stride_vh
    DO += off_b * stride_dob + off_h * stride_doh
    DQ += off_b * stride_dqb + off_h * stride_dqh
    DK += off_b * stride_dkb + off_h * stride_dkh
    DV += off_b * stride_dvb + off_h * stride_dvh
    DP += off_b * stride_dpz + off_h * stride_dph

    if use_bias:
        Bias += off_b * stride_bz + off_h * stride_bh
    if use_mask:
        # offs_base_mask = off_b * N_CTX
        Mask += off_b * stride_mz + off_h * stride_mh

    num_block_n = tl.cdiv(N_CTX, BLOCK_N)
    for start_n in range(0, num_block_n):
        # lo = start_n * BLOCK_M
        lo = 0
        # initialize row/col offsets
        offs_qm = lo + tl.arange(0, BLOCK_M)
        offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)  # BLOCK_M
        offs_m = tl.arange(0, BLOCK_M)  # BLOCK_N
        offs_k = tl.arange(0, BLOCK_DMODEL)
        # initialize pointers to value-like data
        q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
        k_ptrs = K + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk)
        v_ptrs = V + (offs_n[:, None] * stride_vk + offs_k[None, :] * stride_vn)
        do_ptrs = DO + (offs_qm[:, None] * stride_dom + offs_k[None, :] * stride_dok)
        dq_ptrs = DQ + (offs_qm[:, None] * stride_dqm + offs_k[None, :] * stride_dqk)
        dp_ptrs = DP + (offs_qm[:, None] * stride_dpm + offs_n[None, :] * stride_dpn)
        if use_bias:
            b_ptrs = Bias + (offs_qm[:, None] * stride_bm + offs_n[None, :] * stride_bn)
        if use_mask:
            mask_ptrs = Mask + (offs_qm[:, None] * stride_mm + offs_n[None, :] * stride_mn)
        # pointer to row-wise quantities in value-like data
        D_ptrs = D + off_hz * N_CTX
        m_ptrs = M + off_hz * N_CTX
        # initialize dv amd dk
        dv = tl.zeros([BLOCK_N, BLOCK_DMODEL], dtype=tl.float32)  # BLOCK_M
        dk = tl.zeros([BLOCK_N, BLOCK_DMODEL], dtype=tl.float32)  # BLOCK_M
        # k and v stay in SRAM throughout
        if EVEN_N & EVEN_M:
            if EVEN_HEADDIM:
                k = tl.load(k_ptrs)
                v = tl.load(v_ptrs)
            else:
                k = tl.load(k_ptrs, mask=offs_k[None, :] < H_DIM, other=0.0)
                v = tl.load(v_ptrs, mask=offs_k[None, :] < H_DIM, other=0.0)
        else:
            if EVEN_HEADDIM:
                k = tl.load(k_ptrs, mask=offs_n[:, None] < N_CTX, other=0.0)
                v = tl.load(v_ptrs, mask=offs_n[:, None] < N_CTX, other=0.0)
            else:
                k = tl.load(
                    k_ptrs,
                    mask=(offs_n[:, None] < N_CTX) & (offs_k[None, :] < H_DIM),
                    other=0.0,
                )
                v = tl.load(
                    v_ptrs,
                    mask=(offs_n[:, None] < N_CTX) & (offs_k[None, :] < H_DIM),
                    other=0.0,
                )
        # loop over rows
        num_block_m = tl.cdiv(N_CTX, BLOCK_M)
        for start_m in range(lo, num_block_m * BLOCK_M, BLOCK_M):
            start_m = tl.multiple_of(start_m, BLOCK_M)
            offs_m_curr = start_m + offs_m
            # load q, k, v, do on-chip
            if EVEN_M & EVEN_HEADDIM:
                q = tl.load(q_ptrs)
            else:
                if EVEN_HEADDIM:
                    q = tl.load(q_ptrs, mask=offs_m_curr[:, None] < N_CTX, other=0.0)
                else:
                    q = tl.load(
                        q_ptrs,
                        mask=(offs_m_curr[:, None] < N_CTX) & (offs_k[None, :] < H_DIM),
                        other=0.0,
                    )
            # recompute p = softmax(qk, dim=-1).T
            # NOTE: `do` is pre-divided by `l`; no normalization here
            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
            qk += tl.dot(q, tl.trans(k))

            if use_bias:
                tl.debug_barrier()  # Race condition otherwise
                if EVEN_M & EVEN_N:
                    bias = tl.load(b_ptrs).to(tl.float32)
                else:
                    bias = tl.load(
                        b_ptrs,
                        mask=(offs_m_curr[:, None] < N_CTX) & (offs_n[None, :] < N_CTX),
                        other=0.0,
                    ).to(tl.float32)
                qk = qk * sm_scale + bias

            if use_mask:
                # tl.debug_barrier()  # Race condition otherwise
                # qk = tl.where(offs_m_curr[:, None] >= N_CTX, float("-1e20"), qk)
                # qk = tl.where(offs_n[None, :] >= N_CTX, float("-1e20"), qk)
                # mask_data = tl.load(Mask + offs_base_mask + offs_n)
                # qk = tl.where(mask_data[None, :] == 0., float("-1e20"), qk)
                if EVEN_M & EVEN_N:
                    mask_data = tl.load(mask_ptrs).to(tl.float32)
                else:
                    mask_data = tl.load(
                        mask_ptrs,
                        mask=(offs_m_curr[:, None] < N_CTX) & (offs_n[None, :] < N_CTX),
                        other=0.0,
                    ).to(tl.float32)

                qk += tl.where(mask_data == 0.0, -inf, 0.0)
                # qk = tl.where(mask_data == 0., -inf, qk)

            m = tl.load(m_ptrs + offs_m_curr)
            if use_bias:
                p = tl.exp(qk - m[:, None])
            else:
                p = tl.exp(qk * sm_scale - m[:, None])
            # compute dv
            if EVEN_M & EVEN_HEADDIM:
                do = tl.load(do_ptrs)  # .to(tl.float32)
            else:
                do = tl.load(
                    do_ptrs,
                    mask=(offs_m_curr[:, None] < N_CTX) & (offs_k[None, :] < H_DIM),
                    other=0.0,
                )

            dv += tl.dot(tl.trans(p.to(Q.dtype.element_ty)), do)
            # compute dp = dot(v, do)
            Di = tl.load(D_ptrs + offs_m_curr)

            dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - Di[:, None]
            dp += tl.dot(do, tl.trans(v))

            # compute ds = p * (dp - delta[:, None])
            ds = p * dp
            if use_bias:
                tl.store(dp_ptrs, ds)
            ds = ds * sm_scale

            # compute dk = dot(ds.T, q)
            dk += tl.dot(tl.trans(ds.to(Q.dtype.element_ty)), q)

            # compute dq
            # can we remove .to(tl.float32)
            if EVEN_M & EVEN_HEADDIM:  # Race condition if we just do EVEN_M
                dq = tl.load(dq_ptrs).to(tl.float32)
                dq += tl.dot(ds.to(Q.dtype.element_ty), k)
                tl.store(dq_ptrs, dq)
            else:
                if EVEN_HEADDIM:
                    dq = tl.load(dq_ptrs, mask=offs_m_curr[:, None] < N_CTX, other=0.0).to(
                        tl.float32
                    )
                    dq += tl.dot(ds.to(Q.dtype.element_ty), k)
                    tl.store(dq_ptrs, dq, mask=offs_m_curr[:, None] < N_CTX)
                else:
                    dq = tl.load(
                        dq_ptrs,
                        mask=(offs_m_curr[:, None] < N_CTX) & (offs_k[None, :] < H_DIM),
                        other=0.0,
                    ).to(tl.float32)
                    dq += tl.dot(ds.to(Q.dtype.element_ty), k)
                    tl.store(
                        dq_ptrs,
                        dq,
                        mask=(offs_m_curr[:, None] < N_CTX) & (offs_k[None, :] < H_DIM),
                    )
            # increment pointers
            dq_ptrs += BLOCK_M * stride_dqm
            q_ptrs += BLOCK_M * stride_qm
            do_ptrs += BLOCK_M * stride_dom

            dp_ptrs += BLOCK_M * stride_dpm
            if use_bias:
                b_ptrs += BLOCK_M * stride_bm
            if use_mask:
                mask_ptrs += BLOCK_M * stride_mm
        # write-back
        dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_k[None, :] * stride_dvk)
        dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_k[None, :] * stride_dkk)

        if EVEN_N & EVEN_M:
            if EVEN_HEADDIM:
                tl.store(dv_ptrs, dv)
                tl.store(dk_ptrs, dk)
            else:
                tl.store(dv_ptrs, dv, mask=offs_k[None, :] < H_DIM)
                tl.store(dk_ptrs, dk, mask=offs_k[None, :] < H_DIM)
        else:
            if EVEN_HEADDIM:
                tl.store(dv_ptrs, dv, mask=offs_n[:, None] < N_CTX)
                tl.store(dk_ptrs, dk, mask=offs_n[:, None] < N_CTX)
            else:
                tl.store(
                    dv_ptrs,
                    dv,
                    mask=(offs_n[:, None] < N_CTX) & (offs_k[None, :] < H_DIM),
                )
                tl.store(
                    dk_ptrs,
                    dk,
                    mask=(offs_n[:, None] < N_CTX) & (offs_k[None, :] < H_DIM),
                )


================================================
FILE: apex/contrib/openfold_triton/fused_adam_swa.py
================================================
# © 2023 NVIDIA CORPORATION & AFFILIATES

from __future__ import annotations

from collections import defaultdict
from enum import Enum, unique
from itertools import chain
from typing import Callable, List, Optional, Tuple, Union

import torch
import torch.nn as nn
import triton
import triton.language as tl
from torch.optim import Adam, Optimizer

# The most common parameter size in open-fold.
CHUNK_SIZE = torch.tensor(128, dtype=torch.int64)


# Data type enumerates. tl.constexpr arg doesn't accept Triton data types.
@unique
class _DTypeEnum(Enum):
    FP16 = 0
    BF16 = 1
    FP32 = 2
    FP64 = 3


_TORCH2DTYPE = {
    torch.float16: _DTypeEnum.FP16,
    torch.bfloat16: _DTypeEnum.BF16,
    torch.float32: _DTypeEnum.FP32,
    torch.float64: _DTypeEnum.FP64,
}


_DTYPE2TRITON = {
    _DTypeEnum.FP16: tl.float16,
    _DTypeEnum.BF16: tl.bfloat16,
    _DTypeEnum.FP32: tl.float32,
    _DTypeEnum.FP64: tl.float64,
}


# Adam math impl enumerates. There're minor impl differences between Apex and official PyTorch.
@unique
class AdamMathType(Enum):
    ApexAdam = 0
    ApexAdamW = 1
    PyTorchAdam = 2


@triton.jit
def _adam_math(
    param,
    grad,
    moment,
    velocity,
    beta1,
    beta2,
    beta1_correction,
    beta2_correction,
    eps,
    lr,
    weight_decay,
    adam_math_mode: tl.constexpr,
):
    if adam_math_mode == tl.constexpr(AdamMathType.ApexAdam.value):
        grad += weight_decay * param
        moment *= beta1
        moment += (1.0 - beta1) * grad
        velocity *= beta2
        velocity += (1.0 - beta2) * grad * grad
        update = (moment / beta1_correction) / (tl.math.sqrt(velocity / beta2_correction) + eps)
        param -= lr * update
    elif adam_math_mode == tl.constexpr(AdamMathType.ApexAdamW.value):
        moment *= beta1
        moment += (1.0 - beta1) * grad
        velocity *= beta2
        velocity += (1.0 - beta2) * grad * grad
        update = (moment / beta1_correction) / (tl.math.sqrt(velocity / beta2_correction) + eps)
        update += weight_decay * param
        param -= lr * update
    elif adam_math_mode == tl.constexpr(AdamMathType.PyTorchAdam.value):
        grad += weight_decay * param
        moment *= beta1
        moment += (1.0 - beta1) * grad
        velocity *= beta2
        velocity += (1.0 - beta2) * grad * grad
        # PyTorch computes step_size and denominator separately so it can use addcdiv later.
        step_size = -lr / beta1_correction
        beta2_correction_sqrt = tl.math.sqrt(beta2_correction)
        denom = tl.math.sqrt(velocity) / beta2_correction_sqrt + eps
        param += step_size * (moment / denom)
    else:
        raise ValueError(f"Unknown Adam math mode: {adam_math_mode}")
    return param, moment, velocity


# OpenFold model doesn't use buffers, so only update parameters.
@triton.jit
def _swa_math(
    param,
    swa_param,
    decay_rate,
    n_averaged,
):
    if n_averaged == 0:
        swa_param = param
    else:
        swa_param += (1.0 - decay_rate) * (param - swa_param)
    return swa_param


@triton.jit
def _multi_tensor_adam_swa(
    state_param_ptr_per_chunk,
    compute_param_ptr_per_chunk,
    swa_param_ptr_per_chunk,
    grad_ptr_per_chunk,
    moment_ptr_per_chunk,
    velocity_ptr_per_chunk,
    chunk_local_idx_ptr,
    chunk_numel_ptr,
    grad_clip_scale_ptr,
    lr,
    beta1,
    beta2,
    eps,
    weight_decay,
    beta1_correction,
    beta2_correction,
    swa_decay_rate,
    swa_n_averaged,
    adam_math_mode: tl.constexpr,
    MODEL_COMPUTE_DTYPE: tl.constexpr,
    MODEL_STATE_DTYPE: tl.constexpr,
    CHUNK_SIZE: tl.constexpr,
    BLOCK_SIZE: tl.constexpr,
):
    chunk_idx = tl.program_id(0)
    chunk_local_idx = tl.load(chunk_local_idx_ptr + chunk_idx)
    chunk_numel = tl.load(chunk_numel_ptr + chunk_idx)

    compute_dtype = _DTYPE2TRITON[MODEL_COMPUTE_DTYPE.value]
    compute_pointer_type = tl.pointer_type(compute_dtype)
    state_dtype = _DTYPE2TRITON[MODEL_STATE_DTYPE.value]
    state_pointer_type = tl.pointer_type(state_dtype)

    state_param_ptr = tl.load(state_param_ptr_per_chunk + chunk_idx).to(state_pointer_type)
    swa_param_ptr = tl.load(swa_param_ptr_per_chunk + chunk_idx).to(state_pointer_type)
    moment_ptr = tl.load(moment_ptr_per_chunk + chunk_idx).to(state_pointer_type)
    velocity_ptr = tl.load(velocity_ptr_per_chunk + chunk_idx).to(state_pointer_type)
    compute_param_ptr = tl.load(compute_param_ptr_per_chunk + chunk_idx).to(compute_pointer_type)
    grad_ptr = tl.load(grad_ptr_per_chunk + chunk_idx).to(compute_pointer_type)
    grad_clip_scale = tl.load(grad_clip_scale_ptr)

    ptr_base_offset = chunk_local_idx * CHUNK_SIZE
    state_param_ptr += ptr_base_offset
    compute_param_ptr += ptr_base_offset
    swa_param_ptr += ptr_base_offset
    grad_ptr += ptr_base_offset
    moment_ptr += ptr_base_offset
    velocity_ptr += ptr_base_offset

    for i in range(0, CHUNK_SIZE, BLOCK_SIZE):
        idx = i + tl.arange(0, BLOCK_SIZE)
        mask = idx < chunk_numel
        # Gradient clip step.
        grad = tl.load(grad_ptr + idx, mask).to(state_dtype)
        grad *= grad_clip_scale
        # Adam step.
        param = tl.load(state_param_ptr + idx, mask)
        moment = tl.load(moment_ptr + idx, mask)
        velocity = tl.load(velocity_ptr + idx, mask)
        param, moment, velocity = _adam_math(
            param=param,
            grad=grad,
            moment=moment,
            velocity=velocity,
            beta1=beta1,
            beta2=beta2,
            beta1_correction=beta1_correction,
            beta2_correction=beta2_correction,
            eps=eps,
            lr=lr,
            weight_decay=weight_decay,
            adam_math_mode=adam_math_mode,
        )
        # SWA step.
        swa_param = tl.load(swa_param_ptr + idx, mask)
        swa_param = _swa_math(
            param=param,
            swa_param=swa_param,
            decay_rate=swa_decay_rate,
            n_averaged=swa_n_averaged,
        )
        # Write results. BF16 and SWA parameters are updated as well.
        tl.store(state_param_ptr + idx, param, mask)
        tl.store(moment_ptr + idx, moment, mask)
        tl.store(velocity_ptr + idx, velocity, mask)
        tl.store(compute_param_ptr + idx, param, mask)
        tl.store(swa_param_ptr + idx, swa_param, mask)


# Note:
# - Gradients are attached to BF16 tensors
# - Assume all parameters are all updated at each step, i.e., they share the same step number
class FusedAdamSWA(Optimizer):
    def __init__(
        self,
        params: List[nn.Parameter],
        compute_params: List[nn.Parameter],
        swa_params: List[nn.Parameter],
        swa_decay_rate: float,
        lr: float = 1e-3,
        bias_correction: bool = True,
        betas: Tuple[float, float] = (0.9, 0.999),
        eps: float = 1e-8,
        adam_math_mode: AdamMathType = AdamMathType.PyTorchAdam,
        weight_decay: float = 0.0,
        amsgrad: bool = False,
        set_grad_none: bool = True,
        capturable: bool = False,
        master_weights: bool = False,
    ):
        if not isinstance(params, list):
            params = list(params)
        if not isinstance(compute_params, list):
            compute_params = list(compute_params)
        if not isinstance(swa_params, list):
            swa_params = list(swa_params)
        if not compute_params or not swa_params:
            raise ValueError("FusedAdamSWA requires both BF16 and SWA parameters.")
        if not len(params) == len(compute_params) == len(swa_params):
            raise ValueError(
                "FusedAdamSWA expects params, bf16_params, and swa_params to have same length"
            )
        if not all(
            p.shape == b.shape == s.shape for p, b, s in zip(params, compute_params, swa_params)
        ):
            raise ValueError(
                "FusedAdamSWA expects each state in params, bf16_params, abd swa_params to have same shape"
            )
        if not all(p.dtype == s.dtype for p, s in zip(params, swa_params)):
            raise ValueError("FusedAdamSWA expects all params and swa_params to have same dtype")
        if not all(p.is_contiguous() for p in chain(params, compute_params, swa_params)):
            raise ValueError("FusedAdamSWA expects all input params to be contiguous")
        if amsgrad:
            raise NotImplementedError("amsgrad is not supported by FusedAdamSWA")
        if capturable:
            raise NotImplementedError("capturable is not supported by FusedAdamSWA")
        if master_weights:
            raise NotImplementedError("master_weights is not supported by FusedAdamSWA")
        if not isinstance(adam_math_mode, AdamMathType):
            raise ValueError(
                f"Unknown Adam math mode {adam_math_mode}, expect to be any of:\n"
                f"\t- {AdamMathType.ApexAdam}: NVIDIA Apex Adam math;\n"
                f"\t- {AdamMathType.ApexAdamW}: NVIDIA Apex Adam math with adam_w set to True;\n"
                f"\t- {AdamMathType.PyTorchAdam}: The official PyTorch Adam math.\n"
            )

        defaults = dict(
            lr=lr,
            bias_correction=bias_correction,
            betas=betas,
            eps=eps,
            weight_decay=weight_decay,
        )
        super().__init__(params, defaults)
        self.adam_math_mode = adam_math_mode
        self.set_grad_none = set_grad_none
        self.compute_param_groups = [{"params": compute_params}]
        self.swa_param_groups = [{"params": swa_params, "n_averaged": 0}]
        self.swa_decay_rate = swa_decay_rate

        # We assume that parameter and buffer pointers won't change throughout the training, only
        # gradients could be re-allocated due to set_grad_none.
        self._pointer_buffers_initialized = False

    def _build_pointer_buffers(self):
        # Loading checkpoint to optimizer re-allocates param and states, so pointer logic should be
        # at the first step of training, where we assume all states are ready.
        if not all(
            len(pg) == 1
            for pg in (
                self.param_groups,
                self.compute_param_groups,
                self.swa_param_groups,
            )
        ):
            raise RuntimeError("FusedAdamSWA does not support multiple param groups")

        # `bf16_params` contains both BF16 and FP32 data types, thus we have to group parameters
        # and other states into different buffers and launch respective kernels.
        params, compute_params, swa_params = (
            self.param_groups[0]["params"],
            self.compute_param_groups[0]["params"],
            self.swa_param_groups[0]["params"],
        )
        self.pointer_buffer_groups = defaultdict(dict)
        for i, p in enumerate(compute_params):
            compute_dtype = p.dtype
            state_dtype = params[i].dtype
            self.pointer_buffer_groups[(compute_dtype, state_dtype)].setdefault("tensor_idx", [])
            self.pointer_buffer_groups[(compute_dtype, state_dtype)]["tensor_idx"].append(i)

        for (_, state_dtype), buffer_group in self.pointer_buffer_groups.items():
            # Select tensors by dtype.
            t_idx = buffer_group["tensor_idx"]
            params_this_group = [params[i] for i in t_idx]
            compute_params_this_group = [compute_params[i] for i in t_idx]
            swa_params_this_group = [swa_params[i] for i in t_idx]

            # Build parameter pointer buffers.
            param_ptrs = torch.tensor([p.data_ptr() for p in params_this_group], dtype=torch.int64)
            compute_param_ptrs = torch.tensor(
                [b.data_ptr() for b in compute_params_this_group], dtype=torch.int64
            )
            swa_param_ptrs = torch.tensor(
                [s.data_ptr() for s in swa_params_this_group], dtype=torch.int64
            )

            param_numels = torch.tensor([p.numel() for p in params_this_group], dtype=torch.int64)
            chunks_per_param = param_numels.float().div_(CHUNK_SIZE).ceil_().long()
            chunk_local_idx = torch.cat(
                [torch.arange(chunks, dtype=torch.int64) for chunks in chunks_per_param]
            )
            chunk_numel = torch.minimum(
                param_numels.repeat_interleave(chunks_per_param) - chunk_local_idx * CHUNK_SIZE,
                CHUNK_SIZE,
            )
            param_ptr_per_chunk = torch.repeat_interleave(param_ptrs, chunks_per_param)
            compute_param_ptr_per_chunk = torch.repeat_interleave(
                compute_param_ptrs, chunks_per_param
            )
            swa_param_ptr_per_chunk = torch.repeat_interleave(swa_param_ptrs, chunks_per_param)

            device = params_this_group[0].device
            buffer_group["device"] = device
            buffer_group["chunks_per_param"] = chunks_per_param
            buffer_group["chunk_local_idx"] = chunk_local_idx.to(device)
            buffer_group["chunk_numel"] = chunk_numel.to(device)
            buffer_group["param_ptr_per_chunk"] = param_ptr_per_chunk.to(device)
            buffer_group["compute_param_ptr_per_chunk"] = compute_param_ptr_per_chunk.to(device)
            buffer_group["swa_param_ptr_per_chunk"] = swa_param_ptr_per_chunk.to(device)
            buffer_group["total_chunks"] = chunks_per_param.sum().item()
            buffer_group["default_grad_clip_scale"] = torch.tensor(1.0, dtype=state_dtype).to(
                device
            )

            # Build moment pointer buffers.
            moment, velocity = [], []
            for p in params_this_group:
                state = self.state[p]
                if "exp_avg" not in state or "exp_avg_sq" not in state:
                    state["exp_avg"] = torch.zeros_like(p.detach(), dtype=state_dtype)
                    state["exp_avg_sq"] = torch.zeros_like(p.detach(), dtype=state_dtype)
                moment.append(state["exp_avg"].data_ptr())
                velocity.append(state["exp_avg_sq"].data_ptr())
            moment = torch.tensor(moment, dtype=torch.int64)
            velocity = torch.tensor(velocity, dtype=torch.int64)
            buffer_group["exp_avg_ptr_per_chunk"] = torch.repeat_interleave(
                moment, chunks_per_param
            ).to(device)
            buffer_group["exp_avg_sq_ptr_per_chunk"] = torch.repeat_interleave(
                velocity, chunks_per_param
            ).to(device)

        self._pointer_buffers_initialized = True

    def step(
        self,
        closure: Optional[Callable[[], torch.Tensor]] = None,
        grad_clip_scale: Optional[Union[torch.Tensor, float]] = None,
    ):
        if not self._pointer_buffers_initialized:
            self._build_pointer_buffers()

        loss = closure() if closure is not None else None

        group = self.param_groups[0]
        compute_group = self.compute_param_groups[0]
        swa_group = self.swa_param_groups[0]
        if "step" in group:
            group["step"] += 1
        else:
            group["step"] = 1
        (beta1, beta2), step = group["betas"], group["step"]
        if group["bias_correction"]:
            beta1_correction = 1.0 - beta1**step
            beta2_correction = 1.0 - beta2**step
        else:
            beta1_correction = beta2_correction = 1.0

        grad_ptr = []
        for p in compute_group["params"]:
            if p.grad is None:
                continue
            if p.grad.detach().is_sparse:
                raise RuntimeError(
                    "FusedAdamSWA does not support sparse gradients, please consider SparseAdam instead"
                )
            grad_ptr.append(p.grad.data_ptr())

        for (
            compute_dtype,
            state_dtype,
        ), buffer_group in self.pointer_buffer_groups.items():
            device = buffer_group["device"]
            t_idx = buffer_group["tensor_idx"]
            grad_ptr_this_group = [grad_ptr[i] for i in t_idx]
            grad_ptr_this_group = torch.tensor(grad_ptr_this_group, dtype=torch.int64)
            grad_ptr_per_chunk = torch.repeat_interleave(
                grad_ptr_this_group, buffer_group["chunks_per_param"]
            ).to(device, non_blocking=True)
            if grad_clip_scale is None:
                grad_clip_scale_this_group = buffer_group["default_grad_clip_scale"]
            elif not torch.is_tensor(grad_clip_scale):
                grad_clip_scale_this_group = torch.tensor(grad_clip_scale).to(
                    device, non_blocking=True
                )
            else:
                grad_clip_scale_this_group = grad_clip_scale

            grid = (buffer_group["total_chunks"],)
            _multi_tensor_adam_swa[grid](
                state_param_ptr_per_chunk=buffer_group["param_ptr_per_chunk"],
                compute_param_ptr_per_chunk=buffer_group["compute_param_ptr_per_chunk"],
                swa_param_ptr_per_chunk=buffer_group["swa_param_ptr_per_chunk"],
                grad_ptr_per_chunk=grad_ptr_per_chunk,
                moment_ptr_per_chunk=buffer_group["exp_avg_ptr_per_chunk"],
                velocity_ptr_per_chunk=buffer_group["exp_avg_sq_ptr_per_chunk"],
                chunk_local_idx_ptr=buffer_group["chunk_local_idx"],
                chunk_numel_ptr=buffer_group["chunk_numel"],
                grad_clip_scale_ptr=grad_clip_scale_this_group,
                lr=group["lr"],
                beta1=beta1,
                beta2=beta2,
                eps=group["eps"],
                weight_decay=group["weight_decay"],
                beta1_correction=beta1_correction,
                beta2_correction=beta2_correction,
                swa_decay_rate=self.swa_decay_rate,
                swa_n_averaged=swa_group["n_averaged"],
                adam_math_mode=self.adam_math_mode.value,
                MODEL_COMPUTE_DTYPE=_TORCH2DTYPE[compute_dtype],
                MODEL_STATE_DTYPE=_TORCH2DTYPE[state_dtype],
                # TODO: Find optimal hyper-parameters.
                CHUNK_SIZE=CHUNK_SIZE.item(),
                BLOCK_SIZE=128,
                num_warps=1,
            )

        swa_group["n_averaged"] += 1

        return loss

    @classmethod
    def from_optim(
        cls,
        adam_optimizer: Adam,
        fp32_params: List[nn.Parameter],
        bf16_params: List[nn.Parameter],
        swa_params: List[nn.Parameter],
        swa_decay_rate: float,
    ) -> FusedAdamSWA:
        assert len(adam_optimizer.param_groups) == 1
        param_group = adam_optimizer.param_groups[0]
        lr = param_group["lr"]
        betas = param_group["betas"]
        eps = param_group["eps"]
        weight_decay = param_group["weight_decay"]
        amsgrad = param_group["amsgrad"]
        fused_adam_swa_optimizer = cls(
            params=fp32_params,
            compute_params=bf16_params,
            swa_params=swa_params,
            swa_decay_rate=swa_decay_rate,
            lr=lr,
            betas=betas,
            eps=eps,
            weight_decay=weight_decay,
            amsgrad=amsgrad,
            adam_math_mode=AdamMathType.PyTorchAdam,
        )
        adam_state_dict = adam_optimizer.state_dict()
        adam_state_dict["param_groups"][0].setdefault("bias_correction", True)
        steps = [v["step"] for v in adam_state_dict["state"].values()]
        if len(steps) == 0:  # Did not load optimizer checkpoint.
            steps = [torch.tensor(1)]
        elif not all(s == steps[0] for s in steps):
            raise ValueError("FusedAdamSWA requires all parameters were updated by same steps!")
        step = int(steps[0].item())
        adam_state_dict["param_groups"][0].setdefault("step", step)
        fused_adam_swa_optimizer.load_state_dict(adam_state_dict)
        return fused_adam_swa_optimizer


================================================
FILE: apex/contrib/openfold_triton/layer_norm.py
================================================
# © 2023 NVIDIA CORPORATION & AFFILIATES

from math import prod

import torch
import triton
from torch.autograd import Function

from apex.contrib.openfold_triton._layer_norm_backward_kernels import (
    PARTIAL_REDUCE_MIN,
    _layer_norm_backward_buf_reduce,
    _layer_norm_backward_dw_db_partial,
    _layer_norm_backward_dw_db_partial_strided,
    _layer_norm_backward_dx,
    _layer_norm_backward_dx_strided,
)
from apex.contrib.openfold_triton._layer_norm_forward_kernels import (
    _layer_norm_forward,
    _layer_norm_forward_strided,
)

# TODO: Find a more elegant approach to cache tuned results.
_M_BUFSIZE_CACHE = dict()


class LayerNormSmallShapeOptImpl(Function):
    @staticmethod
    def forward(ctx, inputs, normalized_shape, weight, bias, eps=1e-05):
        if not inputs.is_contiguous() and normalized_shape != inputs.shape[-1:]:
            raise ValueError(
                f"This implementation only support normalizing along the last dimension for "
                f"noncontiguous inputs. I.e., we expect "
                f"normalized_shape={tuple(inputs.shape[-1:])}, but got {normalized_shape} instead"
            )
        if not inputs.is_contiguous() and inputs.dim() != 4:
            raise ValueError(
                f"This implementation only supports 4-dim noncontiguous inputs, but got "
                f"{inputs.dim()} instead"
            )

        normalized_degree = len(normalized_shape)
        layer_shape = inputs.shape[:-normalized_degree]
        M, N = prod(layer_shape), prod(normalized_shape)

        x_invstd = torch.empty(M, dtype=torch.float32, device=inputs.device)
        x_mean = torch.empty(M, dtype=torch.float32, device=inputs.device)
        y = torch.empty(inputs.shape, dtype=inputs.dtype, device=inputs.device)

        grid = lambda kwargs: (triton.cdiv(kwargs["M"], kwargs["M_BLOCK"]),)
        if inputs.is_contiguous():
            _layer_norm_forward[grid](
                x_ptr=inputs,
                w_ptr=weight,
                b_ptr=bias,
                eps=eps,
                x_invstd_ptr=x_invstd,
                x_mean_ptr=x_mean,
                y_ptr=y,
                M=M,
                N=N,
            )
        else:
            D0, D1, D2, D3 = inputs.shape
            S0, S1, S2, S3 = inputs.stride()
            _layer_norm_forward_strided[grid](
                x_ptr=inputs,
                w_ptr=weight,
                b_ptr=bias,
                eps=eps,
                x_invstd_ptr=x_invstd,
                x_mean_ptr=x_mean,
                y_ptr=y,
                M=M,
                N=N,
                D0=D0,
                D1=D1,
                D2=D2,
                D3=D3,
                S0=S0,
                S1=S1,
                S2=S2,
                S3=S3,
            )

        ctx.save_for_backward(inputs, weight, x_invstd, x_mean)
        ctx.flatten_shape = M, N
        return y

    @staticmethod
    def backward(ctx, d_y):
        inputs, weight, x_invstd, x_mean = ctx.saved_tensors
        M, N = ctx.flatten_shape
        d_inputs = torch.empty_like(inputs)
        d_weight = torch.empty_like(weight)
        d_bias = torch.empty_like(weight)

        # %% Separated kernels, similar to Inductor.
        # 1. dX.
        grid = lambda kwargs: (triton.cdiv(kwargs["M"], kwargs["M_BLOCK"]),)
        if inputs.is_contiguous():
            _layer_norm_backward_dx[grid](
                dy_ptr=d_y,
                x_ptr=inputs,
                w_ptr=weight,
                x_invstd_ptr=x_invstd,
                x_mean_ptr=x_mean,
                dx_ptr=d_inputs,
                M=M,
                N=N,
            )
        else:
            D0, D1, D2, D3 = inputs.shape
            S0, S1, S2, S3 = inputs.stride()
            _layer_norm_backward_dx_strided[grid](
                dy_ptr=d_y,
                x_ptr=inputs,
                w_ptr=weight,
                x_invstd_ptr=x_invstd,
                x_mean_ptr=x_mean,
                dx_ptr=d_inputs,
                M=M,
                N=N,
                D0=D0,
                D1=D1,
                D2=D2,
                D3=D3,
                S0=S0,
                S1=S1,
                S2=S2,
                S3=S3,
            )
        # 2. dW and db.
        key = (M, N, inputs.is_contiguous())
        M_BUFSIZE = _M_BUFSIZE_CACHE.get(key, triton.cdiv(M, PARTIAL_REDUCE_MIN))
        dw_partial_buf = torch.empty([N, M_BUFSIZE], dtype=torch.float32, device=d_y.device)
        db_partial_buf = torch.empty([N, M_BUFSIZE], dtype=torch.float32, device=d_y.device)
        grid = lambda kwargs: (
            triton.cdiv(M, kwargs["M_PARTIAL_REDUCE"]),
            triton.cdiv(N, kwargs["N_BLOCK"]),
        )
        if inputs.is_contiguous():
            _layer_norm_backward_dw_db_partial[grid](
                dy_ptr=d_y,
                x_ptr=inputs,
                x_invstd_ptr=x_invstd,
                x_mean_ptr=x_mean,
                dw_partial_buf_ptr=dw_partial_buf,
                db_partial_buf_ptr=db_partial_buf,
                M=M,
                N=N,
                BUF_N_STRIDE=M_BUFSIZE,
            )
            M_PARTIAL_REDUCE = _layer_norm_backward_dw_db_partial.best_config.kwargs[
                "M_PARTIAL_REDUCE"
            ]
        else:
            _layer_norm_backward_dw_db_partial_strided[grid](
                dy_ptr=d_y,
                x_ptr=inputs,
                x_invstd_ptr=x_invstd,
                x_mean_ptr=x_mean,
                dw_partial_buf_ptr=dw_partial_buf,
                db_partial_buf_ptr=db_partial_buf,
                M=M,
                N=N,
                BUF_N_STRIDE=M_BUFSIZE,
                D0=D0,
                D1=D1,
                D2=D2,
                D3=D3,
                S0=S0,
                S1=S1,
                S2=S2,
                S3=S3,
            )
            M_PARTIAL_REDUCE = _layer_norm_backward_dw_db_partial_strided.best_config.kwargs[
                "M_PARTIAL_REDUCE"
            ]
        # 2.1. Reduce partial buffers, which can be overlapped.
        M_BUFSIZE = triton.cdiv(M, M_PARTIAL_REDUCE)
        _M_BUFSIZE_CACHE[key] = M_BUFSIZE
        grid = (triton.next_power_of_2(N),)
        _layer_norm_backward_buf_reduce[grid](
            partial_buf_ptr=dw_partial_buf,
            output_ptr=d_weight,
            N=N,
            M=M_BUFSIZE,
            N_STRIDE=dw_partial_buf.stride(0),
            M_STRIDE=dw_partial_buf.stride(1),
            num_warps=1,
        )
        _layer_norm_backward_buf_reduce[grid](
            partial_buf_ptr=db_partial_buf,
            output_ptr=d_bias,
            N=N,
            M=M_BUFSIZE,
            N_STRIDE=db_partial_buf.stride(0),
            M_STRIDE=db_partial_buf.stride(1),
            num_warps=1,
        )

        return d_inputs, None, d_weight, d_bias, None


================================================
FILE: apex/contrib/openfold_triton/mha.py
================================================
# © 2023 NVIDIA CORPORATION & AFFILIATES

import math
from typing import Optional

import torch
import triton
from einops import rearrange

from apex.contrib.openfold_triton._mha_kernel import (
    _attention_core,
    _bwd_kernel,
    _bwd_preprocess,
)

# whether TRITON MHA is enabled or not
_TRI_MHA_ENABLED = False


def is_enabled() -> Optional[bool]:
    global _TRI_MHA_ENABLED
    return _TRI_MHA_ENABLED


def enable() -> None:
    global _TRI_MHA_ENABLED
    _TRI_MHA_ENABLED = True


def disable() -> None:
    global _TRI_MHA_ENABLED
    _TRI_MHA_ENABLED = False


# TODO: support q.shape [1, 1024, 8, 256, 8]
def CanSchTriMHA(in_shape, has_bias=True, inf=1e9, training=True):
    if has_bias == False:  # skip bias is None
        return False
    if inf != 1e9:  # skip inf != 1e9
        return False

    lst_3d = in_shape[-3:]
    skip_neg2_dim = in_shape[:3] + in_shape[-1:]
    if not training and (
        in_shape == [1, 538, 4, 538, 16]
        or in_shape == [1, 585, 4, 585, 16]
        or in_shape == [1, 538, 4, 538, 32]
        or in_shape == [1, 585, 4, 585, 32]
        or in_shape == [1, 128, 8, 585, 32]
        or in_shape == [1, 128, 8, 538, 32]
        or lst_3d == [8, 128, 32]
        or skip_neg2_dim == [1, 1024, 8, 8]
        or skip_neg2_dim == [1, 128, 4, 32]
        or skip_neg2_dim == [1, 128, 8, 32]
    ):  # eval
        return False  # skip eval
    if (
        in_shape == [1, 256, 4, 256, 16]
        or in_shape == [1, 128, 4, 256, 16]
        or in_shape == [1, 64, 4, 256, 16]
        or in_shape == [1, 32, 4, 256, 16]
    ):  # 7.26%
        return True
    elif (
        in_shape == [1, 128, 8, 256, 32]
        or in_shape == [1, 64, 8, 256, 32]
        or in_shape == [1, 32, 8, 256, 32]
        or in_shape == [1, 16, 8, 256, 32]
    ):  # 21.77%
        return True
    elif (
        in_shape == [1, 256, 8, 128, 32]
        or in_shape == [1, 128, 8, 128, 32]
        or in_shape == [1, 64, 8, 128, 32]
        or in_shape == [1, 32, 8, 128, 32]
    ):  # 21.77% no bias
        return True
    elif (
        in_shape == [1, 256, 4, 256, 32]
        or in_shape == [1, 128, 4, 256, 32]
        or in_shape == [1, 64, 4, 256, 32]
        or in_shape == [1, 32, 4, 256, 32]
    ):  # 47.17%
        return True
    else:  # not support
        return False


# tune hyper params for each workload
def schedule_triton_mha(in_shape, fwd=True):
    # default
    ret = [64, 32, 2, 3] if fwd else [128, 64, 8, 0]
    if in_shape == [256, 4, 256, 16]:
        ret = [64, 32, 2, 4] if fwd else [64, 64, 4, 0]
    elif in_shape == [128, 4, 256, 16]:
        ret = [64, 32, 2, 4] if fwd else [64, 64, 4, 0]
    elif in_shape == [64, 4, 256, 16]:
        ret = [64, 32, 2, 4] if fwd else [64, 64, 4, 0]
    elif in_shape == [32, 4, 256, 16]:
        ret = [64, 32, 2, 4] if fwd else [64, 64, 4, 0]
    # [*, 8, 256, 32]
    elif in_shape == [128, 8, 256, 32]:  # DAP1
        ret = [64, 32, 2, 3] if fwd else [128, 64, 8, 1]
    elif in_shape == [64, 8, 256, 32]:  # DAP2
        ret = [64, 32, 2, 3] if fwd else [128, 64, 8, 1]
    elif in_shape == [32, 8, 256, 32]:  # DAP4
        ret = [64, 32, 2, 3] if fwd else [128, 64, 8, 1]
    elif in_shape == [16, 8, 256, 32]:  # DAP8
        ret = [64, 32, 2, 3] if fwd else [128, 64, 8, 1]
    # [*, 8, 128, 32]
    elif in_shape == [256, 8, 128, 32]:  # DAP1
        ret = [64, 64, 4, 3] if fwd else [128, 64, 4, 1]
    elif in_shape == [128, 8, 128, 32]:  # DAP2
        ret = [128, 64, 4, 2] if fwd else [64, 64, 2, 0]
    elif in_shape == [64, 8, 128, 32]:  # DAP4
        ret = [128, 64, 4, 2] if fwd else [64, 64, 2, 0]
    elif in_shape == [32, 8, 128, 32]:  # DAP8
        ret = [128, 64, 4, 2] if fwd else [64, 64, 2, 0]
    # [*, 4, 256, 32]
    elif in_shape == [256, 4, 256, 32]:  # DAP1
        ret = [64, 32, 2, 3] if fwd else [128, 64, 8, 0]
    elif in_shape == [128, 4, 256, 32]:  # DAP2
        ret = [64, 32, 2, 3] if fwd else [128, 64, 8, 1]
    elif in_shape == [64, 4, 256, 32]:  # DAP4
        ret = [64, 32, 2, 3] if fwd else [128, 64, 8, 1]
    elif in_shape == [32, 4, 256, 32]:  # DAP8
        ret = [64, 32, 2, 3] if fwd else [128, 64, 8, 0]
    return ret[0], ret[1], ret[2], ret[3]


class FusedAttenionCoreFunc(torch.autograd.Function):
    @staticmethod
    def forward(ctx, q, k, v, mask=None, bias=None, inf=1000000000.0, is_training=True):
        q_ori_size = len(q.size())
        if q_ori_size == 5:
            q = rearrange(q, "1 b2 h n d -> (1 b2) h n d")
            k = rearrange(k, "1 b2 h n d -> (1 b2) h n d")
            v = rearrange(v, "1 b2 h n d -> (1 b2) h n d")
        if bias is not None:
            if len(bias.size()) == 5:
                bias = rearrange(bias, "1 b2 h n d -> (1 b2) h n d")

        if mask is not None and len(mask.size()) == 5:
            mask = rearrange(mask, "1 b 1 1 e -> b 1 1 e")

        batch = 1
        sm_scale = 1.0 / math.sqrt(q.size(-1))
        # q *= sm_scale
        # shape constraints
        Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
        assert Lq == Lk and Lk == Lv

        if not is_training:
            Lk = max(triton.next_power_of_2(Lk), 16)

        assert Lk in {16, 32, 64, 128}

        o = torch.empty_like(q)

        Z, H, N_CTX, H_DIM = q.shape
        grid = lambda META: (triton.cdiv(N_CTX, META["BLOCK_M"]), Z * H)
        l = torch.empty(
            (q.shape[-4], q.shape[-3], q.shape[-2]),
            device=q.device,
            dtype=torch.float32,
        )
        m = torch.empty(
            (q.shape[-4], q.shape[-3], q.shape[-2]),
            device=q.device,
            dtype=torch.float32,
        )
        # BLOCK_M, BLOCK_N, num_warps, num_stages  = 64, 64, 2, 3
        BLOCK_M, BLOCK_N, num_warps, num_stages = schedule_triton_mha(list(q.shape), fwd=True)
        if bias != None:
            bias = bias.expand(Z, H, N_CTX, N_CTX)
        bias_strides = (
            (bias.stride(0), bias.stride(1), bias.stride(2), bias.stride(3))
            if bias is not None
            else (0, 0, 0, 0)
        )
        if mask != None:
            mask = mask.expand(-1, q.shape[1], q.shape[2], -1)
        mask_strides = (
            (mask.stride(0), mask.stride(1), mask.stride(2), mask.stride(3))
            if mask is not None
            else (0, 0, 0, 0)
        )

        _attention_core[grid](
            q,
            k,
            v,
            mask,
            bias,
            sm_scale,
            l,
            m,
            o,
            q.stride(0),
            q.stride(1),
            q.stride(2),
            q.stride(3),
            k.stride(0),
            k.stride(1),
            k.stride(2),
            k.stride(3),
            v.stride(0),
            v.stride(1),
            v.stride(2),
            v.stride(3),
            o.stride(0),
            o.stride(1),
            o.stride(2),
            o.stride(3),
            *bias_strides,
            *mask_strides,
            q.shape[0],
            q.shape[1],
            q.shape[2],
            q.shape[3],
            batch,  # 256 8 128 1
            inf=inf,
            IS_TRAINING=is_training,
            BLOCK_M=BLOCK_M,
            BLOCK_N=BLOCK_N,
            BLOCK_DMODEL=Lk,
            use_mask=(mask != None),
            use_bias=(bias != None),
            num_warps=num_warps,
            num_stages=num_stages,
        )
        o = o.contiguous()
        # print(h.asm["ttgir"])
        if is_training:
            ctx.save_for_backward(q, k, v, o, m, l, bias)
            ctx.grid = grid
            ctx.sm_scale = sm_scale
            ctx.BLOCK_DMODEL = Lk
            ctx.mask = mask
            ctx.inf = inf
        if q_ori_size == 5:
            o = rearrange(o, "a b c d -> 1 a b c d")
        return o

    @staticmethod
    def backward(ctx, do):
        q, k, v, o, m, l, bias = ctx.saved_tensors
        ori_do_size = len(do.size())
        if ori_do_size == 5:
            do = rearrange(do, "1 a b c d -> a b c d")
        do = do.contiguous()
        dq = torch.zeros_like(q, dtype=torch.float32)
        dk = torch.empty_like(k)
        dv = torch.empty_like(v)
        # bias.dtype
        Z, H, N_CTX, H_DIM = q.shape[-4], q.shape[-3], q.shape[-2], q.shape[-1]
        dp = torch.zeros((Z, H, N_CTX, N_CTX), dtype=torch.float32, device="cuda")

        do_scaled = torch.empty_like(do)
        delta = torch.empty_like(l)
        mask = ctx.mask
        inf = ctx.inf

        BLOCK = 128
        BLOCK_HEADDIM = max(triton.next_power_of_2(H_DIM), 16)
        grid = (triton.cdiv(N_CTX, BLOCK) * Z * H, 1)
        _bwd_preprocess[grid](
            o,
            do,
            l,
            do_scaled,
            delta,
            o.stride(0),
            o.stride(1),
            o.stride(2),
            o.stride(3),
            do.stride(0),
            do.stride(1),
            do.stride(2),
            do.stride(3),
            BLOCK_M=BLOCK,
            D_HEAD=BLOCK_HEADDIM,
        )

        if bias is not None:
            assert bias.dtype in [q.dtype, torch.float]
            assert bias.is_cuda
            assert bias.dim() == 4
            assert bias.stride(-1) == 1
            bias = bias.expand(Z, H, N_CTX, N_CTX)

        # if mask is not None:
        #    mask = mask.expand(Z, H, N_CTX, N_CTX)

        bias_strides = (
            (bias.stride(0), bias.stride(1), bias.stride(2), bias.stride(3))
            if bias is not None
            else (0, 0, 0, 0)
        )
        mask_strides = (
            (mask.stride(0), mask.stride(1), mask.stride(2), mask.stride(3))
            if mask is not None
            else (0, 0, 0, 0)
        )

        # BLOCK_M, BLOCK_N = 128, 64
        BLOCK_M, BLOCK_N, num_warps, num_stages = schedule_triton_mha(list(q.shape), fwd=False)
        # grid = lambda META: (triton.cdiv(N_CTX, META["BLOCK_N"]), Z * H)
        # grid = lambda META: (Z * H, triton.cdiv(N_CTX, META["BLOCK_N"]))
        # grid = lambda META: (triton.cdiv(N_CTX, META["BLOCK_N"]) if META["SEQUENCE_PARALLEL"] else 1,
        #            Z * H)
        grid = lambda META: (Z * H,)
        _bwd_kernel[grid](
            q,
            k,
            v,
            mask,
            bias,
            ctx.sm_scale,
            o,
            do_scaled,
            dq,
            dk,
            dv,
            dp,
            l,
            m,
            delta,
            q.stride(0),
            q.stride(1),
            q.stride(2),
            q.stride(3),
            k.stride(0),
            k.stride(1),
            k.stride(2),
            k.stride(3),
            v.stride(0),
            v.stride(1),
            v.stride(2),
            v.stride(3),
            *mask_strides,
            *bias_strides,
            dp.stride(0),
            dp.stride(1),
            dp.stride(2),
            dp.stride(3),
            do.stride(0),
            do.stride(1),
            do.stride(2),
            do.stride(3),
            dq.stride(0),
            dq.stride(1),
            dq.stride(2),
            dq.stride(3),
            dk.stride(0),
            dk.stride(1),
            dk.stride(2),
            dk.stride(3),
            dv.stride(0),
            dv.stride(1),
            dv.stride(2),
            dv.stride(3),
            q.shape[0],
            q.shape[1],
            q.shape[2],
            q.shape[3],
            # ctx.grid[0], # to delete
            inf=inf,
            BLOCK_M=BLOCK_M,
            BLOCK_N=BLOCK_N,
            BLOCK_DMODEL=ctx.BLOCK_DMODEL,
            use_mask=(mask != None),
            use_bias=(bias != None),
            num_warps=num_warps,
            num_stages=num_stages,
            SEQUENCE_PARALLEL=False,
        )
        dB = None
        if bias is not None:
            dB = torch.sum(dp, dim=-4, keepdim=True)
            if len(bias.size()) == 4:
                dB = rearrange(dB, "b2 h n d -> 1 b2 h n d")
        # print(h.asm["ttgir"])

        if ori_do_size == 5:
            dq = rearrange(dq, "b2 h n d -> 1 b2 h n d")
            dk = rearrange(dk, "b2 h n d -> 1 b2 h n d")
            dv = rearrange(dv, "b2 h n d -> 1 b2 h n d")

        return dq, dk, dv, None, dB, None, None


AttnTri = FusedAttenionCoreFunc.apply


def _attention_bias(
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    mask: torch.Tensor,
    bias: Optional[torch.Tensor],
    inf: float,
) -> torch.Tensor:
    # query:  [*, num_heads, Q, c_hidden]
    # key:    [*, num_heads, K, c_hidden]
    # value:  [*, num_heads, V, c_hidden]
    # mask:   Logit mask tensor broadcastable to [*, num_heads, Q, K]
    # bias:   Optional logit bias tensor broadcastable to [*, num_heads, Q, K]
    # inf:    Safe infinity value.
    # assuming K == V

    key = torch.swapdims(key, -2, -1)
    # key: [*, num_heads, c_hidden, K]

    scaling = 1.0 / math.sqrt(query.size(-1))
    a = torch.matmul(query * scaling, key)
    # a: [*, num_heads, Q, K]

    a += (mask - 1.0) * inf
    # a: [*, num_heads, Q, K]

    a += bias
    # a: [*, num_heads, Q, K]

    a = torch.softmax(a, dim=-1)
    # a: [*, num_heads, Q, K]

    a = torch.matmul(a, value)
    # a: [*, num_heads, Q, c_hidden]

    return a


def _attention_no_bias(
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    mask: torch.Tensor,
    inf: float,
) -> torch.Tensor:
    # query:  [*, num_heads, Q, c_hidden]
    # key:    [*, num_heads, K, c_hidden]
    # value:  [*, num_heads, V, c_hidden]
    # mask:   Logit mask tensor broadcastable to [*, num_heads, Q, K]
    # bias:   Optional logit bias tensor broadcastable to [*, num_heads, Q, K]
    # inf:    Safe infinity value.
    # assuming K == V

    key = torch.swapdims(key, -2, -1)
    # key: [*, num_heads, c_hidden, K]

    scaling = 1.0 / math.sqrt(query.size(-1))
    a = torch.matmul(query * scaling, key)
    # a: [*, num_heads, Q, K]

    a += (mask - 1.0) * inf
    # a: [*, num_heads, Q, K]

    a = torch.softmax(a, dim=-1)
    # a: [*, num_heads, Q, K]

    a = torch.matmul(a, value)
    # a: [*, num_heads, Q, c_hidden]

    return a


AttnBiasJIT = torch.compile(_attention_bias)
AttnNoBiasJIT = torch.compile(_attention_no_bias)


================================================
FILE: apex/contrib/optimizers/__init__.py
================================================
from .fp16_optimizer import FP16_Optimizer
from .fused_adam import FusedAdam
from .fused_lamb import FusedLAMB


================================================
FILE: apex/contrib/optimizers/distributed_fused_adam.py
================================================
import collections
import contextlib
from dataclasses import dataclass
import enum
import inspect
import io
import itertools
import threading
from typing import (
    Any,
    Callable,
    Dict,
    Iterable,
    List,
    Optional,
    Set,
    Tuple,
    Union,
)
import warnings

import torch
from torch.distributed.distributed_c10d import _get_default_group

try:
    import apex.contrib.nccl_allocator as nccl_allocator
except ImportError:
    nccl_allocator = None

from apex.multi_tensor_apply import multi_tensor_applier
import amp_C
import distributed_adam_cuda

# Fallback to private functions if using PyTorch <1.13.0
try:
    from torch.distributed.distributed_c10d import get_global_rank
except ImportError:
    from torch.distributed.distributed_c10d import _get_global_rank

    get_global_rank = _get_global_rank
try:
    from torch.distributed.distributed_c10d import reduce_scatter_tensor
except ImportError:
    from torch.distributed.distributed_c10d import _reduce_scatter_base

    reduce_scatter_tensor = _reduce_scatter_base
try:
    from torch.distributed.distributed_c10d import all_gather_into_tensor
except ImportError:
    from torch.distributed.distributed_c10d import _all_gather_base

    all_gather_into_tensor = _all_gather_base

# Import context manager to coalesce NCCL calls
# Note: Replace these backward compatibility shims once PyTorch
# exposes a stable public API for coalescing communication.
from torch.distributed.distributed_c10d import _coalescing_manager

if "device" not in inspect.signature(_coalescing_manager).parameters:
    # PyTorch <=1.13.1 does not have device arg
    _coalescing_manager_no_device_arg = _coalescing_manager

    @contextlib.contextmanager
    def _coalescing_manager(group, device, reqs):
        with _coalescing_manager_no_device_arg(group, reqs):
            yield


if "reqs" in inspect.signature(_coalescing_manager).parameters:
    # PyTorch <=2.0.1 handles synchronization externally to coalescing
    # manager
    _coalescing_manager_with_reqs_arg = _coalescing_manager

    class _CoalescingManager:
        def __init__(self):
            self.works: List[torch.distributed.Work] = []

        def append(self, work: torch.distributed.Work) -> None:
            if work:
                self.works.append(work)

        def wait(self) -> None:
            for work in self.works:
                work.wait()

    @contextlib.contextmanager
    def _coalescing_manager(
        group: Optional[torch.distributed.ProcessGroup] = None,
        device: Optional[torch.device] = None,
        async_ops: bool = False,
    ) -> contextlib.AbstractContextManager:
        assert device is not None
        cm = _CoalescingManager()
        with _coalescing_manager_with_reqs_arg(
            group,
            device,
            cm.works,
        ):
            yield cm
        if not async_ops:
            cm.wait()

    def _coalescing_manager_append_work(
        cm: _CoalescingManager,
        work: torch.distributed.Work,
    ) -> None:
        """Add asynchronous request to coalescing manager"""
        cm.append(work)

else:
    # PyTorch >2.0.1 handles synchronization within coalescing
    # manager
    def _coalescing_manager_append_work(
        cm: torch.distributed._CoalescingManager,
        work: torch.distributed.Work,
    ) -> None:
        """Dummy function for backward compatibility

        Coalescing manager already keeps track of asynchronous
        communication.

        """
        pass


# Import optional CUDA kernels
_FOUND_DEPRECATED_FUSED_ADAM: bool = False
try:
    import fused_adam_cuda

    _FOUND_DEPRECATED_FUSED_ADAM = True
except ImportError:
    warnings.warn(
        "Could not find recommended CUDA kernels when importing "
        "`DistributedFusedAdam`. "
        "For best performance, Apex should be installed with "
        "`--deprecated_fused_adam`."
    )


def _round_to_multiple(
    number: int,
    multiple: int,
    round_up: bool = True,
) -> int:
    """Assumes arguments are positive integers"""
    return (number + multiple - 1 if round_up else number) // multiple * multiple


def _devices_match(device1: torch.device, device2: torch.device) -> bool:
    """Whether two PyTorch devices are equivalent"""
    device1 = torch.device(device1)
    device2 = torch.device(device2)
    if device1.type != device2.type:
        return False
    if device1.type == "cuda":
        index1 = device1.index
        index2 = device2.index
        if index1 is None:
            index1 = torch.cuda.current_device()
        if index2 is None:
            index2 = torch.cuda.current_device()
        if index1 != index2:
            return False
    return True


def _multi_tensor_copy(
    buffers_in: List[torch.Tensor],
    buffers_out: List[torch.Tensor],
    dummy_overflow_buf: Optional[torch.Tensor] = None,
) -> None:
    """Copy between corresponding buffers

    Uses fused copy kernel if possible.
    """

    # Group buffers by device and dtype
    buffer_groups = collections.defaultdict(list)
    for buf_in, buf_out in zip(buffers_in, buffers_out):
        if buf_in.data_ptr() == buf_out.data_ptr() or buf_in.numel() == 0:
            # Nothing to be done if input and output buffers are same
            # or have no entries
            continue
        if buf_in.dtype == buf_out.dtype:
            # Just copy bytes if dtypes are same
            buf_in = buf_in.view(torch.uint8)
            buf_out = buf_out.view(torch.uint8)
        is_cuda = _devices_match(buf_in.device, "cuda") and _devices_match(buf_out.device, "cuda")
        is_contiguous = buf_in.is_contiguous() and buf_out.is_contiguous()
        key = (
            buf_in.dtype,
            buf_out.dtype,
            is_cuda,
            is_contiguous,
        )
        buffer_groups[key].append((buf_in, buf_out))

    # Copy each group of buffers
    for key, buffers in buffer_groups.items():
        # Check if buffers support fused kernel
        dtype_in, dtype_out, is_cuda, is_contiguous = key
        supported_dtypes = (torch.float32, torch.float16)
        use_fused_kernel = (dtype_in in supported_dtypes and dtype_out in supported_dtypes) or (
            dtype_in == torch.uint8 and dtype_out == torch.uint8
        )
        use_fused_kernel = use_fused_kernel and is_cuda and is_contiguous

        # Copy buffers
        if use_fused_kernel and _FOUND_DEPRECATED_FUSED_ADAM:
            if dummy_overflow_buf is None:
                dummy_overflow_buf = torch.zeros([1], dtype=torch.int32, device="cuda")
            multi_tensor_applier(
                fused_adam_cuda.maybe_cast_mt,
                dummy_overflow_buf,
                list(zip(*buffers)),
            )
        else:
            # Warning: dummy_overflow_buf was not set in such case
            for buf_in, buf_out in buffers:
                buf_out.copy_(buf_in)


@contextlib.contextmanager
def _disable_pre_forward_hook(
    param: torch.nn.Parameter,
) -> contextlib.AbstractContextManager:
    """Prevent parameter from calling pre-forward hook"""
    hook_is_enabled = getattr(
        param,
        "_pre_forward_hook_is_enabled",
        False,
    )
    if hook_is_enabled:
        param._pre_forward_hook_is_enabled = False
    try:
        yield
    finally:
        if hook_is_enabled:
            param._pre_forward_hook_is_enabled = True


@torch.no_grad()
def _bf16_rem_to_fp32(
    bf16: torch.Tensor,
    rem: torch.Tensor,
    fp32: torch.Tensor,
) -> None:
    """Pack BF16 tensor and 16-bit remainders into FP32 tensor"""

    # Check inputs
    assert bf16.size() == rem.size() == fp32.size(), (
        "Tensor dimensions do not match: "
        f"bf16={list(bf16.size())}, "
        f"rem={list(rem.size())}, "
        f"fp32={list(fp32.size())}, "
    )
    assert bf16.dtype is torch.bfloat16, f"bf16 buffer has invalid dtype ({bf16.dtype})"
    assert rem.dtype is torch.int16, f"rem buffer has invalid dtype ({rem.dtype})"
    assert fp32.dtype is torch.float32, f"fp32 buffer has invalid dtype ({fp32.dtype})"

    # Undo bf16 rounding
    bf16 = bf16.view(torch.int16) - torch.where(rem < 0, 1, 0)

    # Pack bf16 and remainder into little-endian fp32
    fp32 = fp32.unsqueeze(-1).view(torch.int16)
    fp32 = torch.stack((rem, bf16), dim=-1, out=fp32)


class DistributedFusedAdam(torch.optim.Optimizer):
    """Adam optimizer with ZeRO algorithm.

    Currently GPU-only. Requires Apex to be installed via
    ``python setup.py install --cuda_ext --cpp_ext --distributed_adam --deprecated_fused_adam``.

    This implements the ZeRO-2 algorithm, which distributes the
    optimizer state and gradients between parallel processes. In
    particular, the parameters are flattened, grouped into fixed-size
    buckets, and the optimizer state for each bucket is sharded over
    the parallel processes. Options are provided to overlap the
    gradient synchronization with the backward pass compute.

    Adam was proposed in `Adam: A Method for Stochastic
    Optimization`_, AdamW in `Decoupled Weight Decay Regularization`_,
    and ZeRO in `ZeRO: Memory Optimizations Toward Training Trillion
    Parameter Models`_.

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts
            defining parameter groups.
        lr (float, optional): learning rate. (default: 1e-3)
        bias_correction (bool, optional): apply correction factor to
            moment estimates. (default: True)
        betas (Tuple[float, float], optional): coefficients used for
            computing running averages of gradient and its square.
            (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability. (default: 1e-8)
        adam_w_mode (boolean, optional): Decouple weight decay
            regularization (also known as AdamW algorithm) (default:
            True)
        weight_decay (float, optional): weight decay (L2 penalty)
            (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad
            variant of this algorithm from the paper
            `On the Convergence of Adam and Beyond`_ (default: False).
            This is not yet supported.
        dtype (torch.dtype, optional): datatype for optimizer state
            (default: torch.float32)
        grad_sync_dtype (torch.dtype, optional): datatype for gradient
            synchronization (default: same as dtype)
        param_sync_dtype (torch.dtype, optional): datatype for
            parameter synchronization (default: same as dtype)
        device (torch.device, optional): device for optimizer state
            (default: cuda). Currently only supports GPU with one GPU
            per process.
        process_group (torch.distributed.ProcessGroup, optional):
            parallel processes participating in optimizer (default:
            default group in torch.distributed). This group is
            interpreted as a 2D grid with dimensions
            distributed_size x redundant_size.
        distributed_process_group (torch.distributed.ProcessGroup,
            optional): parallel processes to distribute optimizer
            state over (default: same as process_group)
        redundant_process_group (torch.distributed.ProcessGroup,
            optional): parallel processes to replicate optimizer state
            over (default: group only containing calling process)
        average_grad_sync (bool, optional): whether to use average
            reduction for gradient synchronization rather than sum
            (default: True)
        overlap_grad_sync (boolean, optional): whether to overlap
            gradient synchronization with backward pass compute
            (default: True)
        overlap_param_sync (boolean, optional): whether to overlap
            parameter synchronization with forward pass compute
            (default: False). This is an experimental feature.
        bucket_cap_mb (float, optional): bucket size in megabytes
            (default: 100)
        pipeline_size (int, optional): number of buckets to process
            simultaneously in optimizer step (default: 2)
        contiguous_param_buffer (bool, optional): convert parameters
            into views into large persistent buffers (default: False).
            This enables some performance optimizations (e.g. avoiding
            some memory copies), but may add memory overhead (e.g. if
            the memory allocator can't reuse the original parameter
            buffers).
        contiguous_grad_buffer (bool, optional): allocate gradient
            buckets out of a large persistent buffers (default:
            False). This allows individual parameter gradients to be
            accessed externally (see grad_buffer_view function). It
            enables some performance optimizations (e.g. avoiding some
            memory copies), but prevents some memory optimizations
            (e.g. the memory allocator can't reuse buffers for
            gradient buckets).
        store_params (bool, optional): store a distributed copy of the
            parameters as optimizer state (default: True). This may be
            desirable if the optimizer dtype has higher precision than
            the parameter dtype.
        store_param_remainders (bool, optional): if model is BF16 and
            optimizer is FP32, store bits required to reconstruct FP32
            params (default: False). This is an experimental feature.
        with_scaled_states (bool, optional): apply per-tensor scaling
            factors to the optimizer state (default: False). As
            discussed in `FP8-LM: Training FP8 Large Language
            Models`_, this helps maintain a reasonable dynamic range
            even when the state is in a low-precision datatype like
            FP16.
        nccl_ub (bool, optional): enable NCCL user buffers for zero-copy
            (default: False). It allows the collectives to use only 1 SM
            when IB SHARP is enabled in a one-rank-per-node communication
            group. This will help speedup the gemms overlapped with data-
            parallel communications.
        capturable (bool, optional): whether to use the version of the
            optimizer that can be used with CUDA Graphs. (default: False).

    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    .. _Decoupled Weight Decay Regularization: https://arxiv.org/abs/1711.05101
    .. _ZeRO\: Memory Optimizations Toward Training Trillion Parameter Models:
        https://arxiv.org/abs/1910.02054
    .. _FP8-LM\: Training FP8 Large Language Models:
        https://arxiv.org/pdf/2310.18313v2.pdf

    """

    @dataclass
    class ParameterFragment:
        """Buffer ranges for a parameter fragment

        Describes corresponding regions in parameter buffer and
        parameter bucket.

        """

        # Parameter group index
        param_group_id: int
        # Parameter index within parameter group
        param_id: int
        # Bucket index
        bucket_id: int
        # Range within flattened parameter buffer
        param_range: Tuple[int, int]
        # Range within bucket
        bucket_range: Tuple[int, int]
        # Whether fragment is in local shard of bucket
        in_local_shard: bool
        # Range within local shard
        shard_range: Optional[Tuple[int, int]]
        # Range of local fragment shard within bucket
        shard_bucket_range: Optional[Tuple[int, int]]
        # Range of local fragment shard within parameter
        shard_param_range: Optional[Tuple[int, int]]

    class StateBucket:
        """Optimizer state for a bucket"""

        def __init__(
            self,
            bucket_size: int,
            shard_size: int,
            dtype: torch.dtype,
            device: torch.device,
            grad_sync_dtype: torch.dtype,
            param_sync_dtype: torch.dtype,
            contiguous_buffer_offset: int = 0,
            store_params: bool = False,
            store_param_remainders: bool = False,
        ):
            # Size of parameter bucket
            self.bucket_size: int = bucket_size
            # Size of local shard of parameter bucket
            self.shard_size: int = shard_size
            # Data type for state
            self.dtype = dtype
            # Data type for gradient synchronization
            self.grad_sync_dtype = grad_sync_dtype
            # Data type for parameter synchronization
            self.param_sync_dtype = param_sync_dtype
            # Size of the filled region in the bucket
            self.filled_size: int = 0
            # Is it able to continue filling
            self.able_to_fill: bool = True
            # Offset to bucket in contiguous buffers
            self.contiguous_buffer_offset: int = contiguous_buffer_offset
            # Buffer ranges corresponding to parameter fragments
            self.fragments: List[ParameterFragment] = []
            # Local shard of parameters
            self.params_shard: Optional[torch.Tensor] = None
            if store_params:
                self.params_shard = torch.zeros(
                    [shard_size],
                    dtype=self.dtype,
                    device=device,
                )
            # Local shard of parameter remainders
            self.param_remainders_shard: Optional[torch.Tensor] = None
            if store_param_remainders:
                self.param_remainders_shard = torch.zeros(
                    [shard_size],
                    dtype=torch.int16,
                    device=device,
                )
            # Local shard of first moment estimate
            self.exp_avg_shard: torch.Tensor = torch.zeros(
                [shard_size],
                dtype=self.dtype,
                device=device,
            )
            # Local shard of second moment estimate
            self.exp_avg_sq_shard: torch.Tensor = torch.zeros(
                [shard_size],
                dtype=self.dtype,
                device=device,
            )

        def dtypes(self) -> Tuple[torch.dtype, torch.dtype, torch.dtype]:
            """Datatypes for the bucket's compute and communication"""
            return (
                self.dtype,
                self.grad_sync_dtype,
                self.param_sync_dtype,
            )

    class GradientStatus(enum.Enum):
        """Status of gradients within a bucket"""

        # Gradients are ready to use
        READY = enum.auto()
        # Bucket is partially filled with unreduced gradients
        PARTIALLY_FILLED = enum.auto()
        # Bucket is fully filled with unreduced gradients
        FULLY_FILLED = enum.auto()
        # Asynchronous reduction is in progress
        SYNCING = enum.auto()

    class GradientBucket:
        """Gradient buffers and state for a bucket"""

        def __init__(self):
            # Local shard of gradients
            self.grads_shard: Optional[torch.Tensor] = None
            # Local contribution to gradients
            self.grads_bucket: Optional[torch.Tensor] = None
            # Buffer for gradient reduce-scatter
            self.sync_grads_shard: Optional[torch.Tensor] = None
            # Status of gradients
            self.status: GradientStatus = DistributedFusedAdam.GradientStatus.READY
            # Params that have generated grads
            self.grads_generated: Set[torch.nn.Parameter] = set()

    class ParameterStatus(enum.Enum):
        """Status of parameters within a bucket"""

        # Parameters are sharded between processes
        SHARDED = enum.auto()
        # Asynchronous communication is in progress
        SYNCING = enum.auto()
        # Parameters are ready to use
        READY = enum.auto()

    class ParameterBucket:
        """Parameter buffers and state for a bucket"""

        def __init__(self):
            # Local shard of parameters
            self.params_shard: Optional[torch.Tensor] = None
            # Gathered parameter values
            self.params_bucket: Optional[torch.Tensor] = None
            # Status of parameters
            self.status: ParameterStatus = DistributedFusedAdam.ParameterStatus.SHARDED
            # Params that have been updated
            self.params_updated: Set[torch.nn.Parameter] = set()

    # Enable custom logic for AMP grad scaling
    _step_supports_amp_scaling: bool = True
    _custom_amp_unscale_grads: bool = True

    def __init__(
        self,
        params: Union[Iterable[torch.nn.Parameter], Iterable[dict]],
        lr: float = 1e-3,
        bias_correction: bool = True,
        betas: Tuple[float, float] = (0.9, 0.999),
        eps: float = 1e-8,
        adam_w_mode: bool = True,
        weight_decay: float = 0.0,
        amsgrad: bool = False,
        dtype: torch.dtype = torch.float32,
        grad_sync_dtype: Optional[torch.dtype] = None,
        param_sync_dtype: Optional[torch.dtype] = None,
        device: Optional[torch.device] = "cuda",
        process_group: Optional[torch.distributed.ProcessGroup] = None,
        distributed_process_group: Optional[torch.distributed.ProcessGroup] = None,
        redundant_process_group: Optional[torch.distributed.ProcessGroup] = None,
        average_grad_sync: bool = True,
        overlap_grad_sync: bool = True,
        overlap_param_sync: bool = False,
        bucket_cap_mb: float = 100.0,
        pipeline_size: int = 2,
        contiguous_param_buffer: bool = False,
        contiguous_grad_buffer: bool = False,
        store_params: bool = True,
        store_param_remainders: bool = False,
        with_scaled_states: bool = False,
        nccl_ub: bool = False,
        capturable: bool = False,
    ):
        if (with_scaled_states or store_param_remainders) and capturable:
            raise Exception(
                f"{self.__class__.__name__} with scaled states "
                "or storing param remainders doesn't support CUDA graph yet."
            )

        if capturable and not _FOUND_DEPRECATED_FUSED_ADAM:
            raise Exception(
                f"Capturable {self.__class__.__name__} relies on "
                "multi_tensor_copy to set dummy_overflow_buf to indicate "
                "whether there's gradient Inf/NaN, build APEX with "
                "`--deprecated_fused_adam` is essential."
            )

        # If capturable for CUDA graph
        self.capturable: bool = capturable
        # If the optimizer is capturable then LR should be a tensor (on GPU)
        if capturable:
            lr = torch.tensor(lr, dtype=torch.float32, device=device)

        defaults = dict(
            lr=lr,
            bias_correction=bias_correction,
            betas=betas,
            eps=eps,
            weight_decay=weight_decay,
        )
        super().__init__(params, defaults)

        # Adam options
        self.adam_w_mode: bool = adam_w_mode
        self.amsgrad: bool = amsgrad
        if amsgrad:
            raise RuntimeError("DistributedFusedAdam does not support the AMSGrad variant.")

        # Datatype options
        if grad_sync_dtype is None:
            grad_sync_dtype = dtype
        if param_sync_dtype is None:
            param_sync_dtype = dtype
        supported_dtypes = (torch.float32, torch.float16, torch.bfloat16)
        if dtype not in supported_dtypes or grad_sync_dtype not in supported_dtypes:
            raise ValueError(
                "Unsupported dtypes for DistributedFusedAdam "
                f"(dtype={dtype}, "
                f"grad_sync_dtype={grad_sync_dtype}, "
                f"param_sync_dtype={param_sync_dtype}))"
            )
        self.dtype: torch.dtype = dtype
        self.grad_sync_dtype: torch.dtype = grad_sync_dtype
        self.param_sync_dtype: torch.dtype = param_sync_dtype

        # Device options
        if not _devices_match(device, "cuda"):
            raise RuntimeError(f"Invalid device for DistributedFusedAdam (device={device})")
        self.device: torch.device = torch.device("cuda", torch.cuda.current_device())

        # Process groups
        self.process_group: torch.distributed.ProcessGroup = (
            _get_default_group() if process_group is None else process_group
        )
        self.distributed_process_group: torch.distributed.ProcessGroup = (
            self.process_group if distributed_process_group is None else distributed_process_group
        )
        self.redundant_process_group: Optional[torch.distributed.ProcessGroup] = (
            redundant_process_group
        )
        self.process_group_size: int = torch.distributed.get_world_size(self.process_group)
        self.distributed_rank: int = torch.distributed.get_rank(self.distributed_process_group)
        self.distributed_size: int = torch.distributed.get_world_size(
            self.distributed_process_group
        )
        self.redundant_size: int = (
            1
            if self.redundant_process_group is None
            else torch.distributed.get_world_size(self.redundant_process_group)
        )
        if self.process_group_size != self.distributed_size * self.redundant_size:
            raise RuntimeError(
                "Invalid process group configuration "
                f"(process group size = {self.process_group_size}, "
                f"distributed process group size = {self.distributed_size}, "
                f"redundant process group size = {self.redundant_size})"
            )
        self.process_group_root: int = get_global_rank(self.process_group, 0)

        # Use average reduction for grad sync
        self.average_grad_sync: bool = average_grad_sync
        # Copy param grads to bucket as soon as available
        self.greedy_grad_copy: bool = True
        # Synchronize grad buckets as soon as their grads are available
        self.overlap_grad_sync: bool = overlap_grad_sync
        # Try synchronizing param buckets just before param is needed
        self.overlap_param_sync: bool = overlap_param_sync
        # Number of buckets to synchronize at a time
        self.pipeline_size: int = pipeline_size

        # Store params or param remainders
        if store_param_remainders:
            if store_params:
                raise RuntimeError(
                    "Attempted to construct DistributedFusedAdam "
                    "with store_params=True and store_param_remainders=True"
                )
            if self.dtype != torch.float32 or self.param_sync_dtype != torch.bfloat16:
                raise RuntimeError(
                    "DistributedFusedAdam requires "
                    "BF16 params and FP32 optimizer state "
                    "when storing parameter remainders "
                    f"(dtype={self.dtype}, "
                    f"param_sync_dtype={self.param_sync_dtype}))"
                )
        self.store_params: bool = store_params
        self.store_param_remainders: bool = store_param_remainders

        # Whether to scale optimizer state
        self.with_scaled_states: bool = with_scaled_states
        if self.with_scaled_states:
            if not self.store_params:
                raise RuntimeError(
                    "Attempted to construct DistributedFusedAdam "
                    "with with_scaled_state=True and store_params=False"
                )
            if self.store_param_remainders:
                raise RuntimeError(
                    "Attempted to construct DistributedFusedAdam "
                    "with with_scaled_state=True and store_params_remainders=True"
                )
            if self.dtype not in (torch.float16, torch.bfloat16):
                raise RuntimeError(
                    "Attempted to construct DistributedFusedAdam "
                    f"with with_scaled_state=True and dtype={self.dtype} "
                    "(only fp16 and bf16 are supported)"
                )
            if self.param_sync_dtype == torch.float32:
                # _local_step_with_scaled_states applies Adam kernel
                # to fp32 workspace buffer and relies on
                # _check_params_shard_dtypes to copy to param sync
                # workspace buffer. However,
                # _check_params_shard_dtypes does nothing if
                # param_sync_dtype is fp32.
                raise RuntimeError(
                    "Attempted to construct DistributedFusedAdam "
                    f"with with_scaled_state=True and param_sync_dtype={self.param_sync_dtype}"
                )
        # Scaling factors to apply to recover unscaled optimizer state
        self._state_scales: dict = {}

        # Determine bucket sizes
        dtype_size = torch.finfo(self.grad_sync_dtype).bits // 8
        self.alignment: int = 128 // dtype_size
        self.bucket_cap_mb: float = bucket_cap_mb
        bucket_size = 1024 * 1024 * bucket_cap_mb / dtype_size
        shard_size = int(bucket_size / self.distributed_size)
        shard_size = _round_to_multiple(shard_size, self.alignment, round_up=False)
        shard_size = max(shard_size, self.alignment)
        self.default_shard_size: int = shard_size

        # Optimizer state
        self.state["buckets"]: List[StateBucket] = []
        self.state["step"]: torch.Tensor | int = (
            torch.tensor([0], dtype=torch.int, device=self.device) if self.capturable else 0
        )

        # Gradient state
        self._grads_buckets: Dict[int, GradientBucket] = collections.defaultdict(
            self.GradientBucket
        )
        # Param state
        self._params_buckets: Dict[int, ParameterBucket] = collections.OrderedDict()

        # Whether to allocate contiguous buffers for parameters
        self.contiguous_param_buffer: bool = contiguous_param_buffer
        # Whether to allocate contiguous buffers for gradients
        self.contiguous_grad_buffer: bool = contiguous_grad_buffer
        # Whether to use NCCL User Buffer
        self.nccl_ub: bool = nccl_ub
        # Contiguous buffers for parameters
        self._param_buffers: Dict[Tuple[torch.dtype, torch.dtype, torch.dtype], torch.Tensor] = {}
        # Contiguous buffers for gradients
        self._grad_buffers: Dict[Tuple[torch.dtype, torch.dtype, torch.dtype], torch.Tensor] = {}
        # Output buffer for gradient shards, only required for NCCL user buffer
        if self.nccl_ub:
            if not nccl_allocator:
                raise RuntimeError("NCCL allocator importing failed but nccl ub is still requested")
            elif not self.contiguous_grad_buffer:
                raise RuntimeError("NCCL user buffers require contiguous grad buffers")
            else:
                self._shard_grad_buffers: Dict[
                    Tuple[torch.dtype, torch.dtype, torch.dtype], torch.Tensor
                ] = {}

        # Side streams for state dict communication
        self._pipeline_streams: List[torch.cuda.Stream] = [
            torch.cuda.Stream() for _ in range(self.pipeline_size)
        ]
        # Side streams for gradients and parameters communication
        self._comm_streams: List[torch.cuda.Stream] = [
            torch.cuda.Stream() for _ in range(self.pipeline_size)
        ]
        self._last_comm_stream_id: int = -1

        # Scale by factor before optimizer step. Used for grad
        # clipping and gradient scaler.
        self._grad_scale: torch.Tensor = torch.full(
            [], 1.0, dtype=torch.float32, device=self.device
        )
        # Norm of parameter gradients. Used for gradient clipping and
        # gradient scaler.
        self._grad_norm: Optional[torch.Tensor] = None

        # Dummy flag for multi-tensor kernels
        # Note: Apex multi-tensor kernels have a noop_flag argument
        # that is intended to detect non-finite values. It shouldn't
        # have any effect with the kernels used in the optimizer, but
        # we still set it to zero out of an abundance of caution.
        self._dummy_overflow_buf: torch.Tensor = torch.zeros(
            [1], dtype=torch.int32, device=self.device
        )

        # Check if collectives have no_copy option
        self._gather_no_copy: bool = (
            "no_copy" in inspect.getfullargspec(torch.distributed.gather).args
        )

        # Make sure parameter values are same across processes
        self._broadcast_params()

        # Lock for callbacks
        self._lock: threading.Lock = threading.Lock()
        # Attach hooks for gradient synchronization
        self._register_post_backward_hooks()
        # Attach hooks for param synchronization
        if self.overlap_param_sync:
            self._register_pre_forward_hooks()

        # Move LR to device
        if capturable:
            for idx, group in enumerate(self.param_groups):
                if len(group["params"]) == 0:
                    continue
                for item in ["lr"]:
                    if torch.is_tensor(group[item]):
                        self.param_groups[idx][item] = group[item].to(device=self.device)
                    else:
                        self.param_groups[idx][item] = torch.tensor(group[item], device=self.device)

        # For better representation string
        arg_names = inspect.getfullargspec(DistributedFusedAdam.__init__).args
        arg_names.remove("self")
        arg_names.remove("params")
        for i, group in enumerate(self.param_groups):
            for key in sorted(group.keys()):
                if key in arg_names:
                    arg_names.remove(key)
        self.args_dict = {name: getattr(self, name) for name in arg_names}

    def __repr__(self) -> str:
        # Based on: https://github.com/pytorch/pytorch/blob/v2.3.0-rc12/torch/optim/optimizer.py#L315
        format_string = self.__class__.__name__ + " ("
        for i, group in enumerate(self.param_groups):
            format_string += "\n"
            format_string += f"Parameter Group {i}\n"
            for key in sorted(group.keys()):
                if key != "params":
                    format_string += f"    {key}: {group[key]}\n"

        for key, val in self.args_dict.items():
            if "process_group" in key and val:
                format_string += f"{key}: {hex(id(val))}, world size {val.size()}\n"
            else:
                format_string += f"{key}: {val}\n"

        format_string += ")"
        return format_string

    @torch.no_grad()
    def _broadcast_params(self) -> None:
        """Broadcast parameter values from root rank"""
        process_group = self.process_group
        with _coalescing_manager(process_group, self.device, async_ops=True) as cm:
            for param_group in self.param_groups:
                for param in param_group["params"]:
                    _coalescing_manager_append_work(
                        cm,
                        torch.distributed.broadcast(
                            param,
                            src=self.process_group_root,
                            group=process_group,
                            async_op=True,
                        ),
                    )
        cm.wait()

    def _make_post_backward_hook(
        self,
        param: torch.nn.Parameter,
        param_group_id: int,
        param_id: int,
    ) -> Callable:
        """Create callback function to call after param generates grad

        Lazily initialize parameter and try launching grad sync.

        """

        def post_backward_hook(*unused) -> None:
            if getattr(param, "_pre_forward_hook_is_enabled", False):
                raise RuntimeError(
                    "A parameter called its post-backward hook "
                    "before its pre-forward hook. "
                    "Please manually interact with the parameter "
                    "before the forward pass (e.g. by calling data_ptr) "
                    "or run DistributedFusedAdam with overlap_param_sync=False."
                )
            with self._lock:
                need_to_initialize = "fragments" not in self.state[param]
                if need_to_initialize:
                    self._init_param_state(param, param_group_id, param_id)
                if self.greedy_grad_copy:
                    self._grad_copy(param)
                    if self.overlap_grad_sync:
                        self._try_start_bucket_grad_sync(
                            params=[param],
                            ignore_last_bucket=need_to_initialize,
                        )

        return post_backward_hook

    def _register_post_backward_hooks(self) -> None:
        """Attach hooks for gradient synchronization"""
        self._grad_accs = []
        for param_group_id, group in enumerate(self.param_groups):
            for param_id, param in enumerate(group["params"]):
                if param.requires_grad:
                    param_tmp = param.expand_as(param)
                    grad_acc = param_tmp.grad_fn.next_functions[0][0]
                    hook = self._make_post_backward_hook(
                        param,
                        param_group_id,
                        param_id,
                    )
                    grad_acc.register_hook(hook)
                    self._grad_accs.append(grad_acc)

    def _make_pre_forward_hook(
        self,
        param: torch.nn.Parameter,
        param_group_id: int,
        param_id: int,
    ) -> Callable:
        """Create callback function to call before param forward pass

        Make sure param has been synchronized and try launching next
        param sync.

        """

        def pre_forward_hook(*unused) -> None:
            with self._lock:
                if "fragments" not in self.state[param]:
                    return
                self._param_copy(param)
                if self.overlap_param_sync:
                    self._try_start_bucket_param_sync()

        return pre_forward_hook

    def _register_pre_forward_hooks(self) -> None:
        """Attach hooks for parameter synchronization

        If _pre_forward_hook_is_enabled is set in a parameter, then
        the callback will be called the first time any of its
        attributes are accessed. This is hackily done by
        monkey-patching the parameter class, so proceed with caution.

        """
        for param_group_id, group in enumerate(self.param_groups):
            for param_id, param in enumerate(group["params"]):
                # Monkey-patch parameter class
                cls = param.__class__
                if not getattr(cls, "_has_pre_forward_hook", False):
                    # Monkey-patch magic methods to call __getattribute__
                    special_funcs = [
                        "__abs__",
                        "__add__",
                        "__and__",
                        "__bool__",
                        "__complex__",
                        "__contains__",
                        "__deepcopy__",
                        "__delitem__",
                        "__div__",
                        "__eq__",
                        "__float__",
                        "__floordiv__",
                        "__ge__",
                        "__getitem__",
                        "__gt__",
                        "__iadd__",
                        "__iand__",
                        "__idiv__",
                        "__ifloordiv__",
                        "__ilshift__",
                        "__imod__",
                        "__imul__",
                        "__index__",
                        "__int__",
                        "__invert__",
                        "__ior__",
                        "__ipow__",
                        "__irshift__",
                        "__isub__",
                        "__iter__",
                        "__itruediv__",
                        "__ixor__",
                        "__le__",
                        "__len__",
                        "__long__",
                        "__lshift__",
                        "__lt__",
                        "__matmul__",
                        "__mod__",
                        "__mul__",
                        "__neg__",
                        "__nonzero__",
                        "__or__",
                        "__pos__",
                        "__pow__",
                        "__radd__",
                        "__rand__",
                        "__rdiv__",
                        "__reduce__",
                        "__reduce_ex__",
                        "__reversed__",
                        "__rfloordiv__",
                        "__rlshift__",
                        "__rmatmul__",
                        "__rmod__",
                        "__rmul__",
                        "__ror__",
                        "__rpow__",
                        "__rrshift__",
                        "__rshift__",
                        "__rsub__",
                        "__rtruediv__",
                        "__rxor__",
                        "__setitem__",
                        "__sizeof__",
                        "__sub__",
                        "__truediv__",
                        "__xor__",
                    ]
                    for func_name in special_funcs:

                        def make_augmented_func() -> Callable:
                            base_func_name = f"_base_{func_name}"

                            def augmented_func(self, *args, **kwargs):
                                return getattr(self, base_func_name)(*args, **kwargs)

                            return augmented_func

                        setattr(cls, f"_base_{func_name}", getattr(cls, func_name))
                        setattr(cls, func_name, make_augmented_func())

                    # Monkey-patch __getattribute__ to call pre-forward hook
                    def make_getattribute() -> Callable[[str], Any]:
                        special_attrs = {
                            "_pre_forward_hook_is_enabled",
                            "_pre_forward_hook",
                            "__del__",
                            "__delattr__",
                            "__dir__",
                            "__getattr__",
                            "__getattribute__",
                            "__hash__",
                            "__init__",
                            "__new__",
                            "__setattr__",
                        }

                        def getattribute_with_pre_forward_hook(self, name: str):
                            """Variant of __getattribute__ that can call pre-forward hook"""
                            if name not in special_attrs:
                                if getattr(self, "_pre_forward_hook_is_enabled", False):
                                    self._pre_forward_hook_is_enabled = False
                                    self._pre_forward_hook()
                            return object.__getattribute__(self, name)

                        return getattribute_with_pre_forward_hook

                    cls.__getattribute__ = make_getattribute()
                    cls._has_pre_forward_hook = True

                # Register pre-forward callback
                param._pre_forward_hook_is_enabled = False
                param._pre_forward_hook = self._make_pre_forward_hook(
                    param,
                    param_group_id,
                    param_id,
                )

    @torch.no_grad()
    def init_param_buffer(self) -> None:
        """Allocate contiguous buffers for param buckets

        This converts the parameters into views into contiguous
        buffers. This enables some performance optimizations (e.g.
        avoiding some memory copies), but may add memory overhead
        (e.g. if the memory allocator can't reuse the original
        parameter buffers). To minimize memory overhead, this buffer
        should be initialized before the first training step.

        """

        # Make sure all params are initialized
        self.contiguous_param_buffer = True
        self.init_params()

        # Construct param buffers
        buffer_sizes = collections.defaultdict(lambda: 0)
        for bucket in self.state["buckets"]:
            dtypes = bucket.dtypes()
            buffer_sizes[dtypes] = max(
                bucket.contiguous_buffer_offset + bucket.bucket_size,
                buffer_sizes[dtypes],
            )
        for dtypes, buffer_size in buffer_sizes.items():
            _, _, param_sync_dtype = dtypes
            self._param_buffers[dtypes] = torch.zeros(
                [buffer_size],
                dtype=param_sync_dtype,
                device=self.device,
            )

        # Figure out corresponding positions in params and param buffer
        params = list(self.parameters())
        param_flat_views = []
        param_buffer_views = []
        for i, param in enumerate(params):
            fragment = self.state[param]["fragments"][0]
            bucket_id = fragment.bucket_id
            bucket = self.state["buckets"][bucket_id]
            param_size = param.numel()
            bucket_start, _ = fragment.bucket_range
            buffer_offset = bucket.contiguous_buffer_offset
            buffer_start = buffer_offset + bucket_start
            buffer_end = buffer_start + param_size
            param_buffer = self._param_buffers[bucket.dtypes()]
            param_buffer_view = param_buffer[buffer_start:buffer_end].detach()
            if not _devices_match(param_buffer_view.device, param.device):
                raise RuntimeError(
                    "Attempted to change a parameter with device={param.device} "
                    f"into a buffer view with device={param_buffer_view.device}"
                )
            if param_buffer_view.dtype != param.dtype:
                if (
                    not torch.is_floating_point(param_buffer_view)
                    and param_buffer_view.element_size() == param.element_size()
                ):
                    param_buffer_view = param_buffer_view.view(dtype=param.dtype)
                else:
                    raise RuntimeError(
                        f"Attempted to change a parameter with dtype={param.dtype} "
                        f"into a buffer view with dtype={param_buffer_view.dtype}"
                    )
            if param.is_contiguous(memory_format=torch.channels_last):
                param = param.permute(0, 2, 3, 1)
            param_flat_views.append(param.detach().view(-1))
            param_buffer_views.append(param_buffer_view)

        # Copy values into param buffer
        _multi_tensor_copy(
            param_flat_views,
            param_buffer_views,
            dummy_overflow_buf=self._dummy_overflow_buf,
        )

        # Make all params a view into the param buffer
        for param, buffer_view in zip(params, param_buffer_views):
            # Preserve memory format for param here, i.e. NHWC tensors
            # `param.data.set_()` failed to change storage.
            # `param.set_()` invalidates bprop hook.
            param.data = buffer_view.as_strided(param.size(), param.stride())

    def _init_grad_buffer(self) -> None:
        """Allocate contiguous buffer for grad buckets"""

        # Make sure all params are initialized
        self.contiguous_grad_buffer = True
        self.init_params()

        # Construct grad buffers
        buffer_sizes = collections.defaultdict(lambda: 0)
        for bucket in self.state["buckets"]:
            dtypes = bucket.dtypes()
            buffer_sizes[dtypes] = max(
                bucket.contiguous_buffer_offset + bucket.bucket_size,
                buffer_sizes[dtypes],
            )
        for dtypes, buffer_size in buffer_sizes.items():
            _, grad_sync_dtype, _ = dtypes
            if not self.nccl_ub:
                self._grad_buffers[dtypes] = torch.zeros(
                    [buffer_size],
                    dtype=grad_sync_dtype,
                    device=self.device,
                )
            else:
                pool = nccl_allocator.create_nccl_mem_pool()
                with nccl_allocator.nccl_mem(pool):
                    self._grad_buffers[dtypes] = torch.zeros(
                        [buffer_size],
                        dtype=grad_sync_dtype,
                        device=self.device,
                    )
                shard_buffer_size = buffer_size // self.distributed_size
                with nccl_allocator.nccl_mem(pool):
                    self._shard_grad_buffers[dtypes] = torch.zeros(
                        [shard_buffer_size],
                        dtype=grad_sync_dtype,
                        device=self.device,
                    )

    def parameters(self) -> Iterable[torch.nn.Parameter]:
        """Returns an iterator over optimizer parameters"""
        return itertools.chain.from_iterable(group["params"] for group in self.param_groups)

    def parameter(
        self,
        *args: Union[int, ParameterFragment],
    ) -> torch.nn.Parameter:
        """Get optimizer parameter

        Can either accept two ints or one
        DistributedFusedAdam.ParameterFragment.

        Arguments:
            param_group_id (int): Parameter group index
            param_id (int): Parameter index within parameter group

        """
        if len(args) == 2 and isinstance(args[0], int) and isinstance(args[1], int):
            param_group_id = args[0]
            param_id = args[1]
        elif len(args) == 1 and isinstance(args[0], self.ParameterFragment):
            fragment = args[0]
            param_group_id = fragment.param_group_id
            param_id = fragment.param_id
        else:
            raise TypeError(
                "Expected input types are "
                "[int, int] or [DistributedFusedAdam.ParameterFragment], "
                f"but found {[type(arg).__name__ for arg in args]}"
            )
        return self.param_groups[param_group_id]["params"][param_id]

    def init_params(
        self,
        params: Optional[Iterable[torch.nn.Parameter]] = None,
        dtype: Optional[torch.dtype] = None,
        grad_sync_dtype: Optional[torch.dtype] = None,
        param_sync_dtype: Optional[torch.dtype] = None,
    ) -> None:
        """Initialize optimizer state for parameters

        Ignores parameters that have already been initialized.

        Arguments:
            params (iterable, optional): parameters to initialize
                (default: all parameters)

        """

        # Default cases
        if params is None:
            params = self.parameters()
        elif isinstance(params, torch.Tensor):
            params = [params]

        # Ignore parameters that have already been initialized
        params = [param for param in params if "fragments" not in self.state[param]]
        if not params:
            return

        # Get indices corresponding to parameters
        id_map = dict()
        for param_group_id, group in enumerate(self.param_groups):
            for param_id, param in enumerate(group["params"]):
                id_map[param] = (param_group_id, param_id)

        # Initialize parameters
        for param in params:
            if param in id_map:
                param_group_id, param_id = id_map[param]
                self._init_param_state(
                    param,
                    param_group_id,
                    param_id,
                    dtype=dtype,
                    grad_sync_dtype=grad_sync_dtype,
                    param_sync_dtype=param_sync_dtype,
                )

    def init_params_bucket(
        self,
        params: Iterable[torch.nn.Parameter],
        dtype: Optional[torch.dtype] = None,
        grad_sync_dtype: Optional[torch.dtype] = None,
        param_sync_dtype: Optional[torch.dtype] = None,
    ) -> None:
        """Initialize optimizer state for parameters in one effective bucket

        The buckets corresponding to the provided parameters are
        configured so they all perform communication together. Ignores
        parameters that have already been initialized.

        Arguments:
            params (iterable): parameters to initialize

        """

        # Ignore parameters that have already been initialized
        if isinstance(params, torch.Tensor):
            params = [params]
        params = [param for param in params if "fragments" not in self.state[param]]
        if not params:
            return

        # Get indices corresponding to parameters
        id_map = dict()
        for param_group_id, group in enumerate(self.param_groups):
            for param_id, param in enumerate(group["params"]):
                id_map[param] = [param_group_id, param_id]
        param_ids = [tuple([param] + id_map[param]) for param in params]

        # Mark existings bucket as fully filled
        for bucket in self.state["buckets"]:
            bucket.able_to_fill = False

        # Initialize optimizer state for parameters
        start_bucket_id = len(self.state["buckets"])
        self.init_params(
            params,
            dtype=dtype,
            grad_sync_dtype=grad_sync_dtype,
            param_sync_dtype=param_sync_dtype,
        )
        end_bucket_id = len(self.state["buckets"])

        # Make sure all added buckets depend on provided params
        for bucket_id in range(start_bucket_id, end_bucket_id):
            bucket = self.state["buckets"][bucket_id]
            bucket_size = bucket.bucket_size
            bucket.able_to_fill = False
            ids_in_bucket = set(
                (fragment.param_group_id, fragment.param_id) for fragment in bucket.fragments
            )
            for param, param_group_id, param_id in param_ids:
                if (param_group_id, param_id) not in ids_in_bucket:
                    param_size = param.numel()
                    fragment = self.ParameterFragment(
                        param_group_id=param_group_id,
                        param_id=param_id,
                        bucket_id=bucket_id,
                        param_range=(param_size, param_size),
                        bucket_range=(bucket_size, bucket_size),
                        in_local_shard=False,
                        shard_range=None,
                        shard_bucket_range=None,
                        shard_param_range=None,
                    )
                    self.state[param]["fragments"].append(fragment)
                    bucket.fragments.append(fragment)

    @torch.no_grad()
    def _init_param_state(
        self,
        param: torch.nn.Parameter,
        param_group_id: int,
        param_id: int,
        dtype: Optional[torch.dtype] = None,
        grad_sync_dtype: Optional[torch.dtype] = None,
        param_sync_dtype: Optional[torch.dtype] = None,
    ) -> None:
        """Initialize optimizer state for a parameter"""

        # Return immediately if already initialized
        if "fragments" in self.state[param]:
            return
        self.state[param]["fragments"] = []

        # Data type configuration
        if dtype is None:
            dtype = self.dtype
        if grad_sync_dtype is None:
            grad_sync_dtype = self.grad_sync_dtype
        if param_sync_dtype is None:
            param_sync_dtype = self.param_sync_dtype
        if dtype != self.dtype:
            raise ValueError("Optimizer states with non-default dtypes are not supported")
        supported_dtypes = (torch.float32, torch.float16, torch.bfloat16)
        if dtype not in supported_dtypes or grad_sync_dtype not in supported_dtypes:
            raise ValueError(
                "Unsupported dtypes for DistributedFusedAdam "
                f"(dtype={dtype}, "
                f"grad_sync_dtype={grad_sync_dtype}, "
                f"param_sync_dtype={param_sync_dtype}))"
            )

        # Store params or param remainders
        store_params = (
            self.store_params or dtype != self.dtype or param_sync_dtype != self.param_sync_dtype
        )
        store_param_remainders = (
            self.store_param_remainders
            and dtype == self.dtype
            and param_sync_dtype == self.param_sync_dtype
        )

        def last_bucket_id() -> int:
            """Index of last optimizer state bucket with desired dtypes

            -1 if there are no such buckets.

            """
            dtypes = (dtype, grad_sync_dtype, param_sync_dtype)
            bucket_id = len(self.state["buckets"]) - 1
            while bucket_id > 0:
                bucket = self.state["buckets"][bucket_id]
                if bucket.dtypes() == dtypes:
                    break
                bucket_id -= 1
            return bucket_id

        def make_bucket(
            bucket_size: int,
            shard_size: int,
            buffer_offset: int,
        ) -> None:
            """Construct new optimizer state bucket"""
            self.state["buckets"].append(
                self.StateBucket(
                    bucket_size,
                    shard_size,
                    dtype,
                    self.device,
                    grad_sync_dtype,
                    param_sync_dtype,
                    contiguous_buffer_offset=buffer_offset,
                    store_params=store_params,
                    store_param_remainders=store_param_remainders,
                )
            )

        # Make sure there is at least one bucket with expected dtypes
        if last_bucket_id() < 0:
            shard_size = self.default_shard_size
            bucket_size = shard_size * self.distributed_size
            buffer_offset = 0
            make_bucket(bucket_size, shard_size, buffer_offset)

        # Split parameter values into fragments
        # Note: Each fragment resides within a bucket
        param_start = 0
        param_size = param.numel()
        while param_start < param_size:
            # Get current bucket
            bucket_id = last_bucket_id()
            bucket = self.state["buckets"][bucket_id]
            fragment_id = len(bucket.fragments)
            bucket_size = bucket.bucket_size
            shard_size = bucket.shard_size

            # Determine fragment position within bucket
            bucket_start = _round_to_multiple(
                bucket.filled_size,
                self.alignment,
                round_up=True,
            )
            fragment_size = min(param_size - param_start, bucket_size - bucket_start)
            param_end = param_start + fragment_size
            bucket_end = bucket_start + fragment_size

            # Create new bucket if current one is full
            if fragment_size <= 0 or not bucket.able_to_fill:
                shard_size = self.default_shard_size
                bucket_size = shard_size * self.distributed_size
                buffer_offset = bucket.contiguous_buffer_offset + bucket.bucket_size
                make_bucket(bucket_size, shard_size, buffer_offset)
                continue

            # Fragment position within local shard
            shard_id = self.distributed_rank
            shard_start = bucket_start - shard_size * shard_id
            shard_end = bucket_end - shard_size * shard_id
            shard_start = min(max(shard_start, 0), shard_size)
            shard_end = min(max(shard_end, 0), shard_size)
            in_local_shard = shard_start < shard_end
            shard_range = None
            shard_bucket_range = None
            shard_param_range = None
            if in_local_shard:
                shard_range = (shard_start, shard_end)
                shard_bucket_start = shard_start + shard_size * shard_id
                shard_bucket_end = shard_bucket_start + shard_end - shard_start
                shard_bucket_range = (shard_bucket_start, shard_bucket_end)
                shard_param_start = shard_bucket_start - bucket_start + param_start
                shard_param_end = shard_param_start + shard_end - shard_start
                shard_param_range = (shard_param_start, shard_param_end)

            # Record fragment info
            fragment = self.ParameterFragment(
                param_group_id=param_group_id,
                param_id=param_id,
                bucket_id=bucket_id,
                param_range=(param_start, param_end),
                bucket_range=(bucket_start, bucket_end),
                in_local_shard=in_local_shard,
                shard_range=shard_range,
                shard_bucket_range=shard_bucket_range,
                shard_param_range=shard_param_range,
            )
            self.state[param]["fragments"].append(fragment)
            bucket.fragments.append(fragment)
            bucket.filled_size = bucket_end
            param_start = param_end

        # Initialize optimizer state scaling factors if needed
        if self.with_scaled_states:
            for fragment in self.state[param]["fragments"]:
                if not fragment.in_local_shard:
                    continue
                bucket_id = fragment.bucket_id
                self._state_scales[(param_group_id, param_id, bucket_id)] = dict(
                    param=torch.zeros([1], dtype=torch.float32, device=self.device),
                    exp_avg=torch.zeros([1], dtype=torch.float32, device=self.device),
                    exp_avg_sq=torch.zeros([1], dtype=torch.float32, device=self.device),
                )

        # Initialize main param buffer
        if store_params:
            for fragment in self.state[param]["fragments"]:
                if not fragment.in_local_shard:
                    continue
                bucket_id = fragment.bucket_id
                bucket = self.state["buckets"][bucket_id]
                # If param is channels last, i.e. tensor with shape (N, C, H, W)
                # and stride (HWC, 1, WC, C), then we will turn it into a tensor
                # with shape (N, H, W, C) and stride (HWC, WC, C, 1). The purppose
                # is to avoid failures when flattening the tensor (`.view(-1)`)
                # and stepping the optimizer.
                if param.is_contiguous(memory_format=torch.channels_last):
                    param = param.permute(0, 2, 3, 1)
                param_range = slice(*fragment.shard_param_range)
                shard_range = slice(*fragment.shard_range)
                model_param_fragment = param.detach().view(-1)[param_range]
                if self.with_scaled_states:
                    model_param_fragment = torch.empty_like(
                        model_param_fragment,
                        dtype=torch.float32,
                    ).copy_(model_param_fragment)
                    self._apply_state_scale(
                        model_param_fragment,
                        self._state_scales[(param_group_id, param_id, bucket_id)]["param"],
                    )
                main_param_fragment = bucket.params_shard[shard_range]
                main_param_fragment.copy_(model_param_fragment)

        # Check if buckets are underutilized
        if all("fragments" in self.state[param] for param in self.parameters()):
            bucket_size = sum(bucket.bucket_size for bucket in self.state["buckets"])
            filled_size = sum(bucket.filled_size for bucket in self.state["buckets"])
            buckets_utilization = filled_size / bucket_size
            if buckets_utilization < 0.7:
                warnings.warn(
                    f"Only {buckets_utilization:.1%} of buckets are used. "
                    "Consider decreasing the bucket_cap_mb argument."
                )

    def zero_grad(self, set_to_none: bool = False) -> None:
        """Clear parameter gradients"""

        # Reset bucket buffers
        self._grads_buckets.clear()

        # Construct views into contiguous grad buffer, if needed
        if self.contiguous_grad_buffer:
            if not self._grad_buffers:
                self._init_grad_buffer()
            for grad_buffer in self._grad_buffers.values():
                grad_buffer.zero_()
            for bucket_id, bucket in enumerate(self.state["buckets"]):
                bucket_size = bucket.bucket_size
                buffer_start = bucket.contiguous_buffer_offset
                buffer_end = buffer_start + bucket_size
                grad_buffer = self._grad_buffers[bucket.dtypes()]
                self._grads_buckets[bucket_id].grads_bucket = grad_buffer[buffer_start:buffer_end]
                if self.nccl_ub:
                    shard_size = bucket.shard_size
                    shard_buffer_start = bucket.contiguous_buffer_offset // self.distributed_size
                    shard_buffer_end = shard_buffer_start + shard_size
                    shard_grad_buffer = self._shard_grad_buffers[bucket.dtypes()]
                    self._grads_buckets[bucket_id].sync_grads_shard = shard_grad_buffer[
                        shard_buffer_start:shard_buffer_end
                    ]

        # Reset param grads
        for param in self.parameters():
            with _disable_pre_forward_hook(param):
                need_to_zero = True
                if set_to_none:
                    param.grad = None
                elif self.contiguous_grad_buffer:
                    bucket_id = self.state[param]["fragments"][0].bucket_id
                    bucket = self.state["buckets"][bucket_id]
                    if param.dtype == bucket.grad_sync_dtype and _devices_match(
                        param.device, self.device
                    ):
                        param.grad = self.grad_buffer_view(param)
                        need_to_zero = False
                if need_to_zero and param.grad is not None:
                    param.grad.zero_()

        # Reset other state
        self._grad_scale.fill_(1.0)
        self._grad_norm = None
        self._dummy_overflow_buf.zero_()

    def _grad_copy(self, param: torch.nn.Parameter) -> None:
        """Copy parameter gradients to gradient buckets

        Initializes gradient buckets if needed. The original parameter
        gradient is set to None.

        """

        # Initialize parameter if needed
        if "fragments" not in self.state[param]:
            for param_group_id, group in enumerate(self.param_groups):
                for param_id, param_ in enumerate(group["params"]):
                    if param is param_:
                        self._init_param_state(param, param_group_id, param_id)
            if "fragments" not in self.state[param]:
                raise RuntimeError("Could not initialize DistributedFusedAdam with parameter")

        # Copy param grad to buckets
        for fragment in self.state[param]["fragments"]:
            # Get fragment position
            bucket_id = fragment.bucket_id
            bucket = self._grads_buckets[bucket_id]
            bucket_size = self.state["buckets"][bucket_id].bucket_size
            grad_sync_dtype = self.state["buckets"][bucket_id].grad_sync_dtype
            grad_start, grad_end = fragment.param_range
            bucket_start, bucket_end = fragment.bucket_range

            # Set reduction status
            if bucket.status == self.GradientStatus.SYNCING:
                self._finish_bucket_grad_sync()
            bucket.status = self.GradientStatus.PARTIALLY_FILLED

            # Allocate gradient buffer if needed
            if bucket.grads_bucket is None and self.contiguous_grad_buffer:
                if not self._grad_buffers:
                    self._init_grad_buffer()
                state_bucket = self.state["buckets"][bucket_id]
                buffer_start = state_bucket.contiguous_buffer_offset
                buffer_end = buffer_start + bucket_size
                grad_buffer = self._grad_buffers[state_bucket.dtypes()]
                grad_buffer = grad_buffer[buffer_start:buffer_end]
                if (
                    bucket.grads_shard is None
                    or bucket.grads_shard.storage().data_ptr() != grad_buffer.storage().data_ptr()
                ):
                    bucket.grads_bucket = grad_buffer
                    bucket.grads_bucket.zero_()
            if bucket.grads_bucket is None:
                bucket.grads_bucket = torch.zeros(
                    [bucket_size],
                    dtype=grad_sync_dtype,
                    device=self.device,
                )

            # Copy param grad to bucket
            if param.grad is not None:
                if param.grad.is_contiguous(memory_format=torch.channels_last):
                    grad_in = param.grad.permute(0, 2, 3, 1)
                else:
                    grad_in = param.grad
                grad_in = grad_in.detach().view(-1)[grad_start:grad_end]
                grad_out = bucket.grads_bucket[bucket_start:bucket_end]
                if grad_in.data_ptr() != grad_out.data_ptr():
                    grad_out.add_(grad_in)

        # Free param grad buffer
        param.grad = None

    def _param_copy(
        self,
        params: Union[torch.nn.Parameter, Iterable[torch.nn.Parameter]],
    ) -> None:
        """Update parameters with values from parameter buckets

        Synchronizes and deletes parameter buckets as needed.

        """

        # Get parameter fragments to be synchronized
        if isinstance(params, torch.Tensor):
            params = [params]
        fragments = []
        for param in params:
            if "fragments" in self.state[param]:
                fragments.extend(
                    fragment
                    for fragment in self.state[param]["fragments"]
                    if fragment.bucket_id in self._params_buckets
                )

        # Return immediately if no fragments need to be synchronized
        if not fragments:
            return

        # Make sure all needed buckets have been synchronized
        buckets = collections.OrderedDict()
        for fragment in fragments:
            bucket_id = fragment.bucket_id
            bucket = self._params_buckets[bucket_id]
            buckets[bucket] = bucket.status
        if any(status != self.ParameterStatus.READY for bucket, status in buckets.items()):
            self._start_bucket_param_sync(buckets.keys())
            self._finish_bucket_param_sync()

        # Copy values from bucket buffers to params
        self._param_copy_fragments(fragments)

        # Delete buckets if possible
        for fragment in fragments:
            bucket_id = fragment.bucket_id
            bucket = self._params_buckets[bucket_id]
            bucket.params_updated.add(self.parameter(fragment))
            bucket_fragments = self.state["buckets"][bucket_id].fragments
            if len(bucket.params_updated) == len(bucket_fragments):
                del self._params_buckets[bucket_id]

    def _param_copy_fragments(
        self,
        fragments: Iterable[ParameterFragment],
    ) -> None:
        """Update parameter fragments with values from parameter buckets"""

        # Figure out corresponding positions in param buckets and params
        buffers_in = []
        buffers_out = []
        for fragment in fragments:
            # Check if fragment needs to be updated
            bucket_id = fragment.bucket_id
            bucket_start, bucket_end = fragment.bucket_range
            param_start, param_end = fragment.param_range
            if param_end <= param_start or bucket_id not in self._params_buckets:
                continue

            # Corresponding positions in param bucket and param
            bucket = self._params_buckets[bucket_id]
            param = self.parameter(fragment)

            # Conv with NHWC layout, i.e. shape (N, C, H, W) and stride
            # (HWC, 1, WC, C), can't `.view(-1)`. Here to turn it to
            # tensor with shape (N, H, W, C) and stride (HWC, WC, C, 1).
            if param.is_contiguous(memory_format=torch.channels_last):
                param = param.permute(0, 2, 3, 1)

            buffer_in = bucket.params_bucket[bucket_start:bucket_end]
            buffer_out = param.detach().view(-1)[param_start:param_end]

            if torch.is_floating_point(buffer_in) and torch.is_floating_point(buffer_out):
                # Cast between floating-point dtypes
                buffers_in.append(buffer_in)
                buffers_out.append(buffer_out)
            else:
                # Copy most significant bytes for non-floating-point
                # dtypes
                # Note: Assume dtypes are little-endian
                in_bytes = buffer_in.unsqueeze(-1).view(torch.uint8)
                out_bytes = buffer_out.unsqueeze(-1).view(torch.uint8)
                copy_size = min(in_bytes.size(-1), out_bytes.size(-1))
                buffers_in.append(in_bytes[..., -copy_size:])
                buffers_out.append(out_bytes[..., -copy_size:])
                if copy_size < out_bytes.size(-1):
                    out_bytes[..., :-copy_size].zero_()

        # Copy data from parameter buckets to parameters
        _multi_tensor_copy(
            buffers_in,
            buffers_out,
            dummy_overflow_buf=self._dummy_overflow_buf,
        )

    def grad_buffer_view(self, param: torch.nn.Parameter) -> torch.Tensor:
        """Construct view into grad buffer corresponding to param

        Assumes optimizer is using a contiguous grad buffer.

        """

        # Initialize contiguous grad buffers if needed
        assert self.contiguous_grad_buffer
        if not self._grad_buffers:
            self._init_grad_buffer()

        # Figure out corresponding position in grad buffer
        fragment = self.state[param]["fragments"][0]
        bucket_id = fragment.bucket_id
        bucket = self.state["buckets"][bucket_id]
        bucket_start, _ = fragment.bucket_range
        buffer_offset = bucket.contiguous_buffer_offset
        buffer_start = buffer_offset + bucket_start
        buffer_end = buffer_start + param.numel()

        # Construct view into grad buffer
        # Preserve memory format for gradient here
        flat_buffer = self._grad_buffers[bucket.dtypes()]
        flat_buffer = flat_buffer[buffer_start:buffer_end]
        return flat_buffer.detach().as_strided(param.size(), param.stride())

    def _force_bucket_grad_sync(self) -> None:
        """Ensure that all gradient buckets are synchronized"""

        # Synchronize all unsynchronized buckets
        Status = self.GradientStatus
        buckets = []
        for bucket_id, grads_bucket in sorted(self._grads_buckets.items()):
            if grads_bucket.status not in (Status.READY, Status.SYNCING):
                buckets.append(grads_bucket)
                if grads_bucket.grads_bucket is None:
                    state_bucket = self.state["buckets"][bucket_id]
                    grads_bucket.grads_bucket = torch.zeros(
                        [state_bucket.bucket_size],
                        dtype=state_bucket.grad_sync_dtype,
                        device=self.device,
                    )
        if buckets:
            self._start_bucket_grad_sync(buckets)
        self._finish_bucket_grad_sync()

        # Fill any unsynchronized gradients with zeros
        for bucket_id in range(len(self.state["buckets"])):
            grads_bucket = self._grads_buckets[bucket_id]
            if grads_bucket.grads_shard is None:
                state_bucket = self.state["buckets"][bucket_id]
                grads_bucket.grads_shard = torch.zeros(
                    [state_bucket.shard_size],
                    dtype=state_bucket.grad_sync_dtype,
                    device=self.device,
                )

    def _try_start_bucket_grad_sync(
        self,
        params: Optional[Iterable[torch.nn.Parameter]] = None,
        ignore_last_bucket: bool = False,
    ) -> None:
        """Attempt to launch gradient synchronization

        Launches gradient synchronization if any bucket has receieved
        all its expected gradients. Gradient synchronization is
        asynchronous.

        Arguments:
            params (iterable): parameters that have had their
                gradients copied to buckets
            ignore_last_bucket (bool): avoid synchronizing last bucket
                until all gradients have been generated. This avoids
                excessive synchronization when initializing buckets in
                the first backward pass.

        """

        # Register params that have generated grads
        if params is None:
            params = []
        for param in params:
            for fragment in self.state[param]["fragments"]:
                bucket_id = fragment.bucket_id
                grads_bucket = self._grads_buckets[bucket_id]
                state_bucket = self.state["buckets"][bucket_id]
                bucket_fragments = state_bucket.fragments
                grads_bucket.grads_generated.add(param)
                if len(grads_bucket.grads_generated) == len(bucket_fragments):
                    grads_bucket.status = self.GradientStatus.FULLY_FILLED
                    if grads_bucket.grads_bucket is None:
                        grads_bucket.grads_bucket = torch.zeros(
                            [state_bucket.bucket_size],
                            dtype=state_bucket.grad_sync_dtype,
                            device=self.device,
                        )

        # Launch reductions if enough buckets are ready
        filled_buckets = []
        for bucket_id, bucket in sorted(self._grads_buckets.items()):
            if ignore_last_bucket and bucket_id == len(self.state["buckets"]) - 1:
                continue
            if bucket.status == self.GradientStatus.FULLY_FILLED:
                filled_buckets.append(bucket)
        if filled_buckets:
            self._start_bucket_grad_sync(filled_buckets)

    def _start_bucket_grad_sync(self, buckets: List[GradientBucket]) -> None:
        """Synchronize gradient buckets

        Gradient synchronization is asynchronous. Involves
        reduce-scatter over distributed process group and allreduce
        over redundant process group. Assumes grad bucket buffers are
        already initialized.

        """

        # Complete any outstanding grad syncs
        # Note: Not needed with contiguous grad buffer since there is
        # no memory benefit from eagerly freeing grad buffers.
        if not self.contiguous_grad_buffer:
            self._finish_bucket_grad_sync()

        # Reduction operation
        if self.average_grad_sync and not self.nccl_ub:
            reduce_op = torch.distributed.ReduceOp.AVG
        else:
            reduce_op = torch.distributed.ReduceOp.SUM

        # Initialize grad state and buffers
        for bucket in buckets:
            if bucket.status == self.GradientStatus.SYNCING:
                self._finish_bucket_grad_sync()
            bucket.status = self.GradientStatus.SYNCING
            bucket.grads_generated.clear()
            if self.distributed_size == 1:
                bucket.sync_grads_shard = bucket.grads_bucket
            elif bucket.sync_grads_shard is None:
                bucket_size = bucket.grads_bucket.numel()
                shard_size = bucket_size // self.distributed_size
                bucket.sync_grads_shard = torch.empty(
                    [shard_size],
                    dtype=bucket.grads_bucket.dtype,
                    device=bucket.grads_bucket.device,
                )

            # Handle case with multiple grad accumulation steps
            if bucket.grads_shard is not None:
                if bucket.sync_grads_shard.data_ptr() == bucket.grads_shard.data_ptr():
                    bucket.grads_shard = bucket.grads_shard.clone()

        # Side stream for communication
        # If new bucket is ready before last bucket communication finishes, use multiple
        # communication streams could help pipeline reduce-scatter and all-reduce.
        main_stream = torch.cuda.current_stream()
        self._last_comm_stream_id = (self._last_comm_stream_id + 1) % len(self._comm_streams)
        comm_stream = self._comm_streams[self._last_comm_stream_id]
        comm_stream.wait_stream(main_stream)

        # Reduce-scatter over distributed process group
        if buckets and self.distributed_size > 1:
            with torch.cuda.stream(comm_stream):
                group = self.distributed_process_group
                with _coalescing_manager(group, self.device, async_ops=True) as cm:
                    for bucket in buckets:
                        if self.average_grad_sync and self.nccl_ub:
                            bucket.grads_bucket /= self.distributed_size
                        _coalescing_manager_append_work(
                            cm,
                            reduce_scatter_tensor(
                                bucket.sync_grads_shard,
                                bucket.grads_bucket,
                                op=reduce_op,
                                group=group,
                                async_op=True,
                            ),
                        )
                cm.wait()

        # All-reduce over redundant process group
        if buckets and self.redundant_size > 1:
            with torch.cuda.stream(comm_stream):
                group = self.redundant_process_group
                with _coalescing_manager(group, self.device, async_ops=True) as cm:
                    for bucket in buckets:
                        _coalescing_manager_append_work(
                            cm,
                            torch.distributed.all_reduce(
                                bucket.sync_grads_shard,
                                op=reduce_op,
                                group=group,
                                async_op=True,
                            ),
                        )
                cm.wait()

    def _finish_bucket_grad_sync(self) -> None:
        """Wait for any gradient synchronizations that are in progress"""
        main_stream = torch.cuda.current_stream()
        for comm_stream in self._comm_streams:
            main_stream.wait_stream(comm_stream)
        for bucket_id, bucket in sorted(self._grads_buckets.items()):
            if bucket.status == self.GradientStatus.SYNCING:
                # Accumulate gradient in local shard
                if bucket.grads_shard is None:
                    bucket.grads_shard = bucket.sync_grads_shard
                else:
                    bucket.grads_shard.add_(bucket.sync_grads_shard)
                bucket.grads_bucket = None

                # Reset status
                bucket.status = self.GradientStatus.READY

                # Cached gradient norm has been invalidated
                self._grad_norm = None

    def _try_start_bucket_param_sync(
        self,
        params: Iterable[torch.nn.Parameter] = None,
    ) -> None:
        """Attempt to launch parameter synchronization

        Launches parameter synchronization for buckets corresponding
        to provided parameters, if needed. If parameters are not
        provided and no other synchronizations are in progress,
        attempts to find a parameter that still requires
        synchronization. Parameter synchronization is asynchronous.

        Arguments:
            params (iterable, optional): parameters to synchronize

        """

        # Default behavior: only launch param sync if no other syncs
        # are in progress
        if params is None:
            params = []
            if any(
                bucket.status == self.ParameterStatus.SYNCING
                for bucket in self._params_buckets.values()
            ):
                return
            for bucket_id, bucket in self._params_buckets.items():
                if bucket.status == self.ParameterStatus.SHARDED:
                    params.append(self.parameter(self.state["buckets"][bucket_id].fragments[-1]))
                    break

        # Find buckets corresponding to params
        bucket_ids = set()
        for param in params:
            bucket_ids.update(fragment.bucket_id for fragment in self.state[param]["fragments"])
        buckets = [
            self._params_buckets[bucket_id]
            for bucket_id in sorted(bucket_ids)
            if bucket_id in self._params_buckets
        ]
        buckets = [bucket for bucket in buckets if bucket.status == self.ParameterStatus.SHARDED]

        # Launch param sync if needed
        if buckets:
            self._start_bucket_param_sync(buckets)

    def _start_bucket_param_sync(self, buckets: List[ParameterBucket]) -> None:
        """Synchronize parameter buckets

        Parameter synchronization is asynchronous. Involves all-gather
        over distributed process group. Assumes param shard buffers
        are already initialized.

        """

        # Complete any outstanding param syncs
        self._finish_bucket_param_sync()

        # Initialize param state and buffers
        buckets = [bucket for bucket in buckets if bucket.status == self.ParameterStatus.SHARDED]
        for bucket in buckets:
            bucket.status = self.ParameterStatus.SYNCING
            if bucket.params_bucket is not None:
                pass
            elif self.distributed_size == 1:
                bucket.params_bucket = bucket.params_shard
            else:
                shard_size = bucket.params_shard.numel()
                bucket_size = shard_size * self.distributed_size
                bucket.params_bucket = torch.empty(
                    [bucket_size],
                    dtype=bucket.params_shard.dtype,
                    device=bucket.params_shard.device,
                )

        # Side stream for communication
        main_stream = torch.cuda.current_stream()
        self._last_comm_stream_id = (self._last_comm_stream_id + 1) % len(self._comm_streams)
        comm_stream = self._comm_streams[self._last_comm_stream_id]
        comm_stream.wait_stream(main_stream)

        # All-gather over distributed process group
        if buckets and self.distributed_size > 1:
            with torch.cuda.stream(comm_stream):
                group = self.distributed_process_group
                with _coalescing_manager(group, self.device, async_ops=True) as cm:
                    for bucket in buckets:
                        _coalescing_manager_append_work(
                            cm,
                            all_gather_into_tensor(
                                bucket.params_bucket,
                                bucket.params_shard,
                                group=group,
                                async_op=True,
                            ),
                        )
                cm.wait()

    def _finish_bucket_param_sync(self) -> None:
        """Wait for any param synchronizations that are in progress"""
        main_stream = torch.cuda.current_stream()
        for comm_stream in self._comm_streams:
            main_stream.wait_stream(comm_stream)
        for bucket_id, bucket in self._params_buckets.items():
            if bucket.status == self.ParameterStatus.SYNCING:
                bucket.params_shard = None
                bucket.status = self.ParameterStatus.READY

    @contextlib.contextmanager
    def no_sync(
        self,
        greedy_grad_copy: None = False,
    ) -> contextlib.AbstractContextManager:
        """Disable overlapped gradient synchronization

        Context manager that is similar to
        torch.nn.parallel.DistributedDataParallel.no_sync. The
        gradients can be synchronized by calling grad_sync or step. If
        overlapped gradient synchronization is enabled, gradients can
        also be synchronized by leaving the context and performing a
        backward pass.

        Arguments:
            greedy_grad_copy (bool, optional): copy parameter
                gradients to buckets as soon as they are generated
                (default: False)

        """
        old_greedy_grad_copy = self.greedy_grad_copy
        old_overlap_grad_sync = self.overlap_grad_sync
        self.greedy_grad_copy = greedy_grad_copy
        self.overlap_grad_sync = False
        try:
            yield
        finally:
            self.greedy_grad_copy = old_greedy_grad_copy
            self.overlap_grad_sync = old_overlap_grad_sync

    def grad_sync(self) -> None:
        """Ensure that all gradients are synchronized"""
        for bucket in self.state["buckets"]:
            for fragment in bucket.fragments:
                param = self.parameter(fragment)
                if param.grad is not None:
                    self._grad_copy(param)
                    if not self.contiguous_grad_buffer:
                        self._try_start_bucket_grad_sync(
                            params=[param],
                            ignore_last_bucket=False,
                        )
        self._force_bucket_grad_sync()

    def param_sync(self) -> None:
        """Ensure that all parameters are synchronized"""
        if self.contiguous_param_buffer:
            self._param_copy(self.parameters())
        else:
            while self._params_buckets:
                bucket_id, bucket = next(iter((self._params_buckets.items())))
                for fragment in reversed(self.state["buckets"][bucket_id].fragments):
                    self._param_copy(self.parameter(fragment))
        self._params_buckets.clear()

    @torch.no_grad()
    def _local_grad_norm(
        self,
        parameters: Optional[Iterable[torch.nn.Parameter]] = None,
        norm_type: float = 2.0,
    ) -> torch.Tensor:
        """Local contribution to parameter gradient norm

        Returns square of 2-norm. Other norms are not yet supported.

        If no parameters are provided, the norm is computed for all
        parameters in optimizer. Provided parameters are assumed to be
        in optimizer and to require gradients.

        """
        norm_type = float(norm_type)
        assert norm_type == 2.0

        # Make sure that gradients have been reduced
        self.grad_sync()

        # Check if provided parameters are subset of all parameters
        if parameters is not None:
            parameters = list(parameters)
            params_set = set(parameters)
            all_params_set = set()
            for bucket in self.state["buckets"]:
                for fragment in bucket.fragments:
                    all_params_set.add(self.parameter(fragment))
            if not params_set.issubset(all_params_set):
                raise RuntimeError(
                    "Attempted to compute gradient norm for a parameter "
                    "that is not managed by DistributedFusedAdam"
                )
            if params_set == all_params_set:
                parameters = None

        # Group grads by dtype
        grad_groups = collections.defaultdict(list)
        if parameters is None:
            # Compute norm of all local gradients
            for bucket_id, grads_bucket in self._grads_buckets.items():
                state_bucket = self.state["buckets"][bucket_id]
                dtype = state_bucket.grad_sync_dtype
                grad_groups[dtype].append(grads_bucket.grads_shard)
        else:
            # Compute norm of selected local gradients
            for param in parameters:
                if "fragments" not in self.state[param]:
                    continue
                for fragment in self.state[param]["fragments"]:
                    if not fragment.in_local_shard:
                        continue
                    shard_start, shard_end = fragment.shard_range
                    if shard_end <= shard_start:
                        continue
                    bucket_id = fragment.bucket_id
                    grads_bucket = self._grads_buckets[bucket_id]
                    state_bucket = self.state["buckets"][bucket_id]
                    grad_groups[state_bucket.grad_sync_dtype].append(
                        grads_bucket.grads_shard[shard_start:shard_end]
                    )

        # Compute norm of each group of grads
        grad_norm_sq = None
        for grad_group in grad_groups.values():
            grad_group_norm_sq = (
                multi_tensor_applier(
                    amp_C.multi_tensor_l2norm,
                    self._dummy_overflow_buf,
                    [grad_group],
                    False,
                )[0]
                ** 2
            )
            if grad_norm_sq is None:
                grad_norm_sq = grad_group_norm_sq
            else:
                grad_norm_sq += grad_group_norm_sq
        if grad_norm_sq is None:
            grad_norm_sq = torch.zeros([], dtype=torch.float32, device=self.device)

        # Interpret norm as scalar
        grad_norm_sq = grad_norm_sq.to(dtype=torch.float32, device=self.device)
        grad_norm_sq = grad_norm_sq.view([])
        return grad_norm_sq

    def grad_norm(
        self,
        parameters: Optional[Iterable[torch.nn.Parameter]] = None,
        norm_type: float = 2.0,
        force: bool = False,
    ) -> torch.Tensor:
        """Gradient norm of parameters in optimizer

        The norm is computed over all gradients together, as if they
        were concatenated into a single vector. All provided
        parameters must be managed by optimizer.

        The computed value is cached to avoid redundant communication.

        Arguments:
            parameters (iterable, optional): an iterable of parameters
                in optimizer (default: all parameters in optimizer).
            norm_type (float, optional): type of the used p-norm
                (default: 2). Only 2-norm is currently supported.
            force (bool, optional): ignore cached value and force norm
                computation (default: False).

        """
        if force or self._grad_norm is None:
            norm_type = float(norm_type)
            assert norm_type == 2.0
            grad_norm_sq = self._local_grad_norm(
                parameters=parameters,
                norm_type=norm_type,
            )
            torch.distributed.all_reduce(
                grad_norm_sq,
                op=torch.distributed.ReduceOp.SUM,
                group=self.distributed_process_group,
            )
            self._grad_norm = grad_norm_sq.sqrt()
        grad_norm = self._grad_norm * self._grad_scale
        return grad_norm.detach()

    def clip_grad_norm(
        self,
        max_norm: float,
        parameters: Optional[Iterable[torch.nn.Parameter]] = None,
        norm_type: float = 2.0,
    ) -> torch.Tensor:
        """Clips gradient norm of parameters in optimizer

        The norm is computed over all gradients together, as if they
        were concatenated into a single vector. The scaling is
        deferred until the optimizer step, which should be called
        immediately after this function.

        The computed grad norm is cached to avoid redundant
        communication.

        Arguments:
            max_norm (float): max norm of the gradients
            parameters (iterable, optional): an iterable of parameters
                in optimizer (default: all parameters in optimizer).
            norm_type (float, optional): type of the used
                p-norm (default: 2)

        """
        assert max_norm > 0
        total_norm = self.grad_norm(parameters=parameters, norm_type=norm_type)
        clip_coef = max_norm / (total_norm + 1e-6)
        clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
        self._grad_scale *= clip_coef_clamped
        return total_norm

    @torch.no_grad
    def unscale_grads(
        self,
        *args: Union[Optional[torch.Tensor], Any],
        inv_scale: Optional[torch.Tensor] = None,
        grad_scaler: Optional[torch.cuda.amp.GradScaler] = None,
    ) -> None:
        """Custom unscale function for use by AMP gradient scaler

        Either inv_scale or grad_scaler must be provided, but not
        both. If grad_scaler is provided, this is equivalent to
        calling its unscale_ function.

        Arguments:
            inv_scale (torch.Tensor, optional): factor to multiply
                gradients. May be provided either as a kwarg or as the
                first positional arg.
            grad_scaler (torch.cuda.amp.GradScaler): gradient scaler
                (default: None)

        """

        # inv_scale is either kwarg or first positional arg
        if inv_scale is None and len(args) >= 1:
            inv_scale = args[0]

        # Check for non-finite values
        # Note: We compute gradient norm to check for non-finite
        # values. This is more conservative and compute intensive than
        # directly checking, but it avoids extra communication if we
        # have already computed gradient norm e.g. for gradient
        # clipping.
        found_inf = torch.logical_not(torch.isfinite(self.grad_norm()))
        found_inf_per_device = {found_inf.device: found_inf.float()}

        # Get inv_scale from GradScaler if provided
        if grad_scaler is not None and grad_scaler._enabled:
            grad_scaler_state = grad_scaler._per_optimizer_states[id(self)]
            GradScalerOptState = torch.cuda.amp.grad_scaler.OptState
            if grad_scaler_state["stage"] is GradScalerOptState.UNSCALED:
                raise RuntimeError(
                    "unscale_grads has already been called since the last GradScaler update"
                )
            if grad_scaler_state["stage"] is GradScalerOptState.STEPPED:
                raise RuntimeError("unscale_grads is being called after optimizer step")
            if grad_scaler._scale is None:
                raise RuntimeError("Attempted unscale_grads with GradScaler that is missing _scale")
            if inv_scale is not None:
                raise ValueError(
                    "unscale_grads is being called with both scale_inv and grad_scaler"
                )
            inv_scale = grad_scaler._scale.double().reciprocal()
            inv_scale = inv_scale.to(dtype=torch.float32, device=self.device)
            grad_scaler_state["found_inf_per_device"] = found_inf_per_device
            grad_scaler_state["stage"] = GradScalerOptState.UNSCALED

        # Apply inv_scale to grad_scale
        if inv_scale is None:
            raise ValueError("unscale_grads is being called with neither scale_inv and grad_scaler")
        self._grad_scale *= inv_scale.view([])
        return found_inf_per_device

    def step(
        self,
        closure: Optional[Callable] = None,
        *,
        grad_scaler: Optional[torch.cuda.amp.GradScaler] = None,
    ):
        """Apply Adam optimizer step

        Arguments:
            closure (callable, optional): closure to recompute loss
                (default: None)
            grad_scaler (torch.cuda.amp.GradScaler, optional):
                gradient scaler (default: None)

        """

        # Apply closure
        loss = None
        if closure is not None:
            loss = closure()

        # Make sure params are initialized
        self.init_params()

        # Make sure that parameters and gradients are synchronized
        self.param_sync()
        self.grad_sync()

        # Apply gradient scaler if provided
        if grad_scaler is not None and grad_scaler._enabled:
            grad_scaler_state = grad_scaler._per_optimizer_states[id(self)]
            GradScalerOptState = torch.cuda.amp.grad_scaler.OptState
            if grad_scaler_state["stage"] is GradScalerOptState.READY:
                self.unscale_grads(grad_scaler=grad_scaler)
            found_inf = grad_scaler_state["found_inf_per_device"][self.device]
            if self.capturable:
                self._dummy_overflow_buf.copy_(found_inf)
            elif found_inf.item():
                return
        self._grad_scale = self._grad_scale.to(dtype=torch.float32, device=self.device)

        # Initialize buffers for param syncs
        num_buckets = len(self.state["buckets"])
        for bucket_id in reversed(range(num_buckets)):
            self._params_buckets[bucket_id] = self.ParameterBucket()
            params_bucket = self._params_buckets[bucket_id]
            state_bucket = self.state["buckets"][bucket_id]
            shard_size = state_bucket.shard_size
            dtype = state_bucket.dtype
            param_sync_dtype = state_bucket.param_sync_dtype

            if self.contiguous_param_buffer:
                # Construct views into contiguous param buffer
                if not self._param_buffers:
                    self.init_param_buffer()
                bucket_size = state_bucket.bucket_size
                buffer_start = state_bucket.contiguous_buffer_offset
                buffer_end = buffer_start + bucket_size
                param_buffer = self._param_buffers[state_bucket.dtypes()]
                params_bucket.params_bucket = param_buffer[buffer_start:buffer_end]
                bucket_start = self.distributed_rank * shard_size
                bucket_end = bucket_start + shard_size
                params_bucket.params_shard = params_bucket.params_bucket[bucket_start:bucket_end]

            # Initialize param shard buffer
            if self.with_scaled_states:
                # Use FP32 workspace buffer with scaled optimizer state
                params_bucket.params_shard = None
            elif not param_sync_dtype.is_floating_point:
                # Make sure param shard buffer is floating-point
                if state_bucket.params_shard is not None and dtype.is_floating_point:
                    params_bucket.params_shard = state_bucket.params_shard
                else:
                    params_bucket.params_shard = torch.empty(
                        [shard_size],
                        dtype=self.dtype,
                        device=self.device,
                    )
            else:
                # Allocate param shard buffer if needed
                if params_bucket.params_shard is not None:
                    pass
                elif state_bucket.params_shard is not None and dtype == param_sync_dtype:
                    params_bucket.params_shard = state_bucket.params_shard
                else:
                    params_bucket.params_shard = torch.empty(
                        [shard_size],
                        dtype=param_sync_dtype,
                        device=self.device,
                    )

        # Apply optimizer step
        self.state["step"] += (
            1 if not self.capturable else (self._dummy_overflow_buf != 1).to(torch.int)
        )
        overlap_first_bucket = (
            self.distributed_size > 1 and self.overlap_param_sync and self.state["buckets"]
        )
        if overlap_first_bucket:
            # Local step and non-blocking param sync
            # Note: Overlap param sync of first buckets with optimizer
            # step of remaining buckets.

            # Get buckets containing "first" parameter
            first_param = self.parameter(self.state["buckets"][-1].fragments[-1])
            first_bucket_ids = sorted(
                fragment.bucket_id for fragment in self.state[first_param]["fragments"]
            )

            # Local step and launch param sync for first buckets
            self._local_step(first_bucket_ids)
            self._start_bucket_param_sync(
                self._params_buckets[bucket_id] for bucket_id in first_bucket_ids
            )

            # Local step for remaining buckets
            first_bucket_ids = set(first_bucket_ids)
            self._local_step(
                [bucket_id for bucket_id in range(num_buckets) if bucket_id not in first_bucket_ids]
            )

        else:
            # Local step
            self._local_step(list(range(num_buckets)))

        # Synchronize params
        if self.distributed_size > 1 and self.overlap_param_sync:
            # Asynchronous param sync
            self._try_start_bucket_param_sync()
            for param in self.parameters():
                param._pre_forward_hook_is_enabled = True
        else:
            # Blocking param sync
            self.param_sync()

        return loss

    def _local_step(self, bucket_ids: List[int]) -> None:
        """Apply optimizer step to local shard of parameter buckets

        Arguments:
            bucket_ids (list): bucket indices

        """

        # Implementation with scaled optimizer state
        if self.with_scaled_states:
            self._local_step_with_scaled_states(bucket_ids)
            return

        # Optimized implementation with BF16 params and 16-bit param
        # remainders
        if self.store_param_remainders:
            bf16_rem_buckets = set()
            for bucket_id in bucket_ids:
                state_bucket = self.state["buckets"][bucket_id]
                if state_bucket.param_remainders_shard is not None:
                    bf16_rem_buckets.add(bucket_id)
            if bf16_rem_buckets:
                self._local_step_with_param_remainders(sorted(bf16_rem_buckets))
            bucket_ids = [
                bucket_id for bucket_id in bucket_ids if bucket_id not in bf16_rem_buckets
            ]
            if not bucket_ids:
                return

        # Find param fragments for each bucket
        buffers = collections.defaultdict(list)  # p_in, m, v, g, p_out
        for bucket_id in bucket_ids:
            state_bucket = self.state["buckets"][bucket_id]
            grads_bucket = self._grads_buckets[bucket_id]
            params_bucket = self._params_buckets[bucket_id]

            # Optimizer state buffers for local shard
            fragments = state_bucket.fragments
            exp_avg = state_bucket.exp_avg_shard
            exp_avg_sq = state_bucket.exp_avg_sq_shard
            grads = grads_bucket.grads_shard
            params_out = params_bucket.params_shard

            # Find param fragments in local shard
            for fragment in fragments:
                if not fragment.in_local_shard:
                    continue
                shard_start, shard_end = fragment.shard_range
                if shard_end <= shard_start:
                    continue
                shard_range = slice(shard_start, shard_end)
                if state_bucket.params_shard is None:
                    param = self.parameter(fragment)
                    if param.is_contiguous(memory_format=torch.channels_last):
                        param = param.permute(0, 2, 3, 1)
                    param_range = slice(*fragment.shard_param_range)
                    param_fragment = param.detach().view(-1)[param_range]
                    param_fragment = param_fragment.to(dtype=state_bucket.dtype, device=self.device)
                else:
                    params_shard = state_bucket.params_shard
                    param_fragment = params_shard[shard_range]
                buffers_key = (
                    fragment.param_group_id,
                    state_bucket.dtype,
                    state_bucket.grad_sync_dtype,
                    state_bucket.param_sync_dtype,
                )
                buffers[buffers_key].append(
                    [
                        param_fragment,
                        exp_avg[shard_range],
                        exp_avg_sq[shard_range],
                        grads[shard_range],
                        params_out[shard_range],
                    ]
                )

        # Apply optimizer step to each param group
        adam_func = (
            distributed_adam_cuda.multi_tensor_fused_adam_capturable
            if self.capturable
            else distributed_adam_cuda.multi_tensor_fused_adam
        )
        for (group_id, _, _, _), group_buffers in buffers.items():
            group = self.param_groups[group_id]
            beta1, beta2 = group["betas"]
            multi_tensor_applier(
                adam_func,
                self._dummy_overflow_buf,
                list(zip(*group_buffers)),
                self._grad_scale,
                group["lr"],
                beta1,
                beta2,
                group["eps"],
                self.state["step"],
                1 if self.adam_w_mode else 0,
                1 if group["bias_correction"] else 0,
                group["weight_decay"],
            )

        # Make sure param sync buffer has correct dtype
        self._check_params_shard_dtypes(
            {bucket_id: self._params_buckets[bucket_id] for bucket_id in bucket_ids}
        )

    def _local_step_with_param_remainders(
        self,
        bucket_ids: List[int],
    ) -> None:
        """Apply optimizer step to local shard of parameter bucket

        This is an experimental implementation that expects
        store_params=False and store_param_remainders=True. The
        optimizer dtype must be FP32 and the params must all be BF16
        and GPU.

        Arguments:
            bucket_ids (list): bucket indices

        """

        # Find param fragments for each bucket
        buffers = collections.defaultdict(list)  # p_in, p_rem, m, v, g, p_out
        for bucket_id in bucket_ids:
            state_bucket = self.state["buckets"][bucket_id]
            grads_bucket = self._grads_buckets[bucket_id]
            params_bucket = self._params_buckets[bucket_id]

            # State buffers for local shard
            fragments = state_bucket.fragments
            param_remainders_shard = state_bucket.param_remainders_shard
            exp_avg = state_bucket.exp_avg_shard
            exp_avg_sq = state_bucket.exp_avg_sq_shard
            grads = grads_bucket.grads_shard
            params_out = params_bucket.params_shard

            # Find param fragments in local shard
            for fragment in fragments:
                if not fragment.in_local_shard:
                    continue
                shard_start, shard_end = fragment.shard_range
                if shard_end <= shard_start:
                    continue
                shard_range = slice(shard_start, shard_end)
                buffers_key = (
                    fragment.param_group_id,
                    state_bucket.grad_sync_dtype,
                )
                param = self.parameter(fragment)
                param_range = slice(*fragment.shard_param_range)
                param_fragment = param.detach().view(-1)[param_range]
                param_fragment = param_fragment.to(dtype=torch.bfloat16, device=self.device)
                buffers[buffers_key].append(
                    [
                        param_fragment,
                        param_remainders_shard[shard_range],
                        exp_avg[shard_range],
                        exp_avg_sq[shard_range],
                        grads[shard_range],
                        params_out[shard_range],
                    ]
                )

        # Apply optimizer step to each param group
        for (group_id, _), group_buffers in buffers.items():
            group = self.param_groups[group_id]
            beta1, beta2 = group["betas"]
            multi_tensor_applier(
                distributed_adam_cuda.multi_tensor_fused_adam_with_param_remainders,
                self._dummy_overflow_buf,
                list(zip(*group_buffers)),
                self._grad_scale,
                group["lr"],
                beta1,
                beta2,
                group["eps"],
                self.state["step"],
                1 if self.adam_w_mode else 0,
                1 if group["bias_correction"] else 0,
                group["weight_decay"],
            )

        # Make sure param sync buffer has correct dtype
        self._check_params_shard_dtypes(
            {bucket_id: self._params_buckets[bucket_id] for bucket_id in bucket_ids}
        )

    @torch.no_grad()
    def _local_step_with_scaled_states(
        self,
        bucket_ids: List[int],
    ) -> None:
        for bucket_id in bucket_ids:
            state_bucket = self.state["buckets"][bucket_id]
            grads_bucket = self._grads_buckets[bucket_id]
            params_bucket = self._params_buckets[bucket_id]
            params_bucket.params_shard = torch.empty_like(
                state_bucket.params_shard,
                dtype=torch.float32,
            )

            # Find param fragments in local shard
            group_buffers = collections.defaultdict(list)  # p_in, m, v, g, p_out
            scaled_buffers = []
            unscaled_buffers = []
            buffer_scales = []
            for fragment in state_bucket.fragments:
                if not fragment.in_local_shard:
                    continue
                shard_start, shard_end = fragment.shard_range
                if shard_end <= shard_start:
                    continue
                shard_range = slice(shard_start, shard_end)
                param_group_id = fragment.param_group_id
                param_id = fragment.param_id
                scaled_param = state_bucket.params_shard[shard_range]
                scaled_exp_avg = state_bucket.exp_avg_shard[shard_range]
                scaled_exp_avg_sq = state_bucket.exp_avg_sq_shard[shard_range]
                grads = grads_bucket.grads_shard[shard_range]
                param = params_bucket.params_shard[shard_range]
                exp_avg = torch.empty_like(scaled_exp_avg, dtype=torch.float32)
                exp_avg_sq = torch.empty_like(scaled_exp_avg_sq, dtype=torch.float32)
                scales = self._state_scales[(param_group_id, param_id, bucket_id)]
                group_buffers[param_group_id].append((param, exp_avg, exp_avg_sq, grads, param))
                scaled_buffers.extend((scaled_param, scaled_exp_avg, scaled_exp_avg_sq))
                unscaled_buffers.extend((param, exp_avg, exp_avg_sq))
                buffer_scales.extend((scales["param"], scales["exp_avg"], scales["exp_avg_sq"]))

            # Unscale optimizer state
            _multi_tensor_copy(
                scaled_buffers,
                unscaled_buffers,
                dummy_overflow_buf=self._dummy_overflow_buf,
            )
            for buf, scale in zip(unscaled_buffers, buffer_scales):
                buf.mul_(scale)

            # Apply optimizer step to each param group
            for group_id, buffers in group_buffers.items():
                group = self.param_groups[group_id]
                beta1, beta2 = group["betas"]
                multi_tensor_applier(
                    distributed_adam_cuda.multi_tensor_fused_adam,
                    self._dummy_overflow_buf,
                    list(zip(*buffers)),
                    self._grad_scale,
                    group["lr"],
                    beta1,
                    beta2,
                    group["eps"],
                    self.state["step"],
                    1 if self.adam_w_mode else 0,
                    1 if group["bias_correction"] else 0,
                    group["weight_decay"],
                )
            del group_buffers

            # Make sure param sync buffer has correct dtype
            self._check_params_shard_dtypes({bucket_id: params_bucket})

            # Scale optimizer state
            for buf, scale in zip(unscaled_buffers, buffer_scales):
                self._apply_state_scale(buf, scale)
            _multi_tensor_copy(
                unscaled_buffers,
                scaled_buffers,
                dummy_overflow_buf=self._dummy_overflow_buf,
            )
            del scaled_buffers, unscaled_buffers, buffer_scales

    @torch.no_grad()
    def _check_params_shard_dtypes(
        self,
        params_buckets: Dict[int, ParameterBucket],
    ) -> None:
        """Make sure local shards of parameters are in expected datatypes

        The Adam kernel only supports floating-point datatypes. If we
        want to perform parameter synchronization with
        non-floating-point dtypes, we need to allocate temporary
        buffers that can accommodate the Adam kernel. This function is
        responsible for converting these temporary buffers to the
        parameter synchronization datatype.

        """

        # Find param shards that require dtype conversion
        buffers_in = []
        buffers_out = []
        for bucket_id, param_bucket in params_buckets.items():
            # Check if param shard is already in expected dtype
            state_bucket = self.state["buckets"][bucket_id]
            param_sync_dtype = state_bucket.param_sync_dtype
            if param_bucket.params_shard.dtype == param_sync_dtype:
                continue

            # Allocate buffer with required dtype
            buffer_in = param_bucket.params_shard
            buffer_out = torch.empty_like(
                param_bucket.params_shard,
                dtype=param_sync_dtype,
            )
            param_bucket.params_shard = buffer_out

            if torch.is_floating_point(buffer_in) and torch.is_floating_point(buffer_out):
                # Cast between floating-point dtypes
                buffers_in.append(buffer_in)
                buffers_out.append(buffer_out)
            else:
                # Copy most significant bytes for non-floating-point
                # dtypes
                # Note: Assume dtypes are little-endian
                in_bytes = buffer_in.unsqueeze(-1).view(torch.uint8)
                out_bytes = buffer_out.unsqueeze(-1).view(torch.uint8)
                copy_size = min(in_bytes.size(-1), out_bytes.size(-1))
                buffers_in.append(in_bytes[..., -copy_size:])
                buffers_out.append(out_bytes[..., -copy_size:])
                if copy_size < out_bytes.size(-1):
                    out_bytes[..., :-copy_size].zero_()

        # Perform dtype conversions
        _multi_tensor_copy(
            buffers_in,
            buffers_out,
            dummy_overflow_buf=self._dummy_overflow_buf,
        )

    @torch.no_grad()
    def _apply_state_scale(
        self,
        tensor: torch.Tensor,
        scale: torch.Tensor,
    ) -> None:
        """Compute and apply scaling factor for scaled optimizer state

        The scaling factor is chosen to maximize the dynamic range
        while avoiding numerical overflows. The returned tensors are
        the scale (used to unscale the optimizer state) and the
        scale-reciprocal (used to generate the scaled optimizer
        state). The input tensors are updated in-place.

        """
        if not hasattr(self, "_max_scaled_state"):
            self._max_scaled_state = torch.full(
                [1],
                torch.finfo(self.dtype).max / 2,
                dtype=torch.float32,
                device=self.device,
            )
        min_val, max_val = torch.aminmax(tensor)
        absmax = torch.maximum(-min_val, max_val)
        absmax = absmax.to(dtype=torch.float32, device=self.device)
        torch.div(absmax, self._max_scaled_state, out=scale)
        rscale = torch.where(scale > 0, scale.reciprocal(), 0.0)
        tensor.mul_(rscale)

    def state_dict(
        self,
        *,
        state_dict_format: Optional[int] = None,
        gather_on_root: Optional[bool] = None,
    ) -> Optional[dict]:
        """Get dictionary containing optimizer state

        All ranks in the process group must call this function since
        it performs communication. The same optimizer state is
        returned on all ranks.

        Arguments:
            state_dict_format (int, optional): Tag for custom or
                deprecated state dict format.
            gather_on_root (bool, optional): Option for deprecated v1
                format.

        """

        # Default state dict format
        if state_dict_format is None:
            state_dict_format = 2

        # Construct state dict
        state_dict = None
        if state_dict_format == 1:
            # Deprecated v1 format
            kwargs = {}
            if gather_on_root is not None:
                kwargs["gather_on_root"] = gather_on_root
            state_dict = self._state_dict_v1(**kwargs)
        elif state_dict_format == 2:
            # Default v2 format
            state_dict = self._state_dict_v2()
        else:
            # Unrecognized format
            raise ValueError(f"Unrecognized state dict format ({state_dict_format})")

        # Add format tag to state dict
        if state_dict is not None:
            state_dict["format"] = state_dict_format

        return state_dict

    def _state_dict_v1(self, gather_on_root: bool = True) -> Optional[dict]:
        """Get dictionary containing optimizer state (deprecated v1 format)

        Default behavior is to perform communication so that the
        entire optimizer state is returned on the root rank in the
        process group. In this case, all ranks in the process group
        must enter this function and no value is returned on non-root
        ranks.

        Arguments:
            gather_on_root (bool, optional): Gather state from all
                ranks on the root rank (default: True)

        """
        warnings.warn(
            "Making optimizer state dictionary in deprecated v1 format. "
            "Future support is not guaranteed."
        )
        if self.with_scaled_states:
            raise NotImplementedError("Deprecated v1 format does not support scaled state")

        state_dict = super().state_dict()
        if not gather_on_root:
            return state_dict

        # Finish any asynchronous communication
        self.grad_sync()
        self.param_sync()

        # Export local state to byte string
        state_bytes = io.BytesIO()
        torch.save(state_dict, state_bytes)
        state_bytes.seek(0)
        state_bytes_view = state_bytes.getbuffer()

        # Get data sizes on all ranks
        local_state_size = len(state_bytes_view)
        state_sizes = [None] * self.distributed_size
        torch.distributed.all_gather_object(
            state_sizes,
            local_state_size,
            group=self.process_group,
        )
        max_state_size = max(state_sizes)

        # Construct workspace buffers
        chunk_size = self.default_shard_size * torch.finfo(self.grad_sync_dtype).bits // 8
        if self.distributed_rank == 0:
            gathered_state_bytes = [
                torch.empty([size], dtype=torch.uint8, device="cpu") for size in state_sizes
            ]
            gathered_state_bytes[0].copy_(torch.frombuffer(state_bytes_view, dtype=torch.uint8))
            gathered_chunks_buffers = [
                torch.empty(
                    [chunk_size * self.distributed_size],
                    dtype=torch.uint8,
                    device=self.device,
                )
                for _ in range(self.pipeline_size)
            ]
        else:
            chunk_buffers = [
                torch.empty(
                    [chunk_size],
                    dtype=torch.uint8,
                    device=self.device,
                )
                for _ in range(self.pipeline_size)
            ]

        # Split data into chunks and gather on root rank
        # Note: Assuming we are using the NCCL backend, communication
        # must happen on the GPU. We split the data into fixed-size
        # chunks to limit GPU memory usage.
        main_stream = torch.cuda.current_stream()
        for stream in self._pipeline_streams:
            stream.wait_stream(main_stream)
        for stream_id, offset in enumerate(range(0, max_state_size, chunk_size)):
            stream_id %= self.pipeline_size
            stream = self._pipeline_streams[stream_id]
            with torch.cuda.stream(stream):
                # Buffers for chunk
                if self.distributed_rank == 0:
                    gathered_chunks = [
                        gathered_chunks_buffers[stream_id][i * chunk_size : (i + 1) * chunk_size]
                        for i in range(self.distributed_size)
                    ]
                else:
                    chunk = chunk_buffers[stream_id]

                # Copy to GPU
                if self.distributed_rank != 0 and offset < local_state_size:
                    local_chunk_size = min(chunk_size, local_state_size - offset)
                    chunk[:local_chunk_size].copy_(
                        torch.frombuffer(
                            state_bytes_view,
                            dtype=torch.uint8,
                            count=local_chunk_size,
                            offset=offset,
                        ),
                        non_blocking=True,
                    )

                # Gather on root
                # Note: Call in main stream to avoid memory pool
                # overheads from internal memory allocations in
                # gather.
                main_stream.wait_stream(stream)
                with torch.cuda.stream(main_stream):
                    if self.distributed_rank == 0:
                        if self._gather_no_copy:
                            no_copy_kwarg = {"no_copy": True}
                        else:
                            no_copy_kwarg = {}
                        torch.distributed.gather(
                            gathered_chunks[0],
                            gathered_chunks,
                            dst=self.process_group_root,
                            group=self.process_group,
                            **no_copy_kwarg,
                        )
                    else:
                        torch.distributed.gather(
                            chunk,
                            dst=self.process_group_root,
                            group=self.process_group,
                        )
                stream.wait_stream(main_stream)

                # Copy back to CPU
                if self.distributed_rank == 0:
                    for rank in range(1, self.distributed_size):
                        rank_chunk_start = offset
                        rank_chunk_end = min(offset + chunk_size, state_sizes[rank])
                        rank_chunk_size = rank_chunk_end - rank_chunk_start
                        if rank_chunk_size > 0:
                            src = gathered_chunks[rank][:rank_chunk_size]
                            dst = gathered_state_bytes[rank][rank_chunk_start:rank_chunk_end]
                            dst.copy_(src, non_blocking=True)

        # Synchronize GPU
        for stream in self._pipeline_streams:
            main_stream.wait_stream(stream)
        main_stream.synchronize()

        # Return gathered state data on root rank
        if self.distributed_rank == 0:
            return {"gathered_states": gathered_state_bytes}
        else:
            return None

    @torch.no_grad()
    def _state_dict_v2(self) -> Optional[dict]:
        """Get dictionary containing optimizer state (default v2 format)

        All ranks in the process group must call this function since
        it performs communication. The same optimizer state is
        returned on all ranks.

        """

        # Make sure params are initialized
        self.init_params()

        # Finish any asynchronous communication
        self.grad_sync()
        self.param_sync()

        # Output tensor format
        dtype = torch.float32 if self.with_scaled_states else self.dtype
        device = torch.device("cpu")

        # Get state dict from base class
        state_dict = super().state_dict()
        state_dict["state"] = {"step": state_dict["state"]["step"]}

        # Initialize state dict with CPU buffers
        for param in self.parameters():
            # Get param index in state dict
            fragment = self.state[param]["fragments"][0]
            param_group_id = fragment.param_group_id
            param_id = fragment.param_id
            index = state_dict["param_groups"][param_group_id]["params"][param_id]

            # Construct CPU buffers with optimizer state
            state_dict["state"][index] = dict(
                param=torch.zeros_like(param, dtype=dtype, device=device),
                exp_avg=torch.zeros_like(param, dtype=dtype, device=device),
                exp_avg_sq=torch.zeros_like(param, dtype=dtype, device=device),
            )

        # Workspace buffers for gathering shards on root rank
        num_buckets = len(self.state["buckets"])
        max_bucket_size = max(bucket.bucket_size for bucket in self.state["buckets"])
        bucket_buffers = [
            torch.empty(
                [max_bucket_size],
                dtype=dtype,
                device=self.device,
            )
            for _ in range(self.pipeline_size)
        ]
        if self.store_param_remainders:
            max_shard_size = max(bucket.shard_size for bucket in self.state["buckets"])
            shard_bf16_buffers = [
                torch.empty([max_shard_size], dtype=torch.bfloat16, device=self.device)
                for _ in range(self.pipeline_size)
            ]

        # Synchronize streams
        main_stream = torch.cuda.current_stream()
        for stream in self._pipeline_streams:
            stream.wait_stream(main_stream)

        def get_workspace_shard(bucket_id: int) -> torch.Tensor:
            """Workspace buffer for local shard"""
            bucket = self.state["buckets"][bucket_id]
            shard_size = bucket.shard_size
            stream_id = bucket_id % self.pipeline_size
            shard_range = slice(
                shard_size * self.distributed_rank,
                shard_size * (self.distributed_rank + 1),
            )
            return bucket_buffers[stream_id][shard_range]

        def unscale_shard(
            bucket_id: int,
            shard: torch.Tensor,
            state_key: str,
        ) -> torch.Tensor:
            """Unscale local shard if needed

            If state buffers are scaled, then the shard is unscaled
            and output to a workspace buffer. Otherwise, the shard is
            immediately returned.

            """
            if not self.with_scaled_states:
                return shard
            out = get_workspace_shard(bucket_id)
            bucket = self.state["buckets"][bucket_id]
            stream_id = bucket_id % self.pipeline_size
            stream = self._pipeline_streams[stream_id]
            with torch.cuda.stream(stream):
                for fragment in bucket.fragments:
                    if not fragment.in_local_shard:
                        continue
                    param_group_id = fragment.param_group_id
                    param_id = fragment.param_id
                    shard_range = slice(*fragment.shard_range)
                    scale = self._state_scales[(param_group_id, param_id, bucket_id)][state_key]
                    out[shard_range].copy_(shard[shard_range]).mul_(scale)
            return out

        def pack_param_shard(bucket_id: int) -> torch.Tensor:
            """Pack local shard of param values into contiguous buffer"""

            # Stream objects
            stream_id = bucket_id % self.pipeline_size
            stream = self._pipeline_streams[stream_id]

            # Bucket objects
            bucket = self.state["buckets"][bucket_id]
            shard_size = bucket.shard_size

            # Case 1: Param state is already packed
            if bucket.params_shard is not None:
                return unscale_shard(bucket_id, bucket.params_shard, "param")

            # Case 2: Pack BF16 model params with 16-bit remainders
            if bucket.param_remainders_shard is not None:
                with torch.cuda.stream(stream):
                    # Pack bf16 param values
                    shard_bf16 = shard_bf16_buffers[stream_id][:shard_size]
                    buffers_in = []
                    buffers_out = []
                    for fragment in bucket.fragments:
                        if not fragment.in_local_shard:
                            continue
                        param_range = slice(*fragment.shard_param_range)
                        shard_range = slice(*fragment.shard_range)
                        param = self.parameter(fragment)
                        buffers_in.append(param.view(-1)[param_range])
                        buffers_out.append(shard_bf16[shard_range])
                    _multi_tensor_copy(
                        buffers_in,
                        buffers_out,
                        dummy_overflow_buf=self._dummy_overflow_buf,
                    )

                    # Reconstruct fp32 from bf16 and remainders
                    shard_fp32 = get_workspace_shard(bucket_id)
                    _bf16_rem_to_fp32(
                        shard_bf16,
                        bucket.param_remainders_shard,
                        shard_fp32,
                    )
                    return shard_fp32

            # Case 3: Pack model params
            with torch.cuda.stream(stream):
                shard = get_workspace_shard(bucket_id)
                buffers_in = []
                buffers_out = []
                for fragment in bucket.fragments:
                    if not fragment.in_local_shard:
                        continue
                    param_range = slice(*fragment.shard_param_range)
                    shard_range = slice(*fragment.shard_range)
                    param = self.parameter(fragment)
                    buffers_in.append(param.view(-1)[param_range])
                    buffers_out.append(shard[shard_range])
                _multi_tensor_copy(
                    buffers_in,
                    buffers_out,
                    dummy_overflow_buf=self._dummy_overflow_buf,
                )
                return shard

        def start_all_gather(bucket_id: int, shard: torch.Tensor) -> None:
            """Launch all-gather on bucket shards

            Communication is done on main stream to ensure consistent
            ordering.

            """

            # Stream objects
            stream_id = bucket_id % self.pipeline_size
            stream = self._pipeline_streams[stream_id]

            # Workspace buffer
            bucket = self.state["buckets"][bucket_id]
            bucket_size = bucket.bucket_size
            bucket_buffer = bucket_buffers[stream_id][:bucket_size]

            # All-gather shards
            main_stream.wait_stream(stream)
            all_gather_into_tensor(
                bucket_buffer,
                shard,
                group=self.distributed_process_group,
            )
            stream.wait_stream(main_stream)

        def finish_all_gather(bucket_id: int, state_dict_key: str) -> None:
            """Finish all-gather on bucket shards

            Data is copied into state dict CPU buffers.

            Splitting the NCCL all-gather and the CPU memcpys into
            separate stages helps achieve good overlap when kernel
            launches are serialized with
            CUDA_DEVICE_MAX_CONNECTIONS=1. In particular, the pipeline
            calls start_all_gather(bucket_id+1) before
            finish_all_gather(bucket_id).

            """

            # Stream objects
            stream_id = bucket_id % self.pipeline_size
            stream = self._pipeline_streams[stream_id]

            # Bucket objects
            bucket = self.state["buckets"][bucket_id]
            bucket_size = bucket.bucket_size
            bucket_buffer = bucket_buffers[stream_id][:bucket_size]

            # Update state dict
            with torch.cuda.stream(stream):
                for fragment in bucket.fragments:
                    param_range = slice(*fragment.param_range)
                    bucket_range = slice(*fragment.bucket_range)
                    param_group_id = fragment.param_group_id
                    param_id = fragment.param_id
                    index = state_dict["param_groups"][param_group_id]["params"][param_id]
                    state_buffer = state_dict["state"][index][state_dict_key]
                    state_fragment = state_buffer.view(-1)[param_range]
                    bucket_fragment = bucket_buffer[bucket_range]
                    state_fragment.copy_(bucket_fragment, non_blocking=True)

        # All-gather param state
        for bucket_id in range(num_buckets):
            shard = pack_param_shard(bucket_id)
            start_all_gather(bucket_id, shard)
            if bucket_id > 0:
                finish_all_gather(bucket_id - 1, "param")
            if bucket_id == num_buckets - 1:
                finish_all_gather(bucket_id, "param")

        # All-gather exp_avg state
        for bucket_id in range(num_buckets):
            shard = unscale_shard(
                bucket_id,
                self.state["buckets"][bucket_id].exp_avg_shard,
                "exp_avg",
            )
            start_all_gather(bucket_id, shard)
            if bucket_id > 0:
                finish_all_gather(bucket_id - 1, "exp_avg")
            if bucket_id == num_buckets - 1:
                finish_all_gather(bucket_id, "exp_avg")

        # All-gather exp_avg_sq state
        for bucket_id in range(num_buckets):
            shard = unscale_shard(
                bucket_id,
                self.state["buckets"][bucket_id].exp_avg_sq_shard,
                "exp_avg_sq",
            )
            start_all_gather(bucket_id, shard)
            if bucket_id > 0:
                finish_all_gather(bucket_id - 1, "exp_avg_sq")
            if bucket_id == num_buckets - 1:
                finish_all_gather(bucket_id, "exp_avg_sq")

        # Synchronize GPU and return
        for stream in self._pipeline_streams:
            main_stream.wait_stream(stream)
        main_stream.synchronize()
        return state_dict

    def load_state_dict(self, state_dict: dict) -> None:
        """Load optimizer state"""

        # Figure out state dict format
        state_dict_format = state_dict.pop("format", None)
        if state_dict_format is None:
            if "buckets" in state_dict or "gathered_states" in state_dict:
                state_dict_format = 1
            else:
                state_dict_format = 2

        # Load state dict
        if state_dict_format == 1:
            # Deprecated v1 format
            self._load_state_dict_v1(state_dict)
        elif state_dict_format == 2:
            # Default v2 format
            self._load_state_dict_v2(state_dict)
        else:
            # Unrecognized format
            raise ValueError(f"Unrecognized state dict format ({state_dict_format})")

    def _load_state_dict_v1(self, state_dict: dict) -> None:
        """Load optimizer state (deprecated v1 format)

        Parallel configuration (e.g. process group sizes) and
        optimizer options must match between saving and loading the
        optimizer state.

        """
        warnings.warn(
            "Loading checkpoint in deprecated v1 format. Future support is not guaranteed."
        )
        if self.with_scaled_states:
            raise NotImplementedError("Deprecated v1 format does not support scaled state")

        # Get state dict for current rank
        if "gathered_states" in state_dict:
            # Deallocate distributed optimizer state to reduce GPU
            # memory usage
            if "buckets" in self.state:
                del self.state["buckets"]

            # Get state for current rank and parse byte string
            state_bytes = state_dict["gathered_states"][self.distributed_rank]
            state_bytes = io.BytesIO(state_bytes.numpy())
            state_dict = torch.load(state_bytes)

        # Load state dict
        super().load_state_dict(state_dict)

        # Handle old state dicts without per-bucket dtypes
        for bucket in self.state["buckets"]:
            if getattr(bucket, "dtype", None) is None:
                bucket.dtype = self.dtype
            if getattr(bucket, "grad_sync_dtype", None) is None:
                bucket.grad_sync_dtype = self.grad_sync_dtype
            if getattr(bucket, "param_sync_dtype", None) is None:
                bucket.param_sync_dtype = self.param_sync_dtype

            if bucket.params_shard is not None:
                bucket.params_shard = bucket.params_shard.to(self.device)
            if bucket.param_remainders_shard is not None:
                bucket.param_remainders_shard = bucket.param_remainders_shard.to(self.device)
            bucket.exp_avg_shard = bucket.exp_avg_shard.to(self.device)
            bucket.exp_avg_sq_shard = bucket.exp_avg_sq_shard.to(self.device)

    @torch.no_grad()
    def _load_state_dict_v2(self, state_dict: dict) -> None:
        """Load optimizer state (default v2 format)

        The parallel configuration and optimizer options are allowed
        to differ between saving and loading the model.

        """

        # Make sure params are initialized
        self.init_params()

        # Finish any asynchronous communication
        self.grad_sync()
        self.param_sync()

        # Load general state
        # Note: State includes bucketing scheme (e.g.
        # self.state["buckets"] and self.state[param]["fragments"]).
        # This was needed for v1 checkpoints, but not for v2. As a
        # kludge, we temporarily set state to dummy dict to avoid
        # messing up the bucketing scheme.
        state = self.state
        self.state = {}
        super().load_state_dict(
            {
                "state": {},
                "param_groups": state_dict["param_groups"],
            }
        )
        self.state = state
        self.state["step"] = state_dict["state"]["step"]

        # Load state for each param
        for param in self.parameters():
            # Get param index in state dict
            fragment = self.state[param]["fragments"][0]
            param_id = fragment.param_id
            param_group_id = fragment.param_group_id
            index = state_dict["param_groups"][param_group_id]["params"][param_id]

            # Buffers in state dict
            param_state = state_dict["state"][index]["param"].view(-1)
            exp_avg = state_dict["state"][index]["exp_avg"].view(-1)
            exp_avg_sq = state_dict["state"][index]["exp_avg_sq"].view(-1)

            # Copy to local shard of state buckets
            for fragment in self.state[param]["fragments"]:
                if not fragment.in_local_shard:
                    continue
                bucket_id = fragment.bucket_id
                bucket = self.state["buckets"][bucket_id]
                param_range = slice(*fragment.shard_param_range)
                shard_range = slice(*fragment.shard_range)
                if self.with_scaled_states:
                    scales = self._state_scales[(param_group_id, param_id, bucket_id)]
                    temp = torch.empty_like(
                        param_state[param_range],
                        dtype=torch.float32,
                        device=self.device,
                    )
                    temp.copy_(param_state[param_range], non_blocking=True)
                    self._apply_state_scale(temp, scales["param"])
                    bucket.params_shard[shard_range].copy_(temp)
                    temp.copy_(exp_avg[param_range], non_blocking=True)
                    self._apply_state_scale(temp, scales["exp_avg"])
                    bucket.exp_avg_shard[shard_range].copy_(temp)
                    temp.copy_(exp_avg_sq[param_range], non_blocking=True)
                    self._apply_state_scale(temp, scales["exp_avg_sq"])
                    bucket.exp_avg_sq_shard[shard_range].copy_(temp)
                else:
                    if bucket.params_shard is not None:
                        bucket.params_shard[shard_range].copy_(
                            param_state[param_range],
                            non_blocking=True,
                        )
                    if bucket.param_remainders_shard is not None:
                        param_state_int16 = param_state.unsqueeze(-1).view(torch.int16)
                        bucket.param_remainders_shard[shard_range].copy_(
                            param_state_int16[param_range, 0],
                            non_blocking=True,
                        )
                    bucket.exp_avg_shard[shard_range].copy_(
                        exp_avg[param_range],
                        non_blocking=True,
                    )
                    bucket.exp_avg_sq_shard[shard_range].copy_(
                        exp_avg_sq[param_range],
                        non_blocking=True,
                    )

        # Synchronize GPU
        torch.cuda.current_stream().synchronize()


================================================
FILE: apex/contrib/optimizers/distributed_fused_lamb.py
================================================
import os
import inspect
import torch
import importlib
import amp_C
from apex.multi_tensor_apply import multi_tensor_applier

import torch.distributed.distributed_c10d as c10d

# Fallback to private fields if using older PyTorch version
try:
    import torch.distributed.distributed_c10d.get_process_group_ranks
except ImportError:

    def get_process_group_ranks(group):
        return list(c10d._pg_group_ranks[group].keys())


_make_nccl_premul_sum = getattr(torch.distributed, "_make_nccl_premul_sum", None)
# Ref: https://github.com/pytorch/pytorch/pull/81272
if _make_nccl_premul_sum is None:
    if hasattr(torch.distributed, "make_nccl_premul_sum"):
        _make_nccl_premul_sum = torch.distributed.make_nccl_premul_sum


class DistributedFusedLAMB(torch.optim.Optimizer):
    """Implements LAMB algorithm.

    Currently GPU-only.  Requires Apex to be installed via
    ``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./``.

    This version of fused LAMB implements 2 fusions.

      * Fusion of the LAMB update's elementwise operations
      * A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.

    :class:`apex.optimizers.FusedLAMB`'s usage is identical to any ordinary Pytorch optimizer::

        opt = apex.optimizers.FusedLAMB(model.parameters(), lr = ....)
        ...
        opt.step()

    :class:`apex.optimizers.FusedLAMB` may be used with or without Amp.  If you wish to use :class:`FusedLAMB` with Amp,
    you may choose any ``opt_level``::

        opt = apex.optimizers.FusedLAMB(model.parameters(), lr = ....)
        model, opt = amp.initialize(model, opt, opt_level="O0" or "O1 or "O2")
        ...
        opt.step()

    In general, ``opt_level="O1"`` is recommended.

    LAMB was proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups.
        lr (float, optional): learning rate. (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its norm. (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability. (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            NOT SUPPORTED now! (default: False)
        adam_w_mode (boolean, optional): Apply L2 regularization or weight decay
            True for decoupled weight decay(also known as AdamW) (default: True)
        grad_averaging (bool, optional): whether apply (1-beta2) to grad when
            calculating running averages of gradient. (default: True)
        set_grad_none (bool, optional): whether set grad to None when zero_grad()
            method is called. (default: True)
        max_grad_norm (float, optional): value used to clip global grad norm
            (default: 1.0)
        use_nvlamb (boolean, optional): Apply adaptive learning rate to 0.0
            weight decay parameter (default: False)
        step_supports_amp_scaling(boolean, optional): whether to use customized
            gradient unscaling logic (default: True)

    .. _Large Batch Optimization for Deep Learning - Training BERT in 76 minutes:
        https://arxiv.org/abs/1904.00962
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    class AtomicCounter(object):
        def __init__(self):
            self.value = 0
            self.order = []
            import threading

            self._lock = threading.Lock()

        def add(self, idx):
            with self._lock:
                self.value += 1
                self.order.append(idx)

    def __init__(
        self,
        params,
        lr=1e-3,
        bias_correction=True,
        grad_averaging=True,
        betas=(0.9, 0.999),
        eps=1e-8,
        weight_decay=0.0,
        max_grad_norm=0.0,
        adam_w_mode=True,
        use_nvlamb=False,
        step_supports_amp_scaling=True,
        overlap_reductions=True,
        dwu_group_size=0,
        dwu_num_blocks=4,
        dwu_num_chunks=4,
        dwu_num_rs_pg=1,
        dwu_num_ar_pg=4,
        dwu_num_ag_pg=0,
        fused_norm=False,
        e5m2_allgather=False,
        verbose=False,
        clip_after_ar=True,
        full_ar=False,
        set_param_views_to_flat_buffer=False,
        skip_allgather=False,
        fuse_scale=False,
        param_order=None,
        nccl_allgather_channels=0,
    ):
        defaults = dict(
            lr=lr,
            bias_correction=bias_correction,
            betas=betas,
            eps=eps,
            weight_decay=weight_decay,
            grad_averaging=grad_averaging,
            max_grad_norm=max_grad_norm,
        )

        super(DistributedFusedLAMB, self).__init__(params, defaults)

        global fused_adam_cuda, distributed_lamb_cuda
        fused_adam_cuda = importlib.import_module("fused_adam_cuda")
        distributed_lamb_cuda = importlib.import_module("distributed_lamb_cuda")

        self._overflow_buf = torch.cuda.IntTensor([0])
        self._has_overflow = False
        self.multi_tensor_lamb_compute_update_term = (
            distributed_lamb_cuda.multi_tensor_lamb_compute_update_term
        )
        self.multi_tensor_lamb_update_weights = (
            distributed_lamb_cuda.multi_tensor_lamb_update_weights
        )
        import amp_C

        self.multi_tensor_l2norm = amp_C.multi_tensor_l2norm

        self._grad_averaging = grad_averaging
        self._adam_w_mode = 1 if adam_w_mode else 0
        self._use_nvlamb = use_nvlamb
        self._step_supports_amp_scaling = step_supports_amp_scaling
        self._is_accumulation_step = False
        self._last_step = False
        self._overlap_reductions = overlap_reductions
        self._global_scale = None
        self._num_blocks = dwu_num_blocks
        self._num_chunks = dwu_num_chunks
        self._e5m2_allgather = e5m2_allgather
        self._verbose = verbose
        self._clip_after_ar = clip_after_ar
        self._full_ar = full_ar
        self._fuse_scale = fuse_scale
        self._L2_grad_norm = None
        self._set_flat_param_view = set_param_views_to_flat_buffer
        self._skip_ag = skip_allgather
        self._fused_norm = fused_norm if not clip_after_ar else False
        self._current_process_group = c10d._get_default_group()
        self._available_ranks = get_process_group_ranks(self._current_process_group)
        self._group_size = torch.cuda.device_count() if dwu_group_size <= 0 else dwu_group_size
        self._world_size = torch.distributed.get_world_size()
        self._num_groups = self._world_size // self._group_size
        self._rank_in_group = torch.distributed.get_rank() % self._group_size

        self._lr = torch.tensor(0.0, dtype=torch.float32, device="cuda")

        self._resume_from_checkpoint = False
        self._step = torch.cuda.IntTensor([0])

        # Master weight, moment, gradient buffers
        self._fp32_p, self._fp32_m, self._fp32_v, self._fp16_p, self._fp16_g = (
            None,
            None,
            None,
            None,
            None,
        )

        # Check if collectives have no_copy option
        self._reduce_scatter_no_copy = (
            "no_copy" in inspect.getfullargspec(torch.distributed.reduce_scatter).args
        )
        self._all_gather_no_copy = (
            "no_copy" in inspect.getfullargspec(torch.distributed.all_gather).args
        )

        if "reduce_scatter_tensor" not in dir(torch.distributed):
            torch.distributed.reduce_scatter_tensor = torch.distributed._reduce_scatter_base
        if "all_gather_into_tensor" not in dir(torch.distributed):
            torch.distributed.all_gather_into_tensor = torch.distributed._all_gather_base

        self._num_rs_pg = dwu_num_rs_pg
        self._num_ar_pg = dwu_num_ar_pg
        self._num_ag_pg = dwu_num_ag_pg

        if self._full_ar:  # full all reduce, only need AR and AG groups
            # l2_grad_norm may be reduced within a node to limit from memory reads
            for group_i in range(self._num_groups):
                ranks = [group_i * self._group_size + j for j in range(self._group_size)]
                l2_grad_norm_pg = torch.distributed.new_group(ranks=ranks)
                if torch.distributed.get_rank() in ranks:
                    self._l2_grad_norm_pg = l2_grad_norm_pg

            self._ar_pg = []
            # consider all the ranks
            ranks = list(range(0, self._world_size))
            for i in range(self._num_ar_pg):
                if self._verbose:
                    print(f"creating new AR group {i}: {ranks}")
                grp = torch.distributed.new_group(ranks=ranks)
                if grp != torch.distributed.GroupMember.NON_GROUP_MEMBER:
                    if self._verbose:
                        print(f"group {i}: init barrier (device: {torch.cuda.current_device()})")
                    torch.distributed.barrier(group=grp, device_ids=[torch.cuda.current_device()])
                if self._verbose:
                    print(f"created new AR group {i}: {ranks}")

                if torch.distributed.get_rank() in ranks:
                    self._ar_pg.append(grp)
            self._ar_st = [torch.cuda.Stream() for _ in range(self._num_ar_pg)]
            if nccl_allgather_channels > 0:
                os.putenv("NCCL_MAX_NCHANNELS", str(nccl_allgather_channels))
            if self._num_ag_pg == 0:
                self._ag_pg = self._ar_pg
                self._ag_st = self._ar_st
                self._num_ag_pg = self._num_ar_pg
            else:
                self._ag_pg = []
                ranks = []
                stride = torch.cuda.device_count()
                for i in range(self._num_groups):
                    rs = list(range(i * stride, (i + 1) * stride))
                    ranks.append(rs)
                for rs in ranks:
                    for i in range(self._num_ag_pg):
                        grp = torch.distributed.new_group(ranks=rs)
                        if torch.distributed.get_rank() in rs:
                            if self._verbose:
                                print(f"creating AG group {i}: {rs}")
                            self._ag_pg.append(grp)

                self._ag_st = [torch.cuda.Stream() for _ in range(self._num_ag_pg)]
        else:  # reduce-scatter + all-reduce, need RS, AR, AG groups
            if self._num_groups > 1:
                self._ar_pg = []
                for dev_i in range(self._group_size):
                    ranks = [dev_i + j * self._group_size for j in range(self._num_groups)]
                    for i in range(self._num_ar_pg):
                        if self._verbose:
                            print(f"creating new AR group {i}: {ranks}")
                        grp = torch.distributed.new_group(ranks=ranks)
                        if grp != torch.distributed.GroupMember.NON_GROUP_MEMBER:
                            if self._verbose:
                                print(
                                    f"group {i}: init barrier (device: {torch.cuda.current_device()})"
                                )
                            torch.distributed.barrier(
                                group=grp, device_ids=[torch.cuda.current_device()]
                            )
                        if self._verbose:
                            print(f"created new AR group {i}: {ranks}")

                        if torch.distributed.get_rank() in ranks:
                            self._ar_pg.append(grp)
                self._ar_st = [torch.cuda.Stream() for _ in range(self._num_ar_pg)]
            rs_ranks = []
            for group_i in range(self._num_groups):
                rs_ranks.append([group_i * self._group_size + j for j in range(self._group_size)])
            self._rs_pg = []
            for group_i in range(self._num_groups):
                ranks = rs_ranks[group_i]
                for i in range(self._num_rs_pg):
                    grp = torch.distributed.new_group(ranks=ranks)
                    if torch.distributed.get_rank() in ranks:
                        self._rs_pg.append(grp)
                        if self._verbose:
                            print(f"creating RS group : {ranks}")
                l2_grad_norm_pg = torch.distributed.new_group(ranks=ranks)
                if torch.distributed.get_rank() in ranks:
                    self._l2_grad_norm_pg = l2_grad_norm_pg
            self._rs_st = [torch.cuda.Stream() for _ in range(self._num_rs_pg)]
            if self._num_ag_pg == 0:
                self._ag_pg = self._rs_pg
                self._ag_st = self._rs_st
                self._num_ag_pg = self._num_rs_pg
            else:
                self._ag_pg = []
                for group_i in range(self._num_groups):
                    ranks = rs_ranks[group_i]
                    for i in range(self._num_ag_pg):
                        grp = torch.distributed.new_group(ranks=ranks)
                        if torch.distributed.get_rank() in ranks:
                            self._ag_pg.append(grp)
                            if self._verbose:
                                print(f"creating AG group : {ranks}")
                self._ag_st = [torch.cuda.Stream() for _ in range(self._num_ag_pg)]
        for ag_pg in self._ag_pg:
            torch.distributed.barrier(group=ag_pg)

        self._l2_grad_norm_st = torch.cuda.Stream()
        self._completion_st = torch.cuda.Stream()
        self._step.record_stream(self._completion_st)

        self._reductions_works = [None] * self._num_blocks
        self._allgather_works = [None] * self._num_blocks

        self._one = torch.cuda.IntTensor([1])

        self._first_step = True
        self._lazy_init_stage1_done, self._lazy_init_stage2_done = False, False
        self._param_order = self.AtomicCounter()

        p_offset = 0
        p_i = 0
        self._model_params = []
        self._grad_accs = []
        self._group_properties = []
        for group in self.param_groups:
            prev = None
            beta1, beta2 = group["betas"]
            beta3 = 1.0 - beta1 if self._grad_averaging else 1.0
            bias_correction = 1 if group["bias_correction"] else 0
            eps = group["eps"]
            weight_decay = group["weight_decay"]
            for p in group["params"]:
                if not p.requires_grad:
                    continue
                self._model_params.append(p)
                self._group_properties.append(
                    (weight_decay, bias_correction, beta1, beta2, beta3, eps)
                )
                p_grads_size = p.numel()
                if self._set_flat_param_view:
                    if param_order:
                        # this is executed when param_order is specified by the user
                        self._param_order.add(param_order[p])
                    else:
                        self._param_order.add(p_i)
                p_offset += p_grads_size
                # Only enforce 128b alignment (64 * fp16) for non-consecutive parameters
                # RNN is one example of consecutive parameters:
                # (weight_ih, weight_hh, bias_ih, bias_hh)
                if prev is not None and (
                    prev.data_ptr() + prev.numel() * prev.element_size() != p.data_ptr()
                ):
                    p_offset = ((p_offset + 63) // 64) * 64
                prev = p
                p_i += 1
        if param_order:
            self._param_order.order = torch.argsort(torch.tensor(self._param_order.order)).tolist()
        self._grads_generated = [False] * len(self._model_params)
        self._grads_fp16, self._grads_fp32 = [], []
        if self._overlap_reductions:
            self._current_block = self._num_blocks

        self._net_total_param_size = p_offset
        self._total_param_size = p_offset
        dwu_min_page_size = 256 * self._num_blocks * self._num_chunks * self._group_size
        self._total_param_size = (
            (self._total_param_size + dwu_min_page_size - 1) // dwu_min_page_size
        ) * dwu_min_page_size
        self._new_params = torch.zeros(
            [self._total_param_size],
            dtype=torch.uint8 if self._e5m2_allgather else torch.float16,
            device="cuda",
        )

    def _lazy_init_stage1(self):
        if self._lazy_init_stage1_done:
            return

        p_i = 0
        # self._model_params = []
        # self._grad_accs = []
        # self._group_properties = []
        for group in self.param_groups:
            for p in group["params"]:
                torch.distributed.broadcast(p, 0)
                if not p.requires_grad:
                    continue

                def wrapper(param, param_i):
                    param_tmp = param.expand_as(param)
                    grad_acc = param_tmp.grad_fn.next_functions[0][0]

                    def allreduce_hook(*unused):
                        if not self._set_flat_param_view:
                            if self._first_step:
                                # first time
                                self._param_order.add(param_i)
                            else:
                                idx = self._param_order.order.index(param_i)
                                self._do_overlapped_reduction(idx, param)
                        else:
                            if not self._first_step:
                                idx = self._param_order.order.index(param_i)
                                self._do_overlapped_reduction(idx, param)

                    grad_acc.register_hook(allreduce_hook)
                    self._grad_accs.append(grad_acc)

                wrapper(p, p_i)
                p_i += 1

        self._block_size = self._total_param_size // self._num_blocks
        self._chunk_size = self._block_size // self._num_chunks
        self._shard_size = self._chunk_size // self._group_size

        self._flat_grads = torch.zeros([self._total_param_size], dtype=torch.float16, device="cuda")
        self._mega_shard_size = self._num_blocks * self._num_chunks * self._shard_size
        # initialize master weights, moments buffers if not loaded from checkpoint
        if self._fp32_p is None:
            self._fp32_p = torch.zeros([self._mega_shard_size], dtype=torch.float32, device="cuda")
            self._fp32_m = torch.zeros([self._mega_shard_size], dtype=torch.float32, device="cuda")
            self._fp32_v = torch.zeros([self._mega_shard_size], dtype=torch.float32, device="cuda")
            self._fp32_u = torch.zeros([self._mega_shard_size], dtype=torch.float32, device="cuda")
        # FIXME: Rethink fp16 label since it's either uint8 or fp16
        self._fp16_p = torch.zeros(
            [self._mega_shard_size],
            dtype=torch.uint8 if self._e5m2_allgather else torch.float16,
            device="cuda",
        )
        self._fp16_g = torch.zeros([self._mega_shard_size], dtype=torch.float16, device="cuda")

        def _flat_split(p):
            def __blockify(p):
                return [
                    p[block_id * self._block_size : (block_id + 1) * self._block_size]
                    for block_id in range(self._num_blocks)
                ]

            def __chunkify(p):
                return [
                    p[chunk_id * self._chunk_size : (chunk_id + 1) * self._chunk_size]
                    for chunk_id in range(self._num_chunks)
                ]

            def __shardify(p):
                return [
                    p[shard_id * self._shard_size : (shard_id + 1) * self._shard_size]
                    for shard_id in range(self._group_size)
                ]

            list_of_blocks = __blockify(p)
            list_of_list_of_chunks = [__chunkify(block) for block in list_of_blocks]
            list_of_list_of_list_of_shards = [
                [__shardify(chunk) for chunk in chunks] for chunks in list_of_list_of_chunks
            ]
            return (
                list_of_blocks,
                list_of_list_of_chunks,
                list_of_list_of_list_of_shards,
            )

        # note(crcrpar): the function below doesn't seem to be used at all.
        # def _flat_split_no_shards(p):
        #     def __blockify(p):
        #         return [p[block_id*self._block_size:(block_id+1)*self._block_size] for block_id in range(self._num_blocks)]
        #     def __chunkify(p):
        #         return [p[chunk_id*self._chunk_size:(chunk_id+1)*self._chunk_size] for chunk_id in range(self._num_chunks)]
        #     list_of_blocks = __blockify(self._flat_grads)
        #     list_of_list_of_chunks = [__chunkify(block) for block in list_of_blocks]
        #     return list_of_blocks, list_of_list_of_chunks

        def _full_packed_split(p):
            def __shardify(p):
                return [
                    p[mega_shard * self._mega_shard_size : (mega_shard + 1) * self._mega_shard_size]
                    for mega_shard in range(self._group_size)
                ]

            def __blockify(p):
                return [
                    p[
                        block_id * self._num_chunks * self._shard_size : (block_id + 1)
                        * self._num_chunks
                        * self._shard_size
                    ]
                    for block_id in range(self._num_blocks)
                ]

            def __chunkify(p):
                return [
                    p[chunk_id * self._shard_size : (chunk_id + 1) * self._shard_size]
                    for chunk_id in range(self._num_chunks)
                ]

            list_of_mega_shards = __shardify(p)
            list_of_list_of_mega_blocks = [
                __blockify(mega_shard) for mega_shard in list_of_mega_shards
            ]
            list_of_list_of_list_of_mega_chunks = [
                [__chunkify(mega_block) for mega_block in mega_blocks]
                for mega_blocks in list_of_list_of_mega_blocks
            ]
            return (
                list_of_mega_shards,
                list_of_list_of_mega_blocks,
                list_of_list_of_list_of_mega_chunks,
            )

        def _packed_split(p):
            def __packed_blockify(p):
                packed_block_size = self._num_chunks * self._shard_size
                return [
                    p[block_id * packed_block_size : (block_id + 1) * packed_block_size]
                    for block_id in range(self._num_blocks)
                ]

            def __packed_chunkify(p):
                # in the packed format, each chunk contains one shard, so packed_chunk_size == self._shard_size
                return [
                    p[chunk_id * self._shard_size : (chunk_id + 1) * self._shard_size]
                    for chunk_id in range(self._num_chunks)
                ]

            list_of_blocks = __packed_blockify(p)
            list_of_list_of_chunks = [__packed_chunkify(block) for block in list_of_blocks]
            return list_of_blocks, list_of_list_of_chunks

        def _split_assign(shards):
            packed_block_size = self._num_chunks * self._shard_size
            list_of_list_of_chunks = []
            for block_id in range(self._num_blocks):
                list_of_chunks = []
                for chunk_id in range(self._num_chunks):
                    # self._fp16_g[block_id*packed_block_size+chunk_id*self._shard_size:block_id*packed_block_size+(chunk_id+1)*self._shard_size] = shards[block_id][chunk_id][self._rank_in_group]
                    list_of_chunks.append(shards[block_id][chunk_id][self._rank_in_group])
                list_of_list_of_chunks.append(list_of_chunks)
            return list_of_list_of_chunks

        (
            self._new_params_mega_shards,
            self._new_params_mega_blocks,
            self._new_params_mega_chunks,
        ) = _full_packed_split(self._new_params)
        # this splitting scheme is needed when allgather needs to be split into multiple chunks in a contiguous way
        self._new_params2_blocks, self._new_params2_chunks, self._new_params2_shards = _flat_split(
            self._new_params
        )

        self._fp32_p_blocks, self._fp32_p_chunks = _packed_split(self._fp32_p)
        self._fp32_m_blocks, self._fp32_m_chunks = _packed_split(self._fp32_m)
        self._fp32_v_blocks, self._fp32_v_chunks = _packed_split(self._fp32_v)
        self._fp32_u_blocks, self._fp32_u_chunks = _packed_split(self._fp32_u)
        self._fp16_p_blocks, self._fp16_p_chunks = _packed_split(self._fp16_p)

        if self._full_ar:
            # for gradient all-reduce
            (
                self._flat_grads_blocks,
                self._flat_grads_chunks,
                self._flat_grads_shards,
            ) = _flat_split(self._flat_grads)
            # for weight update
            self._fp16_g_chunks = _split_assign(self._flat_grads_shards)
        else:
            (
                self._flat_grads_blocks,
                self._flat_grads_chunks,
                self._flat_grads_shards,
            ) = _flat_split(self._flat_grads)
            self._fp16_g_blocks, self._fp16_g_chunks = _packed_split(self._fp16_g)

        self._lazy_init_stage1_done = True

    def _lazy_init_stage2(self):
        if self._lazy_init_stage2_done:
            return
        if not self._set_flat_param_view:
            # reversing is needed for overlapping allreduce and backprop, but currently not supported for flat param view
            self._param_order.order.reverse()

            # re-order model_params, grad_accs, group_properties lists
        self._model_params = [self._model_params[i] for i in self._param_order.order]
        self._grad_accs = [self._grad_accs[i] for i in self._param_order.order]
        self._group_properties = [self._group_properties[i] for i in self._param_order.order]

        def _get_flat_view(param):
            if param.is_contiguous(memory_format=torch.channels_last):
                K, C, H, W = param.shape
                pv = param.as_strided(size=(K, H, W, C), stride=(H * W * C, W * C, C, 1))
            elif param.is_contiguous(memory_format=torch.channels_last_3d):
                K, C, D, H, W = param.shape
                pv = param.as_strided(
                    size=(K, D, H, W, C), stride=(D * H * W * C, H * W * C, W * C, C, 1)
                )
            else:
                pv = param
            return pv.view(-1)

        # re-collect grads info (size, offset) after ordering
        prev = None
        p_offset = 0
        self._grads_info = []
        self._individual_flat_grads = []
        for i, p in enumerate(self._model_params):
            p_grads_size = p.numel()
            self._grads_info.append({"param_grads_size": p_grads_size, "param_offset": p_offset})
            self._individual_flat_grads.append(
                self._flat_grads[p_offset : p_offset + p_grads_size].view_as(p)
            )
            # for the first iteration
            self._do_overlapped_reduction(i, p)
            p_offset += p_grads_size
            # Only enforce 128b alignment (64 * fp16) for non-consecutive parameters
            # RNN is one example of consecutive parameters:
            # (weight_ih, weight_hh, bias_ih, bias_hh)
            if prev is not None and (
                prev.data_ptr() + prev.numel() * prev.element_size() != p.data_ptr()
            ):
                p_offset = ((p_offset + 63) // 64) * 64
            prev = p

        self._low_param_i = [0] * self._num_blocks
        for block_id in range(self._num_blocks - 1, -1, -1):
            p_i = len(self._grads_info) - 1
            while p_i > 0 and self._grads_info[p_i]["param_offset"] > block_id * self._block_size:
                p_i -= 1
            self._low_param_i[block_id] = p_i
        # print("self._low_param_i", self._low_param_i)

        # This paragraph does two things:
        # 1) Copy model parameters into master buffer
        # 2) Create tensor lists for unpacking new parameter tensor after all-gather
        self._packed_flat_to_model_params_fp16 = []
        self._packed_flat_to_model_params_fp32 = []
        self._model_params_num = len(self._model_params)
        self._contrib_tensor_list = []
        self._contrib_min_param_i, self._contrib_max_param_i = -1, -1
        self._contrib_update_frag_for_norm = []
        self._contrib_model_param_for_norm_fp16 = []
        self._contrib_model_param_for_norm_fp32 = []
        self._contrib_model_param_for_norm_is_fp16 = []
        self._model_param_is_contrib = []
        self._contrib_group_properties = []
        for shard_id in range(self._group_size):
            for block_id in range(self._num_blocks):
                for chunk_id in range(self._num_chunks):
                    flat_shard_start = (
                        ((block_id * self._num_chunks + chunk_id) * self._group_size) + shard_id
                    ) * self._shard_size
                    flat_shard_end = flat_shard_start + self._shard_size
                    for param_i, (p, grads_info, group_props) in enumerate(
                        zip(self._model_params, self._grads_info, self._group_properties)
                    ):
                        flat_grad_start = grads_info["param_offset"]
                        flat_grad_end = flat_grad_start + grads_info["param_grads_size"]
                        clipped_start = (lambda a, b: a if a > b else b)(
                            flat_grad_start, flat_shard_start
                        )
                        clipped_end = (lambda a, b: a if a < b else b)(
                            flat_grad_end, flat_shard_end
                        )
                        if clipped_start < clipped_end:
                            grad_offset = clipped_start - flat_grad_start
                            grad_length = clipped_end - clipped_start
                            shard_offset = clipped_start - flat_shard_start
                            pf = _get_flat_view(p)
                            model_param_fragment = pf[grad_offset : grad_offset + grad_length]
                            new_param_packed_fragment = self._new_params_mega_chunks[shard_id][
                                block_id
                            ][chunk_id][shard_offset : shard_offset + grad_length]
                            if model_param_fragment.dtype == torch.float16:
                                self._packed_flat_to_model_params_fp16.append(
                                    (new_param_packed_fragment, model_param_fragment)
                                )
                            else:
                                self._packed_flat_to_model_params_fp32.append(
                                    (new_param_packed_fragment, model_param_fragment)
                                )
                            if shard_id == self._rank_in_group:
                                self._model_param_is_contrib.append(param_i)
                                # copy model parameters into master buffer
                                master_param_fragment = self._fp32_p_chunks[block_id][chunk_id][
                                    shard_offset : shard_offset + grad_length
                                ]
                                opti_state_m_fragment = self._fp32_m_chunks[block_id][chunk_id][
                                    shard_offset : shard_offset + grad_length
                                ]
                                opti_state_v_fragment = self._fp32_v_chunks[block_id][chunk_id][
                                    shard_offset : shard_offset + grad_length
                                ]
                                opti_state_u_fragment = self._fp32_u_chunks[block_id][chunk_id][
                                    shard_offset : shard_offset + grad_length
                                ]
                                opti_state_g_fragment = self._fp16_g_chunks[block_id][chunk_id][
                                    shard_offset : shard_offset + grad_length
                                ]
                                opti_state_p_fragment = self._fp16_p_chunks[block_id][chunk_id][
                                    shard_offset : shard_offset + grad_length
                                ]
                                # print("model_param_fragment.size()=%s, new_param_packed_fragment.size()=%s, master_param_fragment.size()=%s" % (str(model_param_fragment.size()), str(new_param_packed_fragment.size()), str(master_param_fragment.size())))
                                if not self._resume_from_checkpoint:
                                    master_param_fragment.copy_(model_param_fragment)
                                self._contrib_group_properties.append(group_props)
                                self._contrib_tensor_list.append(
                                    (
                                        master_param_fragment,
                                        opti_state_m_fragment,
                                        opti_state_v_fragment,
                                        opti_state_u_fragment,
                                        opti_state_g_fragment,
                                        opti_state_p_fragment,
                                    )
                                )  # p, m, v, u, g, p_copy
                                self._contrib_update_frag_for_norm.append(opti_state_u_fragment)
                                if p.dtype == torch.float16:
                                    self._contrib_model_param_for_norm_fp16.append(p)
                                else:
                                    self._contrib_model_param_for_norm_fp32.append(p)
                                self._contrib_model_param_for_norm_is_fp16.append(
                                    True if p.dtype == torch.float16 else False
                                )
                                if self._contrib_min_param_i < 0:
                                    self._contrib_min_param_i = param_i
                                self._contrib_max_param_i = param_i
        self._contrib_model_param_for_norm_num = len(self._contrib_model_param_for_norm_is_fp16)
        if len(self._contrib_model_param_for_norm_fp16) == 0:
            self._contrib_model_param_for_norm_fp16 = None
        if len(self._contrib_model_param_for_norm_fp32) == 0:
            self._contrib_model_param_for_norm_fp32 = None
        self._contrib_model_param_for_norm_is_fp32 = torch.tensor(
            [not is_fp16 for is_fp16 in self._contrib_model_param_for_norm_is_fp16],
            dtype=torch.bool,
            device="cuda",
        )
        self._contrib_model_param_for_norm_is_fp16 = torch.tensor(
            [is_fp16 for is_fp16 in self._contrib_model_param_for_norm_is_fp16],
            dtype=torch.bool,
            device="cuda",
        )
        self._offsets = torch.tensor(self._model_param_is_contrib, dtype=torch.int64, device="cuda")

        p, m, v, u, g, p_copy = list(zip(*self._contrib_tensor_list))
        self._contrib_compute_update_term_tensor_list = [g, p, m, v, u]
        self._contrib_update_weights_tensor_list = [u, p, p_copy]

        math_type = self._fp32_u.dtype
        decay, bias_correction, beta1, beta2, beta3, epsilon = list(
            zip(*self._contrib_group_properties)
        )
        self._contrib_beta1 = torch.tensor(beta1, dtype=math_type, device="cuda")
        self._contrib_beta2 = torch.tensor(beta2, dtype=math_type, device="cuda")
        self._contrib_beta3 = torch.tensor(beta3, dtype=math_type, device="cuda")
        self._contrib_bias_correction = torch.tensor(
            bias_correction, dtype=torch.int, device="cuda"
        )
        self._contrib_epsilon = torch.tensor(epsilon, dtype=math_type, device="cuda")
        self._contrib_weight_decay = torch.tensor(decay, dtype=math_type, device="cuda")

        self._packed_flat_to_model_params_fp16 = (
            list(zip(*self._packed_flat_to_model_params_fp16))
            if len(self._packed_flat_to_model_params_fp16) > 0
            else None
        )
        self._packed_flat_to_model_params_fp32 = (
            list(zip(*self._packed_flat_to_model_params_fp32))
            if len(self._packed_flat_to_model_params_fp32) > 0
            else None
        )

        self._lazy_init_stage2_done = True

        self.complete_reductions()
        self._first_step = False

    def set_is_accumulation_step(self, is_accumulation_step):
        self._is_accumulation_step = is_accumulation_step

    def set_last_step(self, last_step):
        self._last_step = last_step

    def _get_flush_block(self):
        flush_block = []
        if (
            self._current_block > 0
            and self._grads_generated[self._low_param_i[self._current_block - 1]]
        ):
            num_grads = len(self._grads_generated)
            contiguous_idx = num_grads
            while contiguous_idx > 0 and self._grads_generated[contiguous_idx - 1]:
                contiguous_idx -= 1

            if (
                contiguous_idx < num_grads
                and self._grads_info[contiguous_idx]["param_offset"]
                <= (self._current_block - 1) * self._block_size
            ):
                self._current_block -= 1
                start = self._current_block * self._block_size
                end = (self._current_block + 1) * self._block_size
                flush_block = [start, end]

        return flush_block

    def _full_all_reduce_scale(self, block_id, scale):
        works = [None] * self._num_chunks
        if self._clip_after_ar:
            for chunk_id in range(self._num_chunks):
                glob_chunk_id = block_id * self._num_chunks + chunk_id
                ar_stream = self._ar_st[glob_chunk_id % self._num_ar_pg]
                ar_stream.wait_stream(torch.cuda.current_stream())
                with torch.cuda.stream(ar_stream):
                    works[chunk_id] = torch.distributed.all_reduce(
                        self._flat_grads_chunks[block_id][chunk_id],
                        group=self._ar_pg[glob_chunk_id % self._num_ar_pg],
                        async_op=True,
                        op=_make_nccl_premul_sum(scale),
                    )
        else:
            glob_chunk_id = block_id
            ar_stream = self._ar_st[glob_chunk_id % self._num_ar_pg]
            ar_stream.wait_stream(torch.cuda.current_stream())
            with torch.cuda.stream(ar_stream):
                works0 = torch.distributed.all_reduce(
                    self._flat_grads_blocks[block_id],
                    group=self._ar_pg[glob_chunk_id % self._num_ar_pg],
                    async_op=True,
                    op=_make_nccl_premul_sum(scale),
                )
            for i in range(self._num_chunks):
                works[i] = works0
        self._reductions_works[block_id] = works

    def _full_all_reduce(self, block_id):
        works = [None] * self._num_chunks

        for chunk_id in range(self._num_chunks):
            glob_chunk_id = block_id * self._num_chunks + chunk_id
            ar_stream = self._ar_st[glob_chunk_id % self._num_ar_pg]
            ar_stream.wait_stream(torch.cuda.current_stream())
            with torch.cuda.stream(ar_stream):
                works[chunk_id] = torch.distributed.all_reduce(
                    self._flat_grads_chunks[block_id][chunk_id],
                    group=self._ar_pg[glob_chunk_id % self._num_ar_pg],
                    async_op=True,
                )
        self._reductions_works[block_id] = works

    def _reduce_scatter_and_all_reduce_scale(self, block_id, scale):
        # Reduction within each node
        # Changes gradient format from [block * chunk * shard] to [shard * block * chunk]
        # The output format is the same as the fp32 master parameters
        works = [None] * self._num_chunks
        for chunk_id in range(self._num_chunks):
            glob_chunk_id = block_id * self._num_chunks + chunk_id
            rs_stream = self._rs_st[glob_chunk_id % self._num_rs_pg]
            rs_stream.wait_stream(torch.cuda.current_stream())
            rs_stream.wait_stream(self._l2_grad_norm_st)
            with torch.cuda.stream(rs_stream):
                if self._reduce_scatter_no_copy:
                    works[chunk_id] = torch.distributed.reduce_scatter(
                        output=self._fp16_g_chunks[block_id][chunk_id],
                        input_list=self._flat_grads_shards[block_id][chunk_id],
                        group=self._rs_pg[glob_chunk_id % self._num_rs_pg],
                        async_op=True,
                        no_copy=True,
                        op=_make_nccl_premul_sum(scale),
                    )
                else:
                    works[chunk_id] = torch.distributed.reduce_scatter_tensor(
                        output=self._fp16_g_chunks[block_id][chunk_id],
                        input=self._flat_grads_chunks[block_id][chunk_id],
                        group=self._rs_pg[glob_chunk_id % self._num_rs_pg],
                        async_op=True,
                        op=_make_nccl_premul_sum(scale),
                    )

        # Reduction across nodes for each rank
        if self._num_groups > 1:
            for chunk_id in range(self._num_chunks):
                glob_chunk_id = block_id * self._num_chunks + chunk_id
                ar_stream = self._ar_st[glob_chunk_id % self._num_ar_pg]
                with torch.cuda.stream(ar_stream):
                    works[chunk_id].wait()
                    works[chunk_id] = torch.distributed.all_reduce(
                        self._fp16_g_chunks[block_id][chunk_id],
                        group=self._ar_pg[glob_chunk_id % self._num_ar_pg],
                        async_op=True,
                    )
        self._reductions_works[block_id] = works

    def _reduce_scatter_and_all_reduce(self, block_id):
        # Reduction within each node
        # Changes gradient format from [block * chunk * shard] to [shard * block * chunk]
        # The output format is the same as the fp32 master parameters
        works = [None] * self._num_chunks
        for chunk_id in range(self._num_chunks):
            glob_chunk_id = block_id * self._num_chunks + chunk_id
            rs_stream = self._rs_st[glob_chunk_id % self._num_rs_pg]
            rs_stream.wait_stream(torch.cuda.current_stream())
            with torch.cuda.stream(rs_stream):
                if self._reduce_scatter_no_copy:
                    works[chunk_id] = torch.distributed.reduce_scatter(
                        output=self._fp16_g_chunks[block_id][chunk_id],
                        input_list=self._flat_grads_shards[block_id][chunk_id],
                        group=self._rs_pg[glob_chunk_id % self._num_rs_pg],
                        async_op=True,
                        no_copy=True,
                    )
                else:
                    works[chunk_id] = torch.distributed.reduce_scatter_tensor(
                        output=self._fp16_g_chunks[block_id][chunk_id],
                        input=self._flat_grads_chunks[block_id][chunk_id],
                        group=self._rs_pg[glob_chunk_id % self._num_rs_pg],
                        async_op=True,
                    )

        # Reduction across nodes for each rank
        if self._num_groups > 1:
            for chunk_id in range(self._num_chunks):
                glob_chunk_id = block_id * self._num_chunks + chunk_id
                ar_stream = self._ar_st[glob_chunk_id % self._num_ar_pg]
                with torch.cuda.stream(ar_stream):
                    works[chunk_id].wait()
                    works[chunk_id] = torch.distributed.all_reduce(
                        self._fp16_g_chunks[block_id][chunk_id],
                        group=self._ar_pg[glob_chunk_id % self._num_ar_pg],
                        async_op=True,
                    )
        self._reductions_works[block_id] = works

    def _pipeline_block_reductions(self, block_id):
        if self._clip_after_ar:
            self._flatten_grad_mt(1.0 / self._world_size)

            if self._full_ar:
                self._full_all_reduce(block_id)
            else:
                self._reduce_scatter_and_all_reduce(block_id)

            # Compute L2 grad norm
            if block_id == 0:
                with torch.cuda.stream(self._l2_grad_norm_st):
                    for block_id in range(self._num_blocks):
                        for chunk_id in range(self._num_chunks):
                            self._reductions_works[block_id][chunk_id].wait()
                    # Since the packed format is contiguous after reductions, only one norm is needed
                    l2_grad_norm_sq = torch.empty([1], device="cuda")
                    if self._full_ar:
                        # this flattening of lists is to keep multi_tensor_apply function happy, it wants depth=1 for l2 norm computation
                        flat_list = [item for sublist in self._fp16_g_chunks for item in sublist]
                        l2_grad_norm_sq = (
                            multi_tensor_applier(
                                self.multi_tensor_l2norm,
                                self._overflow_buf,
                                [flat_list],
                                False,
                            )[0]
                            ** 2
                        )
                    else:
                        l2_grad_norm_sq = self._fp16_g.norm(dtype=torch.float32, p=2) ** 2
                    torch.distributed.all_reduce(l2_grad_norm_sq, group=self._l2_grad_norm_pg)
                    self._L2_grad_norm = l2_grad_norm_sq.sqrt()
        else:
            # Copy model grads to flat grads buffer
            self._flatten_grad_mt(1.0)

            # Compute L2 grad norm
            self._l2_grad_norm_st.wait_stream(torch.cuda.current_stream())
            with torch.cuda.stream(self._l2_grad_norm_st):
                if not self._fused_norm:
                    self._L2_grad_norm = self._flat_grads.norm(dtype=torch.float16, p=2).float()
            torch.cuda.current_stream().wait_stream(self._l2_grad_norm_st)

            # Apply clipping & pre-reduction scaling on grads
            loss_scale = self.global_scale
            max_grad_norm = loss_scale * self.defaults["max_grad_norm"]
            coeff = max_grad_norm / (1e-6 + self.L2_grad_norm)
            coeff = (coeff > 1) * self._one + (coeff <= 1) * coeff
            tmp = torch.cat(((self._one), (coeff)))
            index = (coeff + 1 > coeff).int()
            scale = tmp.index_select(0, index).half() / self._world_size
            if not self._fuse_scale:
                self._flat_grads.mul_(scale)

            if self._full_ar:
                if self._fuse_scale:
                    self._full_all_reduce_scale(block_id, scale)
                else:
                    self._full_all_reduce(block_id)
            else:
                if self._fuse_scale:
                    self._reduce_scatter_and_all_reduce_scale(block_id, scale)
                else:
                    self._reduce_scatter_and_all_reduce(block_id)

            if block_id == 0:
                for block_id in range(self._num_blocks):
                    for chunk_id in range(self._num_chunks):
                        self._reductions_works[block_id][chunk_id].wait()

    def __compute_contrib_param_norm(self):
        if (
            self._contrib_model_param_for_norm_fp16 is not None
            and self._contrib_model_param_for_norm_fp32 is not None
        ):
            gnorm_fp16 = multi_tensor_applier(
                self.multi_tensor_l2norm,
                self._overflow_buf,
                [self._contrib_model_param_for_norm_fp16],
                True,
            )[1]
            gnorm_fp32 = multi_tensor_applier(
                self.multi_tensor_l2norm,
                self._overflow_buf,
                [self._contrib_model_param_for_norm_fp32],
                True,
            )[1]
            gnorm = torch.empty(
                size=[self._contrib_model_param_for_norm_num],
                dtype=torch.bool,
                device="cuda",
            )
            gnorm.masked_scatter_(self._contrib_model_param_for_norm_is_fp16, gnorm_fp16)
            gnorm.masked_scatter_(self._contrib_model_param_for_norm_is_fp32, gnorm_fp32)
        elif self._contrib_model_param_for_norm_fp16 is not None:
            gnorm = multi_tensor_applier(
                self.multi_tensor_l2norm,
                self._overflow_buf,
                [self._contrib_model_param_for_norm_fp16],
                True,
            )[1]
        elif self._contrib_model_param_for_norm_fp32 is not None:
            gnorm = multi_tensor_applier(
                self.multi_tensor_l2norm,
                self._overflow_buf,
                [self._contrib_model_param_for_norm_fp32],
                True,
            )[1]
        return gnorm

    def __compute_contrib_update_norm(self):
        l2_norm = torch.zeros(size=[self._model_params_num], dtype=torch.float32, device="cuda")
        local_contrib_l2_norm = (
            multi_tensor_applier(
                self.multi_tensor_l2norm,
                self._overflow_buf,
                [self._contrib_update_frag_for_norm],
                True,
            )[1]
            ** 2
        )
        l2_norm.scatter_(dim=0, index=self._offsets, src=local_contrib_l2_norm)
        torch.distributed.all_reduce(l2_norm, group=self._ag_pg[0])
        l2_norm = torch.sqrt(l2_norm)
        return l2_norm

    def _pipeline_step(self):
        global_scale = self.global_scale
        # if clip before ar, set max_grad_norm to 0
        max_grad_norm = self.defaults["max_grad_norm"] * self._clip_after_ar
        self._completion_st.wait_stream(self._l2_grad_norm_st)
        global_grad_norm = self.L2_grad_norm

        # check global_grad_norm and fill overflow_buf
        is_finite = (global_grad_norm + 1 > global_grad_norm).int()
        self._overflow_buf = self._one * (is_finite ^ self._one)  # toggle between 0 and 1

        if not self._clip_after_ar:
            torch.distributed.all_reduce(
                is_finite,
                op=torch.distributed.ReduceOp.MIN,
                group=self._current_process_group,
            )
            torch.distributed.all_reduce(
                self._overflow_buf,
                op=torch.distributed.ReduceOp.MAX,
                group=self._current_process_group,
            )

        # increment step counter if no overflow
        self._step += is_finite
        self._completion_st.wait_stream(torch.cuda.current_stream())
        self._completion_st.wait_stream(self._l2_grad_norm_st)

        # Call step kernel once per step
        # Call all-gather once per step
        with torch.cuda.stream(self._completion_st):
            for block_id in range(self._num_blocks):
                for chunk_id in range(self._num_chunks):
                    self._reductions_works[block_id][chunk_id].wait()
            param_norm = self.__compute_contrib_param_norm()
            multi_tensor_applier(
                self.multi_tensor_lamb_compute_update_term,
                self._overflow_buf,
                self._contrib_compute_update_term_tensor_list,  # g, p, m, v, u
                self._contrib_beta1,
                self._contrib_beta2,
                self._contrib_beta3,
                self._contrib_bias_correction,
                self._step,
                self._contrib_epsilon,
                self._adam_w_mode,
                self._contrib_weight_decay,
                global_scale,
                global_grad_norm,
                max_grad_norm,
            )
            upd_norm = self.__compute_contrib_update_norm()
            multi_tensor_applier(
                self.multi_tensor_lamb_update_weights,
                self._overflow_buf,
                self._contrib_update_weights_tensor_list,  # u, p, p_copy
                param_norm,
                upd_norm,
                self._offsets,
                self._lr,
                self._contrib_weight_decay,
                global_grad_norm,
                self._use_nvlamb,
            )
            if not self._skip_ag:
                # allgather chunking is currently not supported for clip after allreduce
                if not self._clip_after_ar:
                    for block in range(self._num_blocks):
                        for chunk in range(self._num_chunks):
                            if self._all_gather_no_copy:
                                torch.distributed.all_gather(
                                    tensor_list=self._new_params2_shards[block][chunk],
                                    tensor=self._fp16_p_chunks[block][chunk],
                                    group=self._ag_pg[0],
                                    no_copy=True,
                                )
                            else:
                                torch.distributed.all_gather_into_tensor(
                                    output_tensor=self._new_params2_blocks[block],
                                    input_tensor=self._fp16_p_chunks[block][chunk],
                                    group=self._ag_pg[0],
                                )
                else:
                    if self._all_gather_no_copy:
                        torch.distributed.all_gather(
                            tensor_list=self._new_params_mega_shards,
                            tensor=self._fp16_p,
                            group=self._ag_pg[0],
                            no_copy=True,
                        )
                    else:
                        torch.distributed.all_gather_into_tensor(
                            output_tensor=self._new_params,
                            input_tensor=self._fp16_p,
                            group=self._ag_pg[0],
                        )

    def _flatten_grad_mt(self, scale):
        if len(self._grads_fp16) > 0:
            self._overflow_buf.zero_()
            if not self._fused_norm:
                multi_tensor_applier(
                    amp_C.multi_tensor_scale,
                    self._overflow_buf,
                    list(zip(*self._grads_fp16)),
                    scale,
                )
            else:
                self._L2_grad_norm = multi_tensor_applier(
                    amp_C.multi_tensor_l2norm_scale,
                    self._overflow_buf,
                    list(zip(*self._grads_fp16)),
                    scale,
                    False,
                )[0].float()

            self._grads_fp16 = []
        if len(self._grads_fp32) > 0:
            self._overflow_buf.zero_()
            if not self._fused_norm:
                multi_tensor_applier(
                    amp_C.multi_tensor_scale,
                    self._overflow_buf,
                    list(zip(*self._grads_fp32)),
                    scale,
                )
            else:
                self._L2_grad_norm = multi_tensor_applier(
                    amp_C.multi_tensor_l2norm_scale,
                    self._overflow_buf,
                    list(zip(*self._grads_fp32)),
                    scale,
                    False,
                )[0].float()
            self._grads_fp32 = []

    def _do_overlapped_reduction(self, param_i, param):
        if not self._is_accumulation_step:
            # handle overlapped reductions
            if param.dtype == torch.float16:
                self._grads_fp16.append((param.grad, self._individual_flat_grads[param_i]))
            else:
                self._grads_fp32.append((param.grad, self._individual_flat_grads[param_i]))
            self._grads_generated[param_i] = True
            if not self._first_step and not self._last_step:
                if self._overlap_reductions:
                    flush_block = self._get_flush_block()
                    while flush_block:
                        block_id = flush_block[0] // self._block_size
                        self._pipeline_block_reductions(block_id)
                        flush_block = self._get_flush_block()

    def set_global_scale(self, global_scale):
        """Set global scale."""
        self._global_scale = global_scale

    @property
    def global_scale(self):
        return self._global_scale

    @property
    def L2_grad_norm(self):
        torch.cuda.current_stream().wait_stream(self._l2_grad_norm_st)
        return self._L2_grad_norm

    def complete_reductions(self):
        """Complete reductions if full pipeline is not selected or overlap is not allowed."""
        if self._last_step:
            # zero out gradients that have not been completed yet
            for param_i, grad_generated in enumerate(self._grads_generated):
                if not grad_generated:
                    grad_info = self._grads_info[param_i]
                    param_offset = grad_info["param_offset"]
                    param_size = grad_info["param_grads_size"]
                    self._flat_grads[param_offset : param_offset + param_size].zero_()
                    self._grads_generated[param_i] = True

        if self._first_step or self._last_step or not self._overlap_reductions:
            # nothing done so far, run full pipeline after reductions
            for block_id in range(self._num_blocks - 1, -1, -1):
                self._pipeline_block_reductions(block_id)

        torch.cuda.current_stream().wait_stream(self._l2_grad_norm_st)

        self._current_block = self._num_blocks
        self._grads_generated = [False] * len(self._grads_info)

    def step(self, closure=None, grad_scaler=None):
        loss = None
        if closure is not None:
            loss = closure()

        self._pipeline_step()

        if grad_scaler is not None:
            found_inf = self._overflow_buf.float()
            optimizer_state = grad_scaler._per_optimizer_states[id(self)]
            current_device = torch.device("cuda", torch.cuda.current_device())
            optimizer_state["found_inf_per_device"][current_device] = found_inf

        self._completion_st.wait_stream(torch.cuda.current_stream())
        if not self._set_flat_param_view:
            with torch.cuda.stream(self._completion_st):
                # Copy self._new_params to model params
                with torch.no_grad():
                    if self._packed_flat_to_model_params_fp16 is not None:
                        multi_tensor_applier(
                            fused_adam_cuda.maybe_cast_mt,
                            self._overflow_buf,
                            self._packed_flat_to_model_params_fp16,
                        )
                    if self._packed_flat_to_model_params_fp32 is not None:
                        multi_tensor_applier(
                            fused_adam_cuda.maybe_cast_mt,
                            self._overflow_buf,
                            self._packed_flat_to_model_params_fp32,
                        )

        torch.cuda.current_stream().wait_stream(self._completion_st)

        self._reductions_works = [None] * self._num_blocks
        self._allgather_works = [None] * self._num_blocks

        return loss

    def state_dict(self):
        """
        Returns a dict containing the current state of this :class:`DistributedFusedAdam` instance.
        Example::
            checkpoint = {}
            checkpoint['model'] = model.state_dict()
            checkpoint['optimizer'] = optimizer.state_dict()
            torch.save(checkpoint, "saved.pth")
        """
        # save step, master weights and first/second moments
        state_dict = {}
        state_dict["step"] = self._step
        state_dict["fp32_p"] = self._fp32_p
        state_dict["fp32_m"] = self._fp32_m
        state_dict["fp32_v"] = self._fp32_v
        return state_dict

    def load_state_dict(self, state_dict):
        """
        Loads a state_dict created by an earlier call to state_dict().
        If an DistributedFusedAdam instance was constructed from some ``init_optimizer``,
        whose parameters in turn came from ``model``, it is expected that the user
        will call ``model.load_state_dict()`` before
        ``optimizer.load_state_dict()`` is called.
        Example::
            model = torch.nn.Linear(D_in, D_out).cuda().half()
            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
            ...
            checkpoint = torch.load("saved.pth")
            model.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
        """
        # restore step, master weights and first/second moments
        self._step = state_dict["step"]
        self._fp32_p = state_dict["fp32_p"].to(device="cuda")
        self._fp32_m = state_dict["fp32_m"].to(device="cuda")
        self._fp32_v = state_dict["fp32_v"].to(device="cuda")
        self._resume_from_checkpoint = True


================================================
FILE: apex/contrib/optimizers/fp16_optimizer.py
================================================
import torch
from apex.multi_tensor_apply import multi_tensor_applier


class FP16_Optimizer(object):
    """
    :class:`FP16_Optimizer` A cutdown version of apex.fp16_utils.FP16_Optimizer.
    Designed only to wrap apex.contrib.optimizers.FusedAdam, FusedSGD.
    Refer to apex.fp16_utils documents for more information.
    Example::
        model = torch.nn.Linear(D_in, D_out).cuda().half()
        optimizer = apex.contrib.optimizers.FusedSGD(model.parameters())
        optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
        ...
        # loss.backward() becomes:
        optimizer.backward(loss)
        ...
    Example with dynamic loss scaling::
        ...
        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
                                   # optional arg to control dynamic loss scaling behavior
                                   # dynamic_loss_args={'scale_window' : 500})
                                   # Usually, dynamic_loss_args is not necessary.
    """

    def __init__(
        self,
        init_optimizer,
        static_loss_scale=1.0,
        dynamic_loss_scale=False,
        dynamic_loss_args=None,
        verbose=True,
    ):
        print("\nThis fp16_optimizer is designed to only work with apex.contrib.optimizers.*")
        print("To update, use updated optimizers with AMP.")
        # The fused optimizer does all the work. We need this layer for two reason:
        # 1. maintain same user API from apex.fp16_utils
        # 2. keep common stuff here in case we need to add new fused optimizer later

        if not torch.cuda.is_available:
            raise SystemError("Cannot use fp16 without CUDA.")
        self.optimizer = init_optimizer

        self.fp16_groups = []  # model params
        self.fp32_groups = []  # master weights

        # iterate over param_groups
        for param_group in self.optimizer.param_groups:
            fp16_group = []
            fp32_group = []
            for p in param_group["params"]:
                fp16_group.append(p)
                fp32_group.append(p.clone().float().detach())
            self.fp16_groups.append(fp16_group)
            self.fp32_groups.append(fp32_group)
            param_group["params"] = fp32_group

        if multi_tensor_applier.available:
            import amp_C

            self.overflow_buf = torch.cuda.IntTensor([0])
            self.multi_tensor_l2norm = amp_C.multi_tensor_l2norm
        else:
            raise RuntimeError("FP16_Optimizer requires cuda extensions")

        # we may have a way of fusing dynamic scale. Do not support for now
        if dynamic_loss_scale:
            if dynamic_loss_args is not None:
                raise SystemError("Do not support dynamic loss scale args for now.")
            self.dynamic_loss_scale = True
            self.cur_scale = 2**16
            self.cur_iter = 0
            self.last_overflow_iter = -1
            self.scale_factor = 2
            self.scale_window = 1000
        else:
            self.dynamic_loss_scale = False
            self.cur_iter = 0
            self.cur_scale = static_loss_scale
        self.verbose = verbose

    def zero_grad(self, set_grads_to_None=True):
        """
        Zero FP16 parameter grads.
        """
        # FP32 grad should never exist.
        # For speed, set model fp16 grad to None by default
        for group in self.fp16_groups:
            for p in group:
                if set_grads_to_None:
                    p.grad = None
                else:
                    if p.grad is not None:
                        p.grad.detach_()
                        p.grad.zero_()

    def step(self, closure=None):
        """
        Not supporting closure.
        """
        fp16_grads = []
        norm_groups = []
        skip = False

        for group in self.fp16_groups:
            fp16_grad = []
            for i, p in enumerate(group):
                fp16_grad.append(p.grad)
            fp16_grads.append(fp16_grad)

        # nan check
        self.overflow_buf.zero_()
        for fp16_grad in fp16_grads:
            if len(fp16_grad) > 0:
                norm, norm_per_tensor = multi_tensor_applier(
                    self.multi_tensor_l2norm, self.overflow_buf, [fp16_grad], True
                )
                norm_groups.append(norm)
                if self.overflow_buf.item() != 0:
                    skip = True

        if skip:
            self._update_scale(skip)
            return

        # norm is in fact norm*cur_scale
        self.optimizer.step(
            grads=fp16_grads,
            output_params=self.fp16_groups,
            scale=self.cur_scale,
            grad_norms=norm_groups,
        )

        self._update_scale(False)
        return

    def backward(self, loss):
        """
        :attr:`backward` performs the following steps:
        1. fp32_loss = loss.float()
        2. scaled_loss = fp32_loss*loss_scale
        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's fp16 leaves
        """
        scaled_loss = (loss.float()) * self.cur_scale
        scaled_loss.backward()

    def _update_scale(self, skip):
        if self.dynamic_loss_scale:
            if skip:
                if self.verbose:
                    print("\nGrad overflow on iteration", self.cur_iter)
                    print("Using dynamic loss scale of", self.cur_scale)
                self.cur_scale = max(self.cur_scale / self.scale_factor, 1)
                self.last_overflow_iter = self.cur_iter
            else:
                if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
                    self.cur_scale *= self.scale_factor
        else:
            if skip:
                print("\nGrad overflow on iteration", self.cur_iter)
                print("Using static loss scale of", self.cur_scale)
        self.cur_iter += 1
        return

    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
    def _get_state(self):
        return self.optimizer.state

    def _set_state(self, value):
        self.optimizer.state = value

    state = property(_get_state, _set_state)

    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
    # (for example, to adjust the learning rate)
    def _get_param_groups(self):
        return self.optimizer.param_groups

    def _set_param_groups(self, value):
        self.optimizer.param_groups = value

    param_groups = property(_get_param_groups, _set_param_groups)

    def state_dict(self):
        """
        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
        of the contained Pytorch optimizer.
        Example::
            checkpoint = {}
            checkpoint['model'] = model.state_dict()
            checkpoint['optimizer'] = optimizer.state_dict()
            torch.save(checkpoint, "saved.pth")
        """
        state_dict = {}
        state_dict["dynamic_loss_scale"] = self.dynamic_loss_scale
        state_dict["cur_scale"] = self.cur_scale
        state_dict["cur_iter"] = self.cur_iter
        if state_dict["dynamic_loss_scale"]:
            state_dict["last_overflow_iter"] = self.last_overflow_iter
            state_dict["scale_factor"] = self.scale_factor
            state_dict["scale_window"] = self.scale_window
        state_dict["optimizer_state_dict"] = self.optimizer.state_dict()
        state_dict["fp32_groups"] = self.fp32_groups
        return state_dict

    def load_state_dict(self, state_dict):
        """
        Loads a state_dict created by an earlier call to state_dict().
        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
        whose parameters in turn came from ``model``, it is expected that the user
        will call ``model.load_state_dict()`` before
        ``fp16_optimizer_instance.load_state_dict()`` is called.
        Example::
            model = torch.nn.Linear(D_in, D_out).cuda().half()
            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
            ...
            checkpoint = torch.load("saved.pth")
            model.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
        """
        # I think it should actually be ok to reload the optimizer before the model.
        self.dynamic_loss_scale = state_dict["dynamic_loss_scale"]
        self.cur_scale = state_dict["cur_scale"]
        self.cur_iter = state_dict["cur_iter"]
        if state_dict["dynamic_loss_scale"]:
            self.last_overflow_iter = state_dict["last_overflow_iter"]
            self.scale_factor = state_dict["scale_factor"]
            self.scale_window = state_dict["scale_window"]
        self.optimizer.load_state_dict(state_dict["optimizer_state_dict"])
        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
        # The optimizer's hyperparameters and internal buffers are also up to date.
        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
        # out of date.  There are two options.
        # 1:  Refresh the master params from the model's fp16 params.
        # This requires less storage but incurs precision loss.
        # 2:  Save and restore the fp32 master copies separately.
        # We choose option 2.
        #
        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
        # of their associated parameters, because it's possible those buffers might not exist yet in
        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been
        # constructed in the same way as the one whose state_dict we are loading, the same master params
        # are guaranteed to exist, so we can just copy_() from the saved master params.
        for current, saved in zip(self.fp32_groups, state_dict["fp32_groups"]):
            for _current, _saved in zip(current, saved):
                _current.data.copy_(_saved.data)


================================================
FILE: apex/contrib/optimizers/fused_adam.py
================================================
import types
import torch
import importlib
from apex.multi_tensor_apply import multi_tensor_applier


class FusedAdam(torch.optim.Optimizer):
    """Implements Adam algorithm. Currently GPU-only.  Requires Apex to be installed via
    ``python setup.py install --cuda_ext --cpp_ext``.

    It has been proposed in `Adam: A Method for Stochastic Optimization`_.

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups.
        lr (float, optional): learning rate. (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square. (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability. (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False) NOT SUPPORTED in FusedAdam!
        eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
            adds eps to the bias-corrected second moment estimate before
            evaluating square root instead of adding it to the square root of
            second moment estimate as in the original paper. (default: False)
        use_mt (boolean, optional): use multi tensor apply for lower launch
            latency. (default: False)

    .. _Adam - A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    def __init__(
        self,
        params,
        lr=1e-3,
        bias_correction=True,
        betas=(0.9, 0.999),
        eps=1e-8,
        eps_inside_sqrt=False,
        weight_decay=0.0,
        max_grad_norm=0.0,
        amsgrad=False,
        use_mt=False,
        amp_scale_adjustment=1.0,
    ):
        global fused_adam_cuda
        fused_adam_cuda = importlib.import_module("fused_adam_cuda")

        self._use_multi_tensor = False
        if use_mt:
            if not multi_tensor_applier.available:
                print("Warning:  multi_tensor_applier is unavailable")
            else:
                self._use_multi_tensor = True
                self._overflow_buf = torch.cuda.IntTensor([0])

        self._amp_scale_adjustment = amp_scale_adjustment

        if amsgrad:
            raise RuntimeError("FusedAdam does not support the AMSGrad variant.")
        defaults = dict(
            lr=lr,
            bias_correction=bias_correction,
            betas=betas,
            eps=eps,
            weight_decay=weight_decay,
            max_grad_norm=max_grad_norm,
        )
        super(FusedAdam, self).__init__(params, defaults)
        self.eps_mode = 0 if eps_inside_sqrt else 1

    def step(self, closure=None, grads=None, output_params=None, scale=1.0, grad_norms=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
            grads (list of tensors, optional): weight gradient to use for the
                optimizer update. If gradients have type torch.half, parameters
                are expected to be in type torch.float. (default: None)
            output params (list of tensors, optional): A reduced precision copy
                of the updated weights written out in addition to the regular
                updated weights. Have to be of same type as gradients. (default: None)
            scale (float, optional): factor to divide gradient tensor values
                by before applying to weights. (default: 1)
        """
        loss = None
        if closure is not None:
            loss = closure()

        if hasattr(self, "_amp_stash"):
            grads = self._amp_stash.grads
            output_params = self._amp_stash.output_params
            scale = self._amp_stash.scale * self._amp_scale_adjustment
            grad_norms = self._amp_stash.grad_norms

        if grads is None:
            grads_group = [None] * len(self.param_groups)
        # backward compatibility
        # assuming a list/generator of parameter means single group
        elif isinstance(grads, types.GeneratorType):
            grads_group = [grads]
        elif type(grads[0]) != list:
            grads_group = [grads]
        else:
            grads_group = grads

        if output_params is None:
            output_params_group = [None] * len(self.param_groups)
        elif isinstance(output_params, types.GeneratorType):
            output_params_group = [output_params]
        elif type(output_params[0]) != list:
            output_params_group = [output_params]
        else:
            output_params_group = output_params

        if grad_norms is None:
            grad_norms = [None] * len(self.param_groups)

        for group, grads_this_group, output_params_this_group, grad_norm in zip(
            self.param_groups, grads_group, output_params_group, grad_norms
        ):
            if grads_this_group is None:
                grads_this_group = [None] * len(group["params"])
            if output_params_this_group is None:
                output_params_this_group = [None] * len(group["params"])

            # compute combined scale factor for this group
            combined_scale = scale
            if group["max_grad_norm"] > 0:
                # norm is in fact norm*scale
                clip = ((grad_norm / scale) + 1e-6) / group["max_grad_norm"]
                if clip > 1:
                    combined_scale = clip * scale

            bias_correction = 1 if group["bias_correction"] else 0

            if self._use_multi_tensor:
                if output_params:
                    tensorlists = [[], [], [], [], []]
                else:
                    tensorlists = [[], [], [], []]
                tensordevice = None

            for p, grad, output_param in zip(
                group["params"], grads_this_group, output_params_this_group
            ):
                # note: p.grad should not ever be set for correct operation of mixed precision optimizer that sometimes sends None gradients
                if p.grad is None and grad is None:
                    continue
                if grad is None:
                    grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError(
                        "FusedAdam does not support sparse gradients, please consider SparseAdam instead"
                    )

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state["step"] = 0
                    # Exponential moving average of gradient values
                    state["exp_avg"] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state["exp_avg_sq"] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
                beta1, beta2 = group["betas"]

                state["step"] += 1

                out_p = (
                    torch.tensor([], dtype=torch.float) if output_param is None else output_param
                )
                if self._use_multi_tensor:
                    pl = [p.data, exp_avg, exp_avg_sq, grad]
                    if output_param is not None:
                        pl.append(out_p)

                    for tl, t in zip(tensorlists, pl):
                        tl.append(t)

                    if tensordevice is None:
                        tensordevice = p.device
                    elif tensordevice != p.device:
                        raise RuntimeError(
                            "FusedAdam does not support use_mt with tensors on multiple device"
                        )

                else:
                    with torch.cuda.device(p.device):
                        fused_adam_cuda.adam(
                            p.data,
                            out_p,
                            exp_avg,
                            exp_avg_sq,
                            grad,
                            group["lr"],
                            beta1,
                            beta2,
                            group["eps"],
                            combined_scale,
                            state["step"],
                            self.eps_mode,
                            bias_correction,
                            group["weight_decay"],
                        )

            if self._use_multi_tensor:
                with torch.cuda.device(tensordevice):
                    multi_tensor_applier(
                        fused_adam_cuda.adam_mt,
                        self._overflow_buf,
                        tensorlists,
                        group["lr"],
                        beta1,
                        beta2,
                        group["eps"],
                        combined_scale,
                        state["step"],
                        self.eps_mode,
                        bias_correction,
                        group["weight_decay"],
                    )

        return loss


================================================
FILE: apex/contrib/optimizers/fused_lamb.py
================================================
import torch
import importlib
import math
from apex.multi_tensor_apply import multi_tensor_applier


class FusedLAMB(torch.optim.Optimizer):
    """Implements LAMB algorithm.

    Currently GPU-only.  Requires Apex to be installed via
    ``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_lamb" ./``.

    This version of fused LAMB implements 2 fusions.

      * Fusion of the LAMB update's elementwise operations
      * A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.

    :class:`apex.contrib.optimizers.FusedLAMB`'s usage is identical to any ordinary Pytorch optimizer::

        opt = apex.contrib.optimizers.FusedLAMB(model.parameters(), lr = ....)
        ...
        opt.step()

    :class:`apex.optimizers.FusedLAMB` may be used with or without Amp.  If you wish to use :class:`FusedLAMB` with Amp,
    you may choose any ``opt_level``::

        opt = apex.optimizers.FusedLAMB(model.parameters(), lr = ....)
        model, opt = amp.initialize(model, opt, opt_level="O0" or "O1 or "O2")
        ...
        opt.step()

    In general, ``opt_level="O1"`` is recommended.

    LAMB was proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups.
        lr (float, optional): learning rate. (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its norm. (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability. (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            NOT SUPPORTED now! (default: False)
        adam_w_mode (boolean, optional): Apply L2 regularization or weight decay
            True for decoupled weight decay(also known as AdamW) (default: True)
        grad_averaging (bool, optional): whether apply (1-beta2) to grad when
            calculating running averages of gradient. (default: True)
        set_grad_none (bool, optional): whether set grad to None when zero_grad()
            method is called. (default: True)
        max_grad_norm (float, optional): value used to clip global grad norm
            (default: 1.0)

    .. _Large Batch Optimization for Deep Learning - Training BERT in 76 minutes:
        https://arxiv.org/abs/1904.00962
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    def __init__(
        self,
        params,
        lr=1e-3,
        bias_correction=True,
        betas=(0.9, 0.999),
        eps=1e-6,
        weight_decay=0.01,
        amsgrad=False,
        adam_w_mode=True,
        grad_averaging=True,
        set_grad_none=True,
        max_grad_norm=1.0,
    ):
        if amsgrad:
            raise RuntimeError("FusedLAMB does not support the AMSGrad variant.")
        defaults = dict(
            lr=lr,
            bias_correction=bias_correction,
            betas=betas,
            eps=eps,
            weight_decay=weight_decay,
            grad_averaging=grad_averaging,
            max_grad_norm=max_grad_norm,
        )
        super(FusedLAMB, self).__init__(params, defaults)
        if multi_tensor_applier.available:
            import amp_C

            self.multi_tensor_l2norm = amp_C.multi_tensor_l2norm
            self._dummy_overflow_buf = torch.cuda.IntTensor([0])
            fused_lamb_cuda = importlib.import_module("fused_lamb_cuda")
            self.multi_tensor_lamb = fused_lamb_cuda.lamb
        else:
            raise RuntimeError("apex.contrib.optimizers.FusedLAMB requires cuda extensions")

        self.adam_w_mode = 1 if adam_w_mode else 0
        self.set_grad_none = set_grad_none

    def zero_grad(self):
        if self.set_grad_none:
            for group in self.param_groups:
                for p in group["params"]:
                    p.grad = None
        else:
            super(FusedLAMB, self).zero_grad()

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        # create separate grad lists for fp32 and fp16 params
        g_all_32, g_all_16 = [], []
        for group in self.param_groups:
            for p in group["params"]:
                if p.grad is None:
                    continue
                if p.dtype == torch.float32:
                    g_all_32.append(p.grad.data)
                elif p.dtype == torch.float16:
                    g_all_16.append(p.grad.data)
                else:
                    raise RuntimeError("FusedLAMB only support fp16 and fp32.")

        g_norm_32, g_norm_16 = 0.0, 0.0
        # compute grad norm for two lists
        if len(g_all_32) > 0:
            g_norm_32 = multi_tensor_applier(
                self.multi_tensor_l2norm, self._dummy_overflow_buf, [g_all_32], False
            )[0].item()
        if len(g_all_16) > 0:
            g_norm_16 = multi_tensor_applier(
                self.multi_tensor_l2norm, self._dummy_overflow_buf, [g_all_16], False
            )[0].item()

        # blend two grad norms to get global grad norm
        global_grad_norm = math.sqrt(g_norm_32 * g_norm_32 + g_norm_16 * g_norm_16)
        max_grad_norm = self.defaults["max_grad_norm"]

        for group in self.param_groups:
            bias_correction = 1 if group["bias_correction"] else 0
            beta1, beta2 = group["betas"]
            grad_averaging = 1 if group["grad_averaging"] else 0

            # assume same step across group now to simplify things
            # per parameter step can be easily support by making it tensor, or pass list into kernel
            if "step" in group:
                group["step"] += 1
            else:
                group["step"] = 1

            # create lists for multi-tensor apply
            g_16, p_16, m_16, v_16 = [], [], [], []
            g_32, p_32, m_32, v_32 = [], [], [], []

            for p in group["params"]:
                if p.grad is None:
                    continue
                if p.grad.data.is_sparse:
                    raise RuntimeError(
                        "FusedLAMB does not support sparse gradients, please consider SparseAdam instead"
                    )

                state = self.state[p]
                # State initialization
                if len(state) == 0:
                    # Exponential moving average of gradient values
                    state["exp_avg"] = torch.zeros_like(p.data)
                    # Exponential moving average of gradient values
                    state["exp_avg_sq"] = torch.zeros_like(p.data)

                if p.dtype == torch.float16:
                    g_16.append(p.grad.data)
                    p_16.append(p.data)
                    m_16.append(state["exp_avg"])
                    v_16.append(state["exp_avg_sq"])
                elif p.dtype == torch.float32:
                    g_32.append(p.grad.data)
                    p_32.append(p.data)
                    m_32.append(state["exp_avg"])
                    v_32.append(state["exp_avg_sq"])
                else:
                    raise RuntimeError("FusedLAMB only support fp16 and fp32.")

            if len(g_16) > 0:
                multi_tensor_applier(
                    self.multi_tensor_lamb,
                    self._dummy_overflow_buf,
                    [g_16, p_16, m_16, v_16],
                    group["lr"],
                    beta1,
                    beta2,
                    group["eps"],
                    group["step"],
                    bias_correction,
                    group["weight_decay"],
                    grad_averaging,
                    self.adam_w_mode,
                    global_grad_norm,
                    max_grad_norm,
                )
            if len(g_32) > 0:
                multi_tensor_applier(
                    self.multi_tensor_lamb,
                    self._dummy_overflow_buf,
                    [g_32, p_32, m_32, v_32],
                    group["lr"],
                    beta1,
                    beta2,
                    group["eps"],
                    group["step"],
                    bias_correction,
                    group["weight_decay"],
                    grad_averaging,
                    self.adam_w_mode,
                    global_grad_norm,
                    max_grad_norm,
                )

        return loss


================================================
FILE: apex/contrib/optimizers/fused_sgd.py
================================================
import types
import torch
from torch.optim.optimizer import Optimizer, required

from apex.multi_tensor_apply import multi_tensor_applier


class FusedSGD(Optimizer):
    r"""Implements stochastic gradient descent (optionally with momentum).

    This version of fused SGD implements 2 fusions.
      * Fusion of the SGD update's elementwise operations
      * A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.

    :class:`apex.contrib.optimizers.FusedSGD` should be used without AMP.
   
    :class:`apex.contrib.optimizers.FusedSGD` only works in the case where all parameters require grad. 

    Nesterov momentum is based on the formula from
    `On the importance of initialization and momentum in deep learning`__.

    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float): learning rate
        momentum (float, optional): momentum factor (default: 0)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        dampening (float, optional): dampening for momentum (default: 0)
        nesterov (bool, optional): enables Nesterov momentum (default: False)

    Example:
        model = ...
        model.half()
        optimizer = apex.contrib.optimizers.FusedSGD(model.parameters())
        # wrap with FP16_Optimizer
        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        optimizer.zero_grad()
	...
        optimizer.backward(loss)
        optmizer.step()

    __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf

    .. note::
        The implementation of SGD with Momentum/Nesterov subtly differs from
        Sutskever et. al. and implementations in some other frameworks.

        Considering the specific case of Momentum, the update can be written as

        .. math::
                  v = \rho * v + g \\
                  p = p - lr * v

        where p, g, v and :math:`\rho` denote the parameters, gradient,
        velocity, and momentum respectively.

        This is in contrast to Sutskever et. al. and
        other frameworks which employ an update of the form

        .. math::
             v = \rho * v + lr * g \\
             p = p - v

        The Nesterov version is analogously modified.
    """

    def __init__(
        self,
        params,
        lr=required,
        momentum=0,
        dampening=0,
        weight_decay=0,
        nesterov=False,
        wd_after_momentum=False,
        materialize_master_grads=True,
    ):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(
            lr=lr,
            momentum=momentum,
            dampening=dampening,
            weight_decay=weight_decay,
            nesterov=nesterov,
        )
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(FusedSGD, self).__init__(params, defaults)

        self.wd_after_momentum = wd_after_momentum

        if multi_tensor_applier.available:
            import amp_C

            # Skip buffer
            self._dummy_overflow_buf = torch.cuda.IntTensor([0])
            self.multi_tensor_sgd = amp_C.multi_tensor_sgd
        else:
            raise RuntimeError("apex.contrib.optimizers.FusedSGD requires cuda extensions")

    def __setstate__(self, state):
        super(FusedSGD, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault("nesterov", False)

    def get_momentums(self, params):
        momentums = []
        first_run = True
        for p in params:
            param_state = self.state[p]
            # torch.optim.SGD initializes momentum in the main loop, we have
            # to do it here, and track whether or not we've done so, so that
            # momentum application can be skipped in the main kernel.
            if "momentum_buffer" not in param_state:
                first_run = True
                buf = param_state["momentum_buffer"] = torch.zeros_like(p.data)
                momentums.append(buf)
            else:
                first_run = False
                momentums.append(param_state["momentum_buffer"])
        return momentums, first_run

    def step(self, closure=None, grads=None, output_params=None, scale=1.0, grad_norms=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
            grads (list of tensors, optional): weight gradient to use for the
                optimizer update. If gradients have type torch.half, parameters
                are expected to be in type torch.float. (default: None)
            output_params (list of tensors, optional): A reduced precision copy
                of the updated weights written out in addition to the regular
                updated weights. Have to be of same type as gradients. (default: None)
            scale (float, optional): factor to divide gradient tensor values
                by before applying to weights. (default: 1)
        """
        if hasattr(self, "_amp_stash"):
            raise RuntimeError("apex.contrib.optimizers.FusedSGD should not be used with AMP.")

        loss = None
        if closure is not None:
            loss = closure()

        if grads is None:
            raise RuntimeError(
                "apex.contrib.optimizers.FusedSGD must be wrapped \
	                       with apex.contrib.optimizers.FP16_Optimizer \
			       which provides grads."
            )
        # backward compatibility
        # assuming a list/generator of parameter means single group
        elif isinstance(grads, types.GeneratorType):
            grads_group = [grads]
        elif type(grads[0]) != list:
            grads_group = [grads]
        else:
            grads_group = grads

        if output_params is None:
            raise RuntimeError(
                "apex.contrib.optimizers.FusedSGD must be wrapped \
                               with apex.contrib.optimizers.FP16_Optimizer \
                               which provides output_params."
            )
        elif isinstance(output_params, types.GeneratorType):
            output_params_group = [output_params]
        elif type(output_params[0]) != list:
            output_params_group = [output_params]
        else:
            output_params_group = output_params

        for group, grads_this_group, output_params_this_group in zip(
            self.param_groups, grads_group, output_params_group
        ):
            if grads_this_group is None or output_params_this_group is None:
                raise RuntimeError(
                    "apex.contrib.optimizers.FusedSGD only works \
                                    when all parameters require grad."
                )

            weight_decay = group["weight_decay"]
            momentum = group["momentum"]
            dampening = group["dampening"]
            nesterov = group["nesterov"]
            lr = group["lr"]

            first_runs = [True, True]

            # output_params_this_group: original weights (either fp16 or fp32)
            # group['params']: master weights (fp32)

            # grad_type, param_to_update_type, momentum_type, requires_fp16_model_copy
            # fp32, fp32, fp32, No
            fp32_grads = [
                g
                for (p, g) in zip(output_params_this_group, grads_this_group)
                if p.dtype == torch.float32
            ]
            fp32_params = [
                p2
                for (p1, p2) in zip(output_params_this_group, group["params"])
                if p1.dtype == torch.float32
            ]
            fp32_momentums, first_runs[1] = self.get_momentums(fp32_params)
            fp32_set = [fp32_grads, fp32_params, fp32_momentums]

            # fp16, fp32, fp32, Yes
            fp16_grads = [
                g
                for (p, g) in zip(output_params_this_group, grads_this_group)
                if p.dtype == torch.float16
            ]
            fp32_from_fp16_params = [
                p2
                for (p1, p2) in zip(output_params_this_group, group["params"])
                if p1.dtype == torch.float16
            ]
            fp32_from_fp16_momentums, first_runs[0] = self.get_momentums(fp32_from_fp16_params)
            fp16_params = [
                p1
                for (p1, p2) in zip(output_params_this_group, group["params"])
                if p1.dtype == torch.float16
            ]
            fp16_set = [
                fp16_grads,
                fp32_from_fp16_params,
                fp32_from_fp16_momentums,
                fp16_params,
            ]

            launch_sets = [fp16_set, fp32_set]

            for launch_set, first_run in zip(launch_sets, first_runs):
                assert len(launch_set[0]) == len(launch_set[1])
                assert len(launch_set[0]) == len(launch_set[2])
                if len(launch_set[0]) > 0:
                    multi_tensor_applier(
                        self.multi_tensor_sgd,
                        self._dummy_overflow_buf,
                        launch_set,
                        weight_decay,
                        momentum,
                        dampening,
                        lr,
                        nesterov,
                        first_run,
                        self.wd_after_momentum,
                        1.0 / scale,
                    )

        return loss


================================================
FILE: apex/contrib/peer_memory/__init__.py
================================================
from .peer_memory import PeerMemoryPool
from .peer_halo_exchanger_1d import PeerHaloExchanger1d


================================================
FILE: apex/contrib/peer_memory/peer_halo_exchanger_1d.py
================================================
import torch
import peer_memory_cuda as pm


class PeerHaloExchanger1d:
    def __init__(self, ranks, rank_in_group, peer_pool, half_halo):
        self.peer_group_size = len(ranks)
        self.ranks = ranks
        self.peer_rank = rank_in_group
        self.low_neighbor = (self.peer_rank + self.peer_group_size - 1) % self.peer_group_size
        self.high_neighbor = (self.peer_rank + 1) % self.peer_group_size
        self.low_zero = True if self.peer_rank == 0 else False
        self.high_zero = True if self.peer_rank == self.peer_group_size - 1 else False

        self.peer_pool = peer_pool
        self.half_halo = half_halo

    def _allocate_peer_tensor(self, halo):
        # Compute size in bytes
        # Note: Pad buffer so each CUDA block gets required buffer size
        size = 4 * halo.numel() * halo.element_size()
        size_per_block = 128 * 2 * 16  # 128 threads each require two 128b buffers
        size = (size + size_per_block - 1) // size_per_block * size_per_block

        # Construct dtype peer buffer with desired size
        shape = [1, 1, 1, size // halo.element_size()]
        return self.peer_pool.allocate_peer_tensors(shape, halo.dtype, False, True)

    def __call__(self, y, H_split=True, explicit_nhwc=False, numSM=0, diagnostics=False):
        channels_last = y.is_contiguous(memory_format=torch.channels_last) and not explicit_nhwc
        if H_split:
            if explicit_nhwc:
                _, Hs, _, _ = list(y.shape)
                H = Hs - 2 * self.half_halo
                low_out_halo = y[:, self.half_halo : 2 * self.half_halo, :, :]
                low_tx = self._allocate_peer_tensor(low_out_halo)
                low_inp_halo = y[:, : self.half_halo, :, :]
                high_out_halo = y[:, H : H + self.half_halo, :, :]
                high_tx = self._allocate_peer_tensor(high_out_halo)
                high_inp_halo = y[:, H + self.half_halo : H + 2 * self.half_halo, :, :]
            else:
                _, _, Hs, _ = list(y.shape)
                H = Hs - 2 * self.half_halo
                low_out_halo = y[:, :, self.half_halo : 2 * self.half_halo, :]
                low_tx = self._allocate_peer_tensor(low_out_halo)
                low_inp_halo = y[:, :, : self.half_halo, :]
                high_out_halo = y[:, :, H : H + self.half_halo, :]
                high_tx = self._allocate_peer_tensor(high_out_halo)
                high_inp_halo = y[:, :, H + self.half_halo : H + 2 * self.half_halo, :]
        else:
            if explicit_nhwc:
                _, _, Ws, _ = list(y.shape)
                W = Ws - 2 * self.half_halo
                low_out_halo = y[:, :, self.half_halo : 2 * self.half_halo, :]
                low_tx = self._allocate_peer_tensor(low_out_halo)
                low_inp_halo = y[:, :, : self.half_halo, :]
                high_out_halo = y[:, :, W : W + self.half_halo, :]
                high_tx = self._allocate_peer_tensor(high_out_halo)
                high_inp_halo = y[:, :, W + self.half_halo : W + 2 * self.half_halo, :]
            else:
                _, _, _, Ws = list(y.shape)
                W = Ws - 2 * self.half_halo
                low_out_halo = y[:, :, :, self.half_halo : 2 * self.half_halo]
                low_tx = self._allocate_peer_tensor(low_out_halo)
                low_inp_halo = y[:, :, :, : self.half_halo]
                high_out_halo = y[:, :, :, W : W + self.half_halo]
                high_tx = self._allocate_peer_tensor(high_out_halo)
                high_inp_halo = y[:, :, :, W + self.half_halo : W + 2 * self.half_halo]
        pm.push_pull_halos_1d(
            diagnostics,
            explicit_nhwc,
            numSM,
            self.peer_rank,
            self.low_zero,
            low_out_halo,
            low_tx[self.peer_rank],
            high_tx[self.low_neighbor],
            low_inp_halo,
            self.high_zero,
            high_out_halo,
            high_tx[self.peer_rank],
            low_tx[self.high_neighbor],
            high_inp_halo,
        )


================================================
FILE: apex/contrib/peer_memory/peer_memory.py
================================================
import torch
import numpy as np
import peer_memory_cuda as pm


class PeerMemoryPool(object):
    def __init__(self, static_size, dynamic_size, peer_ranks=None):
        rank = torch.distributed.get_rank()
        world_size = torch.distributed.get_world_size()
        ngpus = min(torch.cuda.device_count(), world_size)
        peer_group_size = ngpus
        peer_group = rank // ngpus
        peer_rank_base = peer_group * ngpus
        peer_rank = rank - peer_rank_base
        if peer_ranks is None:
            peer_ranks = [i + peer_rank_base for i in range(peer_group_size)]
        peer_rank_start = peer_rank_base
        peer_rank_end = peer_rank_start + peer_group_size - 1
        for pr in peer_ranks:
            assert pr >= peer_rank_start and pr <= peer_rank_end, (
                "%d :: peer_rank %d not on same node (ranks=[%d,%d])"
                % (rank, pr, peer_rank_start, peer_rank_end)
            )

        self.alignment = 256
        self.static_size = ((static_size + self.alignment - 1) // self.alignment) * self.alignment
        self.dynamic_size = ((dynamic_size + self.alignment - 1) // self.alignment) * self.alignment

        # allocate giant pool of device memory
        self.raw = pm.allocate_raw(self.static_size + self.dynamic_size)

        # exchange peer pointers with nccl
        raw_ipc = pm.get_raw_ipc_address(self.raw).cuda()
        peer_raw_ipcs = [torch.empty_like(raw_ipc) for _ in range(world_size)]
        torch.distributed.all_gather(peer_raw_ipcs, raw_ipc)
        peer_raw_ipcs = torch.stack(peer_raw_ipcs).cpu()

        # extract IPC pointers for ranks on same node
        peer_raw = pm.get_raw_peers(
            peer_raw_ipcs[peer_rank_base : peer_rank_base + ngpus], peer_rank, self.raw
        )
        self.peer_raw = [peer_raw[peer_rank - peer_rank_base] for peer_rank in peer_ranks]
        self.static_offset = 0
        self.dynamic_offset = 0
        self.peer_ranks = peer_ranks

    def __del__(self):
        pm.free_raw(self.raw)

    def reset(self):
        self.dynamic_offset = 0

    def allocate_peer_tensors(self, shape, dtype, channels_last, dynamic):
        nels = np.prod(shape)
        if dtype == torch.float16:
            elem_size = 2
            if dynamic:
                start = (
                    (self.dynamic_offset + self.alignment - 1) // self.alignment
                ) * self.alignment
                self.dynamic_offset = start + nels * elem_size
                assert self.dynamic_offset < self.dynamic_size, "Dynamic peer memory pool exhausted"
                return [
                    pm.blob_view_half(pr + self.static_size + start, shape, channels_last)
                    for pr in self.peer_raw
                ]
            else:
                start = (
                    (self.static_offset + self.alignment - 1) // self.alignment
                ) * self.alignment
                self.static_offset = start + nels * elem_size
                assert self.static_offset < self.static_size, "Static peer memory pool exhausted"
                return [pm.blob_view_half(pr + start, shape, channels_last) for pr in self.peer_raw]
        if dtype == torch.float32:
            elem_size = 4
            if dynamic:
                start = (
                    (self.dynamic_offset + self.alignment - 1) // self.alignment
                ) * self.alignment
                self.dynamic_offset = start + nels * elem_size
                assert self.dynamic_offset < self.dynamic_size, "Dynamic peer memory pool exhausted"
                return [
                    pm.blob_view_float(pr + self.static_size + start, shape, channels_last)
                    for pr in self.peer_raw
                ]
            else:
                start = (
                    (self.static_offset + self.alignment - 1) // self.alignment
                ) * self.alignment
                self.static_offset = start + nels * elem_size
                assert self.static_offset < self.static_size, "Static peer memory pool exhausted"
                return [
                    pm.blob_view_float(pr + start, shape, channels_last) for pr in self.peer_raw
                ]
        if dtype == torch.int32:
            elem_size = 4
            if dynamic:
                start = (
                    (self.dynamic_offset + self.alignment - 1) // self.alignment
                ) * self.alignment
                self.dynamic_offset = start + nels * elem_size
                assert self.dynamic_offset < self.dynamic_size, "Dynamic peer memory pool exhausted"
                return [
                    pm.blob_view_int(pr + self.static_size + start, shape, channels_last)
                    for pr in self.peer_raw
                ]
            else:
                start = (
                    (self.static_offset + self.alignment - 1) // self.alignment
                ) * self.alignment
                self.static_offset = start + nels * elem_size
                assert self.static_offset < self.static_size, "Static peer memory pool exhausted"
                return [pm.blob_view_int(pr + start, shape, channels_last) for pr in self.peer_raw]
        else:
            assert False, "dtype %s not supported" % (str(dtype))


================================================
FILE: apex/contrib/sparsity/COPYRIGHT
================================================
Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.


================================================
FILE: apex/contrib/sparsity/README.md
================================================
# Introduction to ASP

This serves as a quick-start for ASP (Automatic SParsity), a tool that enables sparse training and inference for PyTorch models by adding 2 lines of Python.

For details on "[Channel Permutations for N:M Sparsity](https://proceedings.neurips.cc/paper/2021/hash/6e8404c3b93a9527c8db241a1846599a-Abstract.html)," please see the [permutation_tests](permutation_tests/README.md) directory.

## Importing ASP

```
from apex.contrib.sparsity import ASP
```

## Initializing ASP

Apart from the import statement, it is sufficient to add just the following line of code before the training phase to augment the model and the optimizer for sparse training/inference:

```
ASP.prune_trained_model(model, optimizer)
```

In the context of a typical PyTorch training loop, it might look like this:

```
ASP.prune_trained_model(model, optimizer)

x, y = DataLoader(args)
for epoch in range(epochs):
    y_pred = model(x)
    loss = loss_function(y_pred, y)
    loss.backward()
    optimizer.step()

torch.save(...)
```

The `prune_trained_model` step calculates the sparse mask and applies it to the weights. This is done once, i.e., sparse locations in the weights matrix remain fixed after this step. 

## Generate a Sparse Network

The following approach serves as a guiding example on how to generate a pruned model that can use Sparse Tensor Cores in the NVIDIA Ampere Architecture. This approach generates a model for deployment, i.e. inference mode.

```
(1) Given a fully trained (dense) network, prune parameter values in a 2:4 sparse pattern.
(2) Fine-tune  the  pruned  model  with  optimization  method  and  hyper-parameters (learning-rate, schedule, number of epochs, etc.) exactly as those used to obtain the trained model.
(3) (If required) Quantize the model.
```

In code, below is a sketch on how to use ASP for this approach (steps 1 and 2 above).

```
model = define_model(..., pretrained=True) # define model architecture and load parameter tensors with trained values (by reading a trained checkpoint)
criterion = ... # compare ground truth with model predition; use the same criterion as used to generate the dense trained model
optimizer = ... # optimize model parameters; use the same optimizer as used to generate the dense trained model
lr_scheduler = ... # learning rate scheduler; use the same schedule as used to generate the dense trained model

from apex.contrib.sparsity import ASP     
ASP.prune_trained_model(model, optimizer) #pruned a trained model

x, y = DataLoader(args)
for epoch in range(epochs): # train the pruned model for the same number of epochs as used to generate the dense trained model
    y_pred = model(x)
    loss = criterion(y_pred, y)
    lr_scheduler.step()
    loss.backward()
    optimizer.step()

torch.save(...) # saves the pruned checkpoint with sparsity masks 
```

## Non-Standard Usage

If your goal is to easily perpare a network for accelerated inference, please follow the recipe above.  However, ASP can also be used to perform experiments in advanced techniques like training with sparsity from initialization. For example, in order to recompute the sparse mask in between training steps, use the following method:

```
ASP.compute_sparse_masks()
```

A more thorough example can be found in `./test/toy_problem.py`. 

## Advanced Usage: Channel Permutation

We introduce channel permutations as an advanced method to maximize the accuracy of structured sparse networks. By permuting weight matrices along their channel dimension and adjusting the surrounding layers appropriately, we demonstrate accuracy recovery for even small, parameter-efficient networks, without affecting inference run-time.

The final accuracy has a strong relationship with the quality of permutations. We provide the default algorithms to search for high-quality permutations. The permutation search process can be accelerated by the Apex CUDA extension: `apex.contrib.sparsity.permutation_search_kernels`

If you want to use the GPU to accelerate the permutation search process, we recommend installing Apex with permutation search CUDA extension via

```
pip install -v --disable-pip-version-check --no-cache-dir --global-option="--permutation_search" ./
```

If you want to disable the permutation search process, please pass the `allow_permutation=False` to `init_model_for_pruning` function. For example:

```
ASP.init_model_for_pruning(model, mask_calculator="m4n2_1d", verbosity=2, whitelist=[torch.nn.Linear, torch.nn.Conv2d], allow_recompute_mask=False, allow_permutation=False)
```

Please notice, when using multi-GPUs we should set the identical random seed for all GPUs to make sure the same results generated in permutation search. The library has implemented the `set_identical_seed` function in `permutation_lib.py`, and be called in ASP library. We still suggest the users to set the identical random seed when using multi-GPUs in their code, the example code is as follows:

```
import torch
import numpy
import random

torch.manual_seed(identical_seed)
torch.cuda.manual_seed_all(identical_seed)
numpy.random.seed(identical_seed)
random.seed(identical_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
```

## Reference Papers

More details about sparsity support on the NVIDIA Ampere GPU with Sparse Tensor Cores can refer to our [white paper](https://arxiv.org/abs/2104.08378).

```
@article{mishra2021accelerating,
  title={Accelerating sparse deep neural networks},
  author={Mishra, Asit and Latorre, Jorge Albericio and Pool, Jeff and Stosic, Darko and Stosic, Dusan and Venkatesh, Ganesh and Yu, Chong and Micikevicius, Paulius},
  journal={arXiv preprint arXiv:2104.08378},
  year={2021}
}
```

The details about sparsity with permutation can refer to our [paper](https://proceedings.neurips.cc/paper/2021/hash/6e8404c3b93a9527c8db241a1846599a-Abstract.html) published in *Thirty-fourth Conference on Neural Information Processing Systems* (**NeurIPS 2021**):

```
@inproceedings{pool2021channel,
  author    = {Pool, Jeff and Yu, Chong},
  booktitle = {Advances in Neural Information Processing Systems ({NeurIPS})},
  title     = {Channel Permutations for {N:M} Sparsity},
  url       = {https://proceedings.neurips.cc/paper/2021/file/6e8404c3b93a9527c8db241a1846599a-Paper.pdf},
  volume    = {34},
  year      = {2021}
}

```


================================================
FILE: apex/contrib/sparsity/__init__.py
================================================
from .sparse_masklib import create_mask
from .asp import ASP


================================================
FILE: apex/contrib/sparsity/asp.py
================================================
import types
import torch
from .sparse_masklib import create_mask
from .permutation_lib import Permutation

torchvision_imported = True
try:
    import torchvision
except ImportError:
    print("[ASP][Warning] torchvision cannot be imported.")
    torchvision_imported = False

import os
import time


def eligible_modules(model, whitelist_layer_types, allowed_layer_names, disallowed_layer_names):
    eligible_modules_list = []
    for name, mod in model.named_modules():
        if isinstance(mod, whitelist_layer_types) and name not in disallowed_layer_names:
            if allowed_layer_names is not None and name not in allowed_layer_names:
                continue
            eligible_modules_list.append((name, mod))
    return eligible_modules_list


class ASP:
    __model = None
    __verbosity = 0
    __optimizer = None
    __sparse_parameters = []
    __calculate_mask = None
    __allow_permutation = True
    __all_parameters = []
    __save_permutation_graph = False
    __permutation_output_dir = ""

    @classmethod
    def init_model_for_pruning(
        cls,
        model,
        mask_calculator="m4n2_1d",
        verbosity=3,
        whitelist=[
            torch.nn.Linear,
            torch.nn.Conv1d,
            torch.nn.Conv2d,
            torch.nn.Conv3d,
            torch.nn.MultiheadAttention,
        ],
        allowed_layer_names=None,
        disallowed_layer_names=[],
        allow_recompute_mask=False,
        custom_layer_dict={},
        allow_permutation=True,
    ):
        """Call this method to modify your model to take advantage of sparse matrix multiplication.
        Note that this call alone only augments the model with additional buffers needed for sparse MMA,
        it does not enable use of sparse MMA.

        If you are starting with a fresh model:

        model = ...
        ASP.init_model_for_pruning(model, mask_calculator, ...)
        if (training) ASP.init_optimizer_for_pruning(optimizer)
        ASP.compute_sparse_masks() // sparsity is off by default, call when youy want to enable it.

        If you are starting from a checkpoint:

        model = ...
        ASP.init_model_for_pruning(model, mask_calculator, ...)
        torch.load(...)
        if (training) ASP.init_optimizer_for_pruning(optimizer)

        Arguments:
          model                    The model
          mask_calculator          Either callable that computes mask given a tensor OR pattern string for sparse mask lib.
          verbosity                Integer controling verbosity level.
                                   0 -> Only errors.
                                   1 -> Errors and warnings.
                                   2 -> Errors, warnings and info.
                                   3 -> Errors, warnings, info and debug.
          whitelist                Module types approved for sparsity.
          allowed_layer_names      If not None, only layer names that appear in this list are considered for sparsity.
          disallowed_layer_names   If not [], only layer names that do not appear in this list are considered for sparsity.
          allow_recompute_mask     If True, stores pruned values so that dense weights can be restored.
                                   Pruned weights are stored in CPU memory, hence this option does not increase GPU memory usage.
          custom_layer_dict        Dictionary of additional layer paremeters to sparsify. e.g. {CustomLinear: ['weight']}
          allow_permutation        If True, allow the input channel permutation to ease the influence of weight pruning.

          [Future] Support for allow_recompute_mask can be removed, it is not part of sparse inference recipe.
        """
        assert cls.__model is None, "ASP has been initialized already."
        cls.__model = model
        cls.__verbosity = verbosity
        cls.__allow_permutation = allow_permutation

        if isinstance(mask_calculator, str):

            def create_mask_from_pattern(param):
                return create_mask(param, mask_calculator).bool()

            cls.__calculate_mask = create_mask_from_pattern
        else:
            cls.__calculate_mask = mask_calculator  # user defined function

        # function to extract variables that will be sparsified.
        # idea is that you will add one of these functions for each module type that can be sparsified.
        if torchvision_imported:
            print(
                "[ASP] torchvision is imported, can work with the MaskRCNN/KeypointRCNN from torchvision."
            )
            torchvision_version = str(torchvision.__version__)
            torchvision_version_major = int(torchvision_version.split(".")[0])
            torchvision_version_minor = int(torchvision_version.split(".")[1])
            if torchvision_version_major == 0 and torchvision_version_minor < 12:
                sparse_parameter_list = {
                    torch.nn.Linear: ["weight"],
                    torch.nn.Conv1d: ["weight"],
                    torch.nn.Conv2d: ["weight"],
                    torch.nn.Conv3d: ["weight"],
                    torch.nn.modules.linear.NonDynamicallyQuantizableLinear: ["weight"],
                    torch.nn.MultiheadAttention: [
                        "q_proj_weight",
                        "k_proj_weight",
                        "v_proj_weight",
                        "in_proj_weight",
                    ],
                    torchvision.ops.misc.Conv2d: ["weight"],
                }
            else:  # Torchvision remove APIs that were deprecated before 0.8 (#5386) in 0.12.0, torchvision.ops.misc.Conv2d is removed
                sparse_parameter_list = {
                    torch.nn.Linear: ["weight"],
                    torch.nn.Conv1d: ["weight"],
                    torch.nn.Conv2d: ["weight"],
                    torch.nn.Conv3d: ["weight"],
                    torch.nn.modules.linear.NonDynamicallyQuantizableLinear: ["weight"],
                    torch.nn.MultiheadAttention: [
                        "q_proj_weight",
                        "k_proj_weight",
                        "v_proj_weight",
                        "in_proj_weight",
                    ],
                }
        else:
            sparse_parameter_list = {
                torch.nn.Linear: ["weight"],
                torch.nn.Conv1d: ["weight"],
                torch.nn.Conv2d: ["weight"],
                torch.nn.Conv3d: ["weight"],
                torch.nn.modules.linear.NonDynamicallyQuantizableLinear: ["weight"],
                torch.nn.MultiheadAttention: [
                    "q_proj_weight",
                    "k_proj_weight",
                    "v_proj_weight",
                    "in_proj_weight",
                ],
            }
        if custom_layer_dict:  # Update default list to include user supplied custom (layer type : parameter tensor), make sure this tensor type is something ASP knows how to prune
            sparse_parameter_list.update(custom_layer_dict)
            whitelist += list(custom_layer_dict.keys())

        for module_type in whitelist:
            assert module_type in sparse_parameter_list, (
                "Module %s :: Don't know how to sparsify module." % module.dtype()
            )

        # find all sparse modules, extract sparse parameters and decorate
        def add_sparse_attributes(module_name, module):
            sparse_parameters = sparse_parameter_list[type(module)]
            for p_name, p in module.named_parameters():
                if p_name in sparse_parameters and p.requires_grad:
                    # check for NVIDIA's TC compatibility: we check along the horizontal direction
                    if p.dtype == torch.float32 and (
                        (p.size()[0] % 8) != 0 or (p.size()[1] % 16) != 0
                    ):  # User defines FP32 and APEX internally uses FP16 math
                        print(
                            "[ASP] Auto skipping pruning %s::%s of size=%s and type=%s for sparsity"
                            % (module_name, p_name, str(p.size()), str(p.dtype))
                        )
                        continue
                    if p.dtype == torch.float16 and (
                        (p.size()[0] % 8) != 0 or (p.size()[1] % 16) != 0
                    ):  # For Conv2d dim= K x CRS; we prune along C
                        print(
                            "[ASP] Auto skipping pruning %s::%s of size=%s and type=%s for sparsity"
                            % (module_name, p_name, str(p.size()), str(p.dtype))
                        )
                        continue

                    if cls.__verbosity >= 3:
                        print(
                            "[ASP] Sparsifying %s::%s of size=%s and type=%s for sparsity"
                            % (module_name, p_name, str(p.size()), str(p.dtype))
                        )

                    mask = torch.ones_like(p).bool()
                    buffname = p_name.split(".")[-1]  # buffer names cannot contain "."
                    module.register_buffer("__%s_mma_mask" % buffname, mask)
                    if allow_recompute_mask:
                        pruned = torch.zeros_like(p).cpu()
                        module.register_buffer("__%s_mma_pruned_p" % buffname, pruned)
                    else:
                        pruned = None
                    cls.__sparse_parameters.append((module_name, module, p_name, p, mask, pruned))
                else:
                    if cls.__verbosity >= 3:
                        print(
                            "[ASP] Not sparsifying %s::%s of size=%s and type=%s"
                            % (module_name, p_name, str(p.size()), str(p.dtype))
                        )

        for name, sparse_module in eligible_modules(
            model, tuple(whitelist), allowed_layer_names, disallowed_layer_names
        ):
            add_sparse_attributes(name, sparse_module)

        if allow_permutation:  # find all named modules, extract parameters and decorate, used for offline permutation in K dim
            for module_name, module in model.named_modules():
                module_type_str = str(type(module)).split("'")[1]
                if (
                    module_type_str == "torch.nn.modules.container.Sequential"
                    or module_type_str.startswith("torchvision.models")
                ):
                    # filter out the 'torch.nn.modules.container.Sequential' type and the whole model, like 'torchvision.models.vgg.VGG'
                    continue
                for p_name, p in module.named_parameters():
                    cls.__all_parameters.append((module_name, module, p_name, p))
                if module_type_str == "torch.nn.modules.batchnorm.BatchNorm2d":
                    # need to get the running_mean and running_var from model.state_dict(), as they are not the learnable parameters
                    module_mean_name = module_name + ".running_mean"
                    module_var_name = module_name + ".running_var"
                    for param_key in model.state_dict():
                        if module_mean_name == param_key or module_var_name == param_key:
                            cls.__all_parameters.append(
                                (
                                    module_name,
                                    module,
                                    param_key.split(".")[-1],
                                    model.state_dict()[param_key],
                                )
                            )
            # add the __permutation_output_dir field to save the intermediate results for permutation
            cls.__permutation_output_dir = "."
            # Set the corresponding params from ASP class to the Permutation class
            permutation_verbosity = 5
            Permutation.set_permutation_params_from_asp(
                cls.__model,
                cls.__sparse_parameters,
                cls.__all_parameters,
                permutation_verbosity,
            )
            # Set the identical random seed for all GPUs to make sure the same results generated in permutation search
            Permutation.set_identical_seed()

    @classmethod
    def already_init_asp_model(cls):
        """Call this method to check whether ASP has been initialized already."""
        if cls.__model is None:
            if cls.__verbosity >= 3:
                print("[ASP] ASP has not been initialized.")
                return False
        else:
            if cls.__verbosity >= 3:
                print("[ASP] ASP has been initialized already.")
                return True

    @classmethod
    def init_optimizer_for_pruning(cls, optimizer):
        """Call this method to monkey patch optimizer step function so that masks can be applied to
        gradients and weights during training.
        You must call init_model_for_pruning(...) before calling init_optimizer_for_pruning(...)
        """
        assert cls.__optimizer is None, "ASP has initialized optimizer already."
        assert cls.__calculate_mask is not None, (
            "Called ASP.init_optimizer_for_pruning before ASP.init_model_for_pruning."
        )

        # store pointer to original optimizer step method
        cls.__optimizer = optimizer
        cls.__optimizer.__step = optimizer.step

        def __step(opt_self, *args, **kwargs):
            # prune gradients before step method
            with torch.no_grad():
                for (
                    module_name,
                    module,
                    p_name,
                    p,
                    mask,
                    pruned,
                ) in cls.__sparse_parameters:
                    if p.grad is not None:  # thx pjudd
                        p.grad.mul_(mask)
            # call original optimizer step method
            rval = opt_self.__step(*args, **kwargs)
            # prune parameters after step method
            with torch.no_grad():
                for (
                    module_name,
                    module,
                    p_name,
                    p,
                    mask,
                    pruned,
                ) in cls.__sparse_parameters:
                    p.mul_(mask)
            return rval

        cls.__optimizer.step = types.MethodType(__step, cls.__optimizer)

    @classmethod
    def compute_sparse_masks(cls):
        """Call this method to enable sparsity.
        If init(...) was called with allow_recompute_mask=False AND sparsity is disabled, pruned field can be None.
        """
        with torch.no_grad():
            if cls.__allow_permutation:
                # Step 1: use the Torch.FX library to build the graph
                # Step 2: permutation search with the customized kernel
                # The simplest without user intervention:
                # A. try to import with the distributed mode of the original model
                # B. if meet the error, import with the none-distributed mode of the original model
                start_time_permute = time.perf_counter()
                successful_permutation = False
                try:
                    successful_permutation = Permutation.permute_model(
                        cls.__model.module,
                        dump_fx_graph=cls.__save_permutation_graph,
                        save_dumped_fx_graph=os.path.join(
                            cls.__permutation_output_dir,
                            "model_offline_permutation_graph.json",
                        ),
                    )
                    if successful_permutation:
                        print("\n[compute_sparse_masks] permuted the (distributed) model.")
                except AttributeError:
                    successful_permutation = Permutation.permute_model(
                        cls.__model,
                        dump_fx_graph=cls.__save_permutation_graph,
                        save_dumped_fx_graph=os.path.join(
                            cls.__permutation_output_dir,
                            "model_offline_permutation_graph.json",
                        ),
                    )
                    if successful_permutation:
                        print("\n[compute_sparse_masks] permuted the model.")

                if successful_permutation:
                    duration_build_offline_permutation_graph = (
                        time.perf_counter() - start_time_permute
                    )
                    print(
                        "[compute_sparse_masks] Take {:.4f} seconds to find and apply permutations.".format(
                            duration_build_offline_permutation_graph
                        )
                    )

            for module_name, module, p_name, p, mask, pruned in cls.__sparse_parameters:
                if mask.sum() < mask.numel():  # when recalculating masks
                    # restore dense parameter if allow_recompute_mask is enabled
                    assert pruned is not None, (
                        "Unable to restore dense parameter because allow_recompute_mask == False"
                    )
                    p.add_(pruned.cuda())

                mask.set_(cls.__calculate_mask(p))

                if pruned is not None:  # stow away pruned weights to cpu
                    pruned.set_((p * (~mask)).cpu())

                p.mul_(
                    mask
                )  # in-place multiplication, so pruned weights are 0-values, hence checkpoint will have 0s for pruned weights
                if cls.__verbosity >= 2:
                    print(
                        "[ASP] Enabled %.2f%% sparsity for %s::%s of size=%s and type=%s with magnitude %s"
                        % (
                            100.0 - 100.0 * mask.sum() / mask.numel(),
                            module_name,
                            p_name,
                            str(p.size()),
                            str(p.dtype),
                            torch.sum(torch.abs(p)),
                        )
                    )

    @classmethod
    def restore_pruned_weights(cls):
        """Call this method to disable sparsity and restore all weights.
        This will only work if init(...) was called with allow_recompute=True.
        """
        with torch.no_grad():
            for module_name, module, p_name, p, mask, pruned in cls.__sparse_parameters:
                if mask.sum() < mask.numel():
                    assert pruned is not None, (
                        "Unable to restore dense parameter because allow_recompute_mask == False"
                    )
                    p.add_(pruned.cuda())
                    mask.fill_(1)
                    pruned.zero_()
                    if cls.__verbosity >= 2:
                        print(
                            "[ASP] Disabled sparsity for %s::%s (dense weights restored)"
                            % (module_name, p_name)
                        )

    @classmethod
    def is_sparsity_enabled(cls):
        """Call this method to determine if sparsity is enabled in the model.
        The typical use case is right after checkpoint has been loaded.
        """
        total, sp100, sp50 = 0, 0, 0
        for module_name, module, p_name, p, mask, pruned in cls.__sparse_parameters:
            total += 1
            mask_sum = mask.sum()
            mask_numel = mask.numel()
            if mask_sum == mask_numel:
                sp100 += 1
            elif mask_sum * 2 == mask_numel:
                sp50 += 1

        assert total == sp100 or total == sp50, "Inconsistent model sparsity"
        if total == sp100:
            return False
        elif total == sp50:
            return True

    @classmethod
    def prune_trained_model(cls, model, optimizer):
        # add mask buffers to model (init_model_for_pruning), augment optimizer (init_optimizer_for_pruning) and compute masks (compute_sparse_masks)
        cls.init_model_for_pruning(
            model,
            mask_calculator="m4n2_1d",
            verbosity=2,
            whitelist=[torch.nn.Linear, torch.nn.Conv2d, torch.nn.MultiheadAttention],
            allow_recompute_mask=False,
        )
        cls.init_optimizer_for_pruning(optimizer)
        cls.compute_sparse_masks()

    @classmethod
    def set_permutation_saving_params(
        cls,
        allow_permutation=True,
        save_permutation_graph=False,
        permutation_output_dir=".",
    ):
        """This function is used to set the permutation saving related parameters in ASP class and inside of the Permutation class."""
        print("\n[ASP][set_permutation_saving_param] Set permutation saving related parameters")
        print("\n[set_permutation_saving_param] Set permutation saving related parameters")
        cls.__allow_permutation = allow_permutation
        print(
            "[set_permutation_saving_param]\t Allow permutation: {}".format(cls.__allow_permutation)
        )
        cls.__save_permutation_graph = save_permutation_graph
        print(
            "[set_permutation_saving_param]\t Save permutation graphs: {}".format(
                cls.__save_permutation_graph
            )
        )
        cls.__permutation_output_dir = permutation_output_dir
        print(
            "[set_permutation_saving_param]\t Permutation graphs saving dir: {}".format(
                cls.__permutation_output_dir
            )
        )

        Permutation.set_permutation_saving_params(
            allow_permutation, save_permutation_graph, permutation_output_dir
        )


================================================
FILE: apex/contrib/sparsity/permutation_lib.py
================================================
import os
import torch
import json
import string
import time
import numpy as np
import builtins as __builtin__
import io

try:
    from .permutation_search_kernels import (
        accelerated_search_for_good_permutation,
        sum_after_2_to_4,
    )

    print("[ASP][Info] permutation_search_kernels can be imported.")
except ImportError:
    print("[ASP][Warning] permutation_search_kernels cannot be imported.")
    print(
        "[ASP][Warning] If you want to accelerate the permutation search process by GPU, please build APEX by following the instructions at https://github.com/NVIDIA/apex/blob/master/apex/contrib/sparsity/README.md"
    )


def convert_fx_node_name(fx_node_name):
    """Standardize punctuation of a node's name: replace all '_' with '.'"""
    return fx_node_name.replace("_", ".")


def get_node_parent_children(fx_node):
    """Populate lists of all direct parents and children of a node"""
    # get node parent list, and convert node name to module name
    node_parent_name_converted = []
    if len(fx_node.all_input_nodes) > 0:
        node_parent = fx_node.all_input_nodes
        for item in node_parent:
            converted_item = convert_fx_node_name(item.name)
            node_parent_name_converted.append(converted_item)
    else:
        node_parent = []

    # get node children list, and convert node name to module name
    node_children_name_converted = []
    if len(list(fx_node.users.keys())) > 0:
        node_children = list(fx_node.users.keys())
        for item in node_children:
            converted_item = convert_fx_node_name(item.name)
            node_children_name_converted.append(converted_item)
    else:
        node_children = []

    return node_parent_name_converted, node_children_name_converted


def node_name_matches(node_name, module_name):
    """Check for a match between graph node name and stored module name, accounting for formatting and DDP training differences"""

    # process: remove all punctuation, everything to lower case
    def process(name):
        return "".join(c for c in name if c not in string.punctuation).lower()

    processed_node_name = process(node_name)
    processed_module_name = process(module_name)

    # module names start with 'module.' in distributed data-parallel training, but fx graph node names don't; check for both
    distributed_node_name = "module." + node_name
    distributed_processed_node_name = "module" + processed_node_name

    return (
        (node_name == module_name)
        or (distributed_node_name == module_name)
        or (processed_node_name == processed_module_name)
        or (distributed_processed_node_name == processed_module_name)
    )


def replicate_sequence(sequence, replications):
    """Replicate a permutation to apply it to an even multiple of channel counts"""
    replicated_sequence = []

    for rep in range(replications):
        offset = len(sequence) * rep
        for c in sequence:
            replicated_sequence.append(c + offset)

    return replicated_sequence


class Permutation:
    __model = None
    __sparse_parameters = []
    __allow_permutation = False
    __all_parameters = []
    __verbosity = 0  ## 0: errors only, 1: also high-level details, warnings, 2: also intermediate steps, 3: everything
    __params_permuted_in_C = []
    __params_permuted_in_K = []
    __unpermuted_dims = []

    __save_permutation_graph = False
    __permutation_output_dir = ""
    __manual_seed = None
    __tcpstore_port = 2341

    # these module types may be the target of permutations (have potentially sparse weights or are attributes with no parents)
    __permutation_target_module_types = [
        "torch.nn.modules.conv.Conv1d",
        "torch.nn.modules.conv.Conv2d",
        "torch.nn.modules.linear.Linear",
        "torch.nn.modules.linear.LazyLinear",
        "torch.nn.modules.linear.NonDynamicallyQuantizableLinear",
        "torch.nn.modules.activation.MultiheadAttention",
        "get_attr",
    ]

    # these module types are not permuted, but must pass any permutation seen by a child's C or passed-thru K to the parents' K
    __simple_passthru_module_types = [
        "torch.nn.modules.activation.ReLU6",
        "torch.nn.modules.activation.ReLU",
        "torch.nn.modules.dropout.Dropout",
        "torch.nn.modules.dropout.Dropout1d",
        "torch.nn.modules.dropout.Dropout2d",
        "torch.nn.modules.dropout.Dropout3d",
        "torch.nn.modules.dropout.AlphaDropout",
        "torch.nn.modules.dropout.FeatureAlphaDropout",
        "torch.nn.modules.pooling.MaxPool2d",
        "torch.nn.modules.pooling.AdaptiveAvgPool2d",
        "torch.nn.modules.pooling.AvgPool2d",
        "torch.nn.modules.activation.Hardsigmoid",
        "torch.nn.modules.activation.Hardswish",
        "torch.nn.modules.activation.GELU",
        "torch.nn.modules.normalization.LocalResponseNorm",
        "torch.nn.modules.activation.Softmin",
        "torch.nn.modules.activation.Softmax",
        "torch.nn.modules.activation.Softmax2d",
        "torch.nn.modules.activation.LogSoftmax",
        "torch.nn.modules.activation.AdaptiveLogSoftmaxWithLoss",
        "torch.nn.modules.activation.SiLU",
        "torch.nn.modules.activation.Sigmoid",
        "concat",
        "torch.nn.modules.flatten.Flatten",  # if it's a problem, it'll be handled via dimension mismatch check
    ]

    # these module types have parameters that must be permuted along K as well as need to pass the permutation thru to parents' K
    __permute_K_and_passthru_module_types = [
        "torch.nn.modules.batchnorm.BatchNorm2d",
        "torch.nn.modules.normalization.LayerNorm",
        "torch.nn.modules.instancenorm.InstanceNorm2d",
        "torch.nn.modules.batchnorm.SyncBatchNorm",
    ]

    # these module types cannot be permuted safely (today), and cause neighboring layers to have permutations disabled
    __disallow_permutations_module_types = [
        "torch.nn.modules.normalization.GroupNorm",  # to handle: influence GCD of real children's sibling group
        "torch.nn.modules.linear.Bilinear",  # need to permute one input along in1_features and the other along in2_features
        "torch.nn.modules.activation.GLU",  # may work OOTB, but might need to explicitly handle dimsionality change
    ]

    @classmethod
    def set_identical_seed(cls, identical_seed=1):
        """Make all GPUs in DDP use the same seed to find identical permutations and not require syncing parameters later"""

        if cls.__verbosity > 0:
            print(
                "[set_identical_seed] Set the identical seed: {:} for all GPUs to make sure the same results generated in permutation search".format(
                    identical_seed
                )
            )

        cls.__manual_seed = identical_seed
        cls.reset_seed()

    @classmethod
    def reset_seed(cls):
        """To find the same permutations no matter how many GPUs are used, we reset the seed before every search"""

        identical_seed = cls.__manual_seed
        assert identical_seed is not None, "Must call set_identical_seed() before it can be reset"

        torch.manual_seed(identical_seed)
        torch.cuda.manual_seed(identical_seed)
        import random

        np.random.seed(identical_seed)
        random.seed(identical_seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    @classmethod
    def set_tcpstore_port(cls, tcpstore_port):
        """Override the default port if it is in use in a distributed training session"""

        cls.__tcpstore_port = tcpstore_port
        if cls.__verbosity > 0:
            print(f"[set_tcpstore_port] TCPStore port set to {cls.__tcpstore_port} .")

    @classmethod
    def set_permutation_saving_params(
        cls,
        allow_permutation=False,
        save_permutation_graph=False,
        permutation_output_dir=".",
    ):
        """This function is used to set the permutation saving related parameters."""

        cls.__allow_permutation = allow_permutation
        cls.__save_permutation_graph = save_permutation_graph
        cls.__permutation_output_dir = permutation_output_dir

        if cls.__verbosity > 0:
            print(
                f"[permutation_lib][set_permutation_saving_param] Set permutation saving related parameters\n\tAllow permutation: {cls.__alow_permutation}\n\tSave permutation graphs: {cls.__save_permutation_graph}\n\tPermutation graphs saving dir: {cls.__permutation_output_dir}"
            )

    @classmethod
    def set_permutation_params_from_asp(cls, model, sparse_parameters, all_parameters, verbosity):
        """This function is used to set the permutation needed parameters from ASP class."""
        cls.__verbosity = verbosity

        if cls.__verbosity > 0:
            print("[set_permutation_params_from_asp] Set permutation needed parameters")
        cls.__model = model
        cls.__sparse_parameters = sparse_parameters
        cls.__all_parameters = all_parameters

        if cls.__verbosity > 1:
            sparse_param_names = [
                module_name + ":" + p_name
                for (
                    module_name,
                    module,
                    p_name,
                    p,
                    mask,
                    pruned,
                ) in cls.__sparse_parameters
            ]
            all_param_names = [
                module_name + ":" + p_name
                for (module_name, module, p_name, p) in cls.__all_parameters
            ]
            print(
                f"\tSparse parameter names: {sparse_param_names}\n\tAll parameter names: {all_param_names}"
            )

        cls.__params_permuted_in_C = []
        cls.__params_permuted_in_K = []
        cls.__unpermuted_dims = []

    @classmethod
    def permute_model(
        cls,
        model,
        dump_fx_graph=False,
        save_dumped_fx_graph="./model_permutation_graph.json",
    ):
        """Permute a model's weights in order to maintain more magnitude after enforcing the sparsity constraint."""

        if cls.__verbosity > 0:
            print("\n[permute_model] Permuting the model")

        # extract the output_dir, so all the intermediate fx_graph can be saved under that path
        extract_output_dir = os.path.split(save_dumped_fx_graph)[0]
        cls.__permutation_output_dir = extract_output_dir
        fx_graph, success_in_build_fx_graph = cls.build_fx_graph(
            model,
            dump_fx_graph=dump_fx_graph,
            save_dumped_fx_graph=save_dumped_fx_graph,
        )

        if success_in_build_fx_graph:
            fx_graph_after_init_flags = cls.init_permutation_flags(fx_graph)
            fx_graph_after_find_real_parents = cls.find_real_parents(fx_graph_after_init_flags)
            fx_graph_after_find_real_children = cls.find_real_children(
                fx_graph_after_find_real_parents
            )
            fx_graph_after_making_groups = cls.make_sibling_coparent_groups(
                fx_graph_after_find_real_children
            )
            fx_graph_after_fixup_concats = cls.fixup_concats(fx_graph_after_making_groups)
            fx_graph_after_enforce_dimension_agreement = cls.enforce_dimension_agreement(
                fx_graph_after_fixup_concats
            )
            fx_graph_after_propagate_flags = cls.propagate_permutation_flags(
                fx_graph_after_enforce_dimension_agreement
            )

            start_time_search_for_good_permutation = time.perf_counter()
            fx_graph_after_find_permutations = cls.find_permutations(fx_graph_after_propagate_flags)

            if torch.distributed.is_initialized():
                if cls.__verbosity > 0:
                    duration_search_for_good_permutation = (
                        time.perf_counter() - start_time_search_for_good_permutation
                    )
                    print(
                        f"[permute_model] Rank {torch.distributed.get_rank()} completed search in {duration_search_for_good_permutation:.2f}s, waiting for others.",
                        force=True,
                    )
                torch.distributed.barrier()

            duration_search_for_good_permutation = (
                time.perf_counter() - start_time_search_for_good_permutation
            )
            if cls.__verbosity > 0:
                print(
                    "\n[permute_model] Take {:.4f} seconds to finish search_for_good_permutation function.".format(
                        duration_search_for_good_permutation
                    )
                )

            fx_graph_after_sync_permutations = cls.sync_permutations(
                fx_graph_after_find_permutations
            )
            fx_graph_after_apply_permutations = cls.apply_permutations(
                fx_graph_after_sync_permutations
            )
            cls.check_graph_for_unpermuted_nodes(fx_graph_after_apply_permutations)

            fx_graph = fx_graph_after_apply_permutations

        if cls.__save_permutation_graph:
            cls.save_graph_to_json(
                fx_graph,
                save_dumped_graph_path_with_name=os.path.join(
                    cls.__permutation_output_dir, "./model_graph_permutation_graph.json"
                ),
            )  # save the intermediate graph as JSON file for debugging

        return success_in_build_fx_graph

    @classmethod
    def get_permutation_stats(cls):
        """Return statistics for how many permutations were applied in various dimensions, used for testing"""

        return (
            cls.__params_permuted_in_C,
            cls.__params_permuted_in_K,
            cls.__unpermuted_dims,
        )

    @classmethod
    def apply_permutation_in_C_dim(cls, node_name, permutation_sequence, dryrun):
        """This function is used to permutation for a node in C dim. (Only need to handle the weight of the node)"""

        if cls.__verbosity > 1 and dryrun:
            print(
                "[apply_permutation_in_C_dim] Permutation for node: '{:}' in C dim".format(
                    node_name
                )
            )

        if len(permutation_sequence) == 0:
            if cls.__verbosity >= 0:
                print(
                    f"ERROR: [apply_permutation_in_C_dim] the permutation sequence for node {node_name} is empty, fail to apply permutation in C dim."
                )
            return False

        is_node_in_sparse_parameters = False
        success_permutation = False
        for module_name, module, p_name, p, mask, pruned in cls.__sparse_parameters:
            if node_name_matches(node_name, module_name):
                if cls.__verbosity > 2 and dryrun:
                    print(
                        "[apply_permutation_in_C_dim] find the node: '{:}' '{:}' in cls.__sparse_parameters, succeed to apply permutation in C dim.".format(
                            node_name, p_name
                        )
                    )
                is_node_in_sparse_parameters = True
                permutation_to_apply = permutation_sequence
                if p.shape[1] != len(
                    permutation_sequence
                ):  # assumed to be grouped convolutions or concatenated weights
                    if p.shape[1] % len(permutation_sequence) != 0:
                        return False

                    permutation_to_apply = replicate_sequence(
                        permutation_sequence, p.shape[1] // len(permutation_sequence)
                    )

                if not dryrun:
                    p.data.copy_(p[:, permutation_to_apply, ...])
                    cls.__params_permuted_in_C.append(node_name + "." + p_name)

                success_permutation = True
        if not is_node_in_sparse_parameters:
            # A special case: if the node itself not in sparse_module_names but one of its real_siblings in sparse_module_names, then the node will not do the permutation search, but it may need to apply the offline permutation in C dim according to the searched permutation sequence from its real_siblings in sparse_module_names
            try:
                for (
                    module_name_from_all_parameters,
                    module_from_all_parameters,
                    p_name_from_all_parameters,
                    p_from_all_parameters,
                ) in cls.__all_parameters:
                    if (
                        node_name_matches(node_name, module_name_from_all_parameters)
                        and p_name_from_all_parameters == "weight"
                    ):
                        if cls.__verbosity > 3 and dryrun:
                            print(
                                "[apply_permutation_in_C_dim] cannot find the node: '{:}' '{:}' in cls.__sparse_parameters, but can find in cls.__all_parameters.".format(
                                    node_name, p_name_from_all_parameters
                                )
                            )
                        permutation_to_apply = permutation_sequence
                        if p_from_all_parameters.shape[1] != len(
                            permutation_sequence
                        ):  # assumed to be grouped convolutions
                            if p_from_all_parameters.shpae[1] % len(permutation_sequence) != 0:
                                return False

                            permutation_to_apply = replicate_sequence(
                                permutation_sequence,
                                p_from_all_parameters.shape[1] // len(permutation_sequence),
                            )

                        if not dryrun:
                            p_from_all_parameters.data.copy_(
                                p_from_all_parameters[:, permutation_to_apply, ...]
                            )
                            cls.__params_permuted_in_C.append(
                                node_name + "." + p_name_from_all_parameters
                            )

                        success_permutation = True
                        if cls.__verbosity > 2 and dryrun:
                            print(
                                "[apply_permutation_in_C_dim] cannot find the node: '{:}' in cls.__sparse_parameters, after trying with cls.__all_parameters, succeed to apply permutation in C dim.".format(
                                    node_name
                                )
                            )
            except:
                success_permutation = False
                if cls.__verbosity >= 0:
                    print(
                        "ERROR: [apply_permutation_in_C_dim] cannot find the node: '{:}' in cls.__sparse_parameters, after trying with cls.__all_parameters, still fail to apply permutation in C dim.".format(
                            node_name
                        )
                    )
        return success_permutation

    @classmethod
    def permute_attr(cls, node_name, permutation_sequence, fx_graph, dryrun):
        """Permute a node's attributes. Somewhat hacky, assumes that we'll find exactly one dimension with a length matching the permutation's"""

        assert "attr" in fx_graph[node_name].keys()
        attr = fx_graph[node_name]["attr"]
        if cls.__verbosity > 1:
            print(f"Found attribute {node_name} of shape {attr.shape}")
        found_perm = False
        for dim in range(len(attr.shape)):
            if attr.shape[dim] == len(permutation_sequence):
                if found_perm:
                    if cls.__verbosity > 0:
                        print(
                            f"\tWARNING: {node_name} has already been permuted, but it's trying to happen again along another dimension {dim}."
                        )

                    return False

                found_perm = True
                if cls.__verbosity > 1 and dryrun:
                    print(f"\tpermuting along dimension {dim}")

                if not dryrun:
                    # permute the dimension of interest to the front, permute within that dimension, then reset it
                    order = [c for c in range(len(attr.shape))]
                    order[0] = dim
                    order[dim] = 0
                    prmt = tuple(order)

                    temp_weight = torch.clone(attr)
                    temp_weight = torch.permute(temp_weight, prmt)
                    temp_weight.copy_(temp_weight[permutation_sequence, ...])
                    temp_weight = torch.permute(temp_weight, prmt)
                    attr.data.copy_(temp_weight)

                    cls.__params_permuted_in_K.append(node_name + "_" + str(dim))

        return found_perm

    @classmethod
    def apply_permutation_in_K_dim(cls, node_name, permutation_sequence, fx_graph, dryrun):
        """This function is used to permutation for a node in K dim. (Need to handle the weight/bias/running_mean/running_var of the node)"""

        if cls.__verbosity > 1:
            print(
                "[apply_permutation_in_K_dim] Permutation for node: '{:}' in K dim".format(
                    node_name
                )
            )

        if len(permutation_sequence) == 0:
            if cls.__verbosity >= 0:
                print(
                    "ERROR: [apply_permutation_in_K_dim] the permutation sequence is empty, fail to apply permutation in K dim."
                )
            return False

        # permute attribute nodes
        if "attr" in fx_graph[node_name].keys():
            return cls.permute_attr(node_name, permutation_sequence, fx_graph, dryrun)

        # if we didn't store the attribute already, look in the modules' parameters
        is_node_in_all_parameters = False
        success_permutation = False

        for module_name, module, p_name, p in cls.__all_parameters:
            if node_name_matches(node_name, module_name):
                if cls.__verbosity > 1 and dryrun:
                    print(
                        "[apply_permutation_in_K_dim] find the node: '{:}' with '{:}' in cls.__all_parameters, may succeed to apply permutation in K dim.".format(
                            node_name, p_name
                        )
                    )
                is_node_in_all_parameters = True
                permutation_to_apply = permutation_sequence

                if p.shape[0] != len(permutation_sequence):  # assumed to be grouped convolutions
                    if cls.__verbosity > 2 and dryrun:
                        print(
                            f"Mismatch in K dimension between found module {module_name} {p_name} for node {node_name}: permutation length {len(permutation_sequence)} but parameter shape in K {p.shape[0]}"
                        )

                    if p.shape[0] % len(permutation_sequence) != 0:
                        return False

                    permutation_to_apply = replicate_sequence(
                        permutation_sequence, p.shape[0] // len(permutation_sequence)
                    )

                    if cls.__verbosity > 1 and dryrun:
                        print(
                            "[apply_permutation_in_K_dim] the node: '{:}' with shape: '{:}' required replicating the permutation sequence with len '{:}' {:} times to succeed in applying the permutation in the K dimension.".format(
                                node_name,
                                p.shape,
                                len(permutation_sequence),
                                p.shape[0] // len(permutation_sequence),
                            )
                        )
                else:
                    if cls.__verbosity > 1 and dryrun:
                        print(
                            "[apply_permutation_in_K_dim] the node: '{:}' with shape: '{:}', can match the size of permutation sequence with len: '{:}', succeed to apply permutation in K dim.".format(
                                node_name, p.shape, len(permutation_sequence)
                            )
                        )

                if not dryrun:
                    p.data.copy_(p[permutation_to_apply, ...])
                    cls.__params_permuted_in_K.append(node_name + "." + p_name)

                success_permutation = True

        if not is_node_in_all_parameters:
            if cls.__verbosity >= 0:
                print(
                    "ERROR: [apply_permutation_in _K_dim] cannot find the node: '{:}' in cls.__all_parameters, fail to apply permutation in K dim.".format(
                        node_name
                    )
                )
            success_permutation = False

        return success_permutation

    @classmethod
    def check_graph_for_unpermuted_nodes(cls, fx_graph):
        """Make sure that all permutable nodes/parameters were actually permuted and all GPUs agree"""

        for node_name in fx_graph.keys():
            node = fx_graph[node_name]

            if "C_permutable" in node.keys() and node["C_permutable"] and not node["C_permuted"]:
                sibling_group_id = node["sibling_group_id"]
                if (
                    node["is_real"]
                    and cls.__group_data["skipped_sibling_groups"][sibling_group_id] is None
                ):
                    if cls.__verbosity >= 0:
                        print(
                            f"{node_name} was C_permutable in a not skipped sibling group but was not permuted along C! {node}"
                        )
                    cls.__unpermuted_dims.append(node_name + "_C")

            if "K_permutable" in node.keys() and node["K_permutable"] and not node["K_permuted"]:
                coparent_group_id = node["coparent_group_id"]
                if (
                    node["is_real"]
                    and cls.__group_data["skipped_coparent_groups"][coparent_group_id] is None
                ):
                    if cls.__verbosity >= 0:
                        print(
                            f"{node_name} was K_permutable in a not skipped coparent group but was not permuted along K! {node}"
                        )
                    cls.__unpermuted_dims.append(node_name + "_K")

        if cls.__verbosity > 0:
            print(
                f"[check_graph_for_unpermuted_nodes] found nodes that missed permutations along {len(cls.__unpermuted_dims)} dimensions."
            )

        # make sure all GPUs agree
        if torch.distributed.is_initialized():
            cls.__unpermuted_dims = sorted(cls.__unpermuted_dims)
            rank = torch.distributed.get_rank()
            world_size = torch.distributed.get_world_size()
            dist_store = torch.distributed.TCPStore(
                "127.0.0.1", cls.__tcpstore_port, world_size, rank == 0
            )
            torch.distributed.barrier()

            dist_store.set(str(rank), ",".join(cls.__unpermuted_dims))
            torch.distributed.barrier()

            if rank == 0:
                my_list = dist_store.get("0").decode()

                for peer in range(1, world_size):
                    peer_list = dist_store.get(str(peer)).decode()
                    assert my_list == peer_list, (
                        f"peer {peer} disagreed with rank 0's list of unpermuted nodes: \n{my_list}\n{peer_list}"
                    )

    @classmethod
    def find_sparse_parameters_for_node(cls, node_name):
        """If the node has parameters that are in the trackd sparse parameter list, find them and reshape to a 2D tensor with channels last"""
        node_weight = None

        # check the sparse parameters
        for module_name, module, p_name, p, mask, pruned in cls.__sparse_parameters:
            if node_name_matches(node_name, module_name):
                node_weight = torch.zeros_like(p)
                node_weight.copy_(p)

        # if we found something, reshape to concatenate along the same dimension
        if node_weight is not None:
            # Need to handle the concat for layers with different R & S
            shape = node_weight.shape
            # 1d-tensor
            if len(shape) == 1:
                node_weight = node_weight.view(1, shape[0])
            # 2d-tensor (K, C)
            elif len(shape) == 2:
                node_weight = node_weight.view(shape[0], shape[1])
            # 3d-tensor (K, C, R)
            elif len(shape) == 3:
                node_weight = (
                    node_weight.permute(0, 2, 1).contiguous().view(shape[0] * shape[2], shape[1])
                )
            # 4d-tensor (K, C, R, S)
            elif len(shape) == 4:
                # convs
                node_weight = (
                    node_weight.permute(2, 3, 0, 1)
                    .contiguous()
                    .view(shape[2] * shape[3] * shape[0], shape[1])
                )

        return node_weight

    @classmethod
    def find_permutation_for_matrix_group(cls, matrix_group):
        """Find a good permutation for some matrix (which may be concatenated matrices that require the same permutation)"""

        if cls.__verbosity > 1:
            print(
                f"Searching for a good permutation for this sibling group of shape {matrix_group.shape}"
            )

        permutation_found = False
        num_channels = matrix_group.shape[1]
        group_permutation = [c for c in range(num_channels)]

        # automatic check for skipping the permutation search process
        original_magnitude = (torch.abs(matrix_group)).sum(dtype=torch.float64)
        pruned_magnitude = sum_after_2_to_4(matrix_group.cpu().detach().numpy())
        diff_ratio = abs(original_magnitude - pruned_magnitude) / original_magnitude
        epsilon = 1e-3

        if cls.__verbosity > 1:
            print(
                "\n[search_for_good_permutation] Original element abs sum: {:}, Pruned element abs sum: {:}, Diff ratio: {:}".format(
                    original_magnitude, pruned_magnitude, diff_ratio
                )
            )

        start_time_accelerated_search_for_good_permutation = time.perf_counter()
        if diff_ratio < epsilon:
            if cls.__verbosity > 2:
                print(
                    "[search_for_good_permutation] Original element abs sum is almost same as the pruned element abs sum, further permutation search will not help, skipping!"
                )

        else:
            if cls.__verbosity > 2:
                print(
                    "[search_for_good_permutation] Original element abs sum is different from the pruned element abs sum, further permutation search will help, continue with the permutation search!"
                )

            # call the permutation search CUDA kernels as ASP extension.
            # users can provide prefer search strategy by providing a valid 'search_options' as a dictionary,
            # or users can implement their customized 'accelerated_search_for_good_permutation' function.
            search_options = {}
            # No.1 Strategy: Exhaustive Search
            search_options["strategy"] = "exhaustive"
            search_options["stripe_group_size"] = 8
            search_options["escape_attempts"] = 100
            # No.2 Strategy: Progressive Channel Swap Search
            # search_options['strategy'] = 'progressive channel swap'
            # search_options['progressive_search_time_limit'] = 10
            # search_options['improvement_threshold'] = 1e-9

            # permutation search time is too long for matrix_group with large channel num
            # change from Exhaustive Search to Progressive Channel Swap Search based on input matrix_group size
            if num_channels > 2048:
                search_options = {}
                search_options["strategy"] = "progressive channel swap"
                search_options["progressive_search_time_limit"] = 120
                search_options["improvement_threshold"] = 1e-9

            if cls.__verbosity > 1:
                print(f"[search_for_good_permutation] search options: {search_options}")

            group_permutation = accelerated_search_for_good_permutation(
                matrix_group, options=search_options, verbosity=cls.__verbosity
            )
            permutation_found = True

        if cls.__verbosity > 1:
            duration_accelerated_search_for_good_permutation = (
                time.perf_counter() - start_time_accelerated_search_for_good_permutation
            )
            permuted_magnitude = sum_after_2_to_4(
                matrix_group.cpu().detach().numpy()[:, group_permutation]
            )
            print(
                "[search_for_good_permutation] Take {:.4f} seconds to finish accelerated_search_for_good_permutation function and with final magnitude {:}.".format(
                    duration_accelerated_search_for_good_permutation, permuted_magnitude
                )
            )

        return group_permutation, permutation_found

    @classmethod
    def skip_sibling_group(cls, fx_graph, sibling_group_id, reason):
        """Keep track of sibling groups that do not have permutations applied"""

        # grab a parent to get the coparent group id
        sibling_group = cls.__group_data["sibling_groups"][sibling_group_id]
        a_sibling = list(sibling_group)[0]
        a_parent = fx_graph[a_sibling]["real_parents"][0]
        coparent_group_id = fx_graph[a_parent]["coparent_group_id"]

        if cls.__verbosity > 1:
            print(
                f"Skipping permutations for Sibling Group {sibling_group_id} and Coparent Group {coparent_group_id}: {reason}"
            )

        cls.__group_data["skipped_sibling_groups"][sibling_group_id] = reason
        cls.__group_data["skipped_coparent_groups"][coparent_group_id] = reason

    @classmethod
    def collect_sparse_weights(cls, fx_graph, sibling_group, sibling_group_C_param):
        """Gather all sparse weights for a sibling group (to serve as input to the permutation search)"""

        matrix_group = None

        for sibling in sibling_group:
            node_weight = cls.find_sparse_parameters_for_node(sibling)

            if node_weight is not None:
                # reshape due to siblings with grouped convolutions of different sizes
                assert node_weight.shape[1] % sibling_group_C_param == 0, (
                    f"sibling {sibling}'s weights' C={node_weight.shape[1]} must be even multiple of the sibling group's C parameter {sibling_group_C_param}"
                )
                node_weight = torch.reshape(node_weight, (-1, sibling_group_C_param))

                if matrix_group is None:
                    matrix_group = node_weight
                else:
                    try:
                        matrix_group = torch.cat(
                            (matrix_group, node_weight), dim=0
                        )  # concat the weights in the K dimension, keep the same C dimension

                    except:
                        if cls.__verbosity >= 0:
                            print(
                                "ERROR: [search_for_good_permutation][warning] cannot merge the weight for node: '{:}', with its weight shape: '{:}', the matrix_group shape: '{:}'.".format(
                                    sibling, node_weight.size(), matrix_group.size()
                                )
                            )
                        continue
                if cls.__verbosity > 2:
                    print(
                        "[search_for_good_permutation] have merged the weight for node: '{:}', with its weight shape: '{:}', the matrix_group shape: '{:}'.".format(
                            sibling, node_weight.size(), matrix_group.size()
                        )
                    )
            else:
                if cls.__verbosity > 2:
                    print(
                        f"[search_for_good_permutation] not adding dense weights for node {sibling} to the group"
                    )

        return matrix_group

    @classmethod
    def find_sibling_group_permutation(cls, fx_graph, sibling_group_id):
        """ "Find a good permutation for some sibling group"""

        if cls.__verbosity > 1:
            print(f"Finding permutation for sibling group {sibling_group_id}")

        cls.reset_seed()

        sibling_group = cls.__group_data["sibling_groups"][sibling_group_id]
        sibling_group_C_param = int(cls.__group_data["sibling_group_C_params"][sibling_group_id])

        if sibling_group_C_param % 4 != 0 or sibling_group_C_param < 8:
            cls.skip_sibling_group(
                fx_graph, sibling_group_id, f"Useless C: {sibling_group_C_param}"
            )
            return

        # collect *sparse* weights from all siblings, get the coparent group
        matrix_group = cls.collect_sparse_weights(fx_graph, sibling_group, sibling_group_C_param)

        # early-out if no siblings are sparse
        if matrix_group is None:
            cls.skip_sibling_group(fx_graph, sibling_group_id, "Dense")
            return

        # find a good permutation
        group_permutation, found = cls.find_permutation_for_matrix_group(matrix_group)

        # if no permutation was found, we didn't need it (input already sparse)
        if not found:
            cls.skip_sibling_group(fx_graph, sibling_group_id, "Not needed")
            return

        if cls.__verbosity > 2:
            print(f"Permutation for sibling group {sibling_group_id}: {group_permutation}")

        cls.__group_data["sibling_group_permutations"][sibling_group_id] = group_permutation

    @classmethod
    def permute_sibling_group(cls, fx_graph, sibling_group_id, group_permutation):
        """Apply a permutation to some sibling group"""

        if cls.__verbosity > 1:
            print(f"Attempting to permute sibling group {sibling_group_id}")

        sibling_group = cls.__group_data["sibling_groups"][sibling_group_id]

        # apply the permutation in two steps: first, a dry run to find any issues.
        # if there were no issues, actually apply the permutation in the second step.
        success = True
        coparent_group_id = None
        for dryrun in [True, False]:
            # apply that permutation to the siblings' C dimension
            for sibling in sibling_group:
                assert fx_graph[sibling]["C_permutable"] and not fx_graph[sibling]["C_permuted"]
                sibling_permuted = cls.apply_permutation_in_C_dim(
                    sibling, group_permutation, dryrun
                )
                if dryrun:
                    success = success and sibling_permuted
                else:
                    assert sibling_permuted, "shouldn't fail permuting siblings after the dry run"
                    fx_graph[sibling]["C_permuted"] = sibling_permuted

                a_parent = fx_graph[sibling]["real_parents"][0]
                if coparent_group_id is None:
                    coparent_group_id = fx_graph[a_parent]["coparent_group_id"]
                else:
                    assert coparent_group_id == fx_graph[a_parent]["coparent_group_id"], (
                        f"parent {a_parent} must belong to the same coparent group {coparent_group_id}, not {fx_graph[a_parent]['coparent_group_id']}"
                    )

            # grab the parents (and co-parents) and apply to their K dimension
            coparents = cls.__group_data["coparent_groups"][coparent_group_id]
            for coparent in coparents:
                assert fx_graph[coparent]["K_permutable"] and not fx_graph[coparent]["K_permuted"]
                coparent_permuted = cls.apply_permutation_in_K_dim(
                    coparent, group_permutation, fx_graph, dryrun
                )
                if dryrun:
                    success = success and coparent_permuted
                else:
                    assert coparent_permuted, "shouldn't fail permuting coparents after the dry run"
                    fx_graph[coparent]["K_permuted"] = coparent_permuted

                children_permuted = cls.apply_permutation_in_K_dim_to_children(
                    fx_graph, coparent, group_permutation, dryrun
                )
                if dryrun:
                    success = success and children_permuted
                else:
                    assert children_permuted, (
                        "shouldn't fail permuting coparents' children after the dry run"
                    )

            if not success:
                cls.skip_sibling_group(fx_graph, sibling_group_id, "dryrun_failure")

                if cls.__verbosity > 0:
                    print(
                        f"There was an issue permuting sibling group {sibling_group_id}, skipping it to preserve network quality."
                    )

                break

    @classmethod
    def apply_permutation_in_K_dim_to_children(cls, fx_graph, node_name, permutation, dryrun):
        """Apply a permutation along K to the children of some node"""

        success = True
        children = fx_graph[node_name]["children"]
        if cls.__verbosity > 2 and dryrun:
            print(f"Applying a permutation in K to children of {node_name} : {children}")

        # apply the permutation along K to children as necessary
        for child in children:
            if "is_real" in fx_graph[child].keys() and fx_graph[child]["is_real"]:
                if cls.__verbosity > 3 and dryrun:
                    print(f"\tFound a real child {child}, not permuting it or its children along K")
            else:
                if (
                    "module_type" not in fx_graph[child].keys()
                    or fx_graph[child]["module_type"] == "None"
                ):
                    if cls.__verbosity > 3 and dryrun:
                        print(f"\tPermuting children of non-module {child} along K")
                    success = success and cls.apply_permutation_in_K_dim_to_children(
                        fx_graph, child, permutation, dryrun
                    )
                elif not fx_graph[child]["C_permutable"]:
                    if fx_graph[child]["K_permutable"] and not fx_graph[child]["K_permuted"]:
                        if cls.__verbosity > 2 and dryrun:
                            print(f"\tPermuting {child} along K")
                        child_permuted = cls.apply_permutation_in_K_dim(
                            child, permutation, fx_graph, dryrun
                        )
                        success = success and child_permuted
                        if not dryrun:
                            fx_graph[child]["K_permuted"] = child_permuted
                        assert fx_graph[child]["K_passthru"]

                    if fx_graph[child]["K_passthru"]:
                        success = success and cls.apply_permutation_in_K_dim_to_children(
                            fx_graph, child, permutation, dryrun
                        )
                    else:
                        if cls.__verbosity >= 0:
                            print(
                                f"\t!! ERROR {child} was a not real module that was not K_passthru"
                            )

        return success

    @classmethod
    def defer_prints(cls):
        """Collect prints from this rank in distributed mode to avoid interleaved output"""

        if torch.distributed.is_initialized() and torch.distributed.get_world_size() > 1:
            cls.__new_stdout = io.StringIO(str(torch.distributed.get_rank()))
            cls.__builtin_print = __builtin__.print

            def deferred_print(*args, **kwargs):
                try:  # see if torchvision examples has suppressed other ranks with the force argument
                    cls.__builtin_print(*args, file=cls.__new_stdout, force=True, **kwargs)
                except:
                    cls.__builtin_print(*args, file=cls.__new_stdout, **kwargs)

            __builtin__.print = deferred_print

    @classmethod
    def resume_prints(cls):
        """Emit the collected outputs from this rank, resume immediate printing"""

        if torch.distributed.is_initialized() and torch.distributed.get_world_size() > 1:
            output = cls.__new_stdout.getvalue()
            __builtin__.print = cls.__builtin_print

            try:
                print(output, force=True)
            except:
                print(output)

    @classmethod
    def find_permutations(cls, fx_graph):
        """Search for permutations for all sibling groups"""

        for sibling_group_id in cls.__group_data["sibling_groups"].keys():
            search_this_group = True
            if torch.distributed.is_initialized():
                rank = torch.distributed.get_rank()
                world_size = torch.distributed.get_world_size()

                if sibling_group_id % world_size != rank:
                    search_this_group = False

            cls.__group_data["sibling_group_permutations"][sibling_group_id] = None
            if search_this_group:
                cls.defer_prints()

                sibling_group = cls.__group_data["sibling_groups"][sibling_group_id]
                test_node_name = list(sibling_group)[0]
                if not fx_graph[test_node_name]["C_permutable"]:
                    if cls.__verbosity > 1:
                        print(
                            f"Skipping permutation for sibling group {sibling_group_id} since it does not allow permutations along C"
                        )

                else:
                    if cls.__verbosity > 1:
                        print(f"Sibling group {sibling_group_id} can permute along C, permuting it")

                    cls.find_sibling_group_permutation(fx_graph, sibling_group_id)

                cls.resume_prints()

        return fx_graph

    @classmethod
    def sync_permutations(cls, fx_graph):
        """If multiple GPUs were involved in finding permutations, make sure everyone's in sync"""

        if not torch.distributed.is_initialized():
            return fx_graph

        rank = torch.distributed.get_rank()
        world_size = torch.distributed.get_world_size()
        dist_store = torch.distributed.TCPStore(
            "127.0.0.1", cls.__tcpstore_port, world_size, rank == 0
        )

        if cls.__verbosity > 0:
            print(f"Syncing permutations found among world size {world_size}")

        torch.distributed.barrier()
        for sibling_group_id in sorted(cls.__group_data["sibling_groups"].keys()):
            src_rank = sibling_group_id % world_size

            if src_rank == rank:
                to_send = cls.__group_data["sibling_group_permutations"].get(sibling_group_id, None)
                skip_reason = None
                if to_send is None:
                    skip_reason = cls.__group_data["skipped_sibling_groups"].get(
                        sibling_group_id, None
                    )
                    if skip_reason is None:
                        to_send = ""
                    else:
                        to_send = "skip"
                else:
                    to_send = ",".join(str(c) for c in to_send)

                dist_store.set(str(sibling_group_id), to_send)
                if skip_reason is not None:
                    dist_store.set(f"skip {sibling_group_id}", skip_reason)

                if cls.__verbosity > 1:
                    print(
                        f"{rank}: stored permutation for sibling group {sibling_group_id}",
                        force=True,
                    )

        torch.distributed.barrier()
        for sibling_group_id in sorted(cls.__group_data["sibling_groups"].keys()):
            permutation = dist_store.get(str(sibling_group_id)).decode()

            if permutation == "skip":
                permutation = None
                skip_reason = dist_store.get(f"skip {sibling_group_id}").decode()
                cls.skip_sibling_group(fx_graph, sibling_group_id, skip_reason)
            else:
                if len(permutation) == 0:
                    permutation = None
                else:
                    permutation = [int(c) for c in permutation.split(",")]

            cls.__group_data["sibling_group_permutations"][sibling_group_id] = permutation

            if cls.__verbosity > 1:
                print(f"Got permutation for sibling group {sibling_group_id}")

        torch.distributed.barrier()
        return fx_graph

    @classmethod
    def apply_permutations(cls, fx_graph):
        """Apply all the permutations that were found to the network appropriately"""

        for sibling_group_id in cls.__group_data["sibling_group_permutations"].keys():
            permutation = cls.__group_data["sibling_group_permutations"][sibling_group_id]

            if permutation is not None:
                cls.permute_sibling_group(fx_graph, sibling_group_id, permutation)

        return fx_graph

    @staticmethod
    def insert_MHA_out_proj(fx_graph, MHA_node, verbosity):
        """MHA nodes have a hidden out_proj node, so insert it and fix up neighboring nodes"""

        if verbosity > 1:
            print(f"Inserting MHA out_proj for node {MHA_node}")
        out_proj_node_name = MHA_node + ".out_proj"
        # insert the new node
        fx_graph[out_proj_node_name] = {}
        fx_graph[out_proj_node_name]["parents"] = [MHA_node]
        fx_graph[out_proj_node_name]["children"] = fx_graph[MHA_node]["children"]
        fx_graph[MHA_node]["children"] = [out_proj_node_name]

        # set the new node's properties
        fx_graph[out_proj_node_name]["fx_op"] = "call_module"
        fx_graph[out_proj_node_name]["module_type"] = "torch.nn.modules.linear.Linear"
        fx_graph[out_proj_node_name]["groups_param"] = "None"
        fx_graph[out_proj_node_name]["C_param"] = fx_graph[MHA_node]["C_param"]
        fx_graph[out_proj_node_name]["K_param"] = fx_graph[MHA_node]["K_param"]
        fx_graph[out_proj_node_name]["sibling_group_id"] = None
        fx_graph[out_proj_node_name]["coparent_group_id"] = None

        # set permutation flags
        fx_graph[out_proj_node_name]["C_permutable"] = False
        fx_graph[MHA_node]["K_permutable"] = False
        fx_graph[MHA_node]["C_permutable"] = True
        fx_graph[out_proj_node_name]["K_permutable"] = True
        fx_graph[out_proj_node_name]["K_passthru"] = False
        fx_graph[out_proj_node_name]["C_permuted"] = False
        fx_graph[out_proj_node_name]["K_permuted"] = False
        fx_graph[out_proj_node_name]["is_real"] = True

        if verbosity > 2:
            print(f"\tUpdated: {MHA_node}: {fx_graph[MHA_node]}")
            print(f"\tAdded: {out_proj_node_name}: {fx_graph[out_proj_node_name]}")

        # update any nodes that thought their parent was the MHA node
        for node in fx_graph.keys():
            parents = fx_graph[node]["parents"]
            if node != out_proj_node_name and MHA_node in parents:
                parents.remove(MHA_node)
                parents.append(out_proj_node_name)
                fx_graph[node]["parents"] = parents
                if verbosity > 2:
                    print(f"\tUpdated parents of {node}: {fx_graph[node]}")

        return fx_graph

    @staticmethod
    def init_grouped_conv_permutation_flags(fx_graph, node_name, node_groups, verbosity):
        """Handle grouped convolutions to make dimensions match"""

        node_C = int(fx_graph.get(node_name).get("C_param"))
        node_K = int(fx_graph.get(node_name).get("K_param"))
        node_groups = int(node_groups)

        if verbosity > 2:
            print(f"\t{node_name} pre-divide C: {node_C}, K: {node_K}, G: {node_groups}")
        assert node_C % node_groups == 0
        node_C = int(node_C / node_groups)
        fx_graph[node_name]["C_param"] = str(node_C)
        if verbosity > 2:
            print(f"\t{node_name} post-divide C: {node_C}, K: {node_K}, G: {node_groups}")

        if node_C == 1:  # G == C (C is pre-divided by G)
            if node_groups == node_K:  # true depthwise, G == C == K (C will be pre-divided by G)
                fx_graph[node_name]["K_permutable"] = True
                fx_graph[node_name]["K_permuted"] = False
                fx_graph[node_name]["K_passthru"] = True
                fx_graph[node_name]["is_real"] = False
            # else:                                          # G != K, handling a permutation along K would be very tricky and not likely useful

        else:  # G != C
            if (
                node_C > 4 and node_C % 4 == 0
            ):  # permutations only help if there's more than one 2:4 pruning group
                fx_graph[node_name]["C_permutable"] = True
                fx_graph[node_name]["C_permuted"] = False

    @classmethod
    def init_permutation_flags(cls, fx_graph):
        """Set the permutation flags for each node based only on that node's module type and parameters"""

        if cls.__verbosity > 0:
            print(
                "\n[init_permutation_flags] Initialize the permutation flags for each node according to module type and parameters"
            )

        # initialize some graph-wide trackers
        cls.__group_data = {}
        cls.__group_data["next_sibling_group_id"] = 0
        cls.__group_data["next_coparent_group_id"] = 0
        cls.__group_data["sibling_groups"] = {}
        cls.__group_data["sibling_group_permutations"] = {}
        cls.__group_data["sibling_group_C_params"] = {}
        cls.__group_data["skipped_sibling_groups"] = {}
        cls.__group_data["coparent_groups"] = {}
        cls.__group_data["skipped_coparent_groups"] = {}

        # track MHA nodes
        MHA_nodes = []

        # initialize each node's details
        for node_name in fx_graph.keys():
            fx_node = fx_graph.get(node_name)
            node_module_type = fx_node.get("module_type")
            if cls.__verbosity > 1:
                if node_module_type == "get_attr":
                    print(f"Initializing node {node_name} of type {node_module_type}")
                else:
                    print(f"Initializing node {node_name} of type {node_module_type}: {fx_node}")

            # default for all nodes: don't allow anything
            if node_module_type is not None:
                fx_graph[node_name]["C_permutable"] = (
                    False  # does this node have parameters that can be permuted in C
                )
                fx_graph[node_name]["K_permutable"] = (
                    False  # does this node have parameters that can be permuted in K
                )
                fx_graph[node_name]["K_passthru"] = (
                    False  # does this node need to pass a K permutation to its parents
                )
                fx_graph[node_name]["is_real"] = False
                fx_graph[node_name]["C_permuted"] = False
                fx_graph[node_name]["K_permuted"] = False

                # initialize sibling and coparent groups
                fx_graph[node_name]["sibling_group_id"] = None
                fx_graph[node_name]["coparent_group_id"] = None

                # update each node to be more permissive if supported
                if node_module_type in cls.__permutation_target_module_types:
                    fx_graph[node_name]["is_real"] = True
                    node_groups = fx_graph.get(node_name).get("groups_param")

                    if node_groups in ["None", "1"]:  # no groups, no constraints
                        fx_graph[node_name]["C_permutable"] = True
                        fx_graph[node_name]["K_permutable"] = True

                    else:  # handle groups
                        Permutation.init_grouped_conv_permutation_flags(
                            fx_graph, node_name, node_groups, cls.__verbosity
                        )

                elif node_module_type in cls.__permute_K_and_passthru_module_types:
                    fx_graph[node_name]["K_permutable"] = True
                    fx_graph[node_name]["K_passthru"] = True
                    fx_graph[node_name]["is_real"] = False

                elif node_module_type in cls.__simple_passthru_module_types:
                    fx_graph[node_name]["K_passthru"] = True
                    fx_graph[node_name]["is_real"] = False

                elif node_module_type in cls.__disallow_permutations_module_types:
                    fx_graph[node_name]["is_real"] = True
                    fx_graph[node_name]["C_param"] = 1
                    fx_graph[node_name]["K_param"] = 1
                    fx_graph[node_name]["groups_param"] = 1

                elif "activation" in node_module_type:
                    if cls.__verbosity > 0:
                        print(
                            f"WARNING: how should permutation flags be initialized for node {node_name} of module type {node_module_type}?  Found 'activation', assuming simple passthru behavior."
                        )
                    fx_graph[node_name]["K_passthru"] = True
                    fx_graph[node_name]["is_real"] = False

                else:
                    if cls.__verbosity > 0:
                        print(
                            f"WARNING: how should permutation flags be initialized for node {node_name} of module type {node_module_type}?  Defaulting to strict, disallowing permutations around it."
                        )
                    # is_real coupled with disallowed C and K permutations will poison real parents and real children
                    fx_graph[node_name]["is_real"] = True
                    # dummy entries:
                    fx_graph[node_name]["C_param"] = 1
                    fx_graph[node_name]["K_param"] = 1
                    fx_graph[node_name]["groups_param"] = 1

                # MHA nodes only handle the in_proj, need to add out_proj nodes explicitly
                # keep track here so we can iterate directly and change fx_graph keys
                if node_module_type == "torch.nn.modules.activation.MultiheadAttention":
                    MHA_nodes.append(node_name)

            if cls.__verbosity > 1:
                if node_module_type == "get_attr":
                    print(f"\tInitialized node {node_name} of type {node_module_type}")
                else:
                    print(
                        f"\tInitialized node {node_name} of type {node_module_type}: {fx_graph[node_name]}"
                    )

        for MHA_node in MHA_nodes:
            fx_graph = Permutation.insert_MHA_out_proj(fx_graph, MHA_node, cls.__verbosity)

        return fx_graph

    @staticmethod
    def collect_siblings(fx_graph, node_name, all_siblings):
        """Recursively build a set of some node's siblings in the graph"""

        # find all siblings of the requested node
        siblings = set()
        parents = fx_graph.get(node_name).get("real_parents")
        for parent in parents:
            children = fx_graph.get(parent).get("real_children")
            for child in children:
                siblings.add(child)

        # separate the new siblings, since we'll need to process them recursively
        new_siblings = siblings.difference(all_siblings)
        # update the final list with just the new elements
        all_siblings.update(new_siblings)

        for new_sibling in new_siblings:
            all_siblings = Permutation.collect_siblings(fx_graph, new_sibling, all_siblings)

        return all_siblings

    @staticmethod
    def propagate_sibling_group(fx_graph, all_siblings, verbosity):
        """Check a sibling group for ability to be permuted, disallow all siblings and coparents if there's an issue"""

        made_change = False
        allow_C = True
        for sibling in all_siblings:
            pre_check = allow_C
            allow_C = allow_C and fx_graph[sibling]["C_permutable"]
            if allow_C != pre_check:
                if verbosity > 2:
                    if fx_graph[sibling]["module_type"] == "get_attr":
                        print(f"\tnode {sibling} has poisoned the sibling group of {all_siblings}")
                    else:
                        print(
                            f"\tnode {sibling} has poisoned the sibling group of {all_siblings}: {fx_graph[sibling]}"
                        )
                break

        if not allow_C:
            for sibling in all_siblings:
                made_change = made_change or fx_graph[sibling]["C_permutable"]
                fx_graph[sibling]["C_permutable"] = False

                # only disable permutation along K for parents if this node cannot passthru, either
                if not fx_graph[sibling]["K_passthru"]:
                    sibling_parents = fx_graph[sibling]["real_parents"]
                    for sibling_parent in sibling_parents:
                        made_change = (
                            made_change
                            or fx_graph[sibling_parent]["K_permutable"]
                            or fx_graph[sibling_parent]["K_passthru"]
                        )
                        fx_graph[sibling_parent]["K_permutable"] = False
                        fx_graph[sibling_parent]["K_passthru"] = False

        return made_change

    @staticmethod
    def collect_coparents(fx_graph, node_name, all_coparents):
        """Recursively build a set of all coparents of a particular node in the graph"""

        # find all coparents of the requested node
        coparents = set()
        children = fx_graph.get(node_name).get("real_children")
        for child in children:
            parents = fx_graph.get(child).get("real_parents")
            for parent in parents:
                coparents.add(parent)

                # coparents are used to restrict what nodes can be permuted along C, so we need to track if the current parents also pass their K permutations up
                if fx_graph[parent]["K_passthru"]:
                    grandparents = fx_graph[parent]["real_parents"]
                    for grandparent in grandparents:
                        coparents = coparents.union(
                            Permutation.collect_coparents(fx_graph, grandparent, coparents)
                        )

        # separate the new coparents, since we'll need to process them recursively
        new_coparents = coparents.difference(all_coparents)
        # update the final list with just the new elements
        all_coparents.update(new_coparents)

        for new_coparent in new_coparents:
            all_coparents = Permutation.collect_coparents(fx_graph, new_coparent, all_coparents)

        return all_coparents

    @staticmethod
    def propagate_coparent_group(fx_graph, all_coparents, verbosity):
        """Check a coparent group for ability to be permuted, disallow all fellow coparents and children if there's an issue"""

        # see if all coparents agree that K can be permuted
        allow_K = True
        made_change = False
        for coparent in all_coparents:
            pre_check = allow_K
            allow_K = allow_K and (
                fx_graph[coparent]["K_permutable"] or fx_graph[coparent]["K_passthru"]
            )
            if allow_K != pre_check:
                if verbosity > 2:
                    if fx_graph[coparent]["module_type"] == "get_attr":
                        print(
                            f"\tnode {coparent} has poisoned the coparent group of {all_coparents}"
                        )
                    else:
                        print(
                            f"\tnode {coparent} has poisoned the coparent group of {all_coparents}: {fx_graph[coparent]}"
                        )
                break

        # if anyone says no, force everyone to 'no', keep track of updated state
        if not allow_K:
            for coparent in all_coparents:
                # all coparents can no longer be permuted along K
                if fx_graph[coparent]["K_permutable"] or fx_graph[coparent]["K_passthru"]:
                    made_change = True

                    fx_graph[coparent]["K_permutable"] = False
                    fx_graph[coparent]["K_passthru"] = False

                # children of coparents can't be permuted along C
                coparent_children = fx_graph[coparent]["real_children"]
                for coparent_child in coparent_children:
                    if fx_graph[coparent_child]["C_permutable"]:
                        fx_graph[coparent_child]["C_permutable"] = False
                        made_change = True

        return made_change

    @classmethod
    def fixup_concats(cls, fx_graph):
        """concat operations/modules may concatenate along the channel dimension, which requires special handling (like grouped convs)"""

        if cls.__verbosity > 0:
            print("[fixup_concats]")

        for node_name in fx_graph.keys():
            fx_node = fx_graph[node_name]
            if fx_node.get("module_type") == "concat":
                # get real parents, find GCD of their Ks
                node_real_parents = fx_node["real_parents"]

                # some concats are at the front of networks (googlenet)
                if len(node_real_parents) == 0:
                    continue

                parents_K_params = []
                for parent in node_real_parents:
                    parent_K_param = int(fx_graph[parent]["K_param"])
                    parents_K_params.append(parent_K_param)
                    fx_graph[parent]["allow_K_mismatch"] = "concat op"

                # if grouped convolutions make the input channels different among siblings different sizes,
                # restrict the permutation atom to the greatest common divisor so it can be tiled as needed for each sibling (and parent)
                if cls.__verbosity > 2:
                    print(
                        f"\tfixing up concat node {node_name}, found parents' {node_real_parents} Ks: {parents_K_params}"
                    )

                children_GCD_param = str(np.gcd.reduce(parents_K_params))

                # set this to GCD of children's sibling group
                sibling_group_id = -1
                node_real_children = fx_node["real_children"]
                for child in node_real_children:
                    sibling_group_id = fx_graph[child]["sibling_group_id"]
                    fx_graph[child]["C_param"] = children_GCD_param

                old_children_GCD = cls.__group_data["sibling_group_C_params"][sibling_group_id]
                cls.__group_data["sibling_group_C_params"][sibling_group_id] = children_GCD_param

                # fixup this node's dimensions
                # use the functionality of grouped convolutions
                fx_node["C_param"] = children_GCD_param
                fx_node["K_param"] = old_children_GCD
                fx_node["groups_param"] = str(int(old_children_GCD) // int(children_GCD_param))

                if cls.__verbosity > 2:
                    print(
                        f"\tfixed up concat node {node_name}, found GCD of parents' {node_real_parents} K to be {children_GCD_param}, updated children's {node_real_children} C_params and sibling group {sibling_group_id} GCD"
                    )
                    print(f"\tthis node now: {fx_node}")

        return fx_graph

    @classmethod
    def enforce_dimension_agreement(cls, fx_graph):
        """Check all nodes' channel dimensions against parents and children to make sure they agree; e.g. flatten ops may change these dimensions"""

        if cls.__verbosity > 0:
            print("[enforce_dimension_agreement]")

        for node_name in fx_graph.keys():
            fx_node = fx_graph[node_name]
            if "is_real" in fx_node.keys() and fx_node["is_real"]:
                # enforce this node's input dimension matches its parents' output dimensions
                node_C = int(fx_node["C_param"])
                node_K = int(fx_node["K_param"])

                if fx_graph[node_name]["groups_param"] not in ["1", "None"]:
                    node_C = node_C * int(fx_node["groups_param"])

                node_real_parents = fx_node["real_parents"]
                if len(node_real_parents) == 0:
                    if cls.__verbosity > 1:
                        print(f"\t{node_name} has no real parents, disabling permutations along C")
                    fx_graph[node_name]["C_permutable"] = False
                else:
                    for real_parent in node_real_parents:
                        parent_K = int(fx_graph[real_parent]["K_param"])
                        ignore_mismatch = fx_graph[real_parent].get("allow_K_mismatch")

                        if ignore_mismatch is not None:
                            if cls.__verbosity > 1:
                                print(
                                    f"\tIgnoring dimension mismatch between {node_name} (C={node_C}) and its parent {real_parent} (K={parent_K}) as requested: {ignore_mismatch}"
                                )

                        elif parent_K >= 0 and node_C != parent_K:
                            if cls.__verbosity > 1:
                                print(
                                    f"\tDimensions mismatch between {node_name} (C={node_C}) and its parent {real_parent} (K={parent_K}), disallowing the relevant permutations"
                                )

                            fx_graph[node_name]["C_permutable"] = False
                            fx_graph[real_parent]["K_permutable"] = False

                            if cls.__verbosity > 2:
                                print(f"\t{fx_graph[node_name]}\n\t{fx_graph[real_parent]}")

                if len(fx_graph[node_name]["real_children"]) == 0:
                    if cls.__verbosity > 1:
                        print(f"\t{node_name} has no real children, disabling permutations along K")
                    fx_graph[node_name]["K_permutable"] = False

        return fx_graph

    @classmethod
    def make_sibling_coparent_groups(cls, fx_graph):
        """Traverse all real nodes in the graph and collect their siblings and coparents"""

        if cls.__verbosity > 0:
            print("[make_sibling_coparent_groups]")

        for node_name in fx_graph.keys():
            fx_node = fx_graph[node_name]

            if "is_real" in fx_node.keys() and fx_node["is_real"]:
                sibling_group_id = fx_node["sibling_group_id"]
                if sibling_group_id is None:  # need to make a new sibling group for this node
                    all_siblings = cls.collect_siblings(fx_graph, node_name, set([node_name]))
                    all_siblings = sorted(all_siblings)  # deterministic order for DDP setups
                    sibling_group_id = cls.__group_data["next_sibling_group_id"]
                    cls.__group_data["sibling_groups"][sibling_group_id] = all_siblings
                    cls.__group_data["next_sibling_group_id"] = sibling_group_id + 1

                    sibling_group_C_params = []
                    for sibling in all_siblings:
                        fx_graph[sibling]["sibling_group_id"] = sibling_group_id
                        sibling_C_param = int(fx_graph[sibling]["C_param"])
                        sibling_group_C_params.append(sibling_C_param)

                    # if grouped convolutions make the input channels different among siblings different sizes,
                    # restrict the permutation atom to the greatest common divisor so it can be tiled as needed for each sibling (and parent)
                    sibling_group_C_param = str(np.gcd.reduce(sibling_group_C_params))
                    cls.__group_data["sibling_group_C_params"][sibling_group_id] = (
                        sibling_group_C_param
                    )
                    cls.__group_data["skipped_sibling_groups"][sibling_group_id] = None

                    if cls.__verbosity > 1:
                        print(
                            f"New sibling group {sibling_group_id} with GCD(C) of {sibling_group_C_param}: {all_siblings}"
                        )

                coparent_group_id = fx_node["coparent_group_id"]
                if coparent_group_id is None:
                    all_coparents = cls.collect_coparents(fx_graph, node_name, set([node_name]))
                    coparent_group_id = cls.__group_data["next_coparent_group_id"]
                    cls.__group_data["coparent_groups"][coparent_group_id] = all_coparents
                    cls.__group_data["next_coparent_group_id"] = coparent_group_id + 1
                    cls.__group_data["skipped_coparent_groups"][coparent_group_id] = None

                    for coparent in all_coparents:
                        fx_graph[coparent]["coparent_group_id"] = coparent_group_id

                    if cls.__verbosity > 1:
                        print(f"New coparent group {coparent_group_id}: {all_coparents}")
        return fx_graph

    @classmethod
    def propagate_permutation_flags(cls, fx_graph):
        """Disallow sibling groups from having different C_permutable flags and coparent groups from having different K_permutable flags within the groups"""

        made_change = True  # will we need to repeat this propagation?
        # TODO: just propagate to sibling groups and coparent groups directly, instead of iteratively to direct real_parents and siblings
        while made_change:
            made_change = False

            if cls.__verbosity > 0:
                print("Making a pass at propagating permutation flags")

            for node_name in fx_graph.keys():
                fx_node = fx_graph.get(node_name)

                node_parents = fx_graph.get(node_name).get("parents")
                node_real_parents = fx_graph.get(node_name).get("real_parents")
                node_children = fx_graph.get(node_name).get("children")
                node_real_children = fx_graph.get(node_name).get("real_children")

                # input layers can't be permuted along C without a runtime fixup, skip them
                if node_parents is None or (
                    "x" in node_parents
                    and "C_permutable" in fx_graph[node_name].keys()
                    and fx_graph[node_name]["C_permutable"]
                ):
                    if cls.__verbosity > 1:
                        print(
                            f"{node_name} has no parents, or only an input, disabling permutations in C"
                        )
                    made_change = True
                    fx_graph[node_name]["C_permutable"] = False

                # output layers can't be permuted along K without a runtime fixup, skip them
                if node_children is None or (
                    "output" in node_children
                    and "K_permutable" in fx_graph[node_name].keys()
                    and fx_graph[node_name]["K_permutable"]
                ):
                    if cls.__verbosity > 1:
                        print(
                            f"{node_name} has no children, or only an output, disabling permutations in K"
                        )
                    made_change = True
                    fx_graph[node_name]["K_permutable"] = False
                    fx_graph[node_name]["K_passthru"] = False

                if "is_real" in fx_node.keys() and fx_node["is_real"]:
                    # siblings must share C-flags; if one cannot be permuted along C, none can
                    sibling_group_id = fx_graph[node_name]["sibling_group_id"]
                    all_siblings = cls.__group_data["sibling_groups"][sibling_group_id]
                    made_change = (
                        cls.propagate_sibling_group(fx_graph, all_siblings, cls.__verbosity)
                        or made_change
                    )

                    # coparents must share K-flags; if one cannot be permuted along K, none can
                    coparent_group_id = fx_graph[node_name]["coparent_group_id"]
                    all_coparents = cls.__group_data["coparent_groups"][coparent_group_id]
                    made_change = (
                        cls.propagate_coparent_group(fx_graph, all_coparents, cls.__verbosity)
                        or made_change
                    )

        return fx_graph

    @classmethod
    def find_node_real_children(cls, fx_graph, node_name, found_children):
        """Collect the real children of some node"""

        if "real_children" in fx_graph[node_name].keys():
            return found_children.union(fx_graph[node_name]["real_children"])

        children = fx_graph[node_name]["children"]
        for child in children:
            if child in fx_graph.keys():  # not the output node
                if cls.__verbosity > 3:
                    print(f"\tchecking child {child} of node {node_name}")

                # if it's a real node, just add it
                if "is_real" in fx_graph[child].keys() and fx_graph[child]["is_real"]:
                    found_children.add(child)
                else:  # otherwise, search its children
                    found_children = cls.find_node_real_children(fx_graph, child, found_children)

        return found_children

    @classmethod
    def find_real_children(cls, fx_graph):
        """Collect the real children of all nodes in the graph"""

        if cls.__verbosity > 0:
            print(
                "\n[find_real_children] Find the real children for each node according to the whole network graph built with Torch.FX"
            )

        reversible_fx_graph_keys = list(fx_graph.keys())
        for node_name in reversed(
            reversible_fx_graph_keys
        ):  # as the optimization, we need to find the real children from back to front, to use the already saved 'real_children'
            node_children = fx_graph.get(node_name).get("children")

            if cls.__verbosity > 2:
                print(
                    "[find_real_children] node_name: '{:}', children: {:}".format(
                        node_name, node_children
                    )
                )

            real_children = cls.find_node_real_children(fx_graph, node_name, set())

            if cls.__verbosity > 1:
                print(
                    f"[find_real_children] {node_name} has {len(real_children)} real children: {real_children}"
                )

            fx_graph[node_name]["real_children"] = sorted(real_children)

        if cls.__save_permutation_graph:
            cls.save_graph_to_json(
                fx_graph,
                save_dumped_graph_path_with_name=os.path.join(
                    cls.__permutation_output_dir,
                    "./model_graph_find_real_children.json",
                ),
            )  # save the intermediate graph as JSON file for debugging
        return fx_graph

    @classmethod
    def find_node_real_parents(cls, fx_graph, node_name, found_parents):
        """Collect the real parents of some node"""

        if "real_parents" in fx_graph[node_name].keys():
            return found_parents.union(fx_graph[node_name]["real_parents"])

        parents = fx_graph[node_name]["parents"]
        for parent in parents:
            if parent in fx_graph.keys():  # not the input node
                if cls.__verbosity > 3:
                    print(f"\tchecking parent {parent} of node {node_name}")

                # if it's a real node, just add it
                if "is_real" in fx_graph[parent].keys() and fx_graph[parent]["is_real"]:
                    found_parents.add(parent)
                else:  # otherwise, search its parents
                    found_parents = cls.find_node_real_parents(fx_graph, parent, found_parents)

        return found_parents

    @classmethod
    def find_real_parents(cls, fx_graph):
        """Collect the real parents of all nodes in the graph"""

        if cls.__verbosity > 0:
            print(
                "\n[find_real_parents] Find the real parents for each node according to the whole network graph built with Torch.FX"
            )

        for node_name in fx_graph.keys():
            node_real_parents_name = []
            node_real_parents_module_type = []

            real_parents = cls.find_node_real_parents(fx_graph, node_name, set())

            if cls.__verbosity > 1:
                print(
                    f"[find_real_parents] {node_name} has {len(real_parents)} real parents: {real_parents}"
                )

            fx_graph[node_name]["real_parents"] = sorted(real_parents)

        if cls.__save_permutation_graph:
            cls.save_graph_to_json(
                fx_graph,
                save_dumped_graph_path_with_name=os.path.join(
                    cls.__permutation_output_dir, "./model_graph_find_real_parent.json"
                ),
            )  # save the intermediate graph as JSON file for debugging
        return fx_graph

    @classmethod
    def build_fx_graph(
        cls, model, dump_fx_graph=False, save_dumped_fx_graph="./model_fx_graph.json"
    ):
        """Build the whole network graph with Torch.FX."""

        network_fx_graph = {}
        success = True
        torch_version = str(torch.__version__)
        torch_version_major = int(torch_version.split(".")[0])
        torch_version_minor = int(torch_version.split(".")[1])
        try:
            torch_version_minimum = int(torch_version.split(".")[2])
        except ValueError:  # support the none standard version
            torch_version_minimum = torch_version.split(".")[2]
        if cls.__verbosity > 2:
            print(
                "[build_fx_graph] The torch version is: {}, version major is: {}, version minor is: {}, version minimum is: {}".format(
                    torch_version,
                    torch_version_major,
                    torch_version_minor,
                    torch_version_minimum,
                )
            )

        if torch_version_major >= 2 or (torch_version_major >= 1 and torch_version_minor >= 8):
            if cls.__verbosity > 1:
                print("[build_fx_graph] The Torch.FX is supported.")
        else:  # Torch.FX is introduced in torch 1.8.0
            if cls.__verbosity >= 0:
                print(
                    "[build_fx_graph] The Torch.FX is not supported. So cannot build the Torch.FX graph."
                )
            success = False
            return network_fx_graph, success

        if cls.__verbosity > 2:
            print("\n[build_fx_graph] Print the model structure with pure PyTorch function")
            print(model)

        graph_module = cls.trace_and_print_raw_fx_graph(
            model, print_tabular=cls.__verbosity > 1
        )  # needs "tabulate" library
        if graph_module is None:
            success = False
            return network_fx_graph, success

        if cls.__verbosity > 0:
            print("\n[build_fx_graph] Build the module name and type dictionary")

        module_name_type_dict = {}
        module_name_group_conv_dict = {}
        module_name_C_dict = {}
        module_name_K_dict = {}
        for name, mod in model.named_modules():
            if cls.__verbosity > 1:
                print("[build_fx_graph] module_name: {}, module type: {}".format(name, type(mod)))
            module_name_type_dict[name] = str(type(mod)).split("'")[1]
            try:
                module_name_C_dict[name] = str(mod.in_channels)
            except:
                try:
                    module_name_C_dict[name] = str(mod.in_features)
                except:
                    try:
                        module_name_C_dict[name] = str(mod.embed_dim)
                    except:
                        module_name_C_dict[name] = "None"

            try:
                module_name_K_dict[name] = str(mod.out_channels)
            except:
                try:
                    module_name_K_dict[name] = str(mod.out_features)
                except:
                    try:
                        module_name_K_dict[name] = str(mod.embed_dim)
                    except:
                        module_name_K_dict[name] = "None"

            try:
                module_name_group_conv_dict[name] = str(mod.groups)
                if cls.__verbosity > 1:
                    print(
                        "[build_fx_graph] this module has 'group' param with value: {}".format(
                            mod.groups
                        )
                    )
            except:
                module_name_group_conv_dict[name] = "None"
                continue

        # keep track of children and parents for each layer (could be call_module or call_function)
        if cls.__verbosity > 0:
            print("\n[build_fx_graph] Print the children and parents relationship for each layer")
        network_fx_graph = {}
        for node in graph_module.graph.nodes:
            if node.op == "placeholder":
                if cls.__verbosity > 2:
                    print("[build_fx_graph] This is the 'input' node: {:}".format(node.target))
                continue
            elif node.op == "get_attr":
                if cls.__verbosity > 2:
                    print("[build_fx_graph] This is the 'get_attr' node: {:}".format(node.target))
                node_parent, node_children = get_node_parent_children(node)
                converted_node_name = convert_fx_node_name(node.target)

                network_fx_graph[converted_node_name] = {}
                network_fx_graph[converted_node_name]["parents"] = node_parent
                network_fx_graph[converted_node_name]["children"] = node_children
                network_fx_graph[converted_node_name]["module_type"] = "get_attr"
                network_fx_graph[converted_node_name]["groups_param"] = "None"

                # inspired by https://pytorch.org/docs/stable/fx.html
                def fetch_attr(target: str, mod):
                    target_atoms = target.split(".")
                    attr_itr = mod
                    for i, atom in enumerate(target_atoms):
                        if not hasattr(attr_itr, atom):
                            raise RuntimeError(
                                f"Node referenced nonexistant target {'.'.join(target_atoms[:i])}"
                            )
                        attr_itr = getattr(attr_itr, atom)
                    return attr_itr

                attr = fetch_attr(node.target, graph_module)
                network_fx_graph[converted_node_name]["C_param"] = 1
                network_fx_graph[converted_node_name]["K_param"] = -1
                network_fx_graph[converted_node_name]["attr"] = attr

            elif (
                node.op == "call_function"
            ):  # e.g. 'adaptive.avg.pool2d', 'add', 'cat', 'flatten', 'floordiv', 'getattr', 'getitem', 'hardsigmoid', 'mean', 'mul', 'relu', 'transpose'
                node_parent, node_children = get_node_parent_children(node)
                converted_node_name = convert_fx_node_name(node.name)
                if cls.__verbosity > 2:
                    print(
                        "[build_fx_graph] This is the 'call_function' node: {:}, its parent list: {:}, its children list: {:}".format(
                            converted_node_name, node_parent, node_children
                        )
                    )
                network_fx_graph[converted_node_name] = {}
                network_fx_graph[converted_node_name]["parents"] = node_parent
                network_fx_graph[converted_node_name]["children"] = node_children
                network_fx_graph[converted_node_name]["fx_op"] = "call_function"

                ### "convert" some ops to modules

                # concatenating along K can be handled by reducing the size of the childrens' C appropriately
                # see fixup_concats, if no dim arg, default is 0 (handled automatically)
                if node.target == torch.cat and len(node.args) > 1 and node.args[1] == 1:
                    network_fx_graph[converted_node_name]["fx_op"] = "call_module"
                    network_fx_graph[converted_node_name]["module_type"] = "concat"
                    network_fx_graph[converted_node_name]["groups_param"] = (
                        "N/A"  # just need placeholders
                    )
                    network_fx_graph[converted_node_name]["C_param"] = "N/A"
                    network_fx_graph[converted_node_name]["K_param"] = "N/A"

            elif (
                node.op == "call_method"
            ):  # e.g. 'chunk', 'contiguous', 'mean', 'size', 'unsqueeze', 'view'
                node_parent, node_children = get_node_parent_children(node)
                converted_node_name = convert_fx_node_name(node.name)
                if cls.__verbosity > 2:
                    print(
                        "[build_fx_graph] This is the 'call_method' node: {:}, its parent list: {:}, its children list: {:}".format(
                            converted_node_name, node_parent, node_children
                        )
                    )
                network_fx_graph[converted_node_name] = {}
                network_fx_graph[converted_node_name]["parents"] = node_parent
                network_fx_graph[converted_node_name]["children"] = node_children
                network_fx_graph[converted_node_name]["fx_op"] = "call_method"
                continue

            elif node.op == "call_module":
                node_parent, node_children = get_node_parent_children(node)
                converted_node_name = convert_fx_node_name(node.name)
                # check whether the converted_node_name is same as node.target, especially for ReLU case
                if converted_node_name != node.target:
                    if cls.__verbosity > 2:
                        print(
                            "[build_fx_graph][warning] The target name from Torch.FX is '{:}', the manually converted node name is '{:}', not the same one, choose the converted node name".format(
                                node.target, converted_node_name
                            )
                        )

                # assume the modules share the same target name have the same type, because converted_node_name may not be obtained by model.named_modules(), like some ReLU (defined in forward function)
                node_type = module_name_type_dict[node.target]
                if cls.__verbosity > 2:
                    print(
                        "[build_fx_graph] This is the 'call_module' node: {:}, its parent list: {:}, its children list: {:}, its type: {:}".format(
                            converted_node_name, node_parent, node_children, node_type
                        )
                    )
                network_fx_graph[converted_node_name] = {}
                network_fx_graph[converted_node_name]["parents"] = node_parent
                network_fx_graph[converted_node_name]["children"] = node_children
                network_fx_graph[converted_node_name]["fx_op"] = "call_module"
                network_fx_graph[converted_node_name]["module_type"] = node_type
                network_fx_graph[converted_node_name]["groups_param"] = module_name_group_conv_dict[
                    node.target
                ]
                network_fx_graph[converted_node_name]["C_param"] = module_name_C_dict[node.target]
                network_fx_graph[converted_node_name]["K_param"] = module_name_K_dict[node.target]

            elif node.op == "output":
                if cls.__verbosity > 2:
                    print("[build_fx_graph] This is the 'output' node: {:}".format(node.target))
                continue

        if dump_fx_graph:
            if cls.__verbosity > 0:
                print(
                    "\n[build_fx_graph] Dump the overall dict for children and parents relationship into JSON file"
                )
            cls.save_graph_to_json(
                network_fx_graph, save_dumped_graph_path_with_name=save_dumped_fx_graph
            )

        return network_fx_graph, success

    @classmethod
    def trace_and_print_raw_fx_graph(cls, model, print_tabular=False, generate_python_code=False):
        """This function is used to find and print the intermediate representation (IR) - Graph representation with Torch.FX features."""

        from torch.fx import symbolic_trace
        import traceback

        # Symbolic tracing frontend - captures the semantics of the module
        try:
            symbolic_traced: torch.fx.GraphModule = symbolic_trace(model)
        except Exception as ex:
            if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
                if cls.__verbosity > 0:
                    print(ex)
                    print(
                        "".join(
                            traceback.format_exception(
                                etype=type(ex), value=ex, tb=ex.__traceback__
                            )
                        )
                    )
                    print(
                        "\n[print_raw_fx_graph] Meet the fatal fault when trying to symbolic trace the model with Torch.FX"
                    )
            return None

        # High-level intermediate representation (IR) - Graph representation
        if cls.__verbosity > 1:
            print("\n[print_raw_fx_graph] Print the intermediate representation (IR) with Torch.FX")
            print(symbolic_traced.graph)

        if print_tabular:
            print(
                "\n[print_raw_fx_graph] Print the intermediate representation (IR) with Torch.FX in a table format"
            )
            try:
                from tabulate import tabulate

                symbolic_traced.graph.print_tabular()
            except ImportError:
                if cls.__verbosity > 1:
                    print(
                        "[print_raw_fx_graph][Warning] 'print_tabular' relies on the library `tabulate`; run `pip install tabulate` to install it."
                    )
            except (
                AttributeError
            ):  # to avoid the AttributeError: 'Graph' object has no attribute 'print_tabular'
                if cls.__verbosity > 1:
                    print(
                        "[print_raw_fx_graph][Warning] 'print_tabular' function is not supported in current Torch version. Skip!"
                    )

        # Code generation - valid Python code
        if generate_python_code:
            print(
                "\n[print_raw_fx_graph] Create valid Python code matching the IR/Graph's semantics with Torch.FX"
            )
            print(symbolic_traced.code)

        return symbolic_traced

    @classmethod
    def save_graph_to_json(cls, graph, save_dumped_graph_path_with_name="./model_fx_graph.json"):
        """This function is used to save the graph into JSON file for inspection."""

        # use dumps to transfer the dict to JSON string
        json_graph_str = json.dumps(graph)
        with open(save_dumped_graph_path_with_name, "w", encoding="utf-8") as dumped_graph_file:
            dumped_graph_file.write(
                json_graph_str
            )  # write the transferred JSON string into JSON file


================================================
FILE: apex/contrib/sparsity/permutation_search_kernels/CUDA_kernels/permutation_search_kernels.cu
================================================
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <stdio.h>
namespace py = pybind11;

#define gpuErrchk(ans)                    \
  {                                       \
    gpuAssert((ans), __FILE__, __LINE__); \
  }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true) {
  if (code != cudaSuccess) {
    fprintf(stderr, "GPUassert %d: %s %s %d\n", (int)code, cudaGetErrorString(code), file, line);
    if (abort) exit(code);
  }
}

// find the magnitude after enforcing the 2:4 sparsity constraint on a group of 4 values
__device__ float group_2_to_4(float4 vals) {
  vals.x = fabs(vals.x);
  vals.y = fabs(vals.y);
  vals.z = fabs(vals.z);
  vals.w = fabs(vals.w);

  float sum0 = vals.x + vals.y;
  float sum1 = vals.x + vals.z;
  float sum2 = vals.x + vals.w;
  float sum3 = vals.y + vals.z;
  float sum4 = vals.y + vals.w;
  float sum5 = vals.z + vals.w;

  float best_sum0 = fmax(sum0, sum1);
  float best_sum1 = fmax(sum2, sum3);
  float best_sum2 = fmax(sum4, sum5);
  float best_sum = fmax(fmax(best_sum0, best_sum1), best_sum2);

  return best_sum;
}

inline float* float_ptr_from_numpy(py::array_t<float>& py_float) { return (float*)py_float.data(); }

inline unsigned int* uint_ptr_from_numpy(py::array_t<unsigned int>& py_uint) { return (unsigned int*)py_uint.data(); }

/**********************************************************
 *  Check for the best permutation for an entire matrix
 **********************************************************/
__global__ void permute_and_sum_after_2_to_4(float* matrix, unsigned int rows, unsigned int cols, unsigned int* stripes,
                                             unsigned int total_stripes, unsigned int* permutations, float* output) {
  // vectorize
  float4* mat4 = (float4*)matrix;
  cols /= 4;

  // each thread in a block takes some number of rows
  size_t num_rows = max((int)ceilf((float)rows / (float)blockDim.x), 1);
  size_t row_offset = num_rows * threadIdx.x;
  size_t num_stripes = total_stripes;  // total_stripes / gridDim.x;
  size_t stripe_offset = 0;            // num_stripes * blockIdx.x;
  unsigned int localStart = stripe_offset;
  unsigned int localEnd = localStart + num_stripes;

  // each block takes care of one permutation
  unsigned int p = blockIdx.x;
  unsigned int* permutation = &permutations[p * total_stripes * 4];

  float sum = 0.0f;
  extern __shared__ float s[32][32];
  float4* local_stripes = (float4*)&s[threadIdx.x];
  float* local_columns = (float*)&s[threadIdx.x];
  float4* permuted_local_stripes = (float4*)&local_stripes[num_stripes];
  float* permuted_local_columns = (float*)&local_columns[num_stripes * 4];

  for (unsigned int r = row_offset; r < row_offset + num_rows; ++r) {
    if (r >= rows) break;

    // load into smem
    for (unsigned int s = localStart; s < localEnd; ++s) {
      unsigned int stripe = stripes[s];
      local_stripes[s] = mat4[r * cols + stripe];
    }

// now permute
#pragma unroll 4
    for (unsigned int c = 0; c < num_stripes * 4; ++c) {
      permuted_local_columns[c] = local_columns[permutation[c]];
    }

    // now sum 2:4
    for (unsigned int s = 0; s < num_stripes; ++s) {
      sum += group_2_to_4(permuted_local_stripes[s]);
    }
  }

  atomicAdd(&output[p], sum);
}

void free_permutation_memory(float** dmatrix, unsigned int** dstripe_groups, unsigned int** dpermutations,
                             float** dresults, float** hresults) {
  cudaFree(*dmatrix);
  cudaFree(*dresults);
  cudaFree(*dpermutations);
  cudaFree(*dstripe_groups);
  free(*hresults);
}

int set_up_check_permutation_memory(float** dmatrix, unsigned int rows, unsigned int cols,
                                    unsigned int** dstripe_groups, unsigned int group_width, unsigned int num_groups,
                                    unsigned int** dpermutations, unsigned int num_permutations, float** dresults,
                                    float** hresults) {
  static unsigned int setupRows = 0;
  static unsigned int setupCols = 0;
  static unsigned int setupGroupWidth = 0;
  static unsigned int setupNumGroups = 0;
  static unsigned int setupNumPermutations = 0;
  static bool allocated = false;
  int fresh_alloc = 0;
  if (!allocated || setupRows != rows || setupCols != cols || setupGroupWidth != group_width ||
      setupNumGroups != num_groups || setupNumPermutations != num_permutations) {
    if (allocated) {
      free_permutation_memory(dmatrix, dstripe_groups, dpermutations, dresults, hresults);
    }

    gpuErrchk(cudaMalloc((void**)dmatrix, rows * cols * sizeof(float)));
    gpuErrchk(cudaMalloc((void**)dstripe_groups, group_width * num_groups * sizeof(unsigned int)));
    gpuErrchk(cudaMalloc((void**)dpermutations, num_permutations * group_width * 4 * sizeof(unsigned int)));
    gpuErrchk(cudaMalloc((void**)dresults, num_permutations * sizeof(float)));
    *hresults = (float*)malloc(num_permutations * sizeof(float));

    allocated = true;
    setupRows = rows;
    setupCols = cols;
    setupGroupWidth = group_width;
    setupNumGroups = num_groups;
    setupNumPermutations = num_permutations;
    fresh_alloc = 1;
  }

  return fresh_alloc;
}

int run_check_permutations(
    py::array_t<float>& py_matrix, unsigned int rows, unsigned int cols,
    py::array_t<unsigned int>&
        py_stripe_groups,  // groups of stripes, group_width = stripes per group, num_groups = groups in the array
    unsigned int group_width, unsigned int num_groups,
    py::array_t<unsigned int>& py_permutations,  // array of permutations to try, group_width*4 values per each of
                                                 // num_permutations permutations
    unsigned int num_permutations,
    py::array_t<float>& py_improvement,        // improvment offered by the best permutation
    py::array_t<unsigned int>& py_permutation  // the best permutation
) {
  const unsigned int threads = 32;
  static float* d_matrix;
  static unsigned int* d_permutations;
  static unsigned int* d_stripes;
  static float* d_results;
  static float* results;

  float* matrix = float_ptr_from_numpy(py_matrix);
  unsigned int* stripe_groups = uint_ptr_from_numpy(py_stripe_groups);
  unsigned int* permutations = uint_ptr_from_numpy(py_permutations);
  float* improvement = float_ptr_from_numpy(py_improvement);
  unsigned int* permutation = uint_ptr_from_numpy(py_permutation);

  int fresh_alloc = set_up_check_permutation_memory(&d_matrix, rows, cols, &d_stripes, group_width, num_groups,
                                                    &d_permutations, num_permutations, &d_results, &results);
  if (fresh_alloc == 1) {
    gpuErrchk(cudaMemcpy(d_permutations, permutations, num_permutations * group_width * 4 * sizeof(unsigned int),
                         cudaMemcpyHostToDevice));
    gpuErrchk(
        cudaMemcpy(d_stripes, stripe_groups, group_width * num_groups * sizeof(unsigned int), cudaMemcpyHostToDevice));
  }

  // initialize results, new matrix
  gpuErrchk(cudaMemset(d_results, 0, num_permutations * sizeof(float)));
  gpuErrchk(cudaMemcpy(d_matrix, matrix, rows * cols * sizeof(float), cudaMemcpyHostToDevice));

  // get results for all permutations
  permute_and_sum_after_2_to_4<<<num_permutations, threads, threads * group_width * 4 * 2 * sizeof(float)>>>(
      d_matrix, rows, cols, d_stripes, group_width, d_permutations, d_results);
  gpuErrchk(cudaDeviceSynchronize());

  gpuErrchk(cudaMemcpy(results, d_results, num_permutations * sizeof(float), cudaMemcpyDeviceToHost));

  // find the best permutation - could reduce on GPU
  unsigned int best_permutation = 0;
  float best_improvement = 0.0f;
  for (unsigned int p = 1; p < num_permutations; ++p) {
    float cur_improvement = results[p] - results[0];
    if (best_improvement < cur_improvement) {
      best_permutation = p;
      best_improvement = cur_improvement;
    }
  }

  *improvement = best_improvement;
  *permutation = best_permutation;

  return 0;
}

///////////////////////////////////////////////////////////

/**********************************************************
 * Get the magnitude of a matrix after applying 2:4
 **********************************************************/
// find the magnitude after enforcing the 2:4 sparsity constraint on a subset of the columns of an input matrix
__global__ void subset_sum_after_2_to_4(float* matrix, unsigned int rows, unsigned int cols, unsigned int start_col,
                                        unsigned int end_col, float* output) {
  // vectorize
  float4* mat4 = (float4*)matrix;
  cols /= 4;
  start_col /= 4;
  end_col /= 4;

  // each thread in a block takes some number of rows
  size_t num_rows = max((int)ceilf((float)rows / (float)blockDim.x), 1);
  size_t row_offset = num_rows * threadIdx.x;
  // each block takes some number of columns
  size_t num_cols = (end_col - start_col) / gridDim.x;
  size_t col_offset = num_cols * blockIdx.x;
  start_col += col_offset;
  end_col = start_col + num_cols;

  float sum = 0.0f;
  for (unsigned int r = row_offset; r < row_offset + num_rows; ++r) {
    if (r < rows) {
      for (unsigned int c = start_col; c < end_col; c++) {
        sum += group_2_to_4(mat4[r * cols + c]);
      }
    }
  }

  atomicAdd(output, sum);
}

// build the entire permute map at once
// each block handles one group of stripes
// each threads in the block handle all handle the same permutation at the same time on different rows before moving to
// the next permutation
__global__ void build_permute_map(float* matrix, unsigned int rows, unsigned int cols, unsigned int* stripes,
                                  unsigned int group_width, unsigned int* permutations, unsigned int num_permutations,
                                  unsigned int perm_length, float* output, unsigned int* best_indices) {
  // vectorize
  float4* mat4 = (float4*)matrix;
  cols /= 4;

  // each block handles a group of stripes
  unsigned int* stripe_group = (unsigned int*)&stripes[blockIdx.x * group_width];

  // shared memory: 32 threads each need 16*2
  extern __shared__ float pm_shared[32][32];
  float4* local_stripes = (float4*)&pm_shared[threadIdx.x];
  float* local_columns = (float*)&pm_shared[threadIdx.x];
  float4* permuted_stripes = (float4*)&local_stripes[4];
  float* permuted_columns = (float*)&local_columns[16];

  // each thread handles all permutations in the row before moving on to the next row
  size_t num_rows = max((int)ceilf((float)rows / (float)blockDim.x), 1);
  size_t row_offset = num_rows * threadIdx.x;

  for (unsigned int r = row_offset; r < row_offset + num_rows; ++r) {
    if (r >= rows) break;

    // load a row into smem
    for (unsigned int s = 0; s < group_width; ++s) {
      unsigned int const stripe = stripe_group[s];
      local_stripes[s] = mat4[r * cols + stripe];
    }

    for (unsigned int p = 0; p < num_permutations; ++p) {
      unsigned int* permutation = &permutations[p * perm_length];
      float sum = 0.0f;

// permute
#pragma unroll 4
      for (unsigned int c = 0; c < group_width * 4; ++c) {
        permuted_columns[c] = local_columns[permutation[c]];
      }

      // sum 2:4
      for (unsigned int s = 0; s < group_width; ++s) {
        sum += group_2_to_4(permuted_stripes[s]);
      }

      // update the running sum for this stripe group's permutation
      atomicAdd(&output[blockIdx.x * num_permutations + p], sum);
    }
  }

  // at this point, each permutation's sum in this stripe group has been calculated
  // now, find the best option
  __syncthreads();

  if (threadIdx.x == 0) {
    unsigned int best_permutation = 0;
    float best_magnitude = output[blockIdx.x * num_permutations];
    float base_magnitude = best_magnitude;

    // #pragma unroll 32
    for (unsigned int p = 1; p < num_permutations; ++p) {
      float magnitude = output[blockIdx.x * num_permutations + p];
      if (magnitude > best_magnitude) {
        best_permutation = p;
        best_magnitude = magnitude;
      }
    }

    output[blockIdx.x * num_permutations] = best_magnitude - base_magnitude;
    best_indices[blockIdx.x] = best_permutation;
  }
}

void free_sum_after_2_to_4_memory(float** dmatrix, float** dresult) {
  cudaFree(*dmatrix);
  cudaFree(*dresult);
}

int set_up_sum_after_2_to_4_memory(float** dmatrix, unsigned int rows, unsigned int cols, float** dresult) {
  static unsigned int setupRows = 0;
  static unsigned int setupCols = 0;
  static bool allocated = false;

  int fresh_allocation = 0;
  if (!allocated || setupRows != rows || setupCols != cols) {
    if (allocated) free_sum_after_2_to_4_memory(dmatrix, dresult);

    gpuErrchk(cudaMalloc((void**)dmatrix, rows * cols * sizeof(float)));
    gpuErrchk(cudaMalloc((void**)dresult, sizeof(float)));

    setupRows = rows;
    setupCols = cols;

    fresh_allocation = 1;
  }

  allocated = true;

  return fresh_allocation;
}

int run_subset_sum_after_2_to_4(py::array_t<float>& py_matrix, unsigned int rows, unsigned int cols,
                                unsigned int start_col, unsigned int end_col, unsigned int blocks, unsigned int threads,
                                py::array_t<float>& py_output) {
  static float* d_matrix;
  static float* d_result;

  int fresh_allocation = set_up_sum_after_2_to_4_memory(&d_matrix, rows, cols, &d_result);

  float* matrix = float_ptr_from_numpy(py_matrix);
  float* output = float_ptr_from_numpy(py_output);

  gpuErrchk(cudaMemcpy(d_matrix, matrix, rows * cols * sizeof(float), cudaMemcpyHostToDevice));
  gpuErrchk(cudaMemset(d_result, 0, sizeof(float)));

  subset_sum_after_2_to_4<<<blocks, threads>>>(d_matrix, rows, cols, start_col, end_col, d_result);
  gpuErrchk(cudaDeviceSynchronize());

  gpuErrchk(cudaMemcpy(output, d_result, sizeof(float), cudaMemcpyDeviceToHost));

  return 0;
}

void set_up_permute_map_memory(float** dmatrix, unsigned int rows, unsigned int cols, unsigned int** dstripes,
                               unsigned int num_groups, unsigned int group_width, unsigned int** dpermutations,
                               unsigned int num_permutations, unsigned int perm_length, float** doutput,
                               unsigned int** dindices, float** hresult, unsigned int** hindices) {
  static unsigned int setUpRows = 0;
  static unsigned int setUpCols = 0;
  static unsigned int setUpGroupWidth = 0;
  static unsigned int setUpNumGroups = 0;
  static unsigned int setUpNumPerms = 0;
  static unsigned int setUpPermLength = 0;

  if (setUpRows != rows || setUpCols != cols) {
    if (*dmatrix != NULL) {
      gpuErrchk(cudaFree(*dmatrix));
      *dmatrix = NULL;
    }
    gpuErrchk(cudaMalloc((void**)dmatrix, rows * cols * sizeof(float)));
  }

  if (setUpGroupWidth < group_width || setUpNumGroups < num_groups) {
    if (*dstripes != NULL) {
      gpuErrchk(cudaFree(*dstripes));
      *dstripes = NULL;
    }
    gpuErrchk(cudaMalloc((void**)dstripes, num_groups * group_width * sizeof(unsigned int)));

    if (setUpNumGroups < num_groups) {
      if (*dindices != NULL) {
        gpuErrchk(cudaFree(*dindices));
        *dindices = NULL;
      }
      gpuErrchk(cudaMalloc((void**)dindices, num_groups * sizeof(unsigned int)));
      if (*hindices != NULL) {
        free(*hindices);
        *hindices = NULL;
      }
      *hindices = (unsigned int*)malloc(num_groups * sizeof(unsigned int));
    }
  }

  if (setUpNumPerms < num_permutations || setUpPermLength < perm_length) {
    if (*dpermutations != NULL) {
      gpuErrchk(cudaFree(*dpermutations));
      *dpermutations = NULL;
    }
    gpuErrchk(cudaMalloc((void**)dpermutations, perm_length * num_permutations * sizeof(unsigned int)));
  }

  if (setUpNumPerms < num_permutations || setUpNumGroups < num_groups) {
    if (*doutput != NULL) {
      gpuErrchk(cudaFree(*doutput));
      *doutput = NULL;
    }
    gpuErrchk(cudaMalloc((void**)doutput, num_permutations * num_groups * sizeof(float)));
    if (*hresult != NULL) {
      free(*hresult);
      *hresult = NULL;
    }
    *hresult = (float*)malloc(num_permutations * num_groups * sizeof(float));
  }

  setUpRows = rows;
  setUpCols = cols;
  setUpGroupWidth = group_width;
  setUpNumGroups = num_groups;
  setUpNumPerms = num_permutations;
  setUpPermLength = perm_length;
}

int run_build_permute_map(py::array_t<float>& py_matrix, unsigned int rows, unsigned int cols,
                          py::array_t<unsigned int>& py_stripes, unsigned int num_groups, unsigned int group_width,
                          py::array_t<unsigned int>& py_permutations, unsigned int perm_length,
                          py::array_t<float>& py_improvements, py::array_t<unsigned int>& py_best_indices) {
  static float* d_matrix = NULL;
  static unsigned int* d_stripes = NULL;
  static unsigned int* d_permutations = NULL;
  static float* d_output = NULL;
  static unsigned int* d_indices = NULL;
  static float* hresult = NULL;
  static unsigned int* hindices = NULL;

  const unsigned int num_permutations = py_permutations.size() / perm_length;

  const unsigned int MAX_GROUPS_PER_LAUNCH = num_permutations <= 5775 ? 1820 : 40;
  const unsigned int full_launches = num_groups / MAX_GROUPS_PER_LAUNCH;
  const unsigned int final_launch = num_groups % MAX_GROUPS_PER_LAUNCH;
  const unsigned int launches = full_launches + (final_launch != 0 ? 1 : 0);

  set_up_permute_map_memory(&d_matrix, rows, cols, &d_stripes, min(num_groups, MAX_GROUPS_PER_LAUNCH), group_width,
                            &d_permutations, num_permutations, perm_length, &d_output, &d_indices, &hresult, &hindices);

  float* matrix = float_ptr_from_numpy(py_matrix);
  unsigned int* stripes = uint_ptr_from_numpy(py_stripes);
  unsigned int* permutations = uint_ptr_from_numpy(py_permutations);
  float* improvements = float_ptr_from_numpy(py_improvements);
  unsigned int* best_indices = uint_ptr_from_numpy(py_best_indices);

  gpuErrchk(cudaMemcpy(d_matrix, matrix, rows * cols * sizeof(float), cudaMemcpyHostToDevice));
  gpuErrchk(cudaMemcpy(d_permutations, permutations, num_permutations * perm_length * sizeof(unsigned int),
                       cudaMemcpyHostToDevice));

  unsigned int group_offset = 0;
  for (unsigned int l = 0; l < launches; ++l) {
    unsigned int groups_this_launch = (l < full_launches) ? MAX_GROUPS_PER_LAUNCH : final_launch;

    gpuErrchk(cudaMemcpy(d_stripes, &stripes[group_offset * group_width],
                         groups_this_launch * group_width * sizeof(unsigned int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemset(d_output, 0, groups_this_launch * num_permutations * sizeof(float)));
    gpuErrchk(cudaMemset(d_indices, 0, groups_this_launch * sizeof(unsigned int)));

    unsigned int shmem = 32 * (32) * sizeof(float);
    build_permute_map<<<groups_this_launch, 32, shmem>>>(d_matrix, rows, cols, d_stripes, group_width, d_permutations,
                                                         num_permutations, perm_length, d_output, d_indices);
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(
        cudaMemcpy(hresult, d_output, num_permutations * groups_this_launch * sizeof(float), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(hindices, d_indices, groups_this_launch * sizeof(unsigned int), cudaMemcpyDeviceToHost));

    // thread0 stuck the minimum in the first slot of each group
    for (unsigned int g = 0; g < groups_this_launch; ++g) {
      improvements[group_offset + g] = hresult[g * num_permutations];
      best_indices[group_offset + g] = hindices[g];
    }

    group_offset += groups_this_launch;
  }

  return 0;
}

/**********************************************************
 * Build the swap map for channel_swaps
 **********************************************************/
// find the magnitude improvement if some columns were swapped (check all pairs of columns in all the stripe_pairs)
__global__ void swap_columns_sum_after_2_to_4(float* matrix, unsigned int rows, unsigned int cols,
                                              unsigned int* stripe_pairs, float* output) {
  // vectorize
  float4* mat4 = (float4*)matrix;
  cols /= 4;

  // each thread takes some number of rows
  size_t const num_rows = max((int)ceilf((float)rows / (float)blockDim.x), 1);
  size_t const row_offset = num_rows * threadIdx.x;

  // each block is repsonsible for a pair of stripes
  unsigned int const stripe0 = stripe_pairs[2 * blockIdx.x];
  unsigned int const stripe1 = stripe_pairs[2 * blockIdx.x + 1];
  // space for 32 threads, 8 values (2 stripes) in use at a time, plus 16 partial sums and one base sum
  extern __shared__ float cs[32][32];
  float4* local_stripe0 = (float4*)&cs[threadIdx.x][0];
  float* local_cols0 = (float*)&cs[threadIdx.x][0];
  float4* local_stripe1 = (float4*)&cs[threadIdx.x][4];
  float* local_cols1 = (float*)&cs[threadIdx.x][4];
  float* local_psum = (float*)&cs[threadIdx.x][8];
  float* base_psum = (float*)&cs[threadIdx.x][24];

  *base_psum = 0.0f;
  for (unsigned int s = 0; s < 16; ++s) {
    local_psum[s] = 0.0f;
  }

  for (unsigned int r = row_offset; r < row_offset + num_rows; ++r) {
    if (r >= rows) break;
    *local_stripe0 = mat4[r * cols + stripe0];
    *local_stripe1 = mat4[r * cols + stripe1];
    *base_psum += group_2_to_4(*local_stripe0) + group_2_to_4(*local_stripe1);
    unsigned int swap_idx = 0;
    for (unsigned int c0 = 0; c0 < 4; ++c0) {
      for (unsigned int c1 = 0; c1 < 4; ++c1) {
        // swap c0 and c1
        float tmp = local_cols0[c0];
        local_cols0[c0] = local_cols1[c1];
        local_cols1[c1] = tmp;

        // grab the sum
        local_psum[swap_idx] += group_2_to_4(*local_stripe0) + group_2_to_4(*local_stripe1);

        // swap back
        local_cols1[c1] = local_cols0[c0];
        local_cols0[c0] = tmp;

        swap_idx++;
      }
    }
  }

  // reduce partial sums, store local diffs in the output
  __syncthreads();
  if (threadIdx.x == 0) {
    for (unsigned int t = 1; t < blockDim.x; ++t) {
      for (unsigned int swap = 0; swap < 16; ++swap) {
        local_psum[swap] += cs[t][8 + swap];
      }
      *base_psum += cs[t][24];
    }

    for (unsigned int swap = 0; swap < 16; ++swap) {
      atomicAdd(&output[blockIdx.x * 16 + swap], local_psum[swap] - (*base_psum));
    }
  }
}

void set_up_swap_map_memory(float** dmatrix, unsigned int rows, unsigned int cols, unsigned int** dstripe_pairs,
                            unsigned int num_pairs, float** dresult) {
  static unsigned int setupRows = 0;
  static unsigned int setupCols = 0;
  static unsigned int setupPairs = 0;

  if (*dmatrix == NULL || setupRows != rows || setupCols != cols) {
    if (*dmatrix != NULL) {
      gpuErrchk(cudaFree(*dmatrix));
      *dmatrix = NULL;
    }
    gpuErrchk(cudaMalloc((void**)dmatrix, rows * cols * sizeof(float)));
    setupRows = rows;
    setupCols = cols;
  }

  if (*dstripe_pairs == NULL || *dresult == NULL || setupPairs < num_pairs) {
    if (*dstripe_pairs != NULL) {
      gpuErrchk(cudaFree(*dstripe_pairs));
      *dstripe_pairs = NULL;
    }
    if (*dresult != NULL) {
      gpuErrchk(cudaFree(*dresult));
      *dresult = NULL;
    }
    gpuErrchk(cudaMalloc((void**)dstripe_pairs, num_pairs * 2 * sizeof(unsigned int)));
    gpuErrchk(cudaMalloc((void**)dresult, num_pairs * 16 * sizeof(float)));

    setupPairs = num_pairs;
  }
}

int run_build_swap_map(py::array_t<float>& py_matrix, unsigned int rows, unsigned int cols,
                       py::array_t<uint32_t>& py_stripe_pairs, py::array_t<float>& py_output) {
  static float* d_matrix = NULL;
  static float* d_result = NULL;
  static unsigned int* d_stripe_pairs = NULL;

  float* matrix = float_ptr_from_numpy(py_matrix);                    //(float*)py_matrix.data();
  unsigned int* stripe_pairs = uint_ptr_from_numpy(py_stripe_pairs);  //(unsigned int*)py_stripe_pairs.data();
  float* output = float_ptr_from_numpy(py_output);                    //(float*)py_output.data();

  unsigned int num_pairs = py_stripe_pairs.size() / 2;

  set_up_swap_map_memory(&d_matrix, rows, cols, &d_stripe_pairs, num_pairs, &d_result);
  gpuErrchk(cudaMemcpy(d_matrix, matrix, rows * cols * sizeof(float), cudaMemcpyHostToDevice));
  gpuErrchk(cudaMemcpy(d_stripe_pairs, stripe_pairs, 2 * num_pairs * sizeof(unsigned int), cudaMemcpyHostToDevice));
  gpuErrchk(cudaMemset(d_result, 0, num_pairs * 16 * sizeof(float)));

  unsigned int shmem = 32 * (32) * sizeof(float);
  swap_columns_sum_after_2_to_4<<<num_pairs, 32, shmem>>>(d_matrix, rows, cols, d_stripe_pairs, d_result);
  gpuErrchk(cudaDeviceSynchronize());

  gpuErrchk(cudaMemcpy(output, d_result, num_pairs * 16 * sizeof(float), cudaMemcpyDeviceToHost));

  return 0;
}
///////////////////////////////////////////////////////////

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("sum_after_2_to_4", &run_subset_sum_after_2_to_4, "matrix sum after applying 2:4 (CUDA)",
        py::call_guard<py::gil_scoped_release>());
  m.def("build_permute_map", &run_build_permute_map, "optimize stripe groups (CUDA)",
        py::call_guard<py::gil_scoped_release>());
  m.def("check_permutations", &run_check_permutations, "exhaustively check all permutations (CUDA)",
        py::call_guard<py::gil_scoped_release>());
  m.def("build_swap_map", &run_build_swap_map, "channel swaps (CUDA)", py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: apex/contrib/sparsity/permutation_search_kernels/__init__.py
================================================
from .call_permutation_search_kernels import accelerated_search_for_good_permutation
from .permutation_utilities import sum_after_2_to_4


================================================
FILE: apex/contrib/sparsity/permutation_search_kernels/call_permutation_search_kernels.py
================================================
import numpy as np
from .permutation_utilities import *
from .exhaustive_search import Exhaustive_Search


def accelerated_search_for_good_permutation(matrix_group, options=None, verbosity=0):
    """This function is used to call the permutation search CUDA kernels.
    users can provide prefer search strategy by providing a valid 'options' as a dictionary,
    or users can implement their customized 'accelerated_search_for_good_permutation' function.
    """
    input_matrix = matrix_group.cpu().detach().numpy()
    if verbosity > 1:
        print(
            "\n[accelerated_search_for_good_permutation] input matrix shape: '{:}'.".format(
                input_matrix.shape
            )
        )

    result = np.copy(input_matrix)
    # init a sequential permutation search sequence
    input_channel_num = matrix_group.size(1)
    permutation_sequence = [n for n in range(input_channel_num)]
    duration = 0.0

    if options == None:
        options = {}
    if (
        "strategy" not in options
    ):  # right now, the default permutation search strategy is: 'exhaustive' search
        options["strategy"] = "exhaustive"

    if verbosity > 1:
        print(
            "[accelerated_search_for_good_permutation] the permutation strategy is: '{:} search'.".format(
                options["strategy"]
            )
        )

    # define sub options for each search strategy
    if options["strategy"] == "exhaustive":
        # right now, the default options for 'exhaustive' search is: 'exhaustive,8,100'
        if "stripe_group_size" not in options:
            options["stripe_group_size"] = 8
        if "escape_attempts" not in options:
            options["escape_attempts"] = 100
    elif options["strategy"] == "progressive channel swap":
        # just swaps meaningful channels, keeping the good swaps, until the search time limit expires.
        if "progressive_search_time_limit" not in options:
            options["progressive_search_time_limit"] = 60
        if "improvement_threshold" not in options:
            options["improvement_threshold"] = 1e-9

    # execute the requested strategy
    if options["strategy"] == "exhaustive":
        result, duration, permutation_sequence = Exhaustive_Search(
            result,
            stripe_group_size=options["stripe_group_size"],
            escape_attempts=options["escape_attempts"],
        )
    elif options["strategy"] == "progressive channel swap":
        real_swap_num = 0
        start_time = time.perf_counter()
        while time.perf_counter() - start_time < options["progressive_search_time_limit"]:
            src = np.random.randint(result.shape[1])
            dst = np.random.randint(result.shape[1])
            src_group = int(src / 4)
            dst_group = int(dst / 4)
            if src_group == dst_group:  # channel swapping within a stripe does nothing
                continue
            new_sum, improvement = try_swap(result, dst, src)
            if improvement > options["improvement_threshold"]:
                result[..., [src, dst]] = result[..., [dst, src]]
                permutation_sequence[src], permutation_sequence[dst] = (
                    permutation_sequence[dst],
                    permutation_sequence[src],
                )
                real_swap_num += 1
        duration = time.perf_counter() - start_time
        if verbosity > 1:
            print(
                "\tFinally swap {} channel pairs until the search time limit expires.".format(
                    real_swap_num
                )
            )
    elif (
        options["strategy"] == "user defined"
    ):  # need to get the permutated matrix (result) by applying customized permutation search function
        if verbosity > 1:
            print(
                "[accelerated_search_for_good_permutation] Use the user customized permutation search function!"
            )
    else:
        if verbosity >= 0:
            print(
                "[accelerated_search_for_good_permutation] Cannot find the implementation of the required strategy!"
            )

    if verbosity > 1:
        print(
            "[accelerated_search_for_good_permutation] Take {:.4f} seconds to search the permutation sequence.".format(
                duration
            )
        )

    return permutation_sequence


================================================
FILE: apex/contrib/sparsity/permutation_search_kernels/channel_swap.py
================================================
from .permutation_utilities import *

################################################################################################################
# Greedy Channel Swaps - iterative, deterministic, can be parallelized
#   1. Build a map of the magnitude improvement of involved stripes for all pairs of channel swaps
#   2. Sort the map, march through by decreasing improvement, skipping entries whose stripes have been modified
#   3. Repeat until there's no entry with positive improvement (convergence)
################################################################################################################


## try swapping columns and tracking magnitude after pruning
def try_swap(matrix, dst, src):
    src_base = sum_after_2_to_4(matrix[..., int(src / 4) * 4 : int(src / 4) * 4 + 4])
    dst_base = sum_after_2_to_4(matrix[..., int(dst / 4) * 4 : int(dst / 4) * 4 + 4])

    # swap
    matrix[..., [src, dst]] = matrix[..., [dst, src]]

    # check the Nx4 slices of the swapped columns
    src_sum = sum_after_2_to_4(matrix[..., int(src / 4) * 4 : int(src / 4) * 4 + 4])
    dst_sum = sum_after_2_to_4(matrix[..., int(dst / 4) * 4 : int(dst / 4) * 4 + 4])

    # swap back
    matrix[..., [src, dst]] = matrix[..., [dst, src]]

    return src_sum + dst_sum, (src_sum + dst_sum) - (src_base + dst_base)


## convert stripe and a swap indices to columns
def stripes_and_swap_idx_to_columns(stripe0, stripe1, idx):
    i = 0
    for c0 in range(4):
        for c1 in range(4):
            if i == idx:
                return stripe0 * 4 + c0, stripe1 * 4 + c1
            i += 1
    return None


## convert columns to stripe and swap indices
def columns_to_stripes_and_swap_idx(col0, col1):
    stripe0 = int(col0 / 4)
    col0 %= 4
    stripe1 = int(col1 / 4)
    col1 %= 4

    idx = 0
    for c0 in range(4):
        for c1 in range(4):
            if c0 == col0 and c1 == col1:
                return stripe0, stripe1, idx
            idx += 1
    return None


## build a list of stripe pairs that need their benefits recomputed because one stripe was modified
def build_stripe_pairs(matrix, used_stripes):
    stripe_pairs = []
    total_stripes = int(matrix.shape[1] / 4)

    used_stripes = np.sort(used_stripes)
    for stripe0 in range(total_stripes - 1):
        for stripe1 in range(stripe0, total_stripes):
            if stripe0 in used_stripes or stripe1 in used_stripes:
                stripe_pairs.append([stripe0, stripe1])

    return np.asarray(stripe_pairs)


## compute the benefit of swapping each pair of columns in the matrix using the GPU
## only update stripes' columns that appear in used_stripes to avoid unnecessary computations
def compute_swap_map(matrix, used_stripes):
    do_gpu = use_gpu()
    assert do_gpu

    stripe_pairs = build_stripe_pairs(matrix, used_stripes).astype(np.uint32)
    matrix_view = matrix.astype(np.float32).flatten()
    stripe_pairs_view = stripe_pairs.flatten()
    output = np.zeros((len(stripe_pairs) * 16), dtype=np.float32).flatten()
    result = permutation_search_cuda_kernels.build_swap_map(
        matrix_view, matrix.shape[0], matrix.shape[1], stripe_pairs_view, output
    )

    # translate the flat array from the GPU to a map
    pair_improvement_map = {}
    for i, pair in enumerate(stripe_pairs):
        for swap_idx in range(16):
            col0, col1 = stripes_and_swap_idx_to_columns(pair[0], pair[1], swap_idx)
            pair_improvement_map[(col0, col1)] = output[i * 16 + swap_idx]
    return pair_improvement_map


## build the full swap map
def build_swap_map(matrix, swap_map, swap_ids, used_stripes, verbosity):
    improvements = None

    # if we have a GPU and built kernels, pre-compute the needed values
    do_gpu = use_gpu()
    if do_gpu:
        if len(swap_map) == 0:
            used_stripes = [s for s in range(int(matrix.shape[1] / 4))]
        improvements = compute_swap_map(matrix, used_stripes)

    idx = 0
    updates = 0
    for src in range(matrix.shape[1] - 1):  # parallelize these loops
        for dst in range(src + 1, matrix.shape[1]):
            # swapping within a stripe does nothing
            if int(src / 4) == int(dst / 4):
                continue

            # if we touched this stripe last time, update it
            if (
                (int(src / 4) in used_stripes)
                or (int(dst / 4) in used_stripes)
                or len(swap_map) <= idx
            ):
                tmp_improvement = 0.0

                # use the pre-computed values from the GPU if possible, otherwise compute on the CPU
                if do_gpu:
                    tmp_improvement = improvements[(src, dst)]
                else:
                    tmp_mag, tmp_improvement = try_swap(matrix, src, dst)
                updates += 1

                if len(swap_map) <= idx:
                    swap_map.append(tmp_improvement)
                    swap_ids.append((src, dst))
                else:
                    swap_map[idx] = tmp_improvement
                    swap_ids[idx] = (src, dst)

            idx += 1

    if verbosity > 15:
        print(f"\tupdated {updates} map entries")
    return swap_map, swap_ids


def use_swap_map(
    matrix,
    swap_map,
    swap_ids,
    threshold,
    used_escape_attempts,
    escape_attempts,
    permutation,
    verbosity,
):
    used_stripes = []
    swaps = 0
    improvement = 0.0

    # set the traversal order and threshold
    ix = np.flip(np.argsort(swap_map))  # small to large -> large to small
    threshold = min(max(swap_map[ix[0]] * threshold, 0.0001), 1.0)

    # iterate through the potential swaps in benefit order
    for swap in range(len(ix)):
        swap_id = ix[swap]
        src = swap_ids[swap_id][0]
        dst = swap_ids[swap_id][1]

        # early-out of swaps that are below the threshold (don't be so greedy)
        if swap_map[ix[swap]] < threshold:
            # see if an arbitrary swap helps things if we've converged
            if len(used_stripes) == 0 and used_escape_attempts < escape_attempts:
                swap_id = np.random.randint(len(swap_ids))
                if verbosity > 15:
                    print(
                        f"converged, attempt #{used_escape_attempts + 1} to jiggle out, using index {swap_id} into the sorted list={ix[swap_id]}"
                    )
                swap_id = ix[swap_id]
                src = swap_ids[swap_id][0]
                dst = swap_ids[swap_id][1]
                used_escape_attempts += 1
            else:
                break

        # skip swaps that include a stripe we've already modified
        if int(src / 4) in used_stripes or int(dst / 4) in used_stripes:
            continue

        # we'll need to update these stripes later
        used_stripes.append(int(src / 4))
        used_stripes.append(int(dst / 4))

        # make the swap
        if verbosity > 20:
            print(f"\t{swap}\t{src},{dst}  {swap_map[swap_id]:.4f}")
        matrix[..., [src, dst]] = matrix[..., [dst, src]]
        permutation[src], permutation[dst] = permutation[dst], permutation[src]
        improvement += swap_map[swap_id]
        swaps += 1

    return (
        matrix,
        swaps,
        swap_map,
        swap_ids,
        used_stripes,
        improvement,
        used_escape_attempts,
        permutation,
    )


def Channel_Swap(matrix, escape_attempts=0, verbosity=0, permutation=None):
    threshold = 0.00001
    used_escape_attempts = 0

    # initialize
    if permutation is None:
        permutation = [c for c in range(matrix.shape[1])]
    swap_map = []
    swap_ids = []
    used_stripes = []
    swap_count = 0
    iterations = 0
    agg_improvement = 0.0
    cur_total_sum = sum_after_2_to_4(matrix)
    start_time = time.perf_counter()

    # do the work
    swapped = 1  # just start with nonzero value to fall into the loop
    while swapped > 0:
        swap_map, swap_ids = build_swap_map(matrix, swap_map, swap_ids, used_stripes, verbosity)
        (
            matrix,
            swapped,
            swap_map,
            swap_ids,
            used_stripes,
            improvement,
            used_escape_attempts,
            permutation,
        ) = use_swap_map(
            matrix,
            swap_map,
            swap_ids,
            threshold,
            used_escape_attempts,
            escape_attempts,
            permutation,
            verbosity,
        )
        agg_improvement += improvement

        # keep track of statistics, print occasionally
        swap_count += swapped
        if verbosity > 10:
            iterations += 1
            cur_total_sum += agg_improvement
            duration = time.perf_counter() - start_time
            print(
                f"\t{iterations:8} {cur_total_sum:7.2f} {agg_improvement:7.2f} {swap_count:4} {agg_improvement / max(swap_count, 1):5.2f} {duration:7.2f}"
            )
            agg_improvement = 0.0
            swap_count = 0

    # final status
    seconds = time.perf_counter() - start_time

    return matrix, seconds, permutation


================================================
FILE: apex/contrib/sparsity/permutation_search_kernels/exhaustive_search.py
================================================
from .permutation_utilities import *

ASP_CACHE_DIR_ENV_VAR = "APEX_ASP_CACHE_DIR"
ASP_CACHE_DIR_DEFAULT = ".cache"

################################################################################################################
# Exhaustive
#   Try them all
#   - order of columns within a group doesn't matter
#   - order of groups doesn't matter
#   - we can eliminate effective duplicates by defining aunique combination to be a sorted list of sorted groups
################################################################################################################

####################################################################
# generate unique permutations
####################################################################


# check if adding a column index to a current permutation would keep it in canonical form
# assumes that perm is in canonical form already!
def is_canonical(perm, col):
    # if it's a new group
    if len(perm) % 4 == 0:
        # every column ID < col needs to be in the permutation already
        for val in range(col):
            if val not in perm:
                return False
        # this new group needs to be sorted w.r.t. the previous group
        return col > perm[-4]

    # not a new group, just check to see if it will still be sorted
    return col > perm[-1]


# recursive: build a unique permutation one column index at a time
def generate_unique_combinations(
    built_permutation, remaining_columns, full_permutation_list, group_width
):
    # base case: nothing else to add
    if len(remaining_columns) == 0:
        full_permutation_list.append(np.copy(built_permutation))
        if len(full_permutation_list) % 1000000 == 0:
            print(f"{len(full_permutation_list)} unique permutations found so far")

    # still more choices to make, so add each remaining column in turn column if it keeps everything sorted
    else:
        for c in range(len(remaining_columns)):
            # to satisfy our immutables (values within groups are sorted, groups are globally sorted),
            # only add this column if either:
            #   it's starting a new group and is larger than the previous group's first entry
            #   OR
            #   it's larger than the last value in the built_permutation
            col_to_add = remaining_columns[c]

            if is_canonical(built_permutation, col_to_add):
                # add the column to the running permutation, remove it from remaining columns
                built_permutation.append(col_to_add)
                remaining_columns.pop(c)
                # recurse
                generate_unique_combinations(
                    built_permutation,
                    remaining_columns,
                    full_permutation_list,
                    group_width,
                )
                # remove the most recent column and put it back on the remaining column list where we found it (sorted)
                remaining_columns.insert(c, built_permutation.pop(-1))


import os
from os import path

unique_permutation_list = {}


def generate_all_unique_combinations(C, M, must_use_all_groups=False):
    cache_dir_path = os.getenv(ASP_CACHE_DIR_ENV_VAR, ASP_CACHE_DIR_DEFAULT)
    cache_file_path = path.join(cache_dir_path, f"permutations_{C}_{M}.npy")

    global unique_permutation_list
    if (C, M) not in unique_permutation_list:
        if path.exists(cache_file_path):
            unique_permutation_list[(C, M)] = np.load(cache_file_path, allow_pickle=False)

        else:
            full_permutation_list = []
            generate_unique_combinations([0], [c for c in range(1, C)], full_permutation_list, M)
            unique_permutation_list[(C, M)] = full_permutation_list
            if not path.exists(cache_dir_path):
                os.makedirs(cache_dir_path)
            np.save(cache_file_path, full_permutation_list, allow_pickle=False)

    unique_permutations = unique_permutation_list[(C, M)]

    return unique_permutations


# analytical solution
import math


def predict_unique_combinations(C, M):
    assert C % M == 0
    G = int(C / M)
    return int(int(math.factorial(C)) / (int(math.pow(math.factorial(M), G)) * math.factorial(G)))


#################################################################
# exhaustively try all unique permutations
#################################################################


# exhaustively search the entire matrix
def search_matrix(matrix, group_width):
    # give up quickly if we'd go on forever
    prediction = predict_unique_combinations(matrix.shape[1], group_width)
    best_permutation = [c for c in range(matrix.shape[1])]
    if prediction > 1e10:
        print(
            f"There are {prediction} unique combinations with {matrix.shape[1]} columns and a group width of {group_width}, not searching."
        )
        return matrix, prediction, best_permutation

    start_time = time.perf_counter()
    full_permutation_list = generate_all_unique_combinations(matrix.shape[1], group_width)

    # found them, now try them
    best_improvement = 0.0
    use_cuda = use_gpu()
    if (
        use_cuda and matrix.shape[1] >= 8 and group_width == 4
    ):  # CUDA path only works for a group width of 4
        best_improvement, best_permutation = try_permutations_on_matrix(
            matrix, full_permutation_list
        )
    else:
        base_sum = sum_after_2_to_4(matrix)
        for i in range(1, len(full_permutation_list)):
            permutation = full_permutation_list[i]
            permuted = matrix[:, permutation]
            cur_improvement = sum_after_2_to_4(permuted) - base_sum

            if cur_improvement > best_improvement:
                best_improvement = cur_improvement
                best_permutation = permutation
    seconds = time.perf_counter() - start_time
    return matrix[:, best_permutation], seconds, best_permutation, best_improvement


#############
# Stripe group handling
#############


# gather stripes from a larger matrix into a single matrix
def collect_stripes(matrix, stripes, group_width):
    subset = np.zeros((matrix.shape[0], len(stripes) * group_width))
    for s, stripe in enumerate(stripes):
        subset[..., s * group_width : s * group_width + group_width] = matrix[
            ..., stripe * group_width : stripe * group_width + group_width
        ]
    return subset


# apply the stripe group permutation to the entire permutation
def apply_stripe_group_permutation(sgp, stripes, group_width, permutation):
    new_permutation = permutation.copy()
    for subset_idx in range(len(sgp)):
        dst_stripe_idx = stripes[int(subset_idx / group_width)]
        dst_col_idx = subset_idx % group_width

        subset_val = sgp[subset_idx]
        src_stripe_idx = stripes[int(subset_val / group_width)]
        src_col_idx = subset_val % group_width

        new_permutation[dst_stripe_idx * group_width + dst_col_idx] = permutation[
            src_stripe_idx * group_width + src_col_idx
        ]

    return new_permutation


# generate all possible stripe groups
def generate_stripe_groups(num_stripes, window_size):
    stripe_array = [[c] for c in range(num_stripes)]

    next_stripe_array = []
    for w in range(1, window_size):
        for g in range(len(stripe_array)):
            start_c = stripe_array[g][w - 1] + 1
            group = stripe_array[g]
            for c in range(start_c, num_stripes):
                new_group = group.copy()
                new_group.append(c)
                next_stripe_array.append(new_group)
        stripe_array = next_stripe_array
        next_stripe_array = []

    return set(tuple(stripe_array[g]) for g in range(len(stripe_array)))


# It is not safe to just reset the stripe_set as None here.
# When calling the Exhaustive_Search in E2E search, the stripe_set will not be reset as None.
stripe_set = None
stripe_set_config = None


# build the stripe map
def build_stripe_map(
    matrix, group_width, window_size, stripe_map, stripe_ids, perm_map, used_stripes
):
    global stripe_set, stripe_set_config

    window_size = int(window_size / group_width)

    if (
        stripe_set is None
        or stripe_set_config is None
        or stripe_set_config != (group_width, window_size)
    ):
        num_stripes = int(matrix.shape[1] / group_width)
        assert group_width * num_stripes == matrix.shape[1]
        stripe_set = generate_stripe_groups(num_stripes, window_size)
        stripe_set_config = (group_width, window_size)

    # step through each, update the stripe_map/stripe_ids if necessary
    updates = 0
    use_cuda = use_gpu()
    gpu_list = []
    gpu_groups = []
    for i, s in enumerate(stripe_set):
        sg = []  # build the group of stripes, check if any members changed
        need_update = i >= len(stripe_map)
        for stripe in s:
            sg.append(stripe)
            if stripe in used_stripes:
                need_update = True

        # pre-populate if we're building fresh
        if i >= len(stripe_map):
            stripe_ids.append(sg)
            stripe_map.append(0.0)
            perm_map.append([c for c in range(group_width * window_size)])

        # update entries if needed (only stripe_map and perm_map)
        if need_update:
            updates += 1

            if not use_cuda:  # do the work here if using the CPU
                subset = collect_stripes(matrix, sg, group_width)
                sub_result, sub_duration, permutation, improvement = search_matrix(
                    subset, group_width
                )
                stripe_map[i] = improvement
                perm_map[i] = permutation
            else:  # otherwise, just track the work needed to farm off to the GPU
                gpu_groups.append(sg)
                gpu_list.append(i)

    if use_cuda:  # if using the GPU, perform the work
        matrix_view = np.copy(matrix).astype(np.float32).flatten()
        all_permutations = generate_all_unique_combinations(window_size * group_width, group_width)
        num_permutations = len(all_permutations)
        permutation_view = np.copy(np.asarray(all_permutations)).astype(np.uint32).flatten()
        stripe_groups_view = np.asarray(gpu_groups).astype(np.uint32).flatten()
        num_gpu_groups = len(gpu_list)
        gpu_improvement = np.zeros((num_gpu_groups), dtype=np.float32).flatten()
        gpu_permutation = np.zeros((num_gpu_groups), dtype=np.uint32).flatten()

        result = permutation_search_cuda_kernels.build_permute_map(
            matrix_view,
            matrix.shape[0],
            matrix.shape[1],
            stripe_groups_view,
            num_gpu_groups,
            window_size,
            permutation_view,
            window_size * group_width,
            gpu_improvement,
            gpu_permutation,
        )

        # put the data where python expects it
        for i in range(len(gpu_list)):
            stripe_map[gpu_list[i]] = gpu_improvement[i]
            perm_map[gpu_list[i]] = all_permutations[gpu_permutation[i]]

    return stripe_map, stripe_ids, perm_map


# start performing stripe checks
sm_perturbations = 0
sm_perturbation_limit = 0


def use_stripe_map(matrix, group_width, stripe_map, stripe_ids, perm_map, permutation):
    global sm_perturbations, sm_perturbation_limit
    used_stripes = []
    stripe_groups_optimized = 0
    improvement = 0.0

    # set the traversal order
    ix = np.flip(np.argsort(stripe_map))  # small to large --> large to small

    for i in range(len(ix)):
        stripe_group_id = ix[i]
        perm = perm_map[stripe_group_id].copy()

        if stripe_map[stripe_group_id] <= np.finfo(np.float16).tiny * 5.0:
            # perturbations
            if len(used_stripes) == 0 and sm_perturbations < sm_perturbation_limit:
                sm_perturbations += 1
                # use this permutation, but swap two channels from left/right halves to include two stripes, no matter the group size
                stripe_group_id = ix[np.random.randint(len(ix))]
                perm = perm_map[stripe_group_id].copy()
                # a little easier to escape from
                src = np.random.randint(int(len(perm) / 2))
                dst = int(len(perm) / 2) + np.random.randint(int(len(perm) / 2))
                perm[src], perm[dst] = perm[dst], perm[src]
            else:
                break

        stripe_group = stripe_ids[stripe_group_id]

        # don't work on stripes we've already touched
        touched_stripe = False
        for stripe in stripe_group:
            if stripe in used_stripes:
                touched_stripe = True
        if touched_stripe:
            continue

        # apply the permutation we've already found to this stripe group
        subset = collect_stripes(matrix, stripe_group, group_width)
        sub_result = subset[..., perm]
        permutation = apply_stripe_group_permutation(perm, stripe_group, group_width, permutation)

        # scatter the results, track what changed
        for s, stripe in enumerate(stripe_group):
            # see if this group is in canonical form (entry 0 a multiple of 4, contiguous values))
            group = perm[
                s * group_width : s * group_width + group_width
            ]  # columns in this group of the used permutation
            changed = False
            if group[0] % 4 != 0:
                changed = True
            for c in range(1, group_width):
                if group[c] != group[c - 1] + 1:
                    changed = True
                    break
            # if it's not, then it changed
            if changed:
                used_stripes.append(stripe_group[s])

            matrix[..., stripe * group_width : stripe * group_width + group_width] = sub_result[
                ..., s * group_width : s * group_width + group_width
            ]

        improvement += stripe_map[stripe_group_id]
        stripe_groups_optimized += 1

    return (
        matrix,
        stripe_groups_optimized,
        stripe_map,
        stripe_ids,
        used_stripes,
        improvement,
        permutation,
    )


# entry point for exhaustive searches - both the entire matrix, as well as stripe groups
def Exhaustive_Search(matrix, stripe_group_size=-1, escape_attempts=0, permutation=None):
    global sm_perturbation_limit, sm_perturbations
    sm_perturbations = 0
    sm_perturbation_limit = escape_attempts
    if permutation is None:
        permutation = [c for c in range(matrix.shape[1])]

    # It is much safer to reset the stripe_set as None in the entry point of Exhaustive_Search
    global stripe_set, stripe_set_config
    stripe_set = None
    stripe_set_config = None

    # only support N:4 for now
    group_width = 4

    result = np.copy(matrix)

    # if the matrix is too large for a window size of 12, subdivide, then fix up with a global optimization with a window size of 8
    if group_width == 4 and stripe_group_size == 12 and matrix.shape[1] > 512:
        stripe_split = int(matrix.shape[1] / 2 / group_width)
        col_split = stripe_split * group_width
        result[:, :col_split], durationL, permutation[:col_split] = Exhaustive_Search(
            result[:, :col_split],
            stripe_group_size=stripe_group_size,
            escape_attempts=escape_attempts,
            permutation=permutation[:col_split],
        )
        result[:, col_split:], durationR, permutation[col_split:] = Exhaustive_Search(
            result[:, col_split:],
            stripe_group_size=stripe_group_size,
            escape_attempts=escape_attempts,
            permutation=permutation[col_split:],
        )
        escape_attempts = max(escape_attempts, 100) * 10
        result, duration, permutation = Exhaustive_Search(
            result,
            stripe_group_size=8,
            escape_attempts=escape_attempts,
            permutation=permutation,
        )
        return result, durationL + durationR + duration, permutation

    # small enough to optimize the entire matrix at once
    if stripe_group_size != -1 and stripe_group_size < matrix.shape[1]:
        stripe_map = []
        stripe_ids = []
        perm_map = []
        used_stripes = []

        # in practice, this work will be cached ahead of time; doing it now.
        # (Reading the cached list from disk can take several seconds, which shouldn't be counted against the search, but amortized over every layer in a network)
        generate_all_unique_combinations(stripe_group_size, group_width)

        start_time = time.perf_counter()

        while True:
            # print("[Debug][Exhaustive_Search] Before entering the build_stripe_map function.")
            # print("[Debug][Exhaustive_Search] Now the stripe_set value is: {}".format(stripe_set))
            stripe_map, stripe_ids, perm_map = build_stripe_map(
                result,
                group_width,
                stripe_group_size,
                stripe_map,
                stripe_ids,
                perm_map,
                used_stripes,
            )
            (
                result,
                stripe_groups_optimized,
                stripe_map,
                stripe_ids,
                used_stripes,
                improvement,
                permutation,
            ) = use_stripe_map(result, group_width, stripe_map, stripe_ids, perm_map, permutation)

            # converged?
            if len(used_stripes) == 0:
                break

        duration = time.perf_counter() - start_time

    else:  # no sliding window, single iteration
        print(
            f"Matrix has {matrix.shape[1]} columns and the search window is only {stripe_group_size}: searching exhaustively"
        )
        result, duration, permutation, improvement = search_matrix(matrix, group_width)

    return result, duration, permutation


================================================
FILE: apex/contrib/sparsity/permutation_search_kernels/permutation_utilities.py
================================================
import numpy as np
import subprocess
import math

gpus_tested = False
gpus_found = 0
kernels_found = True
try:
    import permutation_search_cuda as permutation_search_cuda_kernels

    print("Found permutation search CUDA kernels")
except ImportError:
    try:
        from . import permutation_search_cuda as permutation_search_cuda_kernels

        print("Found permutation search CUDA kernels for standalone testing")

    except ImportError:
        print("Could not find permutation search CUDA kernels, falling back to CPU path")
        kernels_found = False


def use_gpu(initial_override=True):
    global gpus_tested, gpus_found, kernels_found
    if not gpus_tested:
        if not initial_override:
            gpus_tested = True
            return False

        try:
            gpus_found = str(subprocess.check_output(["nvidia-smi", "-L"])).count("UUID")
            print(f"Found {gpus_found} gpus")
        except:
            gpus_found = 0
            print("Could not find nvidia-smi, please check your cuda installation")

        gpus_tested = True

    return gpus_found > 0 and kernels_found


##############################################################################################
# pruning utilities
##############################################################################################
## apply 2:4 to some matrix
def apply_2_to_4(matrix):
    for row in range(matrix.shape[0]):
        for col in range(0, matrix.shape[1], 4):
            ix = np.argsort(np.abs(matrix[row, col : col + 4]))
            matrix[row, col + ix[0]] = 0.0
            matrix[row, col + ix[1]] = 0.0
    return matrix


## find the sum of magnitudes if 2:4 were applied to a matrix
def sum_after_2_to_4(matrix):
    cur_sum = 0.0
    use_cuda = use_gpu()
    if not use_cuda:
        for row in range(matrix.shape[0]):
            for col in range(0, matrix.shape[1], 4):
                ix = np.argsort(np.abs(matrix[row, col : col + 4]))
                cur_sum += abs(matrix[row, col + ix[2]])
                cur_sum += abs(matrix[row, col + ix[3]])
    else:
        matrix = matrix.astype(np.float32)
        cuda_sum = np.zeros((1), dtype=np.float32)
        matrix_view = np.copy(matrix).flatten()
        sum_view = cuda_sum.flatten()
        blocks = max(int(matrix.shape[1] / 4 / 2), 1)
        threads = min(max(math.ceil(matrix.shape[0] / 4), 1), 1024)
        result = permutation_search_cuda_kernels.sum_after_2_to_4(
            matrix_view,
            matrix.shape[0],
            matrix.shape[1],
            0,
            matrix.shape[1],
            blocks,
            threads,
            sum_view,
        )
        cur_sum = sum_view[0]
    return cur_sum


# perform unstructured pruning on some matrix
def unstructured_prune(matrix, sparsity):
    shp = matrix.shape
    matrix = matrix.flatten()
    ix = np.argsort(matrix)
    ix = ix[: int(len(ix) * sparsity)]
    matrix[ix] = 0.0
    matrix = np.reshape(matrix, shp)
    return matrix


## try swapping columns and tracking magnitude after pruning
def try_swap(matrix, dst, src):
    src_base = sum_after_2_to_4(matrix[..., int(src / 4) * 4 : int(src / 4) * 4 + 4])
    dst_base = sum_after_2_to_4(matrix[..., int(dst / 4) * 4 : int(dst / 4) * 4 + 4])

    # swap
    matrix[..., [src, dst]] = matrix[..., [dst, src]]

    # check the Nx4 slices of the swapped columns
    src_sum = sum_after_2_to_4(matrix[..., int(src / 4) * 4 : int(src / 4) * 4 + 4])
    dst_sum = sum_after_2_to_4(matrix[..., int(dst / 4) * 4 : int(dst / 4) * 4 + 4])

    # swap back
    matrix[..., [src, dst]] = matrix[..., [dst, src]]

    return src_sum + dst_sum, (src_sum + dst_sum) - (src_base + dst_base)


## magnitude improvement from the naive 2:4 matrix / how much was lost by naive 2:4 compared to the optimal
def efficacy(optimal_lost_magnitude, base_lost_magnitude, cur_lost_magnitude):
    if base_lost_magnitude == optimal_lost_magnitude:
        eff = 1.0
    else:
        eff = (base_lost_magnitude - cur_lost_magnitude) / (
            base_lost_magnitude - optimal_lost_magnitude
        )
    return eff


## find the magnitude if the rows of a matrix were pruned independently, without structure
def magnitude_after_pruning_rows(matrix, rate=0.5):
    magnitude = 0.0
    cols = matrix.shape[1]
    for r in range(matrix.shape[0]):
        rowVals = matrix[r]
        rowVals = np.sort(np.abs(rowVals))
        magnitude += np.sum(rowVals[int(cols * rate) :])

    return magnitude


##############################################################################################
# permutation utilities
##############################################################################################


## exhaustively search an entire matrix on the GPU
def try_permutations_on_matrix(matrix, permutations):
    use_cuda = use_gpu()
    assert use_cuda  # caller should have checked
    matrix = np.copy(matrix)
    matrix = matrix.astype(np.float32)
    matrix_view = np.copy(matrix).flatten()
    permutations_view = np.copy(np.asarray(permutations)).astype(np.uint32).flatten()

    stripe_groups = np.asarray([[s for s in range(int(matrix.shape[1] / 4))]]).astype(np.uint32)
    stripe_groups_view = stripe_groups.flatten()

    improvement = np.zeros((1), dtype=np.float32).flatten()
    permutation = np.zeros((1), dtype=np.uint32).flatten()

    result = permutation_search_cuda_kernels.check_permutations(
        matrix_view,
        matrix.shape[0],
        matrix.shape[1],
        stripe_groups_view,
        len(stripe_groups[0]),
        len(stripe_groups),
        permutations_view,
        len(permutations),
        improvement,
        permutation,
    )
    return improvement[0], permutations[permutation[0]]


## find the permutation needed to make matrix A look like matrix B
def find_permutation(A, B):
    permutation = []
    for col in range(A.shape[1]):
        Avals = A[..., col]
        for bcol in range(B.shape[1]):
            if np.all(Avals - B[..., bcol] == np.zeros(Avals.shape)):
                permutation.append(bcol)
                break
    return permutation


########################################
# reasonable method to find distance between permutations
# this is used to generate permutations "between" two other permutations to divide efficacy space
#######################################


## separate a flat permutation array into its groups, sort each group and the overall order to
## put the output into a canonical order: if two permutations have the same groups, they should appear identical
def make_grouped(A):
    groups = []
    for x in range(0, len(A), 4):
        group = []
        for c in range(4):
            group.append(A[x + c])
        group = np.sort(group)

        groups.append(group)
    return groups


## given two permutations, find the groups they have in common
def common_groups(A, B):
    Ag = make_grouped(A)
    Bg = make_grouped(B)

    # convert to sets to take the intersection
    As = set(tuple(Ag[g]) for g in range(len(Ag)))
    Bs = set(tuple(Bg[g]) for g in range(len(Bg)))
    common = As.intersection(Bs)

    # flatten
    C = []
    for s in common:
        for v in s:
            C.append(v)

    # group
    return make_grouped(C)


## given two permutations, remove the groups that are common between them
def remove_common_groups(A, B):
    Ag = make_grouped(A)
    Bg = make_grouped(B)

    # convert to sets to take set difference
    As = set(tuple(Ag[g]) for g in range(len(Ag)))
    Bs = set(tuple(Bg[g]) for g in range(len(Bg)))
    Ad = As - Bs
    Bd = Bs - As

    # turn the differences back into flat arrays
    A = []
    for s in Ad:
        for v in s:
            A.append(v)
    B = []
    for s in Bd:
        for v in s:
            B.append(v)

    # group to put into canonical order, re-flatten
    A = make_grouped(A)
    B = make_grouped(B)
    A = [item for sublist in A for item in sublist]
    B = [item for sublist in B for item in sublist]

    return A, B


## given two permutations, find which elements in B need to go where to look like A
def group_differences(A, B):
    Ag = make_grouped(A)
    Bg = make_grouped(B)

    wrong_entries = []
    # for g,group in enumerate(Bg):
    for g in range(len(Bg)):
        group = Bg[g]
        for i in range(len(group)):
            val = group[i]
            if val not in Ag[g]:
                group_in_a = int(np.where(A == val)[0][0] / 4)
                wrong_entries.append((val, g, group_in_a))

    return wrong_entries


## (val, cur_group, desired_group) ==> dict[(cur_group, desired_group)] = [vals]
def dictify(wrong_entries):
    result = {}
    for entry in wrong_entries:
        key = (entry[1], entry[2])
        if key in result:
            result[key].append(entry[0])
        else:
            result[key] = [entry[0]]
    return result


## move groups of B to where they best match A's groups
def move_groups_to_match(B, A, debug=False):
    Ag = make_grouped(A)
    Bg = make_grouped(B)

    new_Bg = [[] for g in range(len(Ag))]
    wrong_entry_dict = dictify(group_differences(A, B))

    if debug:
        print(f"MGTM:\n\tAg: {Ag}\n\tBg: {Bg}\n\tWED: {wrong_entry_dict}")

    moved_groups = []

    keys_to_del = []
    # move triples to the right spot
    for k in wrong_entry_dict.keys():
        if k[0] in moved_groups:
            keys_to_del.append(k)
            continue

        if len(wrong_entry_dict[k]) == 3:
            new_Bg[k[1]] = Bg[k[0]]
            moved_groups.append(k[0])
            keys_to_del.append(k)
            if debug:
                print(f"MGTM: moved triple {wrong_entry_dict[k]} from group {k[0]} to group {k[1]}")

    for k in keys_to_del:
        del wrong_entry_dict[k]
    keys_to_del = []

    # move doubles
    for k in wrong_entry_dict.keys():
        # if we've already moved the group to which this key belongs, remove it
        if k[0] in moved_groups:
            keys_to_del.append(k)
            continue

        if len(wrong_entry_dict[k]) == 2:
            if len(new_Bg[k[1]]) == 0:  # move it to its requested destination if possible
                new_Bg[k[1]] = Bg[k[0]]
                keys_to_del.append(k)
                assert k[0] not in moved_groups
                moved_groups.append(k[0])
                if debug:
                    print(
                        f"MGTM: moved double {wrong_entry_dict[k]} from group {k[0]} to its preferred group {k[1]}"
                    )
            elif len(new_Bg[k[0]]) == 0:  # otherwise leave it where it is (if possible)
                new_Bg[k[0]] = Bg[k[0]]
                keys_to_del.append(k)
                assert k[0] not in moved_groups
                moved_groups.append(k[0])
                if debug:
                    print(f"MGTM: left double {wrong_entry_dict[k]} where it was in group {k[0]}")
    for k in keys_to_del:
        del wrong_entry_dict[k]
    keys_to_del = []

    # move singles
    # try to leave things where they are to prevent oscillating
    for k in wrong_entry_dict.keys():
        if k[0] in moved_groups:
            keys_to_del.append(k)
            continue

        if len(new_Bg[k[1]]) == 0:  # requested destination
            new_Bg[k[1]] = Bg[k[0]]
            keys_to_del.append(k)
            assert k[0] not in moved_groups
            moved_groups.append(k[0])
            if debug:
                print(
                    f"MGTM: moved single {wrong_entry_dict[k]} from group {k[0]} to its preferred group {k[1]}"
                )

        elif len(new_Bg[k[0]]) == 0:
            new_Bg[k[0]] = Bg[k[0]]
            keys_to_del.append(k)
            assert k[0] not in moved_groups
            moved_groups.append(k[0])
            if debug:
                print(f"MGTM: left group {wrong_entry_dict[k]} where it was in group {k[0]}")

    for k in keys_to_del:
        del wrong_entry_dict[k]
    keys_to_del = []

    # put what's left where it'll fit
    for k in wrong_entry_dict.keys():
        if k[0] in moved_groups:
            keys_to_del.append(k)
            continue

        for dst in range(len(new_Bg)):
            if len(new_Bg[dst]) == 0:
                new_Bg[dst] = Bg[k[0]]
                keys_to_del.append(k)
                assert k[0] not in moved_groups
                moved_groups.append(k[0])
                if debug:
                    print(
                        f"MGTM: put group {wrong_entry_dict[k]} where it found a spot in group {dst}"
                    )
                break

    for k in keys_to_del:
        del wrong_entry_dict[k]
    keys_to_del = []

    assert len(wrong_entry_dict) == 0
    Agsize = sum([len(group) for group in Ag])
    Bgsize = sum([len(group) for group in new_Bg])
    assert Agsize == Bgsize
    new_B = [item for sublist in new_Bg for item in sublist]
    return new_B


## swap two permutation entries and put the permutation into unique order
def swap_and_correct(permutation, src, tgt):
    permutation[src], permutation[tgt] = permutation[tgt], permutation[src]
    grouped = make_grouped(permutation)
    grouped = [item for sublist in grouped for item in sublist]
    return grouped


## make a swap that will move B in the direction of A
num_diffs = 0


def move_permutation_towards(B, A, debug=False):
    global num_diffs
    B = move_groups_to_match(B, A, debug)
    wrong_entries = group_differences(A, B)
    num_diffs = len(wrong_entries)

    # nothing to do, early out
    if len(wrong_entries) == 0:
        if debug:
            print("MPT: early out")
        return B

    if debug:
        print(f"MPT: checking {len(wrong_entries)} diffs: {wrong_entries}")

    # look for a group of three wrong entries that want to do the same thing
    entry_dict = dictify(wrong_entries)
    for k in entry_dict.keys():
        entry = entry_dict[k]
        if len(entry) == 3:
            if debug:
                print(f"MPT: found a triple swap at {k}: {entry_dict[k]}")
            (src, dst) = k
            # find the index of the one needed to complete the group
            # the value is the value in A[dst] that's not in B[src]
            # it's already in the destination group and may or may not need to move
            group_id = dst
            Ag = make_grouped(np.copy(A))
            Bg = make_grouped(np.copy(B))
            value = -1
            for c in range(4):
                if Ag[dst][c] not in Bg[src]:
                    value = Ag[dst][c]
                    if debug:
                        print(f"\tMPT: found the missing value {value} in A group {dst} offset {c}")
                    break
            assert value != -1

            # now find that value in B
            idx0 = np.where(B == value)[0][0]
            # find the index of the one this group doesn't need
            # it's a member of the group but not in the dict entry
            group_id = src
            for c in range(4):
                if B[group_id * 4 + c] not in entry_dict[k]:
                    if debug:
                        print(f"\tMPT: swapping {idx0} and {group_id * 4 + c}")
                    return swap_and_correct(B, idx0, group_id * 4 + c)

    # look for a group of two entries that are heading to the same place as another wrong entry
    victim_loner_pair = None
    for k in entry_dict.keys():
        entry = entry_dict[k]
        if len(entry) == 2:
            if debug:
                print(f"MPT: found a double swap at {k}: {entry_dict[k]}")
            (src, dst) = k
            # find a wrong entry whose dst is the same
            for k2 in entry_dict.keys():
                if k2 == k:
                    continue

                # k2 is a key whose value also belongs in stripe k2[1] (dst2)
                if dst == k2[1]:
                    if debug:
                        print(
                            f"\tMPT: found a loner going in the same direction at {k2}: {entry_dict[k2][0]}"
                        )
                    # instead of moving these three to where they're headed, start merging them by moving the loner into the double

                    # look for a complement: something moving from src to src2
                    (src2, dst2) = k2
                    complement_key = (src, src2)
                    if complement_key in entry_dict:
                        complement = entry_dict[complement_key][0]
                        if debug:
                            print(f"\t\tMPT: found a complement to the loner:{complement}")
                        return swap_and_correct(
                            B,
                            np.where(B == entry_dict[k2][0])[0][0],
                            np.where(B == complement)[0][0],
                        )
                    # didn't find a complement, choose one of the two in the src group that don't belong
                    elif victim_loner_pair is None:
                        for k3 in entry_dict.keys():
                            if k3 == k:
                                continue

                            if k3[0] == src:  # found the victim
                                victim = entry_dict[k3][0]
                                if debug:
                                    print(
                                        f"\t\tMPT: found a victim for the double swap:{k3} -> {victim}"
                                    )
                                victim_loner_pair = (victim, entry_dict[k2][0])
                                # return swap_and_correct(B, np.where(B == entry_dict[k2][0])[0][0], np.where(B == victim)[0][0])

    if victim_loner_pair is not None:
        if debug:
            print(
                f"\t\tMPT: couldn't find any complements for double swaps, so going with a loner to make a triple: {victim_loner_pair}"
            )
        return swap_and_correct(
            B,
            np.where(B == victim_loner_pair[0])[0][0],
            np.where(B == victim_loner_pair[1])[0][0],
        )

    # look for one swap that will correct two entries
    candidate_second = None
    for we in range(len(wrong_entries)):
        cur_entry = wrong_entries[we]
        # if debug:
        #    print(f"\tMPT: checking {cur_entry} for complement")
        for we2 in range(0, len(wrong_entries)):
            pos_swap = wrong_entries[we2]
            # if debug:
            #    print(f"\t\tMPT: is {pos_swap}?")
            if cur_entry[1] == pos_swap[2] and cur_entry[2] == pos_swap[1]:
                if debug:
                    print(f"\t\tfound complements: swapping {cur_entry} and {pos_swap}")
                return swap_and_correct(
                    B,
                    np.where(B == cur_entry[0])[0][0],
                    np.where(B == pos_swap[0])[0][0],
                )
            elif (
                wrong_entries[0][2] == pos_swap[1]
            ):  # if pos_swap is currently where we[0] wants to go, keep it in mind
                candidate_second = pos_swap

    # fall back on picking the first one we come across
    assert candidate_second is not None
    if debug:
        print(f"No complement, swapping two entries: {wrong_entries[0]} {candidate_second}")
    return swap_and_correct(
        B,
        np.where(B == wrong_entries[0][0])[0][0],
        np.where(B == candidate_second[0])[0][0],
    )


## find a shortest path from permutation A to B
def permutation_distance(A, B, matrix=None, magnitude_targets=None, debug=False, verbosity=0):
    global num_diffs
    swaps = 0
    debug = False

    swap_limit = int(math.pow(2, int(len(A) / 4) - 1))
    num_diffs = swap_limit
    common = []
    target_results = None
    if magnitude_targets is not None:
        assert matrix is not None
        cur_mag = sum_after_2_to_4(matrix[:, A])
        target_results = [(cur_mag, A) for i in range(len(magnitude_targets))]

    if verbosity > 0 and matrix is not None:
        print(f"swap {'0':>4} {sum_after_2_to_4(matrix[:, B]):>15.3f}")
        if verbosity > 5:
            print(f"swap {0:>4}, {make_grouped(A)} {make_grouped(B)}")

    while not np.all(np.array(A) - np.array(B) == np.zeros(np.array(A).shape)):
        cGroups = common_groups(A, B)
        for g in cGroups:
            common.append(g)
        A, B = remove_common_groups(A, B)
        if len(A) == 0:
            break

        B = move_permutation_towards(np.array(B), np.array(A), debug=debug)
        swaps += 1

        if matrix is not None:
            total_cur_permute = [c for c in B]

            for c in [item for sublist in common for item in sublist]:
                total_cur_permute.append(c)

            if verbosity > 0 or magnitude_targets is not None:
                cur_mag = sum_after_2_to_4(matrix[:, total_cur_permute])
                for i in range(len(target_results)):
                    result = target_results[i]
                    if abs(magnitude_targets[i] - result[0]) > abs(magnitude_targets[i] - cur_mag):
                        target_results[i] = (cur_mag, total_cur_permute)
                if verbosity > 0:
                    print(f"swap {swaps:>4} {cur_mag:>15.3f}")

        if verbosity > 5 or swaps > swap_limit:
            print(f"swap {swaps:>4}, {A} {B}, {num_diffs} diffs remain")

        # safety net
        if swaps > swap_limit + 3:
            return swaps, target_results

    return swaps, target_results


================================================
FILE: apex/contrib/sparsity/permutation_tests/README.md
================================================
# ChannelPermutations

Standalone code to reproduce results in "[Channel Permutations for N:M Sparsity](https://proceedings.neurips.cc/paper/2021/hash/6e8404c3b93a9527c8db241a1846599a-Abstract.html)," Jeff Pool and Chong Yu, NeurIPS 2021.

Three search strategies are supported: randomly generating permutations and checking quality, greedily swapping columns until convergence (i.e. TETRIS adapted for 2:4 sparsity), and the technique presented in the above paper, optimizing stripe groups.  This tool will apply these strategies, as configured below, to either a randomly-generated matrix or an .npy file (typically from a real network) and report the efficacy and runtime of the strategy.

## Quick Start

### Installation

#### GPU path

Requirements:
- CUDA
- pybind11

A container such as `nvcr.io/nvidia/pytorch:21.12-py3` satisfies these requirements.

Installation (from this directory):
```
pushd ../permutation_search_kernels/CUDA_kernels
nvcc -O3 -shared -Xcompiler -fPIC -Xcompiler -DTORCH_EXTENSION_NAME=permutation_search_cuda -std=c++11 $(python3 -m pybind11 --includes) permutation_search_kernels.cu -o ../permutation_search_cuda$(python3-config --extension-suffix)
popd
```

#### CPU path

Only NumPy is required for CPU-only execution.

### Important arguments

`python3 permutation_test.py` will tell you all the available arguments and alert you about required arguments:
```
    usage: permutation_test.py [-h] [--infile INFILE] [--channels CHANNELS] [--filters FILTERS] 
                               [--verbosity VERBOSITY] [--seed SEED] [--pretty_print PRETTY_PRINT] 
                               [--unstructured UNSTRUCTURED] [--gpu GPU] [--check_permutation CHECK_PERMUTATION] 
                               [--intermediate_steps INTERMEDIATE_STEPS] [--print_permutation PRINT_PERMUTATION]
                               strategy [strategy ...]
    permutation_test.py: error: the following arguments are required: strategy
```

Detailed information about each argument:

- `--infile` (string) accepts .npy files with weights dumped from some model checkpoint.  By default, the input file is `'random'`, which will generate a random 2D matrix with `CHANNELS` columns and `FILTERS` rows.
- `--channels` and `--filters` (unsigned integers) specify the size of the randomly-generated matrix if there is no input file specified.
- `--verbosity` (unsigned integer) controls the amount of debug and status information printed.  `0` is just the important data, `11` can give periodic status details, and higher integers provide increasingly more detail.
- `--seed` (unsigned integer) allows for changing the random seed, which will affect the random matrix generation, random permutations generated, and columns swapped for bounded regressions.
- `--pretty_print` (bool) prints a pretty graph by default (below), but disabling will generate output friendly for redirecting to a .csv file.
- `--unstructured` (float) will apply unstructured pruning to the matrix before searching for permutations.  A negative value will find the minimum unstructured sparsity for which a search strategy can find a perfect permutation and not create any extra zeros.
- `--gpu` (bool) uses CUDA kernels by default (if they are built and there is a GPU available), but you can override this to run on the CPU.
- `--check_permutation` (bool) makes sure the permutation tracked during the search process matches the one that's recovered directly from the permuted matrix.
- `--intermediate_steps` (unsigned integer) will emit permutations with efficacies equally dividing the distance between the default order and the best permutation found.
- `--print_permutation` (bool) prints the permutation found for each strategy.

Finally, after these optional arguments, provide the search strategies desired.  There are three strategies offered:
- `random,<num_seeds=10>`
- `channel_swaps,<bounded_regressions=100>`
- `optimize_stripe_groups,<stripe_group_size_in_columns=8>,<bounded_regressions=100>`

### Launch a test with interesting search strategies

Now that kernels are built, you can use them to accelerate the search, which can be quite time-consuming without using the GPU.  Below, we report results on a number of interesting strategies for a 64-column, 128-row random matrix using a V100 accelerator.

    $ python3 permutation_test.py --channels 64 --filters 128 channel_swap,0 channel_swap,100 channel_swap,1000 optimize_stripe_groups,8,0 optimize_stripe_groups,8,100 optimize_stripe_groups,8,1000 optimize_stripe_groups,12,0 random,1000 random,10000 random,100000
    Found permutation search CUDA kernels for standalone testing
    Found 2 gpus
    strategy                           ,      magnitude,       efficacy,       duration
    unpruned                           ,       4083.169,       -       ,       -       
    unstructured                       ,       3060.238,       -       ,       -       
    50% rows                           ,       3042.332,          100.0,       -       
    default 2:4                        ,       2852.376,            0.0,          0.000
    channel_swap,0                     ,       2913.352,           32.1,          0.214               
    channel_swap,100                   ,       2914.174,           32.5,          2.249               
    channel_swap,1000                  ,       2920.694,           36.0,         20.248               
    optimize_stripe_groups,8,0         ,       2919.757,           35.5,          0.013               
    optimize_stripe_groups,8,100       ,       2919.758,           35.5,          0.152               
    optimize_stripe_groups,8,1000      ,       2919.935,           35.6,          1.387               
    optimize_stripe_groups,12,0        ,       2921.947,           36.6,          0.860               
    random,1000                        ,       2873.380,           11.1,          0.116               
    random,10000                       ,       2873.603,           11.2,          1.149               
    random,100000                      ,       2879.129,           14.1,         11.510   

For this particular input, the `channel_swap` strategy requires 1000 bounded regressions in order to surpass the efficacy of optimizing two stripe groups (8 columns) without any bounded regressions, but allowing 1000 bounded regressions when optimizing two stripe groups is slightly worse than swapping channels with 1000 bounded regressions.  Optimizing *three* stripe groups at a time outperforms all the other approaches by a wide margin.  Testing many random permutations is inefficient and ineffective.

Without GPU acceleration, these tests would be much slower (though they find the same final permutations):

    $ python3 permutation_test.py --gpu 0 --channels 64 --filters 128 channel_swap,0 channel_swap,100 optimize_stripe_groups,8,0 optimize_stripe_groups,8,100 random,1000
    strategy                           ,      magnitude,       efficacy,       duration
    unpruned                           ,       4083.169,       -       ,       -       
    unstructured                       ,       3060.238,       -       ,       -       
    50% rows                           ,       3042.332,          100.0,       -       
    default 2:4                        ,       2852.377,            0.0,          0.016
    channel_swap,0                     ,       2913.351,           32.1,         55.972
    channel_swap,100                   ,       2914.174,           32.5,        450.025
    optimize_stripe_groups,8,0         ,       2919.759,           35.5,         60.653
    optimize_stripe_groups,8,100       ,       2919.759,           35.5,        465.709
    random,1000                        ,       2873.381,           11.1,         14.889


### Perform the ablation study from Table 1

`bash ablation_studies.sh` will generate the results for the ablation study, showing the relative importance of the bounded regressions and stripe group greedy phase.

### Generate the runtime results from Table 3

`bash runtime_table.sh` will generate the search strategies' efficacies and runtime shown in Table 3.

### Traverse permutation space (as in Figure 3)

We developed a heuristic approach to interpolating between permutations which allows us to find permutations with efficacies that evenly divide some range.  The `--intermediate_steps <N>` argument can be used to emit such a sequence of permutations:

    $ python3 permutation_test.py --channels 64 --filters 128 --intermediate_steps 7 --print_permutation 1 optimize_stripe_groups,8,0
    Found permutation search CUDA kernels for standalone testing
    Found 2 gpus
    strategy                           ,      magnitude,       efficacy,       duration
    unpruned                           ,       4083.169,       -       ,       -
    unstructured                       ,       3060.238,       -       ,       -
    50% rows                           ,       3042.332,          100.0,       -
    default 2:4                        ,       2852.377,            0.0,          0.000
    (2859.8855, [2, 8, 14, 24, 9, 12, 13, 15, 4, 5, 6, 7, 0, 1, 3, 46, 40, 41, 42, 43, 32, 33, 34, 35, 25, 26, 27, 55, 16, 17, 18, 58, 20, 21, 22, 23, 38, 60, 61, 63, 11, 44, 45, 47, 36, 37, 39, 62, 10, 28, 29, 30, 31, 52, 53, 54, 19, 56, 57, 59, 48, 49, 50, 51])
    (2870.1387, [5, 6, 7, 41, 9, 12, 13, 35, 0, 1, 3, 46, 30, 40, 42, 43, 2, 32, 33, 34, 25, 26, 27, 55, 16, 17, 18, 58, 20, 21, 22, 23, 38, 60, 61, 63, 11, 44, 45, 47, 36, 37, 39, 62, 4, 10, 28, 29, 31, 52, 53, 54, 19, 56, 57, 59, 15, 48, 49, 50, 8, 14, 24, 51])
    (2878.0679, [36, 37, 39, 62, 9, 12, 13, 35, 0, 3, 16, 46, 30, 40, 42, 43, 2, 5, 32, 33, 23, 26, 27, 55, 1, 20, 21, 22, 38, 60, 61, 63, 11, 44, 45, 47, 6, 7, 25, 41, 4, 10, 28, 29, 31, 52, 53, 54, 19, 56, 57, 59, 15, 48, 49, 50, 8, 14, 24, 51, 17, 18, 34, 58])
    (2884.8323, [9, 12, 35, 54, 0, 3, 16, 46, 30, 40, 42, 43, 2, 5, 32, 33, 23, 26, 27, 55, 11, 44, 45, 47, 36, 37, 39, 62, 4, 10, 28, 29, 31, 52, 53, 60, 19, 21, 56, 57, 15, 48, 49, 50, 8, 14, 24, 51, 17, 18, 34, 58, 6, 7, 25, 41, 1, 13, 20, 22, 38, 59, 61, 63])
    (2894.9697, [9, 12, 33, 35, 0, 3, 16, 46, 2, 5, 32, 52, 23, 26, 27, 55, 11, 44, 45, 47, 36, 37, 39, 62, 4, 10, 28, 29, 19, 21, 50, 56, 15, 43, 48, 49, 8, 14, 24, 51, 17, 18, 34, 58, 6, 7, 25, 41, 1, 13, 20, 22, 38, 59, 61, 63, 30, 40, 42, 54, 31, 53, 57, 60])
    (2901.5115, [9, 12, 35, 56, 0, 3, 16, 46, 23, 26, 27, 55, 33, 36, 37, 39, 4, 10, 28, 29, 19, 21, 45, 50, 8, 14, 24, 51, 17, 18, 34, 58, 6, 7, 25, 41, 1, 13, 20, 22, 38, 59, 61, 63, 30, 40, 42, 54, 31, 53, 57, 60, 2, 5, 32, 52, 15, 43, 49, 62, 11, 44, 47, 48])
    (2910.2043, [4, 10, 28, 37, 9, 12, 35, 56, 0, 3, 16, 46, 23, 33, 36, 39, 8, 14, 24, 51, 17, 18, 34, 58, 6, 7, 25, 41, 1, 13, 20, 22, 38, 59, 61, 63, 30, 40, 42, 54, 31, 53, 57, 60, 2, 5, 32, 52, 15, 43, 49, 62, 11, 44, 47, 48, 19, 21, 45, 50, 26, 27, 29, 55])
    optimize_stripe_groups,8,0         ,       2919.757,           35.5,          0.015
    [0, 9, 12, 35, 4, 10, 28, 37, 50, 19, 45, 21, 34, 17, 18, 58, 16, 46, 39, 3, 49, 43, 15, 62, 6, 7, 41, 25, 48, 11, 44, 47, 13, 20, 22, 1, 55, 29, 26, 27, 5, 2, 32, 52, 40, 30, 42, 54, 53, 57, 60, 31, 36, 56, 23, 33, 59, 38, 61, 63, 51, 24, 14, 8]

### Transform unstructured sparsity to structured sparsity (as in Figure 4)

If you have a directory with .npy weight files for each layer of a network, `bash unstructured_study.sh <path_to_directory> <network_name>` will perform a binary search for each file to find the minimum unstructured sparsity required to transparently transform that layer with a number of permutation search techniques; this file was used to generate Figure 4, using weights dumped from a pre-trained ResNet50 in Torchvision.

## References

The baseline algorithm which we adapated for use with 2:4 sparsity and upon which we improved is "[TETRIS](https://papers.nips.cc/paper/2018/hash/89885ff2c83a10305ee08bd507c1049c-Abstract.html): TilE-matching the TRemendous Irregular Sparsity," Ji et al., NeurIPS 2018.

If you want to use this technique when generating a 2:4 sparse network for inference, we've packaged it into our [ASP](https://github.com/NVIDIA/apex/tree/master/apex/contrib/sparsity) library - this will perform the permutation searches for each layer as required, as well as fix up neighboring layers so there are no extra operations inserted at runtime.

## Citation

If you use this idea or code in your own research, please cite the [paper](https://proceedings.neurips.cc/paper/2021/hash/6e8404c3b93a9527c8db241a1846599a-Abstract.html) that describes it:

```
@inproceedings{pool2021channel,
  author    = {Pool, Jeff and Yu, Chong},
  booktitle = {Advances in Neural Information Processing Systems ({NeurIPS})},
  title     = {Channel Permutations for {N:M} Sparsity},
  url       = {https://proceedings.neurips.cc/paper/2021/file/6e8404c3b93a9527c8db241a1846599a-Paper.pdf},
  volume    = {34},
  year      = {2021}
}

```


================================================
FILE: apex/contrib/sparsity/permutation_tests/ablation_studies.sh
================================================
#!/bin/bash

OUTDIR="results/ablation_logs"
mkdir -p $OUTDIR

R1000=random,1000
CS=channel_swap,0
CS_100=channel_swap,100
CS_1000=channel_swap,1000
OSG2=optimize_stripe_groups,8,0
OSG2_100=optimize_stripe_groups,8,100
OSG2_1000=optimize_stripe_groups,8,1000
OSG3=optimize_stripe_groups,12,0
OSG3_100=optimize_stripe_groups,12,100
OSG3_1000=optimize_stripe_groups,12,1000
optimal=optimize_stripe_groups,16,0

# Table 1
for seed in {0..24}; do
    echo $seed
    python3 permutation_test.py --channels 16 --filters 32 --seed $seed --pretty_print=False $R1000 $CS $CS_100 $CS_1000 $OSG2 $OSG2_100 $OSG2_1000 $OSG3 $OSG3_100 $OSG3_1000 $optimal | tee "${OUTDIR}/ablations_32x16_$seed.log"
    python3 permutation_test.py --channels 128 --filters 64 --seed $seed --pretty_print=False $R1000 $CS $CS_100 $CS_1000 $OSG2 $OSG2_100 $OSG2_1000 $OSG3 $OSG3_100 $OSG3_1000 | tee "${OUTDIR}/ablations_64x128_$seed.log"
done

echo "Gathering results ..."

################# collect results into a .csv file
# get mean and stddev of efficacy from all seeds for one strategy
get_mean_stddev() {
    local strategy=$1
    local OUTFILE=$2

    # get the strategy's line,                           pull out efficacy and time,              use sum-of-squares to compute stddev and mean in a single pass
    grep "$strategy," $OUTDIR/ablations_64x128_*.log | awk -F "," '{print $3,$4}' | awk '{sum += $1; sumsq += ($1)^2; timesum += $2} END {printf "%.1f,%.1f,%.2f,", sum/NR, sqrt((sumsq-sum^2/NR)/NR), timesum/NR}' >> $OUTFILE
}

# get the number of times some strategy matched the optimal solution
get_num_optimal() {
    local strategy=$1
    local OUTFILE=$2

    matches=0
    for seed in {0..24}; do
        # compare floats with epsilon: add one thousandth to the efficacy under test
        this_eff=$(grep "$strategy," "${OUTDIR}/ablations_32x16_${seed}.log" | awk -F "," '{print int($3 * 1000 + 1)}')
        best_eff=$(grep "optimize_stripe_groups_16_0," "${OUTDIR}/ablations_32x16_${seed}.log" | awk -F "," '{print int($3 * 1000)}')
        if [ "$this_eff" -ge "$best_eff" ]; then
            let "matches = $matches + 1"
        fi
    done

    printf "$matches," >> $OUTFILE
}

# populate a row of the ablation study table
populate_row() {
    local greedy=$1
    local escape=$2
    local strategy=$(echo "$3" | sed 's/,/_/g')
    local OUTFILE=$4

    printf "$greedy,$escape," >> $OUTFILE
    get_mean_stddev "$strategy" "$OUTFILE"
    printf "," >> $OUTFILE
    get_num_optimal "$strategy" "$OUTFILE"
    printf "\n" >> $OUTFILE
}

# prepare output file header
OUTFILE="results/ablation_studies.csv"
printf ",,25x 64x128,,,,25x 32x16\n" > $OUTFILE
printf ",,Efficacy,,Runtime,,Optimal\n" >> $OUTFILE
printf "Greedy Phase,Escape Phase,Mean,StdDev,Mean,,# Found\n" >> $OUTFILE

# finally, gather the data for each strategy into a row of the table
populate_row "Random 1000" "-" "$R1000" "$OUTFILE"
populate_row "Channel Swap" "-" "$CS" "$OUTFILE"
populate_row "Channel Swap" "BR(100)" "$CS_100" "$OUTFILE"
populate_row "Channel Swap" "BR(1000)" "$CS_1000" "$OUTFILE"
populate_row "OSG(2)" "-" "$OSG2" "$OUTFILE"
populate_row "OSG(2)" "BR(100)" "$OSG2_100" "$OUTFILE"
populate_row "OSG(2)" "BR(1000)" "$OSG2_1000" "$OUTFILE"
populate_row "OSG(3)" "-" "$OSG3" "$OUTFILE"
populate_row "OSG(3)" "BR(100)" "$OSG3_100" "$OUTFILE"
populate_row "OSG(3)" "BR(1000)" "$OSG3_1000" "$OUTFILE"

echo "Done! $OUTFILE"


================================================
FILE: apex/contrib/sparsity/permutation_tests/permutation_test.py
================================================
import numpy as np
import time
import sys

# permutation-specifics
sys.path.append("../")
from permutation_search_kernels.permutation_utilities import *
from permutation_search_kernels.exhaustive_search import Exhaustive_Search
from permutation_search_kernels.channel_swap import Channel_Swap

# Arguments
import argparse


def str2bool(v):
    if isinstance(v, bool):
        return v
    if v.lower() in ("yes", "true", "t", "y", "1"):
        return True
    elif v.lower() in ("no", "false", "f", "n", "0"):
        return False
    else:
        raise argparse.ArgumentTypeError("Boolean value expected.")


parser = argparse.ArgumentParser(description="Test channel permutations")
parser.add_argument("--infile", default="random", type=str, help='input file or "random"')
parser.add_argument("--channels", default=384, type=int, help="random input channel count (C)")
parser.add_argument("--filters", default=96, type=int, help="random input filter count (K)")
parser.add_argument("--verbosity", default=0, type=int, help="print status updates")
parser.add_argument("--seed", default=1, type=int, help="random seed")
parser.add_argument(
    "--pretty_print",
    default=True,
    type=str2bool,
    help="print the table for pretty viewing (as opposed to strict .csv)",
)
parser.add_argument(
    "--unstructured",
    default=0.0,
    type=float,
    help='perform unstructured pruning to a target sparsity before processing, emulate an unstructured sparse network. "-1" will find the minimum sparsity required to achieve a perfect permutation',
)
parser.add_argument(
    "--gpu",
    default=True,
    type=str2bool,
    help="uses a gpu to accelerate the search if possible",
)
parser.add_argument(
    "--check_permutation",
    default=False,
    type=str2bool,
    help="check that the tracked permutation matches the recovered permutation",
)
parser.add_argument(
    "--intermediate_steps",
    default=0,
    type=int,
    help="find roughly evenly-spaced permutations in efficacy",
)
parser.add_argument(
    "--print_permutation",
    default=False,
    type=str2bool,
    help="print the final permutation found by each strategy",
)
parser.add_argument("strategies", metavar="strategy", type=str, nargs="+", help="strategies to try")


## binary search for the minimum sparsity necessary to achieve a perfect permutation with some strategy
def find_minimum_sparsity(matrix, search_function, **kwargs):
    duration = 0
    min_sparsity = 50
    max_sparsity = 100
    sparsity = 75
    verbosity = 0
    if "verbosity" in kwargs:
        verbosity = kwargs["verbosity"]

    while min_sparsity < max_sparsity:
        if verbosity > 5:
            print(f"\tlooking now at {sparsity} (between {min_sparsity} and {max_sparsity})")

        # prepare unstructured sparse matrix, get row sparsity magnitude
        tmp_result = unstructured_prune(result, sparsity / 100.0)
        local_unpruned_magnitude = np.sum(np.abs(tmp_result))
        local_unstructured_rows_magnitude = magnitude_after_pruning_rows(tmp_result, rate=0.5)

        # quick check to see if this sparsity is trivially too low
        if local_unstructured_rows_magnitude * 1.0001 < local_unpruned_magnitude:
            if verbosity > 5:
                print(
                    f"Skipping sparsity {sparsity} since there's no perfect permutation (unstructured mag {local_unpruned_magnitude} is larger than sparse rows {local_unstructured_rows_magnitude})."
                )
            min_sparsity = sparsity + 1
            sparsity = int(min_sparsity + (max_sparsity - min_sparsity) / 2.0)
            continue

        tmp_result, tmp_duration, found_permutation = search_function(tmp_result, **kwargs)
        duration += tmp_duration
        nonzeros = np.count_nonzero(tmp_result)
        tmp_result = apply_2_to_4(tmp_result)
        nonzeros_after_2to4 = np.count_nonzero(tmp_result)
        if nonzeros == nonzeros_after_2to4:  # found a winner, are we done?
            if verbosity > 3:
                print(f"Found an unstructured sparsity that we can turn into 2:4: {sparsity}")

            max_sparsity = sparsity
            if max_sparsity <= min_sparsity and verbosity > 0:
                print(
                    f"Found the minimum unstructured sparsity that we can turn into 2:4: {sparsity}"
                )
                break
        else:
            if verbosity > 5:
                print(f"Unstructured sparsity {sparsity} was insufficient to produce 2:4 sparsity")
            min_sparsity = sparsity + 1
            if max_sparsity <= min_sparsity and verbosity > 0:
                print(
                    f"Found the minimum unstructured sparsity that we can turn into 2:4: {max_sparsity}"
                )
                sparsity = max_sparsity
                break

        sparsity = int(min_sparsity + (max_sparsity - min_sparsity) / 2.0)

    return sparsity, duration


# Entry point
if __name__ == "__main__":
    args = parser.parse_args()
    verbosity = args.verbosity
    np.random.seed(seed=args.seed)
    use_gpu(initial_override=args.gpu)

    # get or create the input matrix
    input_vals = np.random.rand(args.filters, args.channels)
    if args.infile != "random":
        if "npy" in args.infile:
            input_vals = np.load(args.infile, "r")
        shp = input_vals.shape
        shp_str = str(shp).replace(",", "x")
        newshp_str = ""
        if len(shp) == 4:  # K,C,R,S -> RSK,C
            input_vals = (
                np.transpose(input_vals, (2, 3, 0, 1))
                .flatten()
                .reshape((shp[2] * shp[3] * shp[0], shp[1]))
            )
            newshp_str = str(input_vals.shape).replace(",", "x")
        print(f"{args.infile},{shp_str},{newshp_str}")
        if input_vals.shape[1] % 4 != 0:
            print(f"Unfriendly shape {input_vals.shape}, not pruning.")
            sys.exit()

    # unstructured prune if requested
    if args.unstructured > 0.0:
        args.unstructured = min(args.unstructured, 1.0)
        input_vals = unstructured_prune(input_vals, args.unstructured)
        print(
            f"{args.infile} pruned to {args.unstructured * 100.0:>.1f} sparsity, shape is {input_vals.shape}"
        )

    # calculate some early metrics
    sorted_magnitudes = np.sort(np.abs(input_vals), axis=None)
    unpruned_magnitude = np.sum(sorted_magnitudes)
    num_weights = sorted_magnitudes.size
    unstructured_magnitude = np.sum(sorted_magnitudes[int(num_weights / 2) :])
    unstructured_rows_magnitude = magnitude_after_pruning_rows(input_vals, rate=0.5)
    simple_2to4 = apply_2_to_4(np.copy(input_vals))
    simple_2to4_magnitude = sum_after_2_to_4(input_vals)
    tmp_time = time.perf_counter()
    simple_2to4_magnitude = sum_after_2_to_4(input_vals)
    default_duration = time.perf_counter() - tmp_time
    best_magnitude = unstructured_rows_magnitude

    best_lost_magnitude = unpruned_magnitude - best_magnitude
    base_lost_magnitude = unpruned_magnitude - simple_2to4_magnitude

    # prep results table
    final_metric = "efficacy"
    if args.unstructured < 0.0:
        final_metric = "min_sparsity"
    if args.pretty_print:
        print(f"{'strategy':<35},{'magnitude':>15},{final_metric:>15},{'duration':>15}")
        print(f"{'unpruned':<35},{unpruned_magnitude:>15.3f},{'-':^15},{'-':^15}")
        print(f"{'unstructured':<35},{unstructured_magnitude:>15.3f},{'-':^15},{'-':^15}")
        print(f"{'50% rows':<35},{unstructured_rows_magnitude:>15.3f},{'100.0':>15},{'-':^15}")
        print(
            f"{'default 2:4':<35},{simple_2to4_magnitude:>15.3f},{'0.0':>15},{default_duration:>15.3f}"
        )
    else:
        print(f"strategy,magnitude,{final_metric},duration")
        print(f"unpruned,{unpruned_magnitude},-,-")
        print(f"unstructured,{unstructured_magnitude},-,-")
        print(f"50%_rows,{unstructured_rows_magnitude},100.0,-")
        print(f"2:4,{simple_2to4_magnitude},0.0,{default_duration}")

    # try the requested strategies
    for i, strategy in enumerate(args.strategies):
        result = np.copy(input_vals)
        np.random.seed(seed=args.seed)

        duration = 0.0
        min_sparsity = 0.0
        strat_split = strategy.split(",")
        found_permutation = None

        # optimize stripe groups
        if strat_split[0] == "optimize_stripe_groups":
            stripe_group_size_in_cols = 8
            if len(strat_split) >= 2:
                stripe_group_size_in_cols = int(strat_split[1])
            escape_attempts = 100
            if len(strat_split) >= 3:
                escape_attempts = int(strat_split[2])

            if args.unstructured >= 0.0:  # just perform the search on the current matrix
                result, duration, found_permutation = Exhaustive_Search(
                    result,
                    stripe_group_size=stripe_group_size_in_cols,
                    escape_attempts=escape_attempts,
                )
            else:  # find the minimum sparsity needed to transparently transform the input
                min_sparsity, duration = find_minimum_sparsity(
                    result,
                    Exhaustive_Search,
                    stripe_group_size=stripe_group_size_in_cols,
                    escape_attempts=escape_attempts,
                )
                result = unstructured_prune(result, min_sparsity / 100.0)

        # channel swaps
        elif strat_split[0] == "channel_swap":
            escape_attempts = 0
            if len(strat_split) >= 2:
                escape_attempts = int(strat_split[1])

            if args.unstructured >= 0.0:  # just perform the search on the current matrix
                result, duration, found_permutation = Channel_Swap(
                    result, escape_attempts=escape_attempts, verbosity=verbosity
                )
            else:  # find the minimum sparsity needed to transparently transform the input
                min_sparsity, duration = find_minimum_sparsity(
                    result,
                    Channel_Swap,
                    escape_attempts=escape_attempts,
                    verbosity=verbosity,
                )
                result = unstructured_prune(result, min_sparsity / 100.0)

        # random permutations
        elif strat_split[0] == "random":
            if (
                args.unstructured < 0.0
            ):  # searching for minimum sparsity not supported for random permutations
                continue

            num_perms = 10
            if len(strat_split) >= 2 and int(strat_split[1]) >= 1:
                num_perms = int(strat_split[1])

            # try the seeds/permutations
            permutation = [c for c in range(result.shape[1])]
            best_sum = sum_after_2_to_4(result)
            best_perm = permutation.copy()
            start_time = time.perf_counter()
            for x in range(num_perms):
                permutation = np.random.permutation(permutation)
                cur_sum = sum_after_2_to_4(result[:, permutation])
                if cur_sum > best_sum:
                    best_sum = cur_sum
                    best_perm = permutation.copy()
                    if verbosity > 0:
                        print(f"\tnew best permutation {x} found with magnitude {best_sum:>15.3f}")
                elif verbosity > 5:
                    print(f"\tpermutation {x} magnitude too low: {cur_sum:>15.3f}")
            duration = time.perf_counter() - start_time
            result = result[:, best_perm]
            found_permutation = best_perm

        else:
            print(f"Unknown strategy: {strategy}!")
            sys.exit()

        # report stats for this strategy
        cur_mag = sum_after_2_to_4(result)
        cur_eff = (
            efficacy(best_lost_magnitude, base_lost_magnitude, unpruned_magnitude - cur_mag) * 100.0
        )
        final_metric = cur_eff
        if args.unstructured < 0.0:
            final_metric = min_sparsity
        perm_distance = ""

        error = None
        if args.check_permutation and found_permutation is not None:
            recovered_perm = find_permutation(result, input_vals)

            error = False
            for c in range(len(recovered_perm)):
                if recovered_perm[c] != found_permutation[c]:
                    if verbosity > 0:
                        print(
                            f"tracked permutation at index {c} was {found_permutation[c]}, but the recovered permutation thought it was {recovered_perm[c]}"
                        )
                    error = True

        # if requested, generate permutations that divide the efficacy space into equal steps
        if args.intermediate_steps != 0:
            magnitude_targets = None
            if args.intermediate_steps != 0:
                ratios = [
                    step / float(args.intermediate_steps + 1)
                    for step in range(1, args.intermediate_steps + 1)
                ]
                mag_diff = cur_mag - (unpruned_magnitude - base_lost_magnitude)
                magnitude_targets = [
                    (unpruned_magnitude - base_lost_magnitude) + mag_diff * ratio
                    for ratio in ratios
                ]
            perm_distance, target_permutations = permutation_distance(
                found_permutation,
                [c for c in range(result.shape[1])],
                matrix=input_vals,
                magnitude_targets=magnitude_targets,
                debug=False,
                verbosity=verbosity,
            )
            if target_permutations is not None:
                for target_permutation in target_permutations:
                    print(target_permutation)

        error_str = ""
        if error is not None:
            error_str = ",       correct"
            if error:
                error_str = ",      mismatch"

        if args.pretty_print:
            print(
                f"{strategy:35},{cur_mag:>15.3f},{final_metric:>15.1f},{duration:>15.3f}{error_str:>15}"
            )
        else:
            strat_string = strategy.replace(",", "_")
            print(f"{strat_string},{cur_mag},{final_metric},{duration}{error_str}")

        if args.print_permutation and found_permutation is not None:
            print(found_permutation)


================================================
FILE: apex/contrib/sparsity/permutation_tests/runtime_table.sh
================================================
#!/bin/bash

OUTDIR="results/runtime_logs"
mkdir -p $OUTDIR

R1000=random,1000
CS=channel_swap,0
CS_100=channel_swap,100
OSG2=optimize_stripe_groups,8,0
OSG2_100=optimize_stripe_groups,8,100
OSG2_1000=optimize_stripe_groups,8,1000
OSG3=optimize_stripe_groups,12,0
OSG3_100=optimize_stripe_groups,12,100
OSG3_1000=optimize_stripe_groups,12,1000

for cols in "32" "64" "128" "256"; do
    echo "$cols x $cols"
    python3 permutation_test.py --channels $cols --filters $cols --pretty_print=False $R1000 $CS $CS_100 $OSG2 $OSG2_100 $OSG2_1000 $OSG3 $OSG3_100 $OSG3_1000 | tee "${OUTDIR}/runtime_${cols}x${cols}.log"
    let "rows = $cols * 2"
    echo "$cols x $rows"
    python3 permutation_test.py --channels $cols --filters $rows --pretty_print=False $R1000 $CS $CS_100 $OSG2 $OSG2_100 $OSG2_1000 $OSG3 $OSG3_100 $OSG3_1000 | tee "${OUTDIR}/runtime_${cols}x${rows}.log"
done

# 2048x2048 is too large for OSG3
echo "2048 x 2048"
python3 permutation_test.py --channels 2048 --filters 2048 --pretty_print=False $R1000 $CS $CS_100 $OSG2 $OSG2_100 $OSG2_1000 | tee "${OUTDIR}/runtime_2048x2048.log"


############### collect results into a .csv file
echo "Gathering results ..."

# efficacy and runtime from one strategy and size
get_results() {
    local strategy=$1
    local cols=$2
    local rows=$3
    local OUTFILE=$4

    grep "$strategy," "$OUTDIR/runtime_${cols}x${rows}.log" | awk -F "," '{printf "%s,%s,",$3,$4}' >> $OUTFILE
}

# prepare output file headers
OUTFILE="results/runtimes.csv"
printf "Columns," > $OUTFILE
for cols in "32" "64" "128" "256"; do
    printf "$cols,$cols,$cols,$cols," >> $OUTFILE
done
printf "2048,2048\n" >> $OUTFILE

printf "Rows," >> $OUTFILE
for cols in "32" "64" "128" "256"; do
    let "rows = $cols * 2"
    printf "$cols,$cols,$rows,$rows," >> $OUTFILE
done
printf "2048,2048\n" >> $OUTFILE

printf "Metric," >> $OUTFILE
for cols in "32" "64" "128" "256"; do
    printf "Efficacy,Runtime,Efficay,Runtime," >> $OUTFILE
done
printf "Efficacy,Runtime\n" >> $OUTFILE

# gather data in a reasonable order
for strategy in "$R1000" "$CS" "$CS_100" "$OSG2" "$OSG2_100" "$OSG2_1000" "$OSG3" "$OSG3_100" "$OSG3_1000"; do
    strategy=$(echo "$strategy" | sed 's/,/_/g') # replace commas with underscores, as they'll appear in the results logs
    printf "$strategy," >> $OUTFILE
    for cols in "32" "64" "128" "256"; do
        get_results "$strategy" "$cols" "$cols" "$OUTFILE"
        let "rows = $cols * 2"
        get_results "$strategy" "$cols" "$rows" "$OUTFILE"
    done

    get_results "$strategy" "2048" "2048" "$OUTFILE"

    printf "\n" >> $OUTFILE
done

echo "Done! $OUTFILE"


================================================
FILE: apex/contrib/sparsity/permutation_tests/unstructured_study.sh
================================================
#!/bin/bash

if [ "$#" -ne 2 ]; then
  echo "Please specify both the source directory and a run tag: bash unstructured_study.sh <directory> <tag>"
  exit
fi

dir=$1  # or set to the directory containing .npy files of interest
tag=$2 # or set to an identifier, e.g. "network_name"

resdir="results/unstructured_logs/${tag}"
mkdir -p $resdir

CS=channel_swap,0
OSG2=optimize_stripe_groups,8,0
OSG2_100=optimize_stripe_groups,8,100
OSG2_1000=optimize_stripe_groups,8,1000
OSG3=optimize_stripe_groups,12,0

CS_successes=()
OSG2_successes=()
OSG2_100_successes=()
OSG2_1000_successes=()
OSG3_successes=()

for sparsity in {50..100}; do
    CS_successes+=(0)
    OSG2_successes+=(0)
    OSG2_100_successes+=(0)
    OSG2_1000_successes+=(0)
    OSG3_successes+=(0)
done

update_successes () {
    strategy=$1
    local -n _successes=$2
    logfile=$3

    limit=$(grep "${strategy}," $logfile | awk -F "," '{print $3}')
 
    echo $logfile, $strategy, $limit
    for (( sparsity=$limit; sparsity<=100; sparsity++ )); do
        let "entry = $sparsity - 50"
        let "value = ${_successes[$entry]} + 1"
        _successes[$entry]=$value
    done
}

# Figure 4
for filename in $dir/*.npy; do
    out=$(basename -- "$filename")
    echo "Searching for minimum sparsities for $out"
    out=$resdir/$out.unstructured
    python3 permutation_test.py --infile=$filename --pretty_print=False --unstructured=-1 $CS $OSG2 $OSG2_100 $OSG2_1000 $OSG3 > $out

    update_successes "channel_swap_0" CS_successes "$out"
    update_successes "optimize_stripe_groups_8_0" OSG2_successes "$out"
    update_successes "optimize_stripe_groups_8_100" OSG2_100_successes "$out"
    update_successes "optimize_stripe_groups_8_1000" OSG2_1000_successes "$out"
    update_successes "optimize_stripe_groups_12_0" OSG3_successes "$out"
done

#################### save the table
# log a single strategy in as a row in the table
log_success () {
    strategy=$1
    local -n _successes=$2
    OUTFILE=$3

    printf "$strategy," >> $OUTFILE
    for sparsity in {50..100}; do
        let "entry = $sparsity - 50"
        printf "%d," ${_successes[$entry]} >> $OUTFILE
    done
    printf "\n" >> $OUTFILE
}

# prepare the header
OUTFILE="results/unstructured.csv"
printf "Sparsity," > $OUTFILE
for sparsity in {50..100}; do
    printf "%d," $sparsity >> $OUTFILE
done
printf "\n" >> $OUTFILE

# add data for each strategy
log_success "channel_swap_0" CS_successes "$OUTFILE"
log_success "optimize_stripe_groups_8_0" OSG2_successes "$OUTFILE"
log_success "optimize_stripe_groups_8_100" OSG2_100_successes "$OUTFILE"
log_success "optimize_stripe_groups_8_1000" OSG2_1000_successes "$OUTFILE"
log_success "optimize_stripe_groups_12_0" OSG3_successes "$OUTFILE"

echo "Done! ${OUTFILE}"


================================================
FILE: apex/contrib/sparsity/sparse_masklib.py
================================================
import sys
import torch
import numpy as np
import collections
from itertools import permutations


""" compute density (helper fn to compute % NNZs in a tensor) """


def fill(x):
    return float(x.nonzero().size(0)) / torch.numel(x)


""" reshape matrix into m-dimensional vectors: (h,w) -> (hw/m, m) """


def reshape_1d(matrix, m):
    # If not a nice multiple of m, fill with zeroes.
    if matrix.shape[1] % m > 0:
        mat = torch.cuda.FloatTensor(
            matrix.shape[0], matrix.shape[1] + (m - matrix.shape[1] % m)
        ).fill_(0)
        mat[:, : matrix.shape[1]] = matrix
        shape = mat.shape
        return mat.view(-1, m), shape
    else:
        return matrix.view(-1, m), matrix.shape


""" return all possible m:n patterns in a 1d vector """
valid_m4n2_1d_patterns = None


def compute_valid_1d_patterns(m, n):
    # Early exit if patterns was already created.
    global valid_m4n2_1d_patterns

    if m == 4 and n == 2 and valid_m4n2_1d_patterns is not None:
        return valid_m4n2_1d_patterns
    patterns = torch.zeros(m)
    patterns[:n] = 1
    valid_patterns = torch.tensor(list(set(permutations(patterns.tolist()))))
    if m == 4 and n == 2:
        valid_m4n2_1d_patterns = valid_patterns
    return valid_patterns


""" m:n 1d structured best """


def mn_1d_best(matrix, m, n):
    # Find all possible patterns.
    patterns = compute_valid_1d_patterns(m, n).cuda()

    # Find the best m:n pattern (sum of non-masked weights).
    mask = torch.cuda.IntTensor(matrix.shape).fill_(1).view(-1, m)
    mat, shape = reshape_1d(matrix, m)
    pmax = torch.argmax(torch.matmul(mat.abs(), patterns.t()), dim=1)
    mask[:] = patterns[pmax[:]]
    mask = mask.view(matrix.shape)
    return mask


def m4n2_1d(mat, density):
    return mn_1d_best(mat, 4, 2)


"""
  Below 2d-masking related code is targeted more for training (from scratch).
  2d-pruning of a weight tensor is done to accelerate DGRAD step during backprop
  phase of training algorithm. Acceleration comes from using SpMMA instructions in
  Tensor Cores of NVIDIA Ampere GPU Architecture 
  (note: this code does not do the acceleration, GPU kernels are required for this).
  1d pruning of weight tensor helps speed up FPROP step by pruning in 2:4 pattern
  along the horizontal (logical) direction.
  During DGRAD step, weight tensor is transposed. 2d pruning functions below, mask
  weight tensor such that their transposed versions are also 2:4 sparse along the
  horizontal (logical) direction. Thus, with 2d pruning, weight tensors are 
  2:4 sparse along row and column directions.
 """

""" m:n 2d structured pruning: greedy method to select mask """


def mn_2d_greedy(matrix, m, n):
    # Convert to numpy
    mat = matrix.cpu().detach().numpy()
    mask = np.ones(mat.shape, dtype=int)

    rowCount = int(mat.shape[0] / m) * m
    colCount = int(mat.shape[1] / m) * m
    for rowStartIdx in range(0, rowCount, m):
        rowEndIdx = rowStartIdx + m
        for colStartIdx in range(0, colCount, m):
            colEndIdx = colStartIdx + m
            matrixSub = np.absolute(np.squeeze(mat[rowStartIdx:rowEndIdx, colStartIdx:colEndIdx]))
            maskSub = np.squeeze(mask[rowStartIdx:rowEndIdx, colStartIdx:colEndIdx])
            maskSub.fill(0.0)
            matrixVecView = matrixSub.reshape(-1)
            maskVecView = maskSub.reshape(-1)
            linearIdx = np.argsort(matrixVecView)
            matrixIdx = [(int(x / m), x % m) for x in linearIdx]
            rowCounter = collections.Counter()
            colCounter = collections.Counter()
            for currIdx in range(len(linearIdx) - 1, -1, -1):
                currMatrixEntry = matrixIdx[currIdx]
                if (rowCounter[currMatrixEntry[0]] == n) or (colCounter[currMatrixEntry[1]] == n):
                    continue
                # end if
                maskSub[currMatrixEntry[0], currMatrixEntry[1]] = 1.0
                rowCounter[currMatrixEntry[0]] += 1
                colCounter[currMatrixEntry[1]] += 1

    return torch.tensor(mask.cuda())


def m4n2_2d_greedy(mat, density):
    return mn_2d_greedy(mat, 4, 2)


""" return all possible m:n patterns in a mxn block. """
valid_m4n2_2d_patterns = None


def compute_valid_2d_patterns(m, n):
    # Early exit if patterns was already created.
    global valid_m4n2_2d_patterns
    if valid_m4n2_2d_patterns is not None:
        return valid_m4n2_2d_patterns

    patterns = torch.zeros(m)
    patterns[:n] = 1
    patterns = list(set(permutations(patterns.tolist())))
    patterns = patterns + patterns
    patterns = torch.empty(list(set(permutations(patterns, m))))

    valid = ((patterns.sum(dim=1) <= n).sum(dim=1) == m).nonzero().view(-1)
    valid_patterns = torch.empty(valid.shape[0], m, m)
    valid_patterns[:] = patterns[valid[:]]

    if m == 4 and n == 2:
        valid_m4n2_2d_patterns = valid_patterns
    return valid_patterns


""" m:n 2d structured pruning: exhaustive method to select best mask """


def mn_2d_best(matrix, m, n):
    # Find all possible patterns.
    patterns = compute_valid_2d_patterns(m, n).cuda()

    # Find the best m:n pattern (sum of non-masked weights).
    mask = torch.cuda.IntTensor(matrix.shape).fill_(1)
    mat = reshape_2d(matrix, m, m).abs()
    pmax = torch.argmax(torch.matmul(mat, patterns.view(patterns.shape[0], m * m).t()), dim=2)

    # Copy best m:n patterns into mask.
    mat = mat.view(mat.shape[0] * mat.shape[1], -1)
    pmax = pmax.view(pmax.shape[0] * pmax.shape[1]).unsqueeze(1).expand(-1, mat.shape[1])
    patterns = patterns.view(patterns.shape[0], patterns.shape[1] * patterns.shape[2])
    mat = torch.gather(patterns, 0, pmax)
    mat = reshape_2d_inv(mat.view(matrix.shape[0] // m, matrix.shape[1] // m, m, m))
    mask.copy_(mat.type(mask.type()))
    return mask


def m4n2_2d_best(mat, density):
    return mn_2d_best(mat, 4, 2)


""" returns a sparse mask """


def create_mask(tensor, pattern="m4n2_1d", density=0.5):
    # Reshape tensor and mask.
    shape = tensor.shape
    ttype = tensor.type()
    t = tensor.float().contiguous()

    # 1d-tensor
    if len(shape) == 1:
        t = t.view(1, shape[0])
        func = getattr(sys.modules[__name__], pattern, None)
        mask = func(t, density)
        return mask.view(shape).type(ttype)
    # 2d-tensor (K, C)
    elif len(shape) == 2:
        # linear
        t = t.view(shape[0], shape[1])
        func = getattr(sys.modules[__name__], pattern, None)
        mask = func(t, density)
        return mask.view(shape).type(ttype)
    # 3d-tensor (K, C, R)
    elif len(shape) == 3:
        # 1d convs
        t = t.permute(0, 2, 1).contiguous().view(shape[0] * shape[2], shape[1])
        func = getattr(sys.modules[__name__], pattern, None)
        mask = func(t, density)
        mask = mask.view(shape[0], shape[2], shape[1]).permute(0, 2, 1).contiguous()
        return mask.view(shape).type(ttype)
    # 4d-tensor (K, C, R, S)
    elif len(shape) == 4:
        """
        # transformers (bmm)
        t = t.view(shape[0]*shape[1]*shape[2], shape[3])
        func = getattr(sys.modules[__name__], pattern, None)
        mask = func(t, density)
        return mask.view(shape).type(ttype)
        """
        # 2d convs
        t = t.permute(2, 3, 0, 1).contiguous().view(shape[2] * shape[3] * shape[0], shape[1])
        func = getattr(sys.modules[__name__], pattern, None)
        mask = func(t, density)
        mask = mask.view(shape[2], shape[3], shape[0], shape[1]).permute(2, 3, 0, 1).contiguous()
        return mask.view(shape).type(ttype)


================================================
FILE: apex/contrib/sparsity/test/checkpointing_test_part1.py
================================================
from collections import OrderedDict

import torch
from apex.optimizers import FusedAdam
from apex.contrib.sparsity import ASP


def build_model(args):
    od = OrderedDict()
    for i in range(args.num_layers):
        if i == 0:
            od["linear_layer_%d" % (i + 1)] = torch.nn.Linear(
                args.input_features, args.hidden_features
            )
            od["layer_norm_%d" % (i + 1)] = torch.nn.LayerNorm(
                [args.batch_size, args.hidden_features]
            )
        elif i == args.num_layers - 1:
            od["linear_layer_%d" % (i + 1)] = torch.nn.Linear(
                args.hidden_features, args.output_features
            )
            od["layer_norm_%d" % (i + 1)] = torch.nn.LayerNorm(
                [args.batch_size, args.output_features]
            )
        else:
            od["linear_layer_%d" % (i + 1)] = torch.nn.Linear(
                args.hidden_features, args.hidden_features
            )
            od["layer_norm_%d" % (i + 1)] = torch.nn.LayerNorm(
                [args.batch_size, args.hidden_features]
            )
    return torch.nn.Sequential(od)


def train_step(args, model, optimizer, input_batch, target_batch, step):
    predicted_target = model(input_batch)
    loss = ((predicted_target - target_batch) ** 2).sum()
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    step = step + 1
    # print("Step %d :: loss=%e" % (step, loss.item()))
    return step


def train_loop(args, model, optimizer, step, num_steps):
    for i in range(num_steps):
        input_batch = torch.randn([args.batch_size, args.input_features]).cuda()
        target_batch = torch.randn([args.batch_size, args.output_features]).cuda()
        step = train_step(args, model, optimizer, input_batch, target_batch, step)
    return step


def main(args):
    #
    # PART1
    #

    torch.manual_seed(args.seed)

    model = build_model(args).cuda()
    one_ll = next(model.children()).weight
    optimizer = FusedAdam(model.parameters())
    ASP.init_model_for_pruning(
        model,
        args.pattern,
        verbosity=args.verbosity,
        whitelist=args.whitelist,
        allow_recompute_mask=args.allow_recompute_mask,
    )
    ASP.init_optimizer_for_pruning(optimizer)

    step = 0

    # train for a few steps with dense weights
    print("DENSE :: ", one_ll)
    step = train_loop(args, model, optimizer, step, args.num_dense_steps)

    # simulate sparsity by inserting zeros into existing dense weights
    ASP.compute_sparse_masks()

    # train for a few steps with sparse weights
    print("SPARSE :: ", one_ll)
    step = train_loop(args, model, optimizer, step, args.num_sparse_steps)

    torch.save(
        {
            "step": step,
            "verbosity": args.verbosity,
            "seed2": args.seed2,
            "pattern": args.pattern,
            "whitelist": args.whitelist,
            "allow_recompute_mask": args.allow_recompute_mask,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
        },
        args.checkpoint_path,
    )


if __name__ == "__main__":

    class Args:
        verbosity = 3
        seed = 4873
        seed2 = 99875
        pattern = "m4n2_2d_best"
        whitelist = [torch.nn.Linear]
        allow_recompute_mask = True
        batch_size = 32
        input_features = 8
        output_features = 8
        hidden_features = 32
        num_layers = 4
        num_dense_steps = 2000
        num_sparse_steps = 3000
        num_sparse_steps_2 = 1000
        checkpoint_path = "part1.chkp"

    args = Args()

    main(args)


================================================
FILE: apex/contrib/sparsity/test/checkpointing_test_part2.py
================================================
from collections import OrderedDict

import torch
from apex.optimizers import FusedAdam
from apex.contrib.sparsity import ASP


def build_model(args):
    od = OrderedDict()
    for i in range(args.num_layers):
        if i == 0:
            od["linear_layer_%d" % (i + 1)] = torch.nn.Linear(
                args.input_features, args.hidden_features
            )
            od["layer_norm_%d" % (i + 1)] = torch.nn.LayerNorm(
                [args.batch_size, args.hidden_features]
            )
        elif i == args.num_layers - 1:
            od["linear_layer_%d" % (i + 1)] = torch.nn.Linear(
                args.hidden_features, args.output_features
            )
            od["layer_norm_%d" % (i + 1)] = torch.nn.LayerNorm(
                [args.batch_size, args.output_features]
            )
        else:
            od["linear_layer_%d" % (i + 1)] = torch.nn.Linear(
                args.hidden_features, args.hidden_features
            )
            od["layer_norm_%d" % (i + 1)] = torch.nn.LayerNorm(
                [args.batch_size, args.hidden_features]
            )
    return torch.nn.Sequential(od)


def train_step(args, model, optimizer, input_batch, target_batch, step):
    predicted_target = model(input_batch)
    loss = ((predicted_target - target_batch) ** 2).sum()
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    step = step + 1
    # print("Step %d :: loss=%e" % (step, loss.item()))
    return step


def train_loop(args, model, optimizer, step, num_steps):
    for i in range(num_steps):
        input_batch = torch.randn([args.batch_size, args.input_features]).cuda()
        target_batch = torch.randn([args.batch_size, args.output_features]).cuda()
        step = train_step(args, model, optimizer, input_batch, target_batch, step)
    return step


def main(step, args, model_state_dict, optimizer_state_dict):
    #
    # PART2
    #

    model = build_model(args).cuda()
    one_ll = next(model.children()).weight
    optimizer = FusedAdam(model.parameters())
    ASP.init_model_for_pruning(
        model,
        args.pattern,
        verbosity=args.verbosity,
        whitelist=args.whitelist,
        allow_recompute_mask=args.allow_recompute_mask,
    )
    ASP.init_optimizer_for_pruning(optimizer)

    torch.manual_seed(args.seed2)
    model.load_state_dict(model_state_dict)
    optimizer.load_state_dict(optimizer_state_dict)

    print("Model sparsity is %s" % ("enabled" if ASP.is_sparsity_enabled() else "disabled"))

    # train for a few steps with sparse weights
    print("SPARSE :: ", one_ll)
    step = train_loop(args, model, optimizer, step, args.num_sparse_steps_2)


if __name__ == "__main__":
    checkpoint = torch.load("part1.chkp")

    class Args:
        verbosity = checkpoint["verbosity"]
        seed = 4873
        seed2 = checkpoint["seed2"]
        pattern = checkpoint["pattern"]
        whitelist = checkpoint["whitelist"]
        allow_recompute_mask = checkpoint["allow_recompute_mask"]
        batch_size = 32
        input_features = 8
        output_features = 8
        hidden_features = 32
        num_layers = 4
        num_dense_steps = 2000
        num_sparse_steps = 3000
        num_sparse_steps_2 = 1000
        checkpoint_path = "part1.chkp"

    args = Args()

    main(
        checkpoint["step"],
        args,
        checkpoint["model_state_dict"],
        checkpoint["optimizer_state_dict"],
    )


================================================
FILE: apex/contrib/sparsity/test/checkpointing_test_reference.py
================================================
from collections import OrderedDict

import torch
from apex.optimizers import FusedAdam
from apex.contrib.sparsity import ASP

#
# Reference run for checkpointing test (part1 + part2)
#


def build_model(args):
    od = OrderedDict()
    for i in range(args.num_layers):
        if i == 0:
            od["linear_layer_%d" % (i + 1)] = torch.nn.Linear(
                args.input_features, args.hidden_features
            )
            od["layer_norm_%d" % (i + 1)] = torch.nn.LayerNorm(
                [args.batch_size, args.hidden_features]
            )
        elif i == args.num_layers - 1:
            od["linear_layer_%d" % (i + 1)] = torch.nn.Linear(
                args.hidden_features, args.output_features
            )
            od["layer_norm_%d" % (i + 1)] = torch.nn.LayerNorm(
                [args.batch_size, args.output_features]
            )
        else:
            od["linear_layer_%d" % (i + 1)] = torch.nn.Linear(
                args.hidden_features, args.hidden_features
            )
            od["layer_norm_%d" % (i + 1)] = torch.nn.LayerNorm(
                [args.batch_size, args.hidden_features]
            )
    return torch.nn.Sequential(od)


def train_step(args, model, optimizer, input_batch, target_batch, step):
    predicted_target = model(input_batch)
    loss = ((predicted_target - target_batch) ** 2).sum()
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    step = step + 1
    # print("Step %d :: loss=%e" % (step, loss.item()))
    return step


def train_loop(args, model, optimizer, step, num_steps):
    for i in range(num_steps):
        input_batch = torch.randn([args.batch_size, args.input_features]).cuda()
        target_batch = torch.randn([args.batch_size, args.output_features]).cuda()
        step = train_step(args, model, optimizer, input_batch, target_batch, step)
    return step


def main(args):
    #
    # PART1
    #

    torch.manual_seed(args.seed)

    model = build_model(args).cuda()
    one_ll = next(model.children()).weight
    optimizer = FusedAdam(model.parameters())
    ASP.init_model_for_pruning(
        model,
        args.pattern,
        whitelist=args.whitelist,
        allow_recompute_mask=args.allow_recompute_mask,
    )
    ASP.init_optimizer_for_pruning(optimizer)

    step = 0

    # train for a few steps with dense weights
    print("DENSE :: ", one_ll)
    step = train_loop(args, model, optimizer, step, args.num_dense_steps)

    # simulate sparsity by inserting zeros into existing dense weights
    ASP.compute_sparse_masks()

    # train for a few steps with sparse weights
    print("SPARSE :: ", one_ll)
    step = train_loop(args, model, optimizer, step, args.num_sparse_steps)

    #
    # PART 2
    #

    torch.manual_seed(args.seed2)

    # train for a few steps with sparse weights
    print("SPARSE :: ", one_ll)
    step = train_loop(args, model, optimizer, step, args.num_sparse_steps_2)


if __name__ == "__main__":

    class Args:
        seed = 4873
        seed2 = 99875
        pattern = "m4n2_2d_best"
        whitelist = [torch.nn.Linear]
        allow_recompute_mask = True
        batch_size = 32
        input_features = 8
        output_features = 8
        hidden_features = 32
        num_layers = 4
        num_dense_steps = 2000
        num_sparse_steps = 3000
        num_sparse_steps_2 = 1000
        checkpoint_path = "part1.chkp"

    args = Args()

    main(args)


================================================
FILE: apex/contrib/sparsity/test/test_permutation_application.py
================================================
import torch
import torch.onnx
from apex.contrib.sparsity.permutation_lib import Permutation

"""
Functional and behavioral correctness checking for network permutations
Each test class is a torch.nn.Module with three required members:
- self.input_shape is used to populate a dummy input
- self.expected_C_params indicates how many parameters are expected to be permuted in the C dimension
- self.expected_K_params indicates how many parameters are expected to be permuted in the K dimension

A test is successful if and only if:
1. The output of the un-permuted module matches (within a tolerance) the ouput of the permuted module
2. The number of parameters permuted in C, as reported by the Permutation class, matches the expected value in the test module
3. The number of parameters permuted in K, as reported by the Permutation class, matches the expected value in the test module

This file has all the test modules defined first, followed by the common test routine to check each module's correctness, and finally the main/entry point.
"""


class simple_convs(torch.nn.Module):
    """Stack of 2d convolutions with different normalization and activation functions"""

    def __init__(
        self,
        num_convs: int,
        channels: int,
        normalization: str = "none",
        activation: str = "ReLU",
    ):
        super().__init__()
        self.num_convs = num_convs
        self.channels = channels
        self.normalization = normalization
        self.activation = activation

        self.input_shape = [4, channels, 7, 7]

        # we'll permute all convs' weights along C except the first
        self.expected_C_params = -1
        self.expected_K_params = 0

        self.conv_stack = torch.nn.Sequential()
        for c in range(self.num_convs - 1):
            self.conv_stack.add_module(
                f"conv_{c}",
                torch.nn.Conv2d(self.channels, self.channels, kernel_size=(3, 3), padding=1),
            )
            self.expected_C_params += 1
            self.expected_K_params += 2

            if self.normalization == "BatchNorm2d":
                self.conv_stack.add_module(
                    f"norm_{c}",
                    torch.nn.BatchNorm2d(self.channels, track_running_stats=False),
                )
                self.expected_K_params += 2
            elif self.normalization == "LazyBatchNorm2d":
                self.conv_stack.add_module(
                    f"norm_{c}", torch.nn.LazyBatchNorm2d(track_running_stats=False)
                )
                self.expected_K_params += 2
            elif self.normalization == "GroupNorm":
                self.conv_stack.add_module(
                    f"norm_{c}", torch.nn.GroupNorm(4, self.channels, affine=True)
                )
                self.expected_C_params -= 1  # GN prevents permutations of the neighboring convs
                self.expected_K_params -= 2
            elif self.normalization == "InstanceNorm2d":
                self.conv_stack.add_module(
                    f"norm_{c}",
                    torch.nn.InstanceNorm2d(self.channels, affine=True, track_running_stats=False),
                )
                self.expected_K_params += 2
            elif self.normalization == "LocalResponseNorm":
                self.conv_stack.add_module(f"norm_{c}", torch.nn.LocalResponseNorm(16))
            elif self.normalization == "LayerNorm1":
                self.conv_stack.add_module(f"norm_{c}", torch.nn.LayerNorm(7))
            elif self.normalization == "LayerNorm2":
                self.conv_stack.add_module(f"norm_{c}", torch.nn.LayerNorm([7, 7]))
            elif self.normalization == "LayerNorm3":
                self.conv_stack.add_module(f"norm_{c}", torch.nn.LayerNorm([self.channels, 7, 7]))
                self.expected_K_params += 2
            elif self.normalization == "SyncBatchNorm":
                self.conv_stack.add_module(
                    f"norm_{c}",
                    torch.nn.SyncBatchNorm(self.channels, track_running_stats=False),
                )
                self.expected_K_params += 2

            self.conv_stack.add_module(f"act_{c}", torch.nn.ReLU())

        self.conv_stack.add_module(
            "conv_out", torch.nn.Conv2d(self.channels, 8, kernel_size=(1, 1))
        )
        self.expected_C_params += 1

    def forward(self, x: torch.Tensor):
        x = self.conv_stack(x)

        return x


class conv_1d(torch.nn.Module):
    """1D convolutions in isolation and with siblings"""

    def __init__(
        self,
        with_2d=False,
    ):
        super().__init__()
        self.input_shape = [4, 16, 7, 7]
        self.expected_C_params = 0
        self.expected_K_params = 0
        self.with_2d = with_2d

        self.input_conv = torch.nn.Conv2d(self.input_shape[1], 32, kernel_size=(3, 3), padding=1)
        self.expected_K_params += 2

        self.branch_a_1D = torch.nn.Conv1d(32, 32, kernel_size=3, padding=1)
        self.expected_C_params += 1
        self.expected_K_params += 2
        if self.with_2d:
            self.branch_b_2D = torch.nn.Conv2d(32, 32, kernel_size=(3, 3), padding=1)
            self.expected_C_params += 1
            self.expected_K_params += 2

        self.out_conv = torch.nn.Conv2d(32, 8, kernel_size=(1, 1))
        self.expected_C_params += 1

    def forward(self, x: torch.Tensor):
        step0 = self.input_conv(x)

        s0shape = step0.shape
        step1 = self.branch_a_1D(step0.view(s0shape[0], s0shape[1], s0shape[2] * s0shape[3])).view(
            s0shape
        )
        if self.with_2d:
            step1 = step1 + self.branch_b_2D(step0)

        return self.out_conv(step1)


class grouped_convs(torch.nn.Module):
    """Stack of 2d convolutions with different types of grouped convolutions"""

    def __init__(
        self,
    ):
        super().__init__()
        self.channels = 128
        self.input_shape = [4, self.channels, 7, 7]
        self.expected_C_params = 0
        self.expected_K_params = 0

        self.conv_stack = torch.nn.Sequential()
        self.conv_stack.add_module(
            "conv_in",
            torch.nn.Conv2d(self.channels, self.channels, kernel_size=(3, 3), padding=1),
        )

        # dw conv will let previous and this layers' weights and biases permute along K
        self.expected_K_params += 4
        self.conv_stack.add_module(
            "conv_dw",
            torch.nn.Conv2d(
                self.channels,
                self.channels,
                kernel_size=(3, 3),
                padding=1,
                groups=self.channels,
            ),
        )

        # regular conv permutes both
        self.expected_C_params += 1
        self.expected_K_params += 2
        self.conv_stack.add_module(
            "conv_0",
            torch.nn.Conv2d(self.channels, self.channels, kernel_size=(3, 3), padding=1, groups=1),
        )  # explicit '1' groups for extra coverage

        # only 2 groups should allow permutations only in C
        self.expected_C_params += 1
        self.conv_stack.add_module(
            "conv_gr2",
            torch.nn.Conv2d(self.channels, self.channels, kernel_size=(3, 3), padding=1, groups=2),
        )

        # another regular conv, this one can't do anything
        self.conv_stack.add_module(
            "conv_1",
            torch.nn.Conv2d(self.channels, self.channels, kernel_size=(3, 3), padding=1),
        )

        # finally, grouped conv with small groups
        self.conv_stack.add_module(
            "conv_gr64",
            torch.nn.Conv2d(
                self.channels,
                self.channels,
                kernel_size=(3, 3),
                padding=1,
                groups=self.channels // 2,
            ),
        )

    def forward(self, input: torch.Tensor):
        return self.conv_stack(input)


class simple_forks_joins(torch.nn.Module):
    """Some simple residual connections to test collecting parameters into a single group.  Four sections: input, blocka + residual, blockb + blockc, output"""

    def __init__(
        self,
    ):
        super().__init__()
        self.channels = 64
        self.input_shape = [4, self.channels, 7, 7]
        self.expected_C_params = 0
        self.expected_K_params = 0

        self.input_convs = torch.nn.Sequential()
        # input conv can only permute along K
        self.expected_K_params += 2
        self.input_convs.add_module(
            "conv_in0",
            torch.nn.Conv2d(self.channels, self.channels, kernel_size=(3, 3), padding=1),
        )
        # the next conv can permute along both C and K
        self.expected_C_params += 1
        self.expected_K_params += 2
        self.input_convs.add_module(
            "conv_in1",
            torch.nn.Conv2d(self.channels, self.channels, kernel_size=(3, 3), padding=1),
        )
        # BN will permute 2 more along K
        self.expected_K_params += 2
        self.input_convs.add_module(
            "bn_in1", torch.nn.BatchNorm2d(self.channels, track_running_stats=False)
        )

        self.block_a = torch.nn.Sequential()
        # cut channels in half, then back to full, two fully permutable convs
        self.expected_C_params += 2
        self.expected_K_params += 4
        self.block_a.add_module(
            "conv_a0",
            torch.nn.Conv2d(self.channels, self.channels // 2, kernel_size=(3, 3), padding=1),
        )
        self.block_a.add_module(
            "conv_a1",
            torch.nn.Conv2d(self.channels // 2, self.channels, kernel_size=(3, 3), padding=1),
        )

        self.block_b = torch.nn.Sequential()
        # cut channels in half, then back to full, two fully permutable convs
        self.expected_C_params += 2
        self.expected_K_params += 4
        self.block_b.add_module(
            "conv_b0",
            torch.nn.Conv2d(self.channels, self.channels // 2, kernel_size=(3, 3), padding=1),
        )
        self.block_b.add_module(
            "conv_b1",
            torch.nn.Conv2d(self.channels // 2, self.channels, kernel_size=(3, 3), padding=1),
        )

        self.block_c = torch.nn.Sequential()
        # cut channels in half, then back to full, two fully permutable convs
        self.expected_C_params += 2
        self.expected_K_params += 4
        self.block_c.add_module(
            "conv_c0",
            torch.nn.Conv2d(self.channels, self.channels // 2, kernel_size=(3, 3), padding=1),
        )
        self.block_c.add_module(
            "conv_c1",
            torch.nn.Conv2d(self.channels // 2, self.channels, kernel_size=(3, 3), padding=1),
        )

        self.output_conv = torch.nn.Sequential()
        self.expected_C_params += 1
        self.output_conv.add_module(
            "conv_out", torch.nn.Conv2d(self.channels, 8, kernel_size=(3, 3), padding=1)
        )

    def forward(self, input: torch.Tensor):
        step0 = self.input_convs(input)
        step1 = step0 + self.block_a(step0)
        step2 = self.block_b(step1) + self.block_c(step1)
        return self.output_conv(step2)


class different_grouped_convs(torch.nn.Module):
    """Convolutions with different group sizes need to use the GCD of the input channel counts if siblings"""

    def __init__(
        self,
    ):
        super().__init__()
        self.channels = 16
        self.input_shape = [4, self.channels, 7, 7]
        self.expected_C_params = 0
        self.expected_K_params = 0

        self.input_conv = torch.nn.Sequential()
        self.expected_K_params += 2
        self.input_conv.add_module(
            "input_conv",
            torch.nn.Conv2d(self.channels, 128, kernel_size=(3, 3), padding=1),
        )

        self.expected_C_params += 4
        # 4 parallel blocks with decreasing group size from "left" to "right"
        self.block_a = torch.nn.Sequential()
        self.block_a.add_module("conv_a", torch.nn.Conv2d(128, 128, kernel_size=(3, 3), padding=1))
        self.block_b = torch.nn.Sequential()
        self.block_b.add_module(
            "conv_b", torch.nn.Conv2d(128, 128, kernel_size=(3, 3), padding=1, groups=2)
        )
        self.block_c = torch.nn.Sequential()
        self.block_c.add_module(
            "conv_c", torch.nn.Conv2d(128, 128, kernel_size=(3, 3), padding=1, groups=4)
        )
        self.block_d = torch.nn.Sequential()
        self.block_d.add_module(
            "conv_d", torch.nn.Conv2d(128, 128, kernel_size=(3, 3), padding=1, groups=8)
        )

        # output can't permute along C, disallowed by parents
        self.output_conv = torch.nn.Sequential()
        self.output_conv.add_module(
            "output_conv", torch.nn.Conv2d(128, 8, kernel_size=(3, 3), padding=1)
        )

    def forward(self, input: torch.Tensor):
        step0 = self.input_conv(input)
        step1 = (
            self.block_a(step0) + self.block_b(step0) + self.block_c(step0) + self.block_d(step0)
        )
        return self.output_conv(step1)


class siblings_poison(torch.nn.Module):
    """A single sibling that cannot permute along C poisons all other siblings in its group"""

    def __init__(
        self,
    ):
        super().__init__()
        self.input_shape = [4, 16, 7, 7]
        self.expected_C_params = 0
        self.expected_K_params = 0

        self.input_conv = torch.nn.Sequential()
        self.input_conv.add_module(
            "input_conv",
            torch.nn.Conv2d(self.input_shape[1], 128, kernel_size=(3, 3), padding=1),
        )

        # two parallel block: conv->flatten->linear | flatten->linear
        self.expected_K_params += (
            4  # two linears will have their output channels permuted for the output layer
        )
        self.block_a = torch.nn.Sequential()
        self.block_a.add_module("conv_a", torch.nn.Conv2d(128, 128, kernel_size=(3, 3), padding=1))
        self.block_a.add_module("flatten_a", torch.nn.Flatten(1))
        self.block_a.add_module("linear_a", torch.nn.Linear(6272, 128))

        self.block_b = torch.nn.Sequential()
        self.block_b.add_module("flatten_b", torch.nn.Flatten(1))
        self.block_b.add_module("linear_b", torch.nn.Linear(6272, 128))

        self.output = torch.nn.Sequential()
        self.expected_C_params += 1  # output layer will have its C dimension permuted
        self.output.add_module("output", torch.nn.Linear(128, 8))

    def forward(self, input: torch.Tensor):
        step0 = self.input_conv(input)
        step1 = self.block_a(step0) + self.block_b(step0)
        return self.output(step1)


class coparent_poison(torch.nn.Module):
    """A single coparent that cannot permute along K poisons all other coparents in its group"""

    def __init__(
        self,
    ):
        super().__init__()
        self.input_shape = [4, 16, 7, 7]
        self.expected_C_params = 0
        self.expected_K_params = 0

        self.input_conv = torch.nn.Sequential()
        self.expected_K_params += 2
        self.input_conv.add_module(
            "input_conv",
            torch.nn.Conv2d(self.input_shape[1], 128, kernel_size=(3, 3), padding=1),
        )

        # two parallel block: conv | conv-> grouped conv
        self.expected_C_params += 3  # all convs permute along C
        self.expected_K_params += 2  # only conv_b0 permutes along K
        self.block_a = torch.nn.Sequential()
        self.block_a.add_module("conv_a", torch.nn.Conv2d(128, 128, kernel_size=(3, 3), padding=1))

        self.block_b = torch.nn.Sequential()
        self.block_b.add_module("conv_b0", torch.nn.Conv2d(128, 128, kernel_size=(3, 3), padding=1))
        self.block_b.add_module(
            "conv_b1",
            torch.nn.Conv2d(128, 128, kernel_size=(3, 3), padding=1, groups=4),
        )

        self.output = torch.nn.Sequential()
        self.output.add_module("output", torch.nn.Conv2d(128, 8, kernel_size=(1, 1)))

    def forward(self, input: torch.Tensor):
        step0 = self.input_conv(input)
        step1 = self.block_a(step0) + self.block_b(step0)
        return self.output(step1)


class depthwise_child_is_sibling(torch.nn.Module):
    """The child of a depthwise convolution should act as a sibling"""

    def __init__(
        self,
    ):
        super().__init__()
        self.input_shape = [4, 16, 7, 7]
        self.expected_C_params = 0
        self.expected_K_params = 0

        self.input_conv = torch.nn.Sequential()
        self.expected_K_params += 2
        self.input_conv.add_module(
            "input_conv",
            torch.nn.Conv2d(self.input_shape[1], 128, kernel_size=(3, 3), padding=1),
        )

        # two parallel block: conv | depthwise->conv
        self.expected_C_params += 2
        self.expected_K_params += 4 + 2
        self.block_a = torch.nn.Sequential()
        self.block_a.add_module("conv_a", torch.nn.Conv2d(128, 128, kernel_size=(3, 3), padding=1))

        self.block_b = torch.nn.Sequential()
        self.block_b.add_module(
            "conv_b_dw",
            torch.nn.Conv2d(128, 128, kernel_size=(3, 3), padding=1, groups=128),
        )
        self.block_b.add_module(
            "conv_b_1", torch.nn.Conv2d(128, 128, kernel_size=(3, 3), padding=1)
        )

        self.output_conv = torch.nn.Sequential()
        self.expected_C_params += 1
        self.output_conv.add_module("output_conv", torch.nn.Conv2d(128, 8, kernel_size=(1, 1)))

    def forward(self, input: torch.Tensor):
        step0 = self.input_conv(input)
        step1 = self.block_a(step0) + self.block_b(step0)
        return self.output_conv(step1)


class module_attribute(torch.nn.Module):
    """Attributes of some module must be permuted if they feed some operation that is permuted"""

    def __init__(
        self,
        complexity: int = 0,
    ):
        super().__init__()
        self.input_shape = [4, 16, 7, 7]
        self.expected_C_params = 0
        self.expected_K_params = 0
        self.complexity = complexity

        self.input_conv = torch.nn.Sequential()
        self.expected_K_params += (
            3  # conv weight, conv bias, input_offset C (counts as K since it's acting as a parent)
        )
        self.input_offset = torch.nn.Parameter(torch.zeros(128, 7, 7))
        torch.nn.init.normal_(self.input_offset.data, mean=0.0, std=2.0)
        self.input_conv.add_module(
            "conv_input",
            torch.nn.Conv2d(self.input_shape[1], 128, kernel_size=(3, 3), padding=1),
        )

        # add a couple more layers, and let the same offset affect another layer, as well
        if complexity == 1:
            self.expected_C_params += 2
            self.expected_K_params += 4
            self.stack_a = torch.nn.Sequential()
            self.stack_a.add_module(
                "conv_a", torch.nn.Conv2d(128, 128, kernel_size=(3, 3), padding=1)
            )

            self.stack_b = torch.nn.Sequential()
            self.stack_b.add_module(
                "conv_b", torch.nn.Conv2d(128, 128, kernel_size=(3, 3), padding=1)
            )

        self.output_conv = torch.nn.Sequential()
        self.expected_C_params += 1
        self.output_conv.add_module("conv_output", torch.nn.Conv2d(128, 8, kernel_size=(3, 3)))

    def forward(self, input: torch.Tensor):
        batch_input_offset = self.input_offset.expand(input.shape[0], -1, -1, -1)
        x = self.input_conv(input) + batch_input_offset
        if self.complexity == 1:
            x = self.stack_a(x) + batch_input_offset
            x = self.stack_b(x) + batch_input_offset
        return self.output_conv(x)


class square_attribute(torch.nn.Module):
    """Attributes with multiple dimensions matching the permutation length should only be permuted along the correct dimension"""

    # TODO: currently, such an attribute will disallow permutations around it, but with effort, it could be handled correctly.

    def __init__(
        self,
    ):
        super().__init__()
        self.input_shape = [4, 16, 16]
        self.expected_C_params = 0
        self.expected_K_params = 0

        self.input_linear = torch.nn.Sequential()
        # self.expected_K_params += 2  # if handled correctly, the linear's K and the offset's K should both be permuted
        self.input_linear.add_module("linear_input", torch.nn.Linear(self.input_shape[1], 16))
        self.input_offset = torch.nn.Parameter(torch.zeros(16, 16))
        torch.nn.init.normal_(self.input_offset.data, mean=0.0, std=2.0)

        self.output_linear = torch.nn.Sequential()
        # self.expected_C_params += 1  # if handled correctly, this should be permuted
        self.output_linear.add_module("linear_output", torch.nn.Linear(16, 8))

    def forward(self, input: torch.Tensor):
        batch_input_offset = self.input_offset.expand(input.shape[0], -1, -1)
        x = self.input_linear(input) + torch.permute(batch_input_offset, (0, 2, 1))
        return self.output_linear(x)


class MHA_test(torch.nn.Module):
    """MultiheadAttention modules are unique, we need to check permutations for input and ouput projections"""

    def __init__(self, hidden_dim: int = 256, seq_len: int = 64, num_heads: int = 16):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.seq_len = seq_len
        self.num_heads = num_heads
        self.input_shape = [4, self.seq_len, self.hidden_dim]

        self.expected_C_params = 1
        self.expected_K_params = 2

        self.MHA0 = torch.nn.MultiheadAttention(
            self.hidden_dim, self.num_heads, dropout=False, batch_first=True
        )
        self.MHA1 = torch.nn.MultiheadAttention(
            self.hidden_dim, self.num_heads, dropout=False, batch_first=True
        )

    def forward(self, input: torch.Tensor):
        step0, _ = self.MHA0(input, input, input)
        step1, _ = self.MHA1(step0, step0, step0)
        return step1


class one_sparse_sibling(torch.nn.Module):
    """If only one of two siblings is sparse, both need to be permuted"""

    def __init__(
        self,
    ):
        super().__init__()
        self.input_shape = [4, 16, 7, 7]
        self.expected_C_params = 0
        self.expected_K_params = 0

        self.in_conv = torch.nn.Sequential()
        self.expected_K_params += 2
        self.in_conv.add_module(
            "conv_in",
            torch.nn.Conv2d(self.input_shape[1], 128, kernel_size=(3, 3), padding=1),
        )

        self.block_a = torch.nn.Sequential()
        self.expected_C_params += 1  # only conv_a0 will be permuted along C
        self.expected_K_params += 2  # only conv_a1 will be permuted along K
        self.block_a.add_module("conv_a0", torch.nn.Conv2d(128, 3, kernel_size=(1, 1)))
        self.block_a.add_module("conv_a1", torch.nn.Conv2d(3, 128, kernel_size=(3, 3), padding=1))

        self.block_b = torch.nn.Sequential()
        self.expected_C_params += 2  # even though conv_a0 will not be sparse (only 3 output channels), conv_b0 can still be permuted along C
        self.expected_K_params += 4
        self.block_b.add_module("conv_b0", torch.nn.Conv2d(128, 128, kernel_size=(3, 3), padding=1))
        self.block_b.add_module("conv_b1", torch.nn.Conv2d(128, 128, kernel_size=(1, 1)))

        self.out_conv = torch.nn.Sequential()
        self.expected_C_params += 1
        self.out_conv.add_module("conv_out", torch.nn.Conv2d(128, 8, kernel_size=(1, 1)))

    def forward(self, input: torch.Tensor):
        step0 = self.in_conv(input)
        step1 = self.block_a(step0) + self.block_b(step0)
        return self.out_conv(step1)


class test_concat(torch.nn.Module):
    """If concats are along the channel dimension (dim1 of NCHW), downstream layers can still be permuted despite C!=parentK"""

    def __init__(
        self,
        ratio=1,  # ratio between # channels in either path to be concatenated
        dim=1,  # dimension to concatenate, K by default
        depth=1,  # number of concats to stack
    ):
        super().__init__()
        assert dim == 1 or ratio == 1, (
            "can't concat along dimensions other than K if K's don't match"
        )
        self.dim = dim
        self.depth = depth
        self.input_shape = [4, 16, 7, 7]
        self.expected_C_params = 0
        self.expected_K_params = 0

        self.in_conv = torch.nn.Sequential()
        self.expected_K_params += 2
        self.in_conv.add_module(
            "conv_in", torch.nn.Conv2d(self.input_shape[1], 64, kernel_size=(1, 1))
        )

        self.left_paths = torch.nn.ModuleList([torch.nn.Conv2d(64, 64, kernel_size=(1, 1))])
        self.expected_C_params += 1
        self.expected_K_params += 2

        in_C = 64
        out_C = 64
        for d in range(1, depth, 1):
            self.expected_C_params += 1
            self.expected_K_params += 2
            if dim == 1:
                out_C += 64
            self.left_paths.append(torch.nn.Conv2d(in_C + 64, out_C, kernel_size=(1, 1)))
            if dim == 1:
                in_C += 64

        self.right_path = torch.nn.Sequential()
        self.expected_C_params += 1
        self.expected_K_params += 2
        self.right_path.add_module("conv_b", torch.nn.Conv2d(64, 64 * ratio, kernel_size=(1, 1)))

        self.out_conv = torch.nn.Sequential()
        self.expected_C_params += 1
        if dim == 1:
            out_C += 64 * ratio
        self.out_conv.add_module("conv_out", torch.nn.Conv2d(out_C, 16, kernel_size=(1, 1)))

    def forward(self, input: torch.Tensor):
        step0 = self.in_conv(input)
        step1 = step0
        for d, layer in enumerate(self.left_paths):
            if d == 0:
                step1 = layer(step1)
            else:
                step1 = layer(torch.cat([step1, step0], 1))

        step2 = torch.cat([step1, self.right_path(step0)], self.dim)
        return self.out_conv(step2)


class test_flatten_op(torch.nn.Module):
    """flatten ops may change the effective channel count, typically by collapsing N,C,H,W into N,C*H*W before a classifier"""

    def __init__(
        self,
        change_dims=True,
    ):
        super().__init__()
        self.change_dims = change_dims
        self.input_shape = [4, 16, 3, 3]
        self.expected_C_params = 0
        self.expected_K_params = 0

        if not self.change_dims:
            self.input_shape = [4, 16, 1, 1]
            self.expected_C_params = 1
            self.expected_K_params = 2

        self.flattened_C = self.input_shape[2] * self.input_shape[3] * 64

        self.in_conv = torch.nn.Conv2d(self.input_shape[1], 64, kernel_size=(1, 1))
        self.out_gemm = torch.nn.Linear(self.flattened_C, 16)

    def forward(self, input: torch.Tensor):
        step0 = self.in_conv(input)
        step1 = torch.flatten(step0, start_dim=1)
        return self.out_gemm(step1)


class test_flatten_module(torch.nn.Module):
    """flatten modules may change the effective channel count, typically by collapsing N,C,H,W into N,C*H*W before a classifier"""

    def __init__(
        self,
        change_dims=True,
    ):
        super().__init__()
        self.change_dims = change_dims
        self.input_shape = [4, 16, 3, 3]
        self.expected_C_params = 0
        self.expected_K_params = 0

        if not self.change_dims:
            self.input_shape = [4, 16, 1, 1]
            self.expected_C_params = 1
            self.expected_K_params = 2

        self.flattened_C = self.input_shape[2] * self.input_shape[3] * 64
        self.stack = torch.nn.Sequential()
        self.stack.add_module(
            "conv_in", torch.nn.Conv2d(self.input_shape[1], 64, kernel_size=(1, 1))
        )
        self.stack.add_module("flatten", torch.nn.Flatten(1))
        self.stack.add_module("gemm_out", torch.nn.Linear(self.flattened_C, 16))

    def forward(self, input: torch.Tensor):
        return self.stack(input)


class test_trace_failure(torch.nn.Module):
    """make sure tracing failures are handled gracefully"""

    def __init__(self):
        super().__init__()
        self.input_shape = [4, 16, 1, 1]
        self.expected_C_params = 0
        self.expected_K_params = 0

        self.in_conv = torch.nn.Conv2d(self.input_shape[1], 64, kernel_size=(1, 1))
        self.out_conv = torch.nn.Conv2d(64, 16, kernel_size=(1, 1))

    def forward(self, input: torch.Tensor):
        step0 = self.in_conv(input)
        # NCHW = 4,64,1,1
        channels = step0.size(1)
        channel_offset = torch.arange(channels, dtype=torch.long, device=step0.device)
        channel_offset = channel_offset.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(step0)
        step0.add_(channel_offset)
        return self.out_conv(step0)


class already_sparse(torch.nn.Module):
    """if weights are already sparse, permutations should be skipped"""

    def __init__(self):
        super().__init__()
        self.input_shape = [4, 16, 3, 3]
        self.expected_C_params = 0
        self.expected_K_params = 0

        self.in_conv = torch.nn.Conv2d(self.input_shape[1], 64, kernel_size=(1, 1))
        self.out_conv = torch.nn.Conv2d(64, 16, kernel_size=(1, 1))

        # apply 2:4 to the output weights, it will not require a permutation
        out_weights = torch.ones_like(self.out_conv.weight)
        out_weights[:, 0::2, ...] = 0
        assert torch.sum(out_weights) == torch.numel(out_weights) / 2
        self.out_conv.weight.data.copy_(out_weights)

    def forward(self, input: torch.Tensor):
        step0 = self.in_conv(input)
        return self.out_conv(step0)


def test_model(model, tag, verbosity=0, save_onnx=False):
    Permutation.set_identical_seed()
    x = torch.rand(model.input_shape)
    if save_onnx:
        torch.onnx.export(model, x, f"{tag}.onnx", verbose=False)

    base_out = model(x)

    sparse_parameters = []
    all_parameters = []

    module_to_params = {}
    module_to_params[torch.nn.MultiheadAttention] = (
        "q_proj_weight",
        "k_proj_weight",
        "v_proj_weight",
        "in_proj_weight",
    )

    for module_name, module in model.named_modules():
        module_type_str = str(type(module)).split("'")[1]
        if module_type_str == "torch.nn.modules.container.Sequential" or module_type_str.startswith(
            "torchvision.models"
        ):
            # filter out the 'torch.nn.modules.container.Sequential' type and the whole model, like 'torchvision.models.vgg.VGG'
            continue
        for p_name, p in module.named_parameters():
            all_parameters.append((module_name, module, p_name, p))

            if isinstance(
                module,
                (
                    torch.nn.Linear,
                    torch.nn.Conv1d,
                    torch.nn.Conv2d,
                    torch.nn.Conv3d,
                    torch.nn.MultiheadAttention,
                    torch.nn.modules.linear.NonDynamicallyQuantizableLinear,
                ),
            ):
                allowed_names = ("weight",)
                if type(module) in module_to_params.keys():
                    allowed_names = module_to_params[type(module)]

                if p_name not in allowed_names:
                    continue

                if len(p.size()) >= 2 and (p.size()[0] % 8) == 0 and (p.size()[1] % 16) == 0:
                    mask = torch.ones_like(p).bool()
                    buffname = p_name.split(".")[-1]
                    module.register_buffer("__%s_mma_mask" % buffname, mask)
                    sparse_parameters.append((module_name, module, p_name, p, mask, None))

        if module_type_str == "torch.nn.modules.batchnorm.BatchNorm2d":
            # need to get the running_mean and running_var from model.state_dict(), as they are not the learnable parameters
            module_mean_name = module_name + ".running_mean"
            module_var_name = module_name + ".running_var"
            for param_key in model.state_dict():
                if module_mean_name == param_key or module_var_name == param_key:
                    all_parameters.append(
                        (
                            module_name,
                            module,
                            param_key.split(".")[-1],
                            model.state_dict()[param_key],
                        )
                    )

    if verbosity > 1:
        sparse_param_names = [
            module_name + ":" + p_name
            for (module_name, module, p_name, p, mask, pruned) in sparse_parameters
        ]
        all_param_names = [
            module_name + ":" + p_name for (module_name, module, p_name, p) in all_parameters
        ]
        print(
            f"\tSparse parameter names: {sparse_param_names}\n\tAll parameter names: {all_param_names}"
        )

    Permutation.set_permutation_params_from_asp(model, sparse_parameters, all_parameters, verbosity)
    Permutation.permute_model(model)

    C_params, K_params, missed_dims = Permutation.get_permutation_stats()

    success = True
    fail_str = ""
    succ_str = ""
    if len(C_params) != model.expected_C_params:
        success = False
        fail_str = (
            fail_str + f"\n\tC expected {model.expected_C_params}, got {len(C_params)} ({C_params})"
        )
    elif verbosity > 0:
        succ_str = (
            succ_str + f"\n\tC expected {model.expected_C_params}, got {len(C_params)} ({C_params})"
        )

    if len(K_params) != model.expected_K_params:
        success = False
        fail_str = (
            fail_str + f"\n\tK expected {model.expected_K_params}, got {len(K_params)} ({K_params})"
        )
    elif verbosity > 0:
        succ_str = (
            succ_str + f"\n\tK expected {model.expected_K_params}, got {len(K_params)} ({K_params})"
        )

    if len(missed_dims) != 0:
        success = False
        fail_str = (
            fail_str
            + f"\n\tMissed permutations along {len(missed_dims)} dimensions ({missed_dims})"
        )

    perm_out = model(x)

    atol = 1e-5
    rtol = 1e-4
    outs_match = torch.allclose(base_out.data, perm_out.data, atol=atol, rtol=rtol)
    if not outs_match:
        fail_str = fail_str + f"\n\tOutputs matched: {outs_match}"
        if success:
            diffs = base_out - perm_out
            diff_locs = (diffs >= atol).nonzero(as_tuple=True)
            fail_str = fail_str + f"\n{diff_locs}\n{diffs[diff_locs]}"
        success = False

    if success:
        print(f"{tag}: Success\t{succ_str}")
    else:
        print(f"{tag}: FAIL\t{fail_str}")

    return success


def main():
    global_success = True

    global_success &= test_model(simple_convs(2, 16), "smoke test")
    global_success &= test_model(simple_convs(5, 64), "simple 5 64")
    global_success &= test_model(simple_convs(10, 32), "simple 10 32")
    # normalization
    for norm in [
        "BatchNorm2d",
        "LazyBatchNorm2d",
        "InstanceNorm2d",
        "LazyInstanceNorm2d",
        "LayerNorm3",
        "LocalResponseNorm",
    ]:
        global_success &= test_model(simple_convs(4, 128, norm), norm)
    # disallowed normalization
    for norm in ["GroupNorm"]:
        global_success &= test_model(simple_convs(4, 128, norm), norm)

    global_success &= test_model(conv_1d(), "conv1d")
    global_success &= test_model(conv_1d(with_2d=True), "conv1d and conv2d")
    global_success &= test_model(grouped_convs(), "grouped convs")
    global_success &= test_model(simple_forks_joins(), "forks and joins")
    global_success &= test_model(different_grouped_convs(), "GCD")
    global_success &= test_model(siblings_poison(), "sibling poison")
    global_success &= test_model(coparent_poison(), "coparent poison")
    global_success &= test_model(depthwise_child_is_sibling(), "dw child is sibling")
    global_success &= test_model(module_attribute(complexity=0), "single attribute")
    global_success &= test_model(module_attribute(complexity=1), "single attribute thrice")
    global_success &= test_model(MHA_test(hidden_dim=256, seq_len=64, num_heads=16), "stacked MHA")
    global_success &= test_model(one_sparse_sibling(), "one sparse sibling")
    global_success &= test_model(test_concat(), "simple concat")  # concat along K
    global_success &= test_model(test_concat(dim=0), "concat dim0")  # concat along C
    global_success &= test_model(
        test_concat(ratio=2), "concat ratio2"
    )  # concat along K with different K values
    global_success &= test_model(
        test_concat(depth=2), "concat depth2"
    )  # concat along K multiple times
    global_success &= test_model(test_concat(depth=3), "concat depth3")
    global_success &= test_model(test_concat(ratio=3, depth=4), "concat ratio3 depth4")
    global_success &= test_model(test_concat(dim=0, depth=3), "concat dim0 depth3")
    global_success &= test_model(test_flatten_op(), "flatten op")
    global_success &= test_model(test_flatten_op(change_dims=False), "useless flatten op")
    global_success &= test_model(test_flatten_module(), "flatten module")
    global_success &= test_model(test_flatten_module(change_dims=False), "useless flatten module")
    global_success &= test_model(test_trace_failure(), "trace failure")
    global_success &= test_model(already_sparse(), "skip already sparse")
    global_success &= test_model(square_attribute(), "square attributes")

    if global_success:
        print("All tests completed successfully.")
    else:
        print("There was at least one failure.")


if __name__ == "__main__":
    main()


================================================
FILE: apex/contrib/sparsity/test/toy_problem.py
================================================
from collections import OrderedDict

import torch
from apex.optimizers import FusedAdam
from apex.contrib.sparsity import ASP


def build_model(args):
    od = OrderedDict()
    for i in range(args.num_layers):
        if i == 0:
            od["linear_layer_%d" % (i + 1)] = torch.nn.Linear(
                args.input_features, args.hidden_features
            )
            od["layer_norm_%d" % (i + 1)] = torch.nn.LayerNorm(
                [args.batch_size, args.hidden_features]
            )
        elif i == args.num_layers - 1:
            od["linear_layer_%d" % (i + 1)] = torch.nn.Linear(
                args.hidden_features, args.output_features
            )
            od["layer_norm_%d" % (i + 1)] = torch.nn.LayerNorm(
                [args.batch_size, args.output_features]
            )
        else:
            od["linear_layer_%d" % (i + 1)] = torch.nn.Linear(
                args.hidden_features, args.hidden_features
            )
            od["layer_norm_%d" % (i + 1)] = torch.nn.LayerNorm(
                [args.batch_size, args.hidden_features]
            )
    return torch.nn.Sequential(od)


def train_step(args, model, optimizer, input_batch, target_batch, step):
    predicted_target = model(input_batch)
    loss = ((predicted_target - target_batch) ** 2).sum()
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    step = step + 1
    # print("Step %d :: loss=%e" % (step, loss.item()))
    return step


def train_loop(args, model, optimizer, step, num_steps):
    for i in range(num_steps):
        input_batch = torch.randn([args.batch_size, args.input_features]).cuda()
        target_batch = torch.randn([args.batch_size, args.output_features]).cuda()
        step = train_step(args, model, optimizer, input_batch, target_batch, step)
    return step


def main(args):
    model = build_model(args).cuda()
    one_ll = next(model.children()).weight
    optimizer = FusedAdam(model.parameters())
    # only prune linear layers, even though we also support conv1d, conv2d and conv3d
    ASP.init_model_for_pruning(
        model, "m4n2_1d", whitelist=[torch.nn.Linear], allow_recompute_mask=True
    )
    ASP.init_optimizer_for_pruning(optimizer)

    step = 0

    # train for a few steps with dense weights
    print("DENSE :: ", one_ll)
    step = train_loop(args, model, optimizer, step, args.num_dense_steps)

    # simulate sparsity by inserting zeros into existing dense weights
    ASP.compute_sparse_masks()

    # train for a few steps with sparse weights
    print("SPARSE :: ", one_ll)
    step = train_loop(args, model, optimizer, step, args.num_sparse_steps)

    # recompute sparse masks
    ASP.compute_sparse_masks()

    # train for a few steps with sparse weights
    print("SPARSE :: ", one_ll)
    step = train_loop(args, model, optimizer, step, args.num_sparse_steps_2)

    # turn off sparsity
    print("SPARSE :: ", one_ll)
    ASP.restore_pruned_weights()

    # train for a few steps with dense weights
    print("DENSE :: ", one_ll)
    step = train_loop(args, model, optimizer, step, args.num_dense_steps_2)


if __name__ == "__main__":

    class Args:
        batch_size = 32
        input_features = 16
        output_features = 8
        hidden_features = 40
        num_layers = 4
        num_dense_steps = 2000
        num_sparse_steps = 3000
        num_sparse_steps_2 = 1000
        num_dense_steps_2 = 1500

    args = Args()

    main(args)


================================================
FILE: apex/contrib/test/__init__.py
================================================


================================================
FILE: apex/contrib/test/bottleneck/__init__.py
================================================


================================================
FILE: apex/contrib/test/bottleneck/test_bottleneck_module.py
================================================
import unittest

import torch
from torch.testing._internal import common_utils

from apex.distributed_testing.distributed_test_base import NcclDistributedTestBase

SKIP_TEST = None
try:
    from apex.contrib.bottleneck import Bottleneck, SpatialBottleneck
    from apex.contrib.bottleneck import HaloExchangerPeer
    from apex.contrib.peer_memory import PeerMemoryPool
except ImportError as e:
    SKIP_TEST = e


def ground_truth_bottleneck(C, dtype, explicit_nhwc):
    bottleneck = Bottleneck(C, C, C, use_cudnn=True, explicit_nhwc=explicit_nhwc)
    bottleneck.to(dtype=dtype, device="cuda")
    for p in bottleneck.parameters():
        torch.distributed.broadcast(p, 0)
    for b in bottleneck.buffers():
        torch.distributed.broadcast(b, 0)
    return bottleneck


def print_bottleneck_p_and_b(bottleneck):
    with torch.no_grad():
        for n, p in bottleneck.named_parameters():
            print("%s :: %s" % (n, str(p.norm(p=2, dtype=torch.float32))))
        for n, p in bottleneck.named_buffers():
            print("%s :: %s" % (n, str(p.norm(p=2, dtype=torch.float32))))


def has_nan(x):
    if isinstance(x, list) or isinstance(x, tuple):
        for xx in x:
            if torch.any(torch.isnan(xx)):
                return True
        return False
    elif isinstance(x, dict):
        for k, v in x.items():
            if torch.any(torch.isnan(v)):
                return True
    else:
        return torch.any(torch.isnan(x))


def rel_diff_t(xx1, xx2):
    return (
        (xx1 - xx2).norm(p=2, dtype=torch.float32) / (xx1 + xx2).norm(p=2, dtype=torch.float32)
    ).item()


def rel_diff(x1, x2):
    if isinstance(x1, list) or isinstance(x1, tuple):
        return [rel_diff_t(xx1, xx2) for xx1, xx2 in zip(x1, x2)]
    elif isinstance(x1, dict):
        return [rel_diff_t(xx1, xx2) for (k1, xx1), (k2, xx2) in zip(x1.items(), x2.items())]
    else:
        return rel_diff_t(x1, x2)


def graph_it(bottleneck, x):
    print("Graphing")
    with torch.no_grad():
        x = x.clone()
        x.grad = None
        x.requires_grad = True
    return torch.cuda.make_graphed_callables(bottleneck, (x,))


def clone_inputs(bottleneck, x, dy=None):
    with torch.no_grad():
        x = x.clone()
        x.grad = None
        x.requires_grad = True
        if dy is None:
            y = bottleneck(x)
            dy = torch.randn_like(y) / 1e2
            torch.distributed.broadcast(dy, 0)
    return x, dy


def fprop_and_bprop(bottleneck, x, dy):
    y = bottleneck(x)
    y.backward(dy)
    dgrad = x.grad.detach()
    wgrad = {}
    for n, p in bottleneck.named_parameters():
        wgrad[n] = p.grad.detach()
    return x, y, dy, dgrad, wgrad


def ground_truth(N, C, H, W, dtype, memory_format, bottleneck):
    if memory_format == 1:
        # 1 -> explicit nhwc
        explicit_nhwc = True
        with torch.no_grad():
            x = torch.randn([N, H, W, C], dtype=dtype, device="cuda")
            torch.distributed.broadcast(x, 0)
            x, dy = clone_inputs(bottleneck, x)
        return fprop_and_bprop(bottleneck, x, dy)
    else:
        # 2 -> native nhwc
        # 3 -> nchw
        explicit_nhwc = False
        assert False, "Not implemented yet"


def print_ground_truth(gt):
    x, y, dy, dgrad, wgrad = gt
    if has_nan(y) or has_nan(dgrad) or has_nan(wgrad):
        print("Error! Ground truth has NAN")
    else:
        print("Ok! No NAN found in ground truth")


def apply_to_different_bottleneck(gt, bottleneck):
    with torch.no_grad():
        x, _, dy, _, _ = gt
        x, dy = clone_inputs(bottleneck, x, dy)
    return fprop_and_bprop(bottleneck, x, dy)


def compare_single_field(results, f1, f2, l0, l1, l2):
    if has_nan(f1) and has_nan(f2):
        results[l0] = "both NAN"
    elif has_nan(f1):
        results[l0] = "%s.%s NAN" % (l1, l0)
    elif has_nan(f2):
        results[l0] = "%s.%s NAN" % (l2, l0)
    else:
        results[l0] = "%s" % (str(rel_diff(f1, f2)))


def compare(gt, bt):
    x1, y1, dy1, dgrad1, wgrad1 = gt
    x2, y2, dy2, dgrad2, wgrad2 = bt
    results = {}
    compare_single_field(results, y1, y2, "y", "gt", "bt")
    compare_single_field(results, dy1, dy2, "dy", "gt", "bt")
    compare_single_field(results, dgrad1, dgrad2, "dgrad", "gt", "bt")
    compare_single_field(results, wgrad1, wgrad2, "wgrad", "gt", "bt")
    for i in range(torch.distributed.get_world_size()):
        if i == torch.distributed.get_rank():
            print(i, results)
        torch.distributed.barrier()


def spatial_parallel_bottleneck(C, dtype, explicit_nhwc, gt_bottleneck, spatial_parallel_args):
    spatial_bottleneck = SpatialBottleneck(
        C,
        C,
        C,
        use_cudnn=True,
        explicit_nhwc=explicit_nhwc,
        spatial_parallel_args=spatial_parallel_args,
    )
    spatial_bottleneck.to(dtype=dtype, device="cuda")
    with torch.no_grad():
        sp = {}
        for n, p in spatial_bottleneck.named_parameters():
            sp[n] = p
        for n, p in gt_bottleneck.named_parameters():
            sp[n].copy_(p)
        sb = {}
        for n, b in spatial_bottleneck.named_buffers():
            sb[n] = b
        for n, b in gt_bottleneck.named_buffers():
            sb[n].copy_(b)
    return spatial_bottleneck


def n_way_spatial(halex, gt_bottleneck, gt, explicit_nhwc, world_size, rank, fp32_reduce=False):
    assert explicit_nhwc, "Only tested for explicit nhwc"

    x, _, dy, _, _ = gt
    N, H, W, C = list(x.shape)  # Tensor is already shaped properly for n-way parallel
    dtype = x.dtype

    spatial_group_size = world_size
    spatial_group_rank = rank
    spatial_communicator = None
    spatial_halo_exchanger = halex
    spatial_method = 1  # 1 -> overlap halo and main conv, 2 -> wait for halo, conv on padded x
    use_delay_kernel = False
    spatial_parallel_args = (
        spatial_group_size,
        spatial_group_rank,
        spatial_communicator,
        spatial_halo_exchanger,
        spatial_method,
        use_delay_kernel,
    )
    spatial_bottleneck = spatial_parallel_bottleneck(
        C, dtype, explicit_nhwc, gt_bottleneck, spatial_parallel_args
    )

    with torch.no_grad():
        Hs = H // spatial_group_size
        xs = x[:, spatial_group_rank * Hs : (spatial_group_rank + 1) * Hs, :, :].clone()
        dys = dy[:, spatial_group_rank * Hs : (spatial_group_rank + 1) * Hs, :, :].clone()
        xs.requires_grad = True

    spatial_bottleneck = graph_it(spatial_bottleneck, xs)
    _, y, _, dgrad, wgrad = fprop_and_bprop(spatial_bottleneck, xs, dys)

    # gather output pieces
    for n, p in wgrad.items():
        if fp32_reduce:
            p32 = p.float()
            torch.distributed.all_reduce(p32)
            p.copy_(p32.half())
        else:
            torch.distributed.all_reduce(p)
    ys = [torch.empty_like(y) for _ in range(spatial_group_size)]
    torch.distributed.all_gather(ys, y)
    y = torch.cat(ys, dim=1)
    dgrads = [torch.empty_like(dgrad) for _ in range(spatial_group_size)]
    torch.distributed.all_gather(dgrads, dgrad)
    dgrad = torch.cat(dgrads, dim=1)
    return x, y, dy, dgrad, wgrad


def main():
    torch.use_deterministic_algorithms(True)

    torch.distributed.init_process_group("nccl")
    rank = torch.distributed.get_rank()
    world_size = torch.distributed.get_world_size()
    torch.cuda.set_device(rank)

    explicit_nhwc = True

    dtype = torch.float16
    N, C, H, W = 1, 64, 200, 336
    Hs = ((H + 8 * world_size - 1) // (8 * world_size)) * 8
    H = Hs * world_size
    gt_bottleneck = ground_truth_bottleneck(C, dtype, explicit_nhwc)
    gt = ground_truth(N, C, H, W, dtype, 1, gt_bottleneck)

    # verify that spatial bottleneck with group_size 1 produces same results as ground truth bottleneck
    spatial_bottleneck = spatial_parallel_bottleneck(C, dtype, explicit_nhwc, gt_bottleneck, None)
    bt = apply_to_different_bottleneck(gt, spatial_bottleneck)
    compare(gt, bt)
    # print_bottleneck_p_and_b(gt_bottleneck)
    # print_bottleneck_p_and_b(spatial_bottleneck)

    group_size = world_size
    group = rank // group_size
    ranks = [group * group_size + i for i in range(group_size)]
    rank_in_group = rank % group_size

    spatial_group_size = world_size
    spatial_communicator = None

    peer_pool = PeerMemoryPool(0, 64 * 1024 * 1024, ranks)

    # class HaloExchangerNoComm(HaloExchanger):
    #    def __init__(self, ranks, rank_in_group):
    # class HaloExchangerAllGather(HaloExchanger):
    #    def __init__(self, ranks, rank_in_group, comm):
    # class HaloExchangerSendRecv(HaloExchanger):
    #    def __init__(self, ranks, rank_in_group):
    # class HaloExchangerPeer(HaloExchanger):
    #    def __init__(self, ranks, rank_in_group, peer_pool, explicit_nhwc, numSM=1):

    # halex = HaloExchangerAllGather(ranks, rank_in_group)
    # halex = HaloExchangerSendRecv(ranks, rank_in_group)

    halex = HaloExchangerPeer(ranks, rank_in_group, peer_pool, explicit_nhwc, numSM=0)
    # print("halex.signals = %s" % (str(halex.signals)))
    # Make sure peer memory halo exchanger has finished initializing flags on all ranks before proceeding
    # torch.cuda.synchronize()
    # torch.distributed.barrier()

    bt2 = n_way_spatial(halex, gt_bottleneck, gt, explicit_nhwc, world_size, rank, fp32_reduce=True)
    compare(gt, bt2)


@unittest.skipIf(SKIP_TEST, f"{SKIP_TEST}")
class TestBottleneck(NcclDistributedTestBase):
    # PyTorch's float16 tolerance values, see https://pytorch.org/docs/stable/testing.html#torch.testing.assert_close
    fp16_tolerance = {"atol": 1e-5, "rtol": 1e-3}

    @property
    def world_size(self) -> int:
        return min(torch.cuda.device_count(), 2)

    def test_bottleneck_without_peer_memory(self) -> None:
        explicit_nhwc: bool = True
        dtype: torch.dtype = torch.float16
        N, C, H, W = 1, 64, 200, 336
        Hs = ((H + 8 * self.world_size - 1) // (8 * self.world_size)) * 8
        H = Hs * self.world_size

        gt_bottleneck = ground_truth_bottleneck(C, dtype, explicit_nhwc)
        gt = ground_truth(N, C, H, W, dtype, 1, gt_bottleneck)

        spatial_bottleneck = spatial_parallel_bottleneck(
            C, dtype, explicit_nhwc, gt_bottleneck, None
        )
        bt = apply_to_different_bottleneck(gt, spatial_bottleneck)
        self.assertEqual(gt, bt, **self.fp16_tolerance)

    @unittest.skipIf(
        torch.cuda.device_count() < 2 or not torch.cuda.can_device_access_peer(0, 1),
        "peer memory access not supported",
    )
    def test_bottleneck_with_peer_memory(self) -> None:
        explicit_nhwc: bool = True
        dtype: torch.dtype = torch.float16
        N, C, H, W = 1, 64, 200, 336
        Hs = ((H + 8 * self.world_size - 1) // (8 * self.world_size)) * 8
        H = Hs * self.world_size

        gt_bottleneck = ground_truth_bottleneck(C, dtype, explicit_nhwc)
        gt = ground_truth(N, C, H, W, dtype, 1, gt_bottleneck)

        group = self.rank // self.world_size
        ranks = [group * self.world_size + i for i in range(self.world_size)]
        rank_in_group = self.rank % self.world_size

        spatial_group_size, spatial_communicator = self.world_size, None
        peer_pool = PeerMemoryPool(0, 64 * 1024 * 1024, ranks)
        halo_exchanger_peer = HaloExchangerPeer(
            ranks, rank_in_group, peer_pool, explicit_nhwc, numSM=0
        )
        bt2 = n_way_spatial(
            halo_exchanger_peer,
            gt_bottleneck,
            gt,
            explicit_nhwc,
            self.world_size,
            self.rank,
            fp32_reduce=True,
        )
        # TODO(crcrpar): Investigate the implementation to mitigate the numerical errors.
        # NOTE(crcrpar): This assert often fails due to numerical errors.
        # self.assertEqual(gt, bt2, **self.fp16_tolerance)


if __name__ == "__main__":
    common_utils.run_tests()


================================================
FILE: apex/contrib/test/clip_grad/__init__.py
================================================


================================================
FILE: apex/contrib/test/clip_grad/test_clip_grad.py
================================================
import random
import unittest

import torch

SKIP_TEST = None
try:
    from apex.contrib.clip_grad import clip_grad_norm_
except ImportError as e:
    SKIP_TEST = e


def make_params(
    num_params,
    sizes=[1, 2, 3, 4, 5],
    num_dims=[1, 2, 3],
    dtypes=[torch.float32],
    devices=["cuda"],
    make_copy=False,
):
    """Construct parameters with random configurations"""

    # Construct parameters
    params = []
    for _ in range(num_params):
        dims = [random.choice(sizes) for _ in range(random.choice(num_dims))]
        dtype = random.choice(dtypes)
        device = random.choice(devices)
        p = torch.nn.Parameter(torch.randn(dims, dtype=dtype, device=device))
        p.grad = torch.randn_like(p)
        params.append(p)

    # Copy parameters if needed
    if make_copy:
        params_copy = []
        for p in params:
            p_copy = p.clone().detach()
            p_copy.grad = p.grad.clone().detach()
            params_copy.append(p_copy)
        return params, params_copy
    else:
        return params


@unittest.skipIf(SKIP_TEST, f"{SKIP_TEST}")
class ClipGradNormTest(unittest.TestCase):
    def setUp(self, seed=1234):
        super().setUp()
        random.seed(seed)
        torch.manual_seed(seed)

    def test_matches_pytorch(
        self,
        num_params=41,
        dtypes=[torch.float32, torch.float16, torch.float64],
        devices=["cuda", "cpu"],
        max_norm=0.54321,
        norm_type=2.0,
        rtol=1e-3,
        atol=1e-20,
    ):
        """Make sure PyTorch and Apex gradient clipping produce same results"""

        # Construct identical sets of parameters
        torch_params, apex_params = make_params(
            num_params,
            dtypes=dtypes,
            devices=devices,
            make_copy=True,
        )

        # Apply gradient clipping
        torch_norm = torch.nn.utils.clip_grad_norm_(
            torch_params,
            max_norm,
            norm_type=norm_type,
        )
        apex_norm = clip_grad_norm_(
            apex_params,
            max_norm,
            norm_type=norm_type,
        )

        # Make sure PyTorch and Apex get same results
        torch.testing.assert_close(
            apex_norm,
            torch_norm,
            rtol=rtol,
            atol=atol,
            check_dtype=False,
        )
        for torch_p, apex_p in zip(torch_params, apex_params):
            torch.testing.assert_close(
                apex_p,
                torch_p,
                rtol=0,
                atol=0,
            )  # Params should be unaffected
            torch.testing.assert_close(
                apex_p.grad,
                torch_p.grad,
                rtol=rtol,
                atol=atol,
            )

    def test_matches_pytorch_fp16(self):
        self.test_matches_pytorch(num_params=11, dtypes=[torch.float16])

    def test_matches_pytorch_fp32(self):
        self.test_matches_pytorch(dtypes=[torch.float32], rtol=1e-6)

    def test_matches_pytorch_fp64(self):
        self.test_matches_pytorch(dtypes=[torch.float64], rtol=1e-15)

    def test_matches_pytorch_cpu(self):
        self.test_matches_pytorch(devices=["cpu"])

    def test_matches_pytorch_infnorm(self):
        self.test_matches_pytorch(norm_type=float("inf"))

    def test_matches_pytorch_1norm(self):
        self.test_matches_pytorch(norm_type=1.0)

    def test_raises_on_mismatch(self):
        # Construct different sets of parameters
        torch_params, apex_params = make_params(7, make_copy=True)
        with torch.no_grad():
            torch_params[0].grad.view(-1)[0] = 1.23
            apex_params[0].grad.view(-1)[0] = 3.21

        # Apply gradient clipping
        torch_norm = torch.nn.utils.clip_grad_norm_(
            torch_params,
            0.54321,
        )
        apex_norm = clip_grad_norm_(
            apex_params,
            0.54321,
        )

        # Make sure PyTorch and Apex get different results
        self.assertRaises(
            AssertionError,
            torch.testing.assert_close,
            apex_norm,
            torch_norm,
            rtol=1e-3,
            atol=1e-20,
            check_dtype=False,
        )
        for torch_p, apex_p in zip(torch_params, apex_params):
            self.assertRaises(
                AssertionError,
                torch.testing.assert_close,
                apex_p.grad,
                torch_p.grad,
                rtol=1e-3,
                atol=1e-20,
            )

    def test_raises_on_nan(self):
        params = make_params(5, num_dims=[1])
        params[2].grad[-1] = float("NaN")
        self.assertRaises(RuntimeError, clip_grad_norm_, params, 1.0, error_if_nonfinite=True)

    def test_raises_on_inf(self):
        params = make_params(5, num_dims=[1])
        params[2].grad[-1] = float("inf")
        self.assertRaises(RuntimeError, clip_grad_norm_, params, 1.0, error_if_nonfinite=True)


if __name__ == "__main__":
    unittest.main()


================================================
FILE: apex/contrib/test/conv_bias_relu/__init__.py
================================================


================================================
FILE: apex/contrib/test/conv_bias_relu/test_conv_bias_relu.py
================================================
import copy
import math
import random
import unittest

import torch
import torch.nn.functional as F

HAS_CONV_BIAS_RELU = None
try:
    from apex.contrib.conv_bias_relu import (
        ConvBiasReLU,
        ConvBias,
        ConvBiasMaskReLU,
        ConvFrozenScaleBiasReLU,
    )
except ImportError:
    HAS_CONV_BIAS_RELU = False
else:
    HAS_CONV_BIAS_RELU = True


@unittest.skipIf(not HAS_CONV_BIAS_RELU, "`apex.contrib.conv_bias_relu` is not found.")
class FusedDenseTest(unittest.TestCase):
    def setUp(self, seed=0):
        super().setUp()
        torch.manual_seed(seed)

        self.batch_size = random.randint(1, 64)
        self.in_channels = random.randint(1, 64) * 8
        self.out_channels = random.randint(1, 64) * 8
        self.in_height = self.in_width = random.randint(5, 100)
        self.conv_kernel_size = random.randint(1, 5)
        self.conv_pad = random.randint(0, int(self.conv_kernel_size / 2))
        self.conv_stride = random.randint(1, 5)
        self.conv_dilation = 1
        self.out_height = self.out_width = math.floor(
            (
                self.in_height
                + 2 * self.conv_pad
                - self.conv_dilation * (self.conv_kernel_size - 1)
                - 1
            )
            / self.conv_stride
            + 1
        )

        self.x = (
            torch.randint(
                low=-16,
                high=16,
                size=[self.batch_size, self.in_channels, self.in_height, self.in_width],
            )
            .cuda()
            .to(memory_format=torch.channels_last)
            .float()
        )
        self.x_ = self.x.clone()
        self.x.requires_grad_()
        self.x_.requires_grad_()

        self.mask = (
            torch.randn([self.batch_size, self.out_channels, self.out_height, self.out_width])
            .cuda()
            .to(memory_format=torch.channels_last)
        )
        self.mask = (self.mask > 0).to(torch.int8)
        self.mask_ = self.mask.clone()

        self.scale = torch.randn([1, self.out_channels, 1, 1]).half().cuda()
        self.scale_ = self.scale.clone()
        self.bias = torch.randn([1, self.out_channels, 1, 1]).half().cuda()
        self.bias_ = self.bias.clone()

        self.conv1 = (
            torch.nn.Conv2d(
                self.in_channels,
                self.out_channels,
                self.conv_kernel_size,
                stride=self.conv_stride,
                padding=self.conv_pad,
            )
            .cuda()
            .to(memory_format=torch.channels_last)
        )
        self.conv1_ = copy.deepcopy(self.conv1)

        self.conv2 = (
            torch.nn.Conv2d(
                self.in_channels,
                self.out_channels,
                self.conv_kernel_size,
                stride=self.conv_stride,
                padding=self.conv_pad,
                bias=False,
            )
            .cuda()
            .to(memory_format=torch.channels_last)
        )
        self.conv2_ = copy.deepcopy(self.conv2)

        print()
        print(
            "> input=[{}, {}, {}, {}]".format(
                self.batch_size, self.in_channels, self.in_height, self.in_width
            )
        )
        print(
            "> kernel=[{}, {}, {}, {}], stride={}, pad={}".format(
                self.out_channels,
                self.in_channels,
                self.conv_kernel_size,
                self.conv_kernel_size,
                self.conv_stride,
                self.conv_pad,
            )
        )

    def test_conv_bias_relu(self):
        with torch.amp.autocast("cuda", dtype=torch.half):
            out = ConvBiasReLU(
                self.x,
                self.conv1.weight,
                self.conv1.bias.reshape(1, -1, 1, 1),
                self.conv_pad,
                self.conv_stride,
            )
            loss = (out.float() ** 2).sum() / out.numel()
        loss.backward()
        with torch.amp.autocast("cuda", dtype=torch.half):
            out_ = F.relu(self.conv1_(self.x_))
            loss_ = (out_**2).sum() / out_.numel()
        loss_.backward()

        torch.testing.assert_close(out_, out, atol=1e-3, rtol=1e-3, equal_nan=True)
        torch.testing.assert_close(
            self.conv1_.bias.grad,
            self.conv1.bias.grad,
            atol=1e-3,
            rtol=1e-3,
            equal_nan=True,
        )
        torch.testing.assert_close(
            self.conv1_.weight.grad,
            self.conv1.weight.grad,
            atol=1e-3,
            rtol=1e-3,
            equal_nan=True,
        )
        torch.testing.assert_close(self.x_.grad, self.x.grad, atol=1e-3, rtol=1e-3, equal_nan=True)

    def test_conv_bias(self):
        with torch.amp.autocast("cuda", dtype=torch.half):
            out = ConvBias(
                self.x,
                self.conv1.weight,
                self.conv1.bias.reshape(1, -1, 1, 1),
                self.conv_pad,
                self.conv_stride,
            )
            loss = (out.float() ** 2).sum() / out.numel()
        loss.backward()

        with torch.amp.autocast("cuda", dtype=torch.half):
            out_ = self.conv1_(self.x_)
            loss_ = (out_**2).sum() / out_.numel()
        loss_.backward()

        torch.testing.assert_close(out, out_, atol=1e-3, rtol=1e-3, equal_nan=True)
        torch.testing.assert_close(
            self.conv1_.bias.grad,
            self.conv1.bias.grad,
            atol=1e-3,
            rtol=1e-3,
            equal_nan=True,
        )
        torch.testing.assert_close(
            self.conv1_.weight.grad,
            self.conv1.weight.grad,
            atol=1e-3,
            rtol=1e-3,
            equal_nan=True,
        )
        torch.testing.assert_close(self.x_.grad, self.x.grad, atol=1e-3, rtol=1e-3, equal_nan=True)

    def test_conv_bias_mask_relu(self):
        with torch.amp.autocast("cuda", dtype=torch.half):
            out = ConvBiasMaskReLU(
                self.x,
                self.conv1.weight,
                self.conv1.bias.reshape(1, -1, 1, 1),
                self.mask,
                self.conv_pad,
                self.conv_stride,
            )
            loss = (out.float() ** 2).sum() / out.numel()
        loss.backward()
        with torch.amp.autocast("cuda", dtype=torch.half):
            out_ = F.relu(self.conv1_(self.x_) * self.mask_)
            loss_ = (out_**2).sum() / out_.numel()
        loss_.backward()

        torch.testing.assert_close(out, out_, atol=1e-3, rtol=1e-3, equal_nan=True)
        torch.testing.assert_close(
            self.conv1_.bias.grad,
            self.conv1.bias.grad,
            atol=1e-3,
            rtol=1e-3,
            equal_nan=True,
        )
        torch.testing.assert_close(
            self.conv1_.weight.grad,
            self.conv1.weight.grad,
            atol=1e-3,
            rtol=1e-3,
            equal_nan=True,
        )
        torch.testing.assert_close(self.x_.grad, self.x.grad, atol=1e-3, rtol=1e-3, equal_nan=True)

    def test_conv_frozen_scale_bias_relu(self):
        with torch.amp.autocast("cuda", dtype=torch.half):
            out = ConvFrozenScaleBiasReLU(
                self.x,
                self.conv2.weight,
                self.scale,
                self.bias,
                self.conv_pad,
                self.conv_stride,
            )
            loss = (out.float() ** 2).sum() / out.numel()
        loss.backward()
        with torch.amp.autocast("cuda", dtype=torch.half):
            out_ = F.relu(self.conv2_(self.x_) * self.scale_ + self.bias_)
            loss_ = (out_**2).sum() / out_.numel()
        loss_.backward()

        torch.testing.assert_close(out, out_, atol=2.5e-3, rtol=2.5e-3, equal_nan=True)
        torch.testing.assert_close(
            self.conv2_.weight.grad,
            self.conv2.weight.grad,
            atol=1e-3,
            rtol=1e-3,
            equal_nan=True,
        )
        torch.testing.assert_close(self.x_.grad, self.x.grad, atol=1e-3, rtol=1e-3, equal_nan=True)


if __name__ == "__main__":
    unittest.main()


================================================
FILE: apex/contrib/test/cudnn_gbn/__init__.py
================================================


================================================
FILE: apex/contrib/test/cudnn_gbn/test_cudnn_gbn_with_two_gpus.py
================================================
import copy
import typing
import unittest

import torch
import torch.nn as nn
from torch.testing._internal import common_utils

SKIP_TEST = None
from apex.distributed_testing.distributed_test_base import NcclDistributedTestBase

try:
    from apex.contrib.cudnn_gbn import GroupBatchNorm2d as GBN
except ImportError as e:
    SKIP_TEST = e


# Usage: python /path/to/cudnn_gbn/test_gbn_with_two_gpus.py

input_shapes = [
    [1, 1024, 48, 72],
    [1, 128, 192, 288],
    [1, 128, 384, 576],
    [1, 1536, 48, 72],
    [1, 2048, 48, 72],
    [1, 256, 1, 1],
    [1, 256, 192, 288],
    [1, 256, 384, 576],
    [1, 256, 48, 72],
    [1, 256, 96, 144],
    [1, 32, 384, 576],
    [1, 48, 192, 288],
    [1, 64, 384, 576],
    [1, 728, 48, 72],
    [1, 728, 96, 144],
]


class BNModelRef(nn.Module):
    def __init__(self, num_features, num_layers=1000):
        super().__init__()
        self.fwd = nn.Sequential(
            *[
                nn.BatchNorm2d(
                    num_features,
                    eps=1e-05,
                    momentum=0.1,
                    affine=True,
                    track_running_stats=True,
                )
                for _ in range(num_layers)
            ]
        )

    def forward(self, x):
        return self.fwd(x)


class BNModel(nn.Module):
    def __init__(self, num_features, num_layers=1000):
        super().__init__()
        self.fwd = nn.Sequential(
            *[
                GBN(
                    num_features,
                    group_size=2,
                    eps=1e-05,
                    momentum=0.1,
                    affine=True,
                    track_running_stats=True,
                )
                for _ in range(num_layers)
            ]
        )

    def forward(self, x):
        return self.fwd(x)


def get_rand_tensors(global_shape, device):
    inp_t = torch.rand(global_shape, dtype=torch.float32, device=device).to(
        memory_format=torch.channels_last
    )
    weight = torch.rand(global_shape[1], dtype=torch.float32, device=device)
    bias = torch.rand(global_shape[1], dtype=torch.float32, device=device)
    _grad_out = torch.rand(global_shape, dtype=torch.float32, device=device).to(
        memory_format=torch.channels_last
    )
    return inp_t, weight, bias, _grad_out


@unittest.skipIf(SKIP_TEST, f"{SKIP_TEST}")
class TestCudnnGBN(NcclDistributedTestBase):
    def _prep(self):
        torch.cuda.manual_seed(333)
        torch.manual_seed(333)

    @property
    def world_size(self) -> int:
        return min(torch.cuda.device_count(), 2)

    @torch.backends.cudnn.flags(enabled=True, benchmark=True)
    def _test_cudnn_gbn(
        self,
        num_layers: int,
        shape: typing.List[int],
        *,
        memory_format: torch.memory_format = torch.channels_last,
    ) -> None:
        global_shape = copy.deepcopy(shape)
        global_shape[0] = self.world_size

        device = torch.device("cuda", self.rank)
        cudnn_gbn_model = BNModel(
            num_features=shape[1],
            num_layers=num_layers,
        ).to(device=device, memory_format=memory_format)
        ref_model = BNModelRef(
            num_features=shape[1],
            num_layers=num_layers,
        ).to(device=device, memory_format=memory_format)

        input, weight, bias, grad_out = get_rand_tensors(global_shape, device)
        with torch.no_grad():
            ref_model.fwd[0].weight.copy_(weight)
            ref_model.fwd[0].bias.copy_(bias)
            cudnn_gbn_model.fwd[0].weight.copy_(weight)
            cudnn_gbn_model.fwd[0].bias.copy_(bias)

            ref_input = input.clone().detach().requires_grad_()
            input = input[self.rank : self.rank + 1, ...].clone().detach().requires_grad_()

            ref_grad_out = grad_out.half().clone().detach()
            grad_out = grad_out[self.rank : self.rank + 1, ...].half().clone().detach()

        with torch.amp.autocast("cuda"):
            out = cudnn_gbn_model(input)
            ref_out = ref_model(ref_input.half())
        out.backward(grad_out)
        ref_out.backward(ref_grad_out)

        kwargs = {"rtol": 3.5e-3, "atol": 3e-2, "msg": f"shape: {shape}"}

        torch.testing.assert_close(ref_out[self.rank : self.rank + 1], out, **kwargs)
        torch.testing.assert_close(ref_input.grad[self.rank : self.rank + 1], input.grad, **kwargs)
        # compensating the averaging over processes done by DDP
        # in order to produce mathematically equivalent result
        # https://github.com/NVIDIA/apex/issues/134#issuecomment-458307368
        torch.testing.assert_close(
            ref_model.fwd[0].weight.grad / self.world_size,
            cudnn_gbn_model.fwd[0].weight.grad,
            **kwargs,
        )
        torch.testing.assert_close(
            ref_model.fwd[0].bias.grad / self.world_size,
            cudnn_gbn_model.fwd[0].bias.grad,
            **kwargs,
        )

    def test_cudnngbn(self):
        if self.world_size != 2:
            self.skipTest(f"This test is written for world_size of 2 but {self.world_size}")
        for shape in input_shapes:
            self._prep()
            self._test_cudnn_gbn(1, shape)


if __name__ == "__main__":
    common_utils.run_tests()


================================================
FILE: apex/contrib/test/fmha/__init__.py
================================================


================================================
FILE: apex/contrib/test/fmha/test_fmha.py
================================================
###############################################################################
# Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#     * Neither the name of the NVIDIA CORPORATION nor the
#       names of its contributors may be used to endorse or promote products
#       derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
###############################################################################

import math
import unittest

import torch
import numpy as np

SKIP_TEST = None
try:
    import fmhalib as mha
except ImportError as e:
    SKIP_TEST = e


def _get_device_properties(device=torch.device("cuda")):
    # type: (str or torch.device) -> Tuple[int, int]
    properties = torch.cuda.get_device_properties(device)
    return properties.major, properties.minor


def py_mha(qkv, amask, b, s, h, d):
    qkv = qkv.view(b, s, h, 3, d)
    q = qkv[:, :, :, 0, :].permute(0, 2, 1, 3)
    k = qkv[:, :, :, 1, :].permute(0, 2, 1, 3)
    v = qkv[:, :, :, 2, :].permute(0, 2, 1, 3)
    p = torch.matmul(q.float(), k.permute(0, 1, 3, 2).float())
    p_masked = p / math.sqrt(d) + (1.0 - amask) * -10000.0
    s = torch.softmax(p_masked, -1).to(qkv.dtype)
    ctx = torch.matmul(s, v)
    ctx = ctx.permute(0, 2, 1, 3).contiguous()

    ctx.retain_grad()

    return ctx


@unittest.skipIf(SKIP_TEST, f"{SKIP_TEST}")
@unittest.skipIf(
    _get_device_properties() not in [(8, 0), (9, 0), (10, 0), (12, 0)],
    "FMHA only supports sm80",
)
class TestFMHA(unittest.TestCase):
    def run_test(self, s: int, b: int, zero_tensors: bool):
        print(f"Test s={s} b={b}, zero_tensors={zero_tensors}")

        torch.manual_seed(1234)
        torch.cuda.manual_seed(1234)

        dtype = torch.float16
        device = torch.device("cuda")

        h = 16
        d = 64

        slens = [s] * b
        a = torch.tensor(np.array([0] + slens), dtype=torch.int32)
        amask = torch.ones(b, h, s, s, dtype=dtype, device=device)
        seqlens = torch.tensor(slens, dtype=torch.int32, device=device)
        cu_seqlens = torch.cumsum(a, 0).to(dtype=torch.int32, device=device)
        total = cu_seqlens[-1].item()

        qkv = torch.randn((b, s, h, 3, d), device=device, dtype=dtype)

        qkv_vs = qkv.permute(0, 1, 3, 2, 4).contiguous().view(b * s, 3, h, d)

        qkv.requires_grad = True

        if b < 4:
            ctx, S_ = mha.fwd(qkv_vs, cu_seqlens, 0.0, s, True, True, zero_tensors, None)
        else:
            ctx, S_ = mha.fwd(qkv_vs, cu_seqlens, 0.0, s, True, False, zero_tensors, None)
        ctx = ctx.view(b, s, h, d)

        ctx_ref = py_mha(qkv, amask, b, s, h, d)
        torch.testing.assert_close(ctx_ref.float(), ctx.float(), atol=1e-3, rtol=1e-5)

        labels = torch.randn_like(ctx_ref)
        diff = ctx_ref - labels
        l = (diff * diff).sum() / b
        l.backward()

        dw = ctx_ref.grad.permute(0, 2, 1, 3)

        dw2 = dw.permute(0, 2, 1, 3).clone().detach().contiguous()

        if b < 4:
            dqkv2, _, _ = mha.bwd_nl(dw2, qkv_vs, S_, cu_seqlens, 0.0, s, zero_tensors)
        else:
            dqkv2, _ = mha.bwd(dw2, qkv_vs, S_, cu_seqlens, 0.0, s, zero_tensors)

        dqkv2 = dqkv2.permute(0, 2, 1, 3).view(b, s, h, 3, d)

        torch.testing.assert_close(qkv.grad.float(), dqkv2.float(), atol=1e-3, rtol=1e-5)

    def test_128(self):
        self.run_test(128, 32, False)
        self.run_test(128, 32, True)
        self.run_test(128, 56, False)
        self.run_test(128, 56, True)

    def test_256(self):
        self.run_test(256, 32, False)
        self.run_test(256, 32, True)
        self.run_test(256, 56, False)
        self.run_test(256, 56, True)

    def test_384(self):
        self.run_test(384, 32, False)
        self.run_test(384, 32, True)
        self.run_test(384, 56, False)
        self.run_test(384, 56, True)

    def test_512(self):
        self.run_test(512, 32, False)
        self.run_test(512, 32, True)
        self.run_test(512, 56, False)
        self.run_test(512, 56, True)
        self.run_test(512, 2, False)
        self.run_test(512, 2, True)
        self.run_test(512, 3, False)
        self.run_test(512, 3, True)


if __name__ == "__main__":
    unittest.main()


================================================
FILE: apex/contrib/test/focal_loss/__init__.py
================================================


================================================
FILE: apex/contrib/test/focal_loss/test_focal_loss.py
================================================
import unittest

import torch
import torch.nn.functional as F

reference_available = True
try:
    from torchvision.ops.focal_loss import sigmoid_focal_loss
except ImportError:
    reference_available = False

SKIP_TEST = None
try:
    from apex.contrib.focal_loss import focal_loss
except ImportError as e:
    SKIP_TEST = e


@unittest.skipIf(SKIP_TEST, f"{SKIP_TEST}")
@unittest.skipIf(
    not reference_available,
    "Reference implementation `torchvision.ops.focal_loss.sigmoid_focal_loss` is not available.",
)
class FocalLossTest(unittest.TestCase):
    N_SAMPLES = 12
    N_CLASSES = 8
    ALPHA = 0.24
    GAMMA = 2.0
    REDUCTION = "sum"

    def test_focal_loss(self) -> None:
        if not reference_available:
            self.skipTest(
                "This test needs `torchvision` for `torchvision.ops.focal_loss.sigmoid_focal_loss`."
            )
        else:
            x = torch.randn(FocalLossTest.N_SAMPLES, FocalLossTest.N_CLASSES).cuda()
            with torch.no_grad():
                x_expected = x.clone()
                x_actual = x.clone()
            x_expected.requires_grad_()
            x_actual.requires_grad_()

            classes = torch.randint(0, FocalLossTest.N_CLASSES, (FocalLossTest.N_SAMPLES,)).cuda()
            with torch.no_grad():
                y = F.one_hot(classes, FocalLossTest.N_CLASSES).float()

            expected = sigmoid_focal_loss(
                x_expected,
                y,
                alpha=FocalLossTest.ALPHA,
                gamma=FocalLossTest.GAMMA,
                reduction=FocalLossTest.REDUCTION,
            )

            actual = sum(
                [
                    focal_loss.FocalLoss.apply(
                        x_actual[i : i + 1],
                        classes[i : i + 1].long(),
                        torch.ones([], device="cuda"),
                        FocalLossTest.N_CLASSES,
                        FocalLossTest.ALPHA,
                        FocalLossTest.GAMMA,
                        0.0,
                    )
                    for i in range(FocalLossTest.N_SAMPLES)
                ]
            )

            # forward parity
            torch.testing.assert_close(expected, actual)

            expected.backward()
            actual.backward()

            # grad parity
            torch.testing.assert_close(x_expected.grad, x_actual.grad)


if __name__ == "__main__":
    torch.manual_seed(42)
    unittest.main()


================================================
FILE: apex/contrib/test/fused_dense/test_fused_dense.py
================================================
import unittest
import os

import torch
from torch.testing._internal import common_utils
from torch.testing._internal.common_device_type import instantiate_device_type_tests

SKIP_TEST = None
try:
    from apex import fused_dense
except ImportError as e:
    SKIP_TEST = e


@unittest.skipIf(SKIP_TEST, f"{SKIP_TEST}")
class FusedDenseTest(common_utils.TestCase):
    def _test_fused_dense(self, dtype, seed=0):
        os.environ["TORCH_ALLOW_TF32_CUBLAS_OVERRIDE"] = "0"
        torch.manual_seed(seed)

        seq_length = 512
        sequences = 3
        hidden_dim = 1024

        ref_inputs = torch.randn(
            sequences * seq_length, hidden_dim, dtype=dtype, device=torch.device("cuda")
        ).requires_grad_(True)

        tst_inputs = ref_inputs.clone().detach().requires_grad_(True)
        dense = fused_dense.FusedDense(1024, 3072)
        dense.to(dtype=dtype)
        dense.cuda()

        y_tst = dense(tst_inputs)
        y_ref = torch.matmul(ref_inputs, dense.weight.t()) + dense.bias
        dy = torch.randn_like(y_tst).to(dtype=dtype)
        y_tst.backward(dy)
        dw_ref = torch.matmul(dy.t(), ref_inputs)
        dx_ref = torch.matmul(dy, dense.weight.clone())
        db_ref = dy.sum(0, False)

        torch.testing.assert_close(ref_inputs, tst_inputs, atol=1e-5, rtol=1e-5)
        torch.testing.assert_close(y_ref, y_tst, atol=1e-3, rtol=1e-3, equal_nan=True)
        torch.testing.assert_close(dw_ref, dense.weight.grad, atol=1e-3, rtol=1e-3, equal_nan=True)
        torch.testing.assert_close(dx_ref, tst_inputs.grad, atol=1e-3, rtol=1e-3, equal_nan=True)
        torch.testing.assert_close(db_ref, dense.bias.grad, atol=1e-3, rtol=1e-3, equal_nan=True)

    @common_utils.parametrize("dtype", [torch.half, torch.float, torch.bfloat16])
    def test_fused_dense(self, dtype):
        self._test_fused_dense(dtype)


instantiate_device_type_tests(FusedDenseTest, globals(), only_for=("cuda",))

if __name__ == "__main__":
    common_utils.run_tests()


================================================
FILE: apex/contrib/test/group_norm/__init__.py
================================================


================================================
FILE: apex/contrib/test/group_norm/test_group_norm.py
================================================
#!/usr/bin/env python
# coding: utf-8

#
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#

import functools
import importlib
import pathlib
import sys
import torch
import unittest

SKIP_TEST = None
try:
    from apex.contrib.group_norm.group_norm import cuda_group_norm_nhwc_one_pass
    from apex.contrib.group_norm.group_norm import cuda_group_norm_nhwc_two_pass
    from apex.contrib.group_norm.group_norm import cuda_group_norm_v2_nhwc
    from apex.contrib.group_norm.group_norm import get_cc_and_sm_count
    from apex.contrib.group_norm import GroupNorm
except ImportError as e:
    SKIP_TEST = e


def torch_group_norm_high_precision(x, g, w, b, eps, act="", *, compute_type):
    xdtype = x.dtype
    y = torch.nn.functional.group_norm(
        x.to(compute_type),
        g,
        w.to(compute_type),
        b.to(compute_type),
        eps,
    )
    if act in ["silu", "swish"]:
        y = torch.nn.functional.silu(y)
    y = y.to(dtype=xdtype)
    return y


torch_group_norm_high_precision_fp64 = functools.partial(
    torch_group_norm_high_precision,
    compute_type=torch.float64,
)


@functools.cache
def relative_ulp(dtype, device):
    # Unit in the Last Place
    one = torch.tensor(1.0, dtype=dtype, device=device)
    two = torch.tensor(2.0, dtype=dtype, device=device)
    return (torch.nextafter(one, two) - one).item()


def _ref_compute_type(ref_func, xdtype: torch.dtype) -> torch.dtype:
    # `torch_group_norm_high_precision_fp64` is a functools.partial with compute_type keyword.
    if isinstance(ref_func, functools.partial):
        compute_type = (ref_func.keywords or {}).get("compute_type", None)
        if compute_type is not None:
            return compute_type
    return xdtype


def _estimate_group_norm_test_bytes(
    *,
    N: int,
    C: int,
    H: int,
    W: int,
    xdtype: torch.dtype,
    wdtype: torch.dtype,
    ref_func,
) -> int:
    """
    Conservative VRAM estimate for `verify_group_norm`.

    The reference path converts to a high-precision compute type (fp64 by default)
    and runs both forward+backward while retaining graphs, which can roughly require
    multiple full-size buffers at once. We intentionally over-estimate to avoid OOMs.
    """
    numel = int(N) * int(C) * int(H) * int(W)
    ref_dtype = _ref_compute_type(ref_func, xdtype)

    x_bytes = numel * int(xdtype.itemsize)
    ref_bytes = numel * int(ref_dtype.itemsize)

    # Live tensors: x, dy, y_ref, y_tst, dx_ref/dx_tst + autograd saved buffers.
    # Empirically, a ~10x multiplier on the reference compute buffers is a safer
    # lower bound for fp64 reference on large tensors.
    #
    # Keep the estimate simple and intentionally conservative:
    # - Base fp16/bf16 buffers: ~6x (x, dy, y, grads/temps)
    # - Reference high-precision buffers: ~10x
    estimate = (6 * x_bytes) + (10 * ref_bytes)

    # Small extras: weights/bias/grads.
    estimate += 6 * int(C) * int(wdtype.itemsize)
    return int(estimate)


def _has_sufficient_cuda_memory(required_bytes: int, *, safety_factor: float = 0.90) -> bool:
    if not torch.cuda.is_available():
        return False
    # `mem_get_info` reports free/total for the current device.
    free_bytes, _total_bytes = torch.cuda.mem_get_info()
    return required_bytes <= int(free_bytes * safety_factor)


@unittest.skipIf(
    torch.cuda.get_device_properties().multi_processor_count < 16,
    "GroupNorm is unsupported on low SM count devices",
)
@unittest.skipIf(SKIP_TEST, f"{SKIP_TEST}")
class GroupNormTest(unittest.TestCase):
    def setUp(self, seed=0):
        super().setUp()
        torch.manual_seed(seed)

    def verify_group_norm(
        self,
        tst_func,
        N=32,
        C=128,
        H=256,
        W=256,
        G=32,
        ref_func=torch_group_norm_high_precision_fp64,
        xdtype=torch.float16,
        wdtype=torch.float32,
        eps=1e-5,
        memory_format=torch.channels_last,
        device="cuda",
        act="",
    ):
        # create data
        x_shape = (N, C, H, W)
        w_shape = (C,)
        weight = torch.rand(w_shape, dtype=wdtype, device="cuda", requires_grad=True)
        bias = torch.rand(w_shape, dtype=wdtype, device="cuda", requires_grad=True)
        x = -2.3 + 0.5 * torch.randn(x_shape, dtype=xdtype, device="cuda")
        x = x.to(memory_format=memory_format)
        dy = 0.1 * torch.randn_like(x)
        x.requires_grad_(True)

        # forward pass
        y_ref = ref_func(x, G, weight, bias, eps, act)
        if tst_func is GroupNorm:
            gn = GroupNorm(G, C, eps, device=device, dtype=wdtype, act=act)
            with torch.no_grad():
                gn.weight = torch.nn.Parameter(weight)
                gn.bias = torch.nn.Parameter(bias)
            y_tst = gn(x)
        else:
            y_tst = tst_func(x, G, weight, bias, eps, act)

        # backward pass
        y_ref.backward(dy, retain_graph=True)
        dx_ref, dw_ref, db_ref = [t.grad.clone() for t in [x, weight, bias]]
        x.grad.zero_()
        weight.grad.zero_()
        bias.grad.zero_()
        y_tst.backward(dy, retain_graph=True)
        if tst_func is GroupNorm:
            dx_tst, dw_tst, db_tst = x.grad, gn.weight.grad, gn.bias.grad
        else:
            dx_tst, dw_tst, db_tst = [t.grad.clone() for t in [x, weight, bias]]

        # compare
        torch.testing.assert_close(
            y_tst, y_ref, atol=1e-2, rtol=relative_ulp(y_ref.dtype, y_ref.device)
        )
        torch.testing.assert_close(
            dx_tst, dx_ref, atol=1e-2, rtol=relative_ulp(dx_ref.dtype, dx_ref.device)
        )
        torch.testing.assert_close(
            dw_tst, dw_ref, atol=1e-2, rtol=relative_ulp(dw_ref.dtype, dw_ref.device)
        )
        torch.testing.assert_close(
            db_tst, db_ref, atol=1e-2, rtol=relative_ulp(db_ref.dtype, db_ref.device)
        )

    def test_fp16_one_pass_algo(self):
        self.verify_group_norm(cuda_group_norm_nhwc_one_pass, act="")

    def test_fp16_two_pass_algo(self):
        self.verify_group_norm(cuda_group_norm_nhwc_two_pass, act="")

    def test_fp16_one_pass_algo_with_swish(self):
        self.verify_group_norm(cuda_group_norm_nhwc_one_pass, act="swish")

    def test_fp16_two_pass_algo_with_swish(self):
        self.verify_group_norm(cuda_group_norm_nhwc_two_pass, act="swish")

    def test_bf16_one_pass_algo(self):
        self.verify_group_norm(cuda_group_norm_nhwc_one_pass, xdtype=torch.bfloat16, act="")

    def test_bf16_two_pass_algo(self):
        self.verify_group_norm(cuda_group_norm_nhwc_two_pass, xdtype=torch.bfloat16, act="")

    def test_bf16_one_pass_algo_with_swish(self):
        self.verify_group_norm(cuda_group_norm_nhwc_one_pass, xdtype=torch.bfloat16, act="swish")

    def test_bf16_two_pass_algo_with_swish(self):
        self.verify_group_norm(cuda_group_norm_nhwc_two_pass, xdtype=torch.bfloat16, act="swish")

    def test_fp32_one_pass_algo(self):
        self.verify_group_norm(cuda_group_norm_nhwc_one_pass, xdtype=torch.float32, act="")

    def test_fp32_two_pass_algo(self):
        self.verify_group_norm(cuda_group_norm_nhwc_two_pass, xdtype=torch.float32, act="")

    def test_fp32_one_pass_algo_with_swish(self):
        self.verify_group_norm(cuda_group_norm_nhwc_one_pass, xdtype=torch.float32, act="swish")

    def test_fp32_two_pass_algo_with_swish(self):
        self.verify_group_norm(cuda_group_norm_nhwc_two_pass, xdtype=torch.float32, act="swish")

    def test_group_norm_module(self):
        self.verify_group_norm(GroupNorm, G=16, act="swish")

    def test_group_norm_inductor(self):
        N, C, H, W, G = 32, 320, 256, 256, 16

        model = (
            torch.nn.Sequential(
                GroupNorm(G, C, act="silu", dtype=torch.float16),
                torch.nn.Conv2d(C, C, kernel_size=3, padding="same"),
            )
            .cuda()
            .half()
        )
        compiled = torch.compile(model)

        x = -2.3 + 0.5 * torch.randn((N, C, H, W), dtype=torch.float16, device="cuda")
        x = x.to(memory_format=torch.channels_last)
        dy = 0.1 * torch.randn_like(x)
        x.requires_grad_(True)

        for _ in range(4):
            y = compiled(x)
            y.backward(dy)

        from torch._dynamo.utils import counters

        # TODO: Remove this when 3.9 is no longer supported
        if sys.version_info < (3, 10):
            num_graph_breaks = sum(counters["graph_break"].values())
        else:
            num_graph_breaks = counters["graph_break"].total()
        self.assertEqual(num_graph_breaks, 0, "Shouldn't see any graph breaks.")
        self.assertEqual(counters["stats"]["unique_graphs"], 1, "Expect only one graph.")

    def test_16_groups(self):
        sizes = [
            [8, 2560, 16, 16],
            [8, 1920, 32, 32],
            [8, 1920, 16, 16],
            [8, 2560, 8, 8],
            [1, 128, 16128, 1200],
        ]
        for sz in sizes:
            with self.subTest(size=sz):
                n, c, h, w = sz
                required = _estimate_group_norm_test_bytes(
                    N=n,
                    C=c,
                    H=h,
                    W=w,
                    xdtype=torch.float16,
                    wdtype=torch.float32,
                    ref_func=torch_group_norm_high_precision_fp64,
                )
                if not _has_sufficient_cuda_memory(required):
                    free_bytes, total_bytes = torch.cuda.mem_get_info()
                    raise unittest.SkipTest(
                        f"Skipping large GroupNorm case {sz}: estimated {required / 1e9:.1f} GB "
                        f"requires more than available free VRAM ({free_bytes / 1e9:.1f} GB free, "
                        f"{total_bytes / 1e9:.1f} GB total)."
                    )
                self.verify_group_norm(GroupNorm, N=n, C=c, H=h, W=w, G=16, act="swish")

    def test_large_batch_two_pass(self):
        """Regression test for divide-by-zero when batch size is large.

        When batch_size >= 256 and c >= 640, blocks_per_act_slice = 256 / n
        truncates to 0, causing div_up(hw, 0). Test all three heuristic branches.
        """
        sizes = [
            [256, 1280, 8, 8],
            [512, 640, 16, 16],
            [1024, 512, 8, 8],
        ]
        for sz in sizes:
            with self.subTest(size=sz):
                n, c, h, w = sz
                required = _estimate_group_norm_test_bytes(
                    N=n,
                    C=c,
                    H=h,
                    W=w,
                    xdtype=torch.float16,
                    wdtype=torch.float32,
                    ref_func=torch_group_norm_high_precision_fp64,
                )
                if not _has_sufficient_cuda_memory(required):
                    free_bytes, total_bytes = torch.cuda.mem_get_info()
                    raise unittest.SkipTest(
                        f"Skipping large-batch GroupNorm case {sz}: estimated "
                        f"{required / 1e9:.1f} GB requires more than available "
                        f"free VRAM ({free_bytes / 1e9:.1f} GB free, "
                        f"{total_bytes / 1e9:.1f} GB total)."
                    )
                self.verify_group_norm(
                    cuda_group_norm_nhwc_two_pass, N=n, C=c, H=h, W=w, G=32, act="silu"
                )

    def test_fp16_parameters(self):
        n, c, h, w = 8, 2560, 16, 16
        self.verify_group_norm(
            GroupNorm,
            N=n,
            C=c,
            H=h,
            W=w,
            G=16,
            xdtype=torch.float16,
            wdtype=torch.float16,
            act="swish",
        )

    @staticmethod
    @functools.cache
    def get_v2_hw_c_list():
        srcpath = pathlib.Path(__file__).parent.absolute()
        gen_module_path = (
            srcpath / ".." / ".." / "csrc" / "group_norm_v2" / "generate_gn_cuda_inst.py"
        )
        spec = importlib.util.spec_from_file_location("generate_gn_cuda_inst", gen_module_path)
        generate_gn_cuda_inst = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(generate_gn_cuda_inst)
        return generate_gn_cuda_inst.hw_c_list

    def check_v2_cc_and_sm_count(self):
        cc, sm_count = get_cc_and_sm_count(torch.cuda.current_device())
        return (
            cc in GroupNorm.GN_V2_SUPPORTED_LOWER_BOUND_SM_COUNT
            and sm_count >= GroupNorm.GN_V2_SUPPORTED_LOWER_BOUND_SM_COUNT[cc]
        )

    def skip_if_v2_not_supported(self):
        if not self.check_v2_cc_and_sm_count():
            cc, sm_count = get_cc_and_sm_count(torch.cuda.current_device())
            self.skipTest(
                f"SM count {sm_count} is not supported for compute capability {cc[0]}.{cc[1]}"
            )

    def test_check_v2_legality(self):
        gn = GroupNorm(
            num_groups=16,
            num_channels=640,
            device="cuda",
            dtype=torch.float16,
            act="swish",
        )
        self.skip_if_v2_not_supported()
        # Correct
        x = torch.empty(
            8,
            640,
            32,
            32,
            dtype=torch.float16,
            device="cuda",
            memory_format=torch.channels_last,
        )
        self.assertTrue(gn._check_legality(x) and gn._check_v2_legality(x))
        # Wrong layout
        x = torch.empty(8, 640, 32, 32, dtype=torch.float16, device="cuda")
        self.assertFalse(gn._check_legality(x) and gn._check_v2_legality(x))
        # Wrong shape
        x = torch.empty(
            8,
            640,
            32,
            24,
            dtype=torch.float16,
            device="cuda",
            memory_format=torch.channels_last,
        )
        self.assertFalse(gn._check_legality(x) and gn._check_v2_legality(x))
        # Wrong dtype
        x = torch.empty(
            8,
            640,
            32,
            32,
            dtype=torch.float32,
            device="cuda",
            memory_format=torch.channels_last,
        )
        self.assertFalse(gn._check_legality(x) and gn._check_v2_legality(x))

    def test_fp16_v2_32_groups(self):
        self.skip_if_v2_not_supported()
        for n in [1, 2, 4, 8, 16, 32]:
            for hw, c in self.get_v2_hw_c_list():
                h = w = int(hw**0.5)
                assert hw == h * w
                self.verify_group_norm(
                    cuda_group_norm_v2_nhwc,
                    N=n,
                    C=c,
                    H=h,
                    W=w,
                    G=32,
                    xdtype=torch.float16,
                    wdtype=torch.float16,
                    act="",
                )

    def test_fp16_v2_16_groups_with_swish(self):
        self.skip_if_v2_not_supported()
        for n in [1, 2, 4, 8, 16, 32]:
            for hw, c in self.get_v2_hw_c_list():
                h = w = int(hw**0.5)
                assert hw == h * w
                self.verify_group_norm(
                    cuda_group_norm_v2_nhwc,
                    N=n,
                    C=c,
                    H=h,
                    W=w,
                    G=16,
                    xdtype=torch.float16,
                    wdtype=torch.float16,
                    act="swish",
                )

    def test_bf16_v2_32_groups(self):
        self.skip_if_v2_not_supported()
        for n in [1, 2, 4, 8, 16, 32]:
            for hw, c in self.get_v2_hw_c_list():
                h = w = int(hw**0.5)
                assert hw == h * w
                self.verify_group_norm(
                    cuda_group_norm_v2_nhwc,
                    N=n,
                    C=c,
                    H=h,
                    W=w,
                    G=32,
                    xdtype=torch.bfloat16,
                    wdtype=torch.bfloat16,
                    act="",
                )

    def test_bf16_v2_16_groups_with_swish(self):
        self.skip_if_v2_not_supported()
        for n in [1, 2, 4, 8, 16, 32]:
            for hw, c in self.get_v2_hw_c_list():
                h = w = int(hw**0.5)
                assert hw == h * w
                self.verify_group_norm(
                    cuda_group_norm_v2_nhwc,
                    N=n,
                    C=c,
                    H=h,
                    W=w,
                    G=16,
                    xdtype=torch.bfloat16,
                    wdtype=torch.bfloat16,
                    act="swish",
                )


if __name__ == "__main__":
    unittest.main()


================================================
FILE: apex/contrib/test/index_mul_2d/__init__.py
================================================


================================================
FILE: apex/contrib/test/index_mul_2d/test_index_mul_2d.py
================================================
import random
import unittest

import torch

HAS_INDEX_MUL_2D_RELU = None
try:
    from apex.contrib.index_mul_2d import index_mul_2d
except ImportError:
    HAS_INDEX_MUL_2D_RELU = False
else:
    HAS_INDEX_MUL_2D_RELU = True


@unittest.skipIf(not HAS_INDEX_MUL_2D_RELU, "`apex.contrib.index_mul_2d` is not found.")
class IndexMul2dTest(unittest.TestCase):
    def setUp(self, seed=0):
        torch.manual_seed(seed)

        self.input1_size = random.randint(1, 1000)
        self.input2_size = random.randint(1, 100000)
        self.feature_size = random.randint(1, 256)

        self.input1_float = torch.randn(
            size=(self.input1_size, self.feature_size),
        ).cuda()
        self.input2_float = torch.randn(
            size=(self.input2_size, self.feature_size),
        ).cuda()
        self.index1 = torch.randint(low=0, high=self.input1_size, size=(self.input2_size,)).cuda()

        self.input1_float_ = self.input1_float.clone()
        self.input2_float_ = self.input2_float.clone()

        self.input1_float.requires_grad_()
        self.input1_float_.requires_grad_()
        self.input2_float.requires_grad_()
        self.input2_float_.requires_grad_()

        self.input1_half = (
            torch.randn(
                size=(self.input1_size, self.feature_size),
            )
            .cuda()
            .half()
        )
        self.input2_half = (
            torch.randn(
                size=(self.input2_size, self.feature_size),
            )
            .cuda()
            .half()
        )

        self.input1_half_ = self.input1_half.clone()
        self.input2_half_ = self.input2_half.clone()

        self.input1_half.requires_grad_()
        self.input2_half.requires_grad_()
        self.input1_half_.requires_grad_()
        self.input2_half_.requires_grad_()

    def test_index_mul_float(self):
        out = index_mul_2d(self.input1_float, self.input2_float, self.index1)
        energy = (out.float() ** 2).sum() / out.numel()
        force = torch.autograd.grad(
            energy,
            self.input1_float,
            grad_outputs=torch.ones_like(energy),
            create_graph=True,
        )[0]
        loss = (out.float() ** 2).sum() / out.numel() + (force.float() ** 2).sum()
        loss.backward()

        out_ = self.input1_float_[self.index1] * self.input2_float_
        energy_ = (out_.float() ** 2).sum() / out.numel()
        force_ = torch.autograd.grad(
            energy_,
            self.input1_float_,
            grad_outputs=torch.ones_like(energy),
            create_graph=True,
        )[0]
        loss = (out_.float() ** 2).sum() / out_.numel() + (force_.float() ** 2).sum()
        loss.backward()

        torch.testing.assert_close(
            self.input1_float, self.input1_float_, atol=1e-3, rtol=1e-3, equal_nan=True
        )
        torch.testing.assert_close(
            self.input2_float, self.input2_float_, atol=1e-3, rtol=1e-3, equal_nan=True
        )
        torch.testing.assert_close(
            self.input1_float.grad,
            self.input1_float_.grad,
            atol=1e-3,
            rtol=1e-3,
            equal_nan=True,
        )
        torch.testing.assert_close(
            self.input2_float.grad,
            self.input2_float_.grad,
            atol=1e-3,
            rtol=1e-3,
            equal_nan=True,
        )

    def test_index_mul_half(self):
        out = index_mul_2d(self.input1_half, self.input2_half, self.index1)
        energy = (out.float() ** 2).sum() / out.numel()
        force = torch.autograd.grad(
            energy,
            self.input1_half,
            grad_outputs=torch.ones_like(energy),
            create_graph=True,
        )[0]
        loss = (out.float() ** 2).sum() / out.numel() + (force.float() ** 2).sum()
        loss.backward()

        out_ = self.input1_half_[self.index1] * self.input2_half_
        energy_ = (out_.float() ** 2).sum() / out.numel()
        force_ = torch.autograd.grad(
            energy_,
            self.input1_half_,
            grad_outputs=torch.ones_like(energy),
            create_graph=True,
        )[0]
        loss = (out_.float() ** 2).sum() / out_.numel() + (force_.float() ** 2).sum()
        loss.backward()

        torch.testing.assert_close(
            self.input1_half, self.input1_half_, atol=1e-3, rtol=1e-3, equal_nan=True
        )
        torch.testing.assert_close(
            self.input2_half, self.input2_half_, atol=1e-3, rtol=1e-3, equal_nan=True
        )
        torch.testing.assert_close(
            self.input1_half.grad,
            self.input1_half_.grad,
            atol=2e-3,
            rtol=5e-2,
            equal_nan=True,
        )
        torch.testing.assert_close(
            self.input2_half.grad,
            self.input2_half_.grad,
            atol=1e-3,
            rtol=1e-3,
            equal_nan=True,
        )


if __name__ == "__main__":
    unittest.main()


================================================
FILE: apex/contrib/test/layer_norm/__init__.py
================================================


================================================
FILE: apex/contrib/test/layer_norm/test_fast_layer_norm.py
================================================
import itertools
import unittest

import torch

SKIP_TEST = None
try:
    from apex.contrib.layer_norm.layer_norm import FastLayerNorm
    import fast_layer_norm as fln
except ImportError as e:
    SKIP_TEST = e


class GPUTimer:
    def __init__(self, stream):
        self.start_ = torch.cuda.Event(enable_timing=True)
        self.stop_ = torch.cuda.Event(enable_timing=True)
        self.stream_ = stream

    def start(self):
        self.stream_.record_event(self.start_)

    def stop(self):
        self.stream_.record_event(self.stop_)

    def sync(self):
        self.stream_.synchronize()

    def millis(self):
        return self.start_.elapsed_time(self.stop_)


def size_in_bytes(t):
    return torch.numel(t) * t.element_size()


def metrics(y_ref, y, epsilon=1e-6):
    y_ref = y_ref.float()
    y = y.float()
    relerr, mse = (
        (y_ref - y).abs().sum() / (y_ref.abs().sum() + epsilon),
        (y_ref - y).square().mean(),
    )
    return relerr.item(), mse.item()


device = torch.device("cuda")
fp32 = torch.float32
fp16 = torch.float16
bf16 = torch.bfloat16


def backward_(dz, x, mu, rs, gamma):
    wtype = gamma.dtype
    itype = x.dtype
    otype = dz.dtype
    ctype = mu.dtype
    mu = mu.unsqueeze(1)
    rs = rs.unsqueeze(1)

    hidden_size = gamma.numel()
    y = rs * (x.to(ctype) - mu)
    dbeta = dz.view(-1, hidden_size).sum(0, dtype=ctype)
    dgamma = (dz * y).view(-1, hidden_size).sum(0, dtype=ctype)
    dy = dz.view(-1, hidden_size).to(ctype) * gamma.unsqueeze(0).to(ctype)
    mdy = dy.mean(1, keepdim=True, dtype=ctype)

    mdyy = (dy * y).mean(1, keepdim=True, dtype=ctype)
    dx = rs * (dy - mdyy * y - mdy)

    return dx.to(itype), dgamma.to(wtype), dbeta.to(wtype)


def benchmark_(S, B, hidden_size, itype, wtype, runs=100):
    epsilon = 1e-5

    x = torch.randn((S * B, hidden_size), dtype=itype, device=device)
    beta = torch.randn(hidden_size, dtype=wtype, device=device)
    gamma = torch.randn(hidden_size, dtype=wtype, device=device)
    dz = torch.randn(x.shape, dtype=wtype, device=device)

    stream = torch.cuda.Stream()
    with torch.cuda.stream(stream):
        timer = GPUTimer(stream)

        # warmup
        for r in range(runs):
            z, mu, rsigma = fln.ln_fwd(x, gamma, beta, epsilon)

        timer.start()
        for r in range(runs):
            z, mu, rsigma = fln.ln_fwd(x, gamma, beta, epsilon)
        timer.stop()
        timer.sync()

        total_bytes_fwd = sum([size_in_bytes(t) for t in [x, z, gamma, beta, mu, rsigma]])

        ms_fwd = timer.millis() / runs

        print(
            "[FWD] Time: {:.4f}ms Throughput: {:.4f} GB/sec".format(
                ms_fwd, total_bytes_fwd * 1e-6 / ms_fwd
            )
        )

        timer.start()
        for r in range(runs):
            dx, dgamma, dbeta, dbp, dgp = fln.ln_bwd(dz, z, mu, rsigma, gamma, beta, True)
        timer.stop()
        timer.sync()

        total_bytes_bwd = sum(
            [
                size_in_bytes(t)
                for t in [
                    dz,
                    x,
                    mu,
                    rsigma,
                    gamma,
                    dx,
                    dgamma,
                    dbeta,
                    dbp,
                    dbp,
                    dgp,
                    dgp,
                ]
            ]
        )

        ms_bwd = timer.millis() / runs

        print(
            "[BWD] Time: {:.4f}ms Throughput: {:.4f} GB/sec".format(
                ms_bwd, total_bytes_bwd * 1e-6 / ms_bwd
            )
        )


def _test_impl(S, B, hidden_size, itype, wtype, ctype=fp32, mem_eff=False):
    seed = 1243
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    otype = wtype
    print("========================================================")
    print(f"S={S} B={B} Hidden={hidden_size} {itype} {wtype} Mem_Eff={mem_eff}")
    print("--------------------------------------------------------")

    x = torch.randn(S * B, hidden_size, dtype=itype, device=device)
    gamma = torch.randn(hidden_size, dtype=wtype, device=device) * 0.2
    beta = torch.randn(hidden_size, dtype=wtype, device=device) * 0.2
    epsilon = 1e-5

    x.requires_grad = True
    gamma.requires_grad = True
    beta.requires_grad = True

    mu_ref = x.mean(1, dtype=ctype, keepdim=True)
    v = torch.square(x - mu_ref).mean(1, dtype=ctype, keepdim=True)
    rs_ref = torch.rsqrt(v + epsilon)
    y_ref = rs_ref * (x.to(ctype) - mu_ref)
    z_ref = (gamma.unsqueeze(0) * (y_ref).to(otype) + beta.unsqueeze(0)).to(otype)

    mu_ref = mu_ref.flatten()
    rs_ref = rs_ref.flatten()

    dz = torch.randn_like(z_ref)

    # z_ref.backward(dz)
    # dx_ref = x.grad
    # dgamma_ref = gamma.grad
    # dbeta_ref = beta.grad

    dx_ref, dg_ref, db_ref = backward_(dz, x, mu_ref, rs_ref, gamma)

    z, mu, rs = fln.ln_fwd(x, gamma, beta, epsilon)
    if mem_eff:
        dx, dg, db, dg_part, db_part = fln.ln_bwd(dz, z, mu, rs, gamma, beta, True)
    else:
        dx, dg, db, dg_part, db_part = fln.ln_bwd(dz, x, mu, rs, gamma, beta, False)

    re_z, mse_z = metrics(z_ref, z)
    re_mu, mse_mu = metrics(mu_ref, mu)
    re_rs, mse_rs = metrics(rs_ref, rs)

    re_dx, mse_dx = metrics(dx_ref, dx)
    re_dg, mse_dg = metrics(dg_ref, dg)
    re_db, mse_db = metrics(db_ref, db)

    print(f" z: relerr={re_z:.4e} mse={mse_z:.4e}")
    print(f"mu: relerr={re_mu:.4e} mse={mse_mu:.4e}")
    print(f"rs: relerr={re_mu:.4e} mse={mse_mu:.4e}")

    print(f"dx: relerr={re_dx:.4e} mse={mse_dx:.4e}")
    print(f"dg: relerr={re_dg:.4e} mse={mse_dg:.4e}")
    print(f"db: relerr={re_db:.4e} mse={mse_db:.4e}")

    def check_err(x, relerr):
        tol = 2e-2 if x.dtype in (torch.float16, torch.bfloat16) else 1e-5
        return relerr < tol

    return [
        check_err(x, re)
        for x, re in zip([z, mu, rs, dx, dg, db], [re_z, re_mu, re_rs, re_dx, re_dg, re_db])
    ]


@unittest.skipIf(SKIP_TEST, f"{SKIP_TEST}")
class TestFastLayerNorm(unittest.TestCase):
    # TODO(crcrpar): Try `torch.testing.assert_close` instead and migrate to it if it's working.
    def assertAll(self, l):
        if not all(l):
            print(l)
        for x in l:
            self.assertTrue(x)

    def test_all_configs(self):
        hidden_sizes = [
            768,
            1024,
            1536,
            2048,
            2304,
            3072,
            3840,
            4096,
            5120,
            6144,
            8192,
            10240,
            12288,
            12800,
            14336,
            15360,
            16384,
            18432,
            20480,
            24576,
            25600,
            30720,
            32768,
            40960,
            49152,
            65536,
        ]

        for h, mem_eff in itertools.product(hidden_sizes, (True, False)):
            with self.subTest(f"hidden_size={h}"):
                self.assertAll(_test_impl(256, 2, h, fp32, fp32, mem_eff=mem_eff))
                self.assertAll(_test_impl(256, 2, h, fp16, fp16, mem_eff=mem_eff))
                self.assertAll(_test_impl(256, 2, h, fp32, fp16, mem_eff=mem_eff))
                self.assertAll(_test_impl(256, 2, h, bf16, bf16, mem_eff=mem_eff))
                self.assertAll(_test_impl(256, 2, h, fp32, bf16, mem_eff=mem_eff))

    def test_run_benchmark(self):
        for S, B, hidden_size, runs in (
            (512, 32, 768, 1000),
            (512, 32, 1024, 1000),
            (512, 8, 4096, 1000),
            (512, 8, 5120, 1000),
            (512, 8, 6144, 1000),
            (256, 2, 20480, 500),
            (256, 2, 25600, 500),
            (256, 2, 40960, 250),
            (256, 2, 65536, 250),
        ):
            with self.subTest(f"(S, B, hidden_size)=({S}, {B}, {hidden_size})"):
                benchmark_(S, B, hidden_size, fp16, fp16, runs)

    def test_compat_with_autocast(self):
        autocast_dtypes = (
            (torch.half, torch.bfloat16) if torch.cuda.is_bf16_supported() else (torch.half,)
        )
        input_shape = (512, 32, 768)
        layer_norm = FastLayerNorm(input_shape[-1]).cuda()
        input = torch.randn(input_shape).cuda()

        for dtype in autocast_dtypes:
            layer_norm.zero_grad(set_to_none=True)
            with self.subTest(f"autocast_dtype={dtype}"):
                with torch.amp.autocast("cuda", enabled=True, dtype=dtype):
                    out = layer_norm(input)
                    self.assertEqual(dtype, out.dtype)
                grad = torch.randn_like(out)
                out.backward(grad)
                self.assertEqual(torch.float32, layer_norm.weight.grad.dtype)


if __name__ == "__main__":
    unittest.main()


================================================
FILE: apex/contrib/test/multihead_attn/__init__.py
================================================


================================================
FILE: apex/contrib/test/multihead_attn/test_encdec_multihead_attn.py
================================================
import unittest

import torch

SKIP_TEST = None
try:
    from apex.contrib.multihead_attn import EncdecMultiheadAttn
except ImportError as e:
    SKIP_TEST = e


@unittest.skipIf(SKIP_TEST, f"{SKIP_TEST}")
class EncdecMultiheadAttnTest(unittest.TestCase):
    def setUp(self, seed=1234):
        torch.manual_seed(seed)

        self.seq_length = 80
        self.sequences = 10
        self.hidden_dim = 1024
        self.heads = 16
        self.dropout_prob = 0.0

        self.ref_layer = EncdecMultiheadAttn(
            self.hidden_dim,
            self.heads,
            dropout=self.dropout_prob,
            bias=False,
            include_norm_add=False,
            impl="default",
        )
        self.ref_layer.cuda().half()
        self.ref_layer.reset_parameters()
        self.ref_inputs_q = torch.randn(
            self.seq_length,
            self.sequences,
            self.hidden_dim,
            dtype=torch.float16,
            device=torch.device("cuda"),
        ).requires_grad_(True)
        self.ref_inputs_k = torch.randn(
            self.seq_length,
            self.sequences,
            self.hidden_dim,
            dtype=torch.float16,
            device=torch.device("cuda"),
        ).requires_grad_(True)

        # Reset seed so parameters are identical
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

        self.tst_layer = EncdecMultiheadAttn(
            self.hidden_dim,
            self.heads,
            dropout=self.dropout_prob,
            bias=False,
            include_norm_add=False,
            impl="fast",
        )
        self.tst_layer.cuda().half()
        self.tst_layer.reset_parameters()

        self.tst_inputs_q = torch.randn(
            self.seq_length,
            self.sequences,
            self.hidden_dim,
            dtype=torch.float16,
            device=torch.device("cuda"),
        ).requires_grad_(True)
        self.tst_inputs_k = torch.randn(
            self.seq_length,
            self.sequences,
            self.hidden_dim,
            dtype=torch.float16,
            device=torch.device("cuda"),
        ).requires_grad_(True)

    def test_encdec_multihead_attn(self):
        ref_outputs, _ = self.ref_layer.forward(
            self.ref_inputs_q,
            self.ref_inputs_k,
            self.ref_inputs_k,
            key_padding_mask=None,
            need_weights=False,
            attn_mask=None,
            is_training=True,
        )

        tst_outputs, _ = self.tst_layer.forward(
            self.tst_inputs_q,
            self.tst_inputs_k,
            self.tst_inputs_k,
            key_padding_mask=None,
            need_weights=False,
            attn_mask=None,
            is_training=True,
        )
        torch.testing.assert_close(self.ref_inputs_q, self.tst_inputs_q, atol=1e-5, rtol=1e-5)
        torch.testing.assert_close(self.ref_inputs_k, self.tst_inputs_k, atol=1e-5, rtol=1e-5)
        torch.testing.assert_close(ref_outputs, tst_outputs, atol=1e-3, rtol=1e-3)

        with torch.no_grad():
            ref_grads = torch.randn_like(ref_outputs)
            tst_grads = ref_grads.clone()
        ref_outputs.backward(ref_grads)
        tst_outputs.backward(tst_grads)
        torch.testing.assert_close(
            self.ref_inputs_q.grad, self.tst_inputs_q.grad, atol=1e-3, rtol=1e-3
        )

    def test_encdec_multihead_attn_time_mask(self):
        grads = torch.randn_like(self.tst_inputs_q)
        time_mask_byte = torch.triu(
            torch.ones(
                self.tst_inputs_q.size(0),
                self.tst_inputs_k.size(0),
                device=torch.device("cuda"),
                dtype=torch.uint8,
            ),
            1,
        )
        time_mask_bool = time_mask_byte.to(torch.bool)

        ref_outputs, _ = self.ref_layer.forward(
            self.ref_inputs_q,
            self.ref_inputs_k,
            self.ref_inputs_k,
            key_padding_mask=None,
            need_weights=False,
            attn_mask=time_mask_bool,
            is_training=True,
        )

        tst_outputs, _ = self.tst_layer.forward(
            self.tst_inputs_q,
            self.tst_inputs_k,
            self.tst_inputs_k,
            key_padding_mask=None,
            need_weights=False,
            attn_mask=time_mask_byte,
            is_training=True,
        )

        self.ref_inputs_q.backward(grads)
        self.tst_inputs_q.backward(grads)

        torch.testing.assert_close(self.ref_inputs_q, self.tst_inputs_q, atol=1e-5, rtol=1e-5)
        torch.testing.assert_close(self.ref_inputs_k, self.tst_inputs_k, atol=1e-5, rtol=1e-5)
        torch.testing.assert_close(ref_outputs, tst_outputs, atol=1e-3, rtol=1e-3)
        torch.testing.assert_close(
            self.ref_inputs_q.grad, self.tst_inputs_q.grad, atol=1e-3, rtol=1e-3
        )

    def test_encdec_multihead_attn_pad_mask(self):
        grads = torch.randn_like(self.tst_inputs_q)
        pad_mask_byte = torch.tril(
            torch.ones(
                self.tst_inputs_k.size(1),
                self.tst_inputs_k.size(0),
                device=torch.device("cuda"),
                dtype=torch.uint8,
            ),
            1,
        )
        pad_mask_bool = pad_mask_byte.to(torch.bool)

        ref_outputs, _ = self.ref_layer.forward(
            self.ref_inputs_q,
            self.ref_inputs_k,
            self.ref_inputs_k,
            key_padding_mask=pad_mask_bool,
            need_weights=False,
            attn_mask=None,
            is_training=True,
        )

        tst_outputs, _ = self.tst_layer.forward(
            self.tst_inputs_q,
            self.tst_inputs_k,
            self.tst_inputs_k,
            key_padding_mask=pad_mask_byte,
            need_weights=False,
            attn_mask=None,
            is_training=True,
        )

        self.ref_inputs_q.backward(grads)
        self.tst_inputs_q.backward(grads)

        torch.testing.assert_close(self.ref_inputs_q, self.tst_inputs_q, atol=1e-5, rtol=1e-5)
        torch.testing.assert_close(self.ref_inputs_k, self.tst_inputs_k, atol=1e-5, rtol=1e-5)
        torch.testing.assert_close(ref_outputs, tst_outputs, atol=1e-3, rtol=1e-3)
        torch.testing.assert_close(
            self.ref_inputs_q.grad, self.tst_inputs_q.grad, atol=1e-3, rtol=1e-3
        )


if __name__ == "__main__":
    unittest.main()


================================================
FILE: apex/contrib/test/multihead_attn/test_encdec_multihead_attn_norm_add.py
================================================
import unittest

import torch

SKIP_TEST = None
try:
    from apex.contrib.multihead_attn import EncdecMultiheadAttn
except ImportError as e:
    SKIP_TEST = e


@unittest.skipIf(SKIP_TEST, f"{SKIP_TEST}")
class EncdecMultiheadAttnNormAddTest(unittest.TestCase):
    def setUp(self, seed=1234):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

        self.seq_length = 80
        self.sequences = 10
        self.hidden_dim = 1024
        self.heads = 16
        self.dropout_prob = 0.0

        self.ref_layer = EncdecMultiheadAttn(
            self.hidden_dim,
            self.heads,
            dropout=self.dropout_prob,
            bias=False,
            include_norm_add=True,
            impl="default",
        )
        self.ref_layer.cuda().half()
        self.ref_layer.reset_parameters()
        self.ref_inputs_q = torch.randn(
            self.seq_length,
            self.sequences,
            self.hidden_dim,
            dtype=torch.float16,
            device=torch.device("cuda"),
        ).requires_grad_(True)
        self.ref_inputs_k = torch.randn(
            self.seq_length,
            self.sequences,
            self.hidden_dim,
            dtype=torch.float16,
            device=torch.device("cuda"),
        ).requires_grad_(True)

        # Reset seed so parameters are identical
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

        self.tst_layer = EncdecMultiheadAttn(
            self.hidden_dim,
            self.heads,
            dropout=self.dropout_prob,
            bias=False,
            include_norm_add=True,
            impl="fast",
        )
        self.tst_layer.cuda().half()
        self.tst_layer.reset_parameters()

        self.tst_inputs_q = torch.randn(
            self.seq_length,
            self.sequences,
            self.hidden_dim,
            dtype=torch.float16,
            device=torch.device("cuda"),
        ).requires_grad_(True)
        self.tst_inputs_k = torch.randn(
            self.seq_length,
            self.sequences,
            self.hidden_dim,
            dtype=torch.float16,
            device=torch.device("cuda"),
        ).requires_grad_(True)

    def test_encdec_multihead_attn_norm_add(self):
        grads = torch.randn_like(self.tst_inputs_q)

        for _ in range(5):
            ref_outputs, _ = self.ref_layer.forward(
                self.ref_inputs_q,
                self.ref_inputs_k,
                self.ref_inputs_k,
                key_padding_mask=None,
                need_weights=False,
                attn_mask=None,
                is_training=True,
            )

            tst_outputs, _ = self.tst_layer.forward(
                self.tst_inputs_q,
                self.tst_inputs_k,
                self.tst_inputs_k,
                key_padding_mask=None,
                need_weights=False,
                attn_mask=None,
                is_training=True,
            )

            self.ref_inputs_q.backward(grads)
            self.tst_inputs_q.backward(grads)

        torch.testing.assert_close(self.ref_inputs_q, self.tst_inputs_q, atol=1e-5, rtol=1e-5)
        torch.testing.assert_close(self.ref_inputs_k, self.tst_inputs_k, atol=1e-5, rtol=1e-5)
        torch.testing.assert_close(ref_outputs, tst_outputs, atol=1e-3, rtol=1e-3)
        torch.testing.assert_close(
            self.ref_inputs_q.grad, self.tst_inputs_q.grad, atol=1e-3, rtol=1e-3
        )


if __name__ == "__main__":
    unittest.main()


================================================
FILE: apex/contrib/test/multihead_attn/test_fast_self_multihead_attn_bias.py
================================================
import unittest

import torch

SKIP_TEST = None
try:
    from apex.contrib.multihead_attn import SelfMultiheadAttn
except ImportError as e:
    SKIP_TEST = e


@unittest.skipIf(SKIP_TEST, f"{SKIP_TEST}")
class SelfMultiheadAttnTest(unittest.TestCase):
    def setUp(self, seed=1234):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

        self.seq_length = 80
        self.sequences = 10
        self.hidden_dim = 1024
        self.heads = 16
        self.dropout_prob = 0.0

        self.ref_layer = SelfMultiheadAttn(
            self.hidden_dim,
            self.heads,
            dropout=self.dropout_prob,
            bias=True,
            include_norm_add=False,
            separate_qkv_params=True,
            mask_additive=True,
            impl="default",
        )
        self.ref_layer.cuda().half()
        self.ref_layer.reset_parameters()
        self.ref_inputs = torch.randn(
            self.seq_length,
            self.sequences,
            self.hidden_dim,
            dtype=torch.float16,
            device=torch.device("cuda"),
        ).requires_grad_(True)
        # Reset seed so parameters are identical
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

        self.tst_layer = SelfMultiheadAttn(
            self.hidden_dim,
            self.heads,
            dropout=self.dropout_prob,
            bias=True,
            include_norm_add=False,
            separate_qkv_params=True,
            mask_additive=True,
            impl="fast",
        )
        self.tst_layer.cuda().half()
        self.tst_layer.reset_parameters()

        self.tst_inputs = torch.randn(
            self.seq_length,
            self.sequences,
            self.hidden_dim,
            dtype=torch.float16,
            device=torch.device("cuda"),
        ).requires_grad_(True)

    def test_self_multihead_attn_additive_mask(self):
        grads = torch.randn_like(self.tst_inputs)
        mask = ((torch.randn(self.sequences, self.seq_length) > 0) * -10000.0).half().cuda()

        ref_outputs, _ = self.ref_layer.forward(
            self.ref_inputs,
            self.ref_inputs,
            self.ref_inputs,
            key_padding_mask=mask,
            need_weights=False,
            attn_mask=None,
            is_training=True,
        )

        tst_outputs, _ = self.tst_layer.forward(
            self.tst_inputs,
            self.tst_inputs,
            self.tst_inputs,
            key_padding_mask=mask,
            need_weights=False,
            attn_mask=None,
            is_training=True,
        )

        self.ref_inputs.backward(grads)
        self.tst_inputs.backward(grads)

        torch.testing.assert_close(self.ref_inputs, self.tst_inputs, atol=1e-5, rtol=1e-5)
        torch.testing.assert_close(ref_outputs, tst_outputs, atol=1e-3, rtol=1e-3)
        torch.testing.assert_close(self.ref_inputs.grad, self.tst_inputs.grad, atol=1e-3, rtol=1e-3)


if __name__ == "__main__":
    unittest.main()


================================================
FILE: apex/contrib/test/multihead_attn/test_mha_fused_softmax.py
================================================
import unittest

import torch
import torch.nn.functional as F

SKIP_TEST = None
try:
    from apex.contrib.multihead_attn import fast_mask_softmax_dropout_func
except ImportError as e:
    SKIP_TEST = e


@unittest.skipIf(SKIP_TEST, f"{SKIP_TEST}")
class FusedSoftmaxTest(unittest.TestCase):
    def setUp(self, seed=1234):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

        self.seq_length = 80
        self.sequences = 10
        self.hidden_dim = 1024
        self.heads = 16
        self.dropout_prob = 0.0

        self.mask = (torch.randn(self.sequences, self.seq_length) > 0).cuda()
        self.mask = self.mask.half() * -10000
        self.ref_inputs = torch.randn(
            self.heads * self.sequences,
            self.seq_length,
            self.seq_length,
            dtype=torch.float16,
            device=torch.device("cuda"),
        ).requires_grad_(True)

        self.tst_inputs = self.ref_inputs.clone().detach().requires_grad_(True)

    def test_fused_softmax(self):
        grads = torch.randn_like(self.tst_inputs)
        y_ref = self.ref_inputs.view(self.sequences, self.heads, self.seq_length, self.seq_length)
        y_ref = y_ref + self.mask.unsqueeze(1).unsqueeze(2)
        y_ref = y_ref.view(self.sequences * self.heads, self.seq_length, self.seq_length)
        y_ref = F.softmax(y_ref, dim=-1)
        y_ref = torch._fused_dropout(y_ref, 1.0)

        y_tst = fast_mask_softmax_dropout_func(
            True, self.heads, self.tst_inputs, self.mask, True, 0.0
        )
        y_ref[0].backward(grads)
        y_tst.backward(grads)

        torch.testing.assert_close(self.ref_inputs, self.tst_inputs, atol=1e-5, rtol=1e-5)
        torch.testing.assert_close(y_ref[0], y_tst, atol=1e-3, rtol=1e-3)
        torch.testing.assert_close(self.ref_inputs.grad, self.tst_inputs.grad, atol=1e-3, rtol=1e-3)


if __name__ == "__main__":
    unittest.main()


================================================
FILE: apex/contrib/test/multihead_attn/test_self_multihead_attn.py
================================================
import unittest

import torch

SKIP_TEST = None
try:
    from apex.contrib.multihead_attn import SelfMultiheadAttn
except ImportError as e:
    SKIP_TEST = e


@unittest.skipIf(SKIP_TEST, f"{SKIP_TEST}")
class SelfMultiheadAttnTest(unittest.TestCase):
    def setUp(self, seed=1234):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

        self.seq_length = 80
        self.sequences = 10
        self.hidden_dim = 1024
        self.heads = 16
        self.dropout_prob = 0.0

        self.ref_layer = SelfMultiheadAttn(
            self.hidden_dim,
            self.heads,
            dropout=self.dropout_prob,
            bias=False,
            include_norm_add=False,
            impl="default",
        )
        self.ref_layer.cuda().half()
        self.ref_layer.reset_parameters()
        self.ref_inputs = torch.randn(
            self.seq_length,
            self.sequences,
            self.hidden_dim,
            dtype=torch.float16,
            device=torch.device("cuda"),
        ).requires_grad_(True)

        # Reset seed so parameters are identical
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

        self.tst_layer = SelfMultiheadAttn(
            self.hidden_dim,
            self.heads,
            dropout=self.dropout_prob,
            bias=False,
            include_norm_add=False,
            impl="fast",
        )
        self.tst_layer.cuda().half()
        self.tst_layer.reset_parameters()

        self.tst_inputs = torch.randn(
            self.seq_length,
            self.sequences,
            self.hidden_dim,
            dtype=torch.float16,
            device=torch.device("cuda"),
        ).requires_grad_(True)

    def test_self_multihead_attn(self):
        ref_outputs, _ = self.ref_layer.forward(
            self.ref_inputs,
            self.ref_inputs,
            self.ref_inputs,
            key_padding_mask=None,
            need_weights=False,
            attn_mask=None,
            is_training=True,
        )

        tst_outputs, _ = self.tst_layer.forward(
            self.tst_inputs,
            self.tst_inputs,
            self.tst_inputs,
            key_padding_mask=None,
            need_weights=False,
            attn_mask=None,
            is_training=True,
        )

        torch.testing.assert_close(self.ref_inputs, self.tst_inputs, atol=1e-5, rtol=1e-5)
        torch.testing.assert_close(ref_outputs, tst_outputs, atol=1e-3, rtol=1e-3)

        with torch.no_grad():
            ref_grads = torch.randn_like(self.tst_inputs)
            tst_grads = ref_grads.clone()

        ref_outputs.backward(ref_grads)
        tst_outputs.backward(tst_grads)
        torch.testing.assert_close(self.ref_inputs.grad, self.tst_inputs.grad, atol=1e-3, rtol=1e-3)

    def test_self_multihead_attn_time_mask(self):
        grads = torch.randn_like(self.tst_inputs)
        time_mask_byte = torch.triu(
            torch.ones(
                self.tst_inputs.size(0),
                self.tst_inputs.size(0),
                device=torch.device("cuda"),
                dtype=torch.uint8,
            ),
            1,
        )
        time_mask_bool = time_mask_byte.to(torch.bool)

        ref_outputs, _ = self.ref_layer.forward(
            self.ref_inputs,
            self.ref_inputs,
            self.ref_inputs,
            key_padding_mask=None,
            need_weights=False,
            attn_mask=time_mask_bool,
            is_training=True,
        )

        tst_outputs, _ = self.tst_layer.forward(
            self.tst_inputs,
            self.tst_inputs,
            self.tst_inputs,
            key_padding_mask=None,
            need_weights=False,
            attn_mask=time_mask_byte,
            is_training=True,
        )

        self.ref_inputs.backward(grads)
        self.tst_inputs.backward(grads)

        torch.testing.assert_close(self.ref_inputs, self.tst_inputs, atol=1e-5, rtol=1e-5)
        torch.testing.assert_close(ref_outputs, tst_outputs, atol=5e-3, rtol=1e-3)
        torch.testing.assert_close(self.ref_inputs.grad, self.tst_inputs.grad, atol=1e-3, rtol=1e-3)

    def test_self_multihead_attn_pad_mask(self):
        grads = torch.randn_like(self.tst_inputs)
        pad_mask_byte = torch.tril(
            torch.ones(
                self.tst_inputs.size(1),
                self.tst_inputs.size(0),
                device=torch.device("cuda"),
                dtype=torch.uint8,
            ),
            1,
        )
        pad_mask_bool = pad_mask_byte.to(torch.bool)

        ref_outputs, _ = self.ref_layer.forward(
            self.ref_inputs,
            self.ref_inputs,
            self.ref_inputs,
            key_padding_mask=pad_mask_bool,
            need_weights=False,
            attn_mask=None,
            is_training=True,
        )

        tst_outputs, _ = self.tst_layer.forward(
            self.tst_inputs,
            self.tst_inputs,
            self.tst_inputs,
            key_padding_mask=pad_mask_byte,
            need_weights=False,
            attn_mask=None,
            is_training=True,
        )

        self.ref_inputs.backward(grads)
        self.tst_inputs.backward(grads)

        torch.testing.assert_close(self.ref_inputs, self.tst_inputs, atol=1e-5, rtol=1e-5)
        torch.testing.assert_close(ref_outputs, tst_outputs, atol=1e-3, rtol=1e-3)
        torch.testing.assert_close(self.ref_inputs.grad, self.tst_inputs.grad, atol=1e-3, rtol=1e-3)


if __name__ == "__main__":
    unittest.main()


================================================
FILE: apex/contrib/test/multihead_attn/test_self_multihead_attn_norm_add.py
================================================
import unittest

import torch

SKIP_TEST = None
try:
    from apex.contrib.multihead_attn import SelfMultiheadAttn
except ImportError as e:
    SKIP_TEST = e


@unittest.skipIf(SKIP_TEST, f"{SKIP_TEST}")
class SelfMultiheadAttnNormAddTest(unittest.TestCase):
    def setUp(self, seed=1234):
        torch.manual_seed(seed)

        self.seq_length = 80
        self.sequences = 10
        self.hidden_dim = 1024
        self.heads = 16
        self.dropout_prob = 0.0

        self.ref_layer = SelfMultiheadAttn(
            self.hidden_dim,
            self.heads,
            dropout=self.dropout_prob,
            bias=False,
            include_norm_add=True,
            impl="default",
        )
        self.ref_layer.cuda().half()
        self.ref_layer.reset_parameters()
        self.ref_inputs = torch.randn(
            self.seq_length,
            self.sequences,
            self.hidden_dim,
            dtype=torch.float16,
            device=torch.device("cuda"),
        ).requires_grad_(True)

        # Reset seed so parameters are identical
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

        self.tst_layer = SelfMultiheadAttn(
            self.hidden_dim,
            self.heads,
            dropout=self.dropout_prob,
            bias=False,
            include_norm_add=True,
            impl="fast",
        )
        self.tst_layer.cuda().half()
        self.tst_layer.reset_parameters()

        self.tst_inputs = torch.randn(
            self.seq_length,
            self.sequences,
            self.hidden_dim,
            dtype=torch.float16,
            device=torch.device("cuda"),
        ).requires_grad_(True)

    def test_self_multihead_attn_norm_add(self):
        grads = torch.randn_like(self.tst_inputs)

        for _ in range(0, 5):
            ref_outputs, _ = self.ref_layer.forward(
                self.ref_inputs,
                self.ref_inputs,
                self.ref_inputs,
                key_padding_mask=None,
                need_weights=False,
                attn_mask=None,
                is_training=True,
            )

            tst_outputs, _ = self.tst_layer.forward(
                self.tst_inputs,
                self.tst_inputs,
                self.tst_inputs,
                key_padding_mask=None,
                need_weights=False,
                attn_mask=None,
                is_training=True,
            )

            self.ref_inputs.backward(grads)
            self.tst_inputs.backward(grads)

        torch.testing.assert_close(self.ref_inputs, self.tst_inputs, atol=1e-5, rtol=1e-5)
        torch.testing.assert_close(ref_outputs, tst_outputs, atol=1e-3, rtol=1e-3)
        torch.testing.assert_close(self.ref_inputs.grad, self.tst_inputs.grad, atol=1e-3, rtol=1e-3)


if __name__ == "__main__":
    unittest.main()


================================================
FILE: apex/contrib/test/openfold_triton/test_fused_adam_swa.py
================================================
# Copyright 2023 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from itertools import chain
import random
import unittest

import torch
import torch.nn as nn

SKIP_TEST = None
try:
    from apex.contrib.openfold_triton.fused_adam_swa import AdamMathType, FusedAdamSWA
except ImportError as e:
    SKIP_TEST = e


# Stochastic weight average (SWA) reference code from
# https://github.com/mlcommons/hpc_results_v3.0/blob/350e46f7/NVIDIA/benchmarks/openfold/implementations/pytorch/openfold/swa.py#L21-L70
class AlphaFoldSWA(nn.Module):
    """AlphaFold SWA (Stochastic Weight Averaging) module wrapper."""

    def __init__(self, alphafold: nn.Module, enabled: bool, decay_rate: float) -> None:
        super(AlphaFoldSWA, self).__init__()
        if enabled:
            self.averaged_model = torch.optim.swa_utils.AveragedModel(
                model=alphafold,
                avg_fn=swa_avg_fn(decay_rate=decay_rate),
            )
            self.enabled = True
        else:
            self.averaged_model = None
            self.enabled = False

    def update(self, alphafold: nn.Module) -> None:
        if self.enabled:
            self.averaged_model.update_parameters(model=alphafold)

    def forward(self, batch):
        if not self.enabled:
            raise RuntimeError("AlphaFoldSWA is not enabled")
        return self.averaged_model(batch)


class swa_avg_fn:
    """Averaging function for EMA with configurable decay rate
    (Supplementary '1.11.7 Evaluator setup')."""

    def __init__(self, decay_rate: float) -> None:
        self._decay_rate = decay_rate

    def __call__(
        self,
        averaged_model_parameter: torch.Tensor,
        model_parameter: torch.Tensor,
        num_averaged: torch.Tensor,
    ) -> torch.Tensor:
        # for decay_rate = 0.999:
        # return averaged_model_parameter * 0.999 + model_parameter * 0.001
        # avg * 0.999 + m * 0.001
        # 999*avg/1000 + m/1000
        # (999*avg + avg - avg)/1000 + m/1000
        # (1000*avg - avg)/1000 + m/1000
        # 1000*avg/1000 - avg/1000 + m/1000
        # avg + (m - avg)/1000
        # avg + (m - avg)*0.001
        return averaged_model_parameter + (model_parameter - averaged_model_parameter) * (
            1.0 - self._decay_rate
        )


@unittest.skipIf(SKIP_TEST, f"Skip testing FusedAdamSWA: {SKIP_TEST}")
class FusedAdamSWATestCase(unittest.TestCase):
    def setUp(self):
        super().setUp()
        self._seed = 19260817
        random.seed(self._seed)
        torch.manual_seed(self._seed)
        # FIXME: correctly fix: """NameError("Cannot access global variable _DTYPE2TRITON from within @jit'ed function.
        # Triton kernels can only access global variables that are instanstiated as constexpr (`x = triton.language.constexpr(42)`).
        # Note that this is different from annotating a variable as constexpr (`x: triton.language.constexpr = 42`), which is not supported.
        # Alternatively, set the envvar TRITON_ALLOW_NON_CONSTEXPR_GLOBALS=1, but we do not promise to support this forever.")"""
        os.environ["TRITON_ALLOW_NON_CONSTEXPR_GLOBALS"] = "1"

    def tearDown(self):
        os.environ.pop("TRITON_ALLOW_NON_CONSTEXPR_GLOBALS", None)

    def test_fused_update_on_random_data(self):
        with torch.backends.cudnn.flags(deterministic=True):
            self._run_fused_update_on_random_data()

    def _run_fused_update_on_random_data(self):
        device = torch.device("cuda:0")
        compute_dtype = torch.float32
        state_dtype = torch.float64
        atol = 1e-5  # Default: 1e-8, raise error at 1e-6 for FP32 compute and FP64 state.
        rtol = 1e-4  # Default: 1e-5
        lr = 1e-1
        bias_correction = True
        beta1, beta2 = 0.9, 0.999
        eps = 1e-6
        adam_math_mode = AdamMathType.PyTorchAdam
        weight_decay = 1e-3  # PyTorchAdam impl will fail non-zero weight decay.
        amsgrad = False
        adam_step = 1900
        swa_decay_rate = 0.9
        swa_n_averaged = 1

        state_params = [
            torch.empty(random.randint(128, 2048), device=device, dtype=state_dtype).uniform_(-5, 5)
            for _ in range(32)
        ]
        compute_dtypes = [
            compute_dtype if random.uniform(0.0, 1.0) <= 0.5 else state_dtype for _ in range(32)
        ]
        grads = [
            torch.empty_like(p, dtype=d).uniform_(-5, 5)
            for d, p in zip(compute_dtypes, state_params)
        ]
        moments = [torch.empty_like(p).uniform_(-5, 5) for p in state_params]
        velocities = [torch.empty_like(p).uniform_(0, 10) for p in state_params]

        # Ground truth: Apex FusedAdam, optimized-hpc SWA.
        compute_params_gt = [p.clone().to(d) for d, p in zip(compute_dtypes, state_params)]
        dummy_model = torch.nn.Module()
        for i, p in enumerate(state_params):
            dummy_model.register_parameter(f"param_{i}", torch.nn.Parameter(p.clone()))
        state_params_gt = list(dummy_model.parameters())
        swa_model = AlphaFoldSWA(dummy_model, enabled=True, decay_rate=swa_decay_rate)
        swa_params_gt = list(swa_model.parameters())
        optimizer = torch.optim.Adam(
            state_params_gt,
            lr=lr,
            betas=(beta1, beta2),
            eps=eps,
            weight_decay=weight_decay,
            amsgrad=amsgrad,
        )
        moments_gt, velocities_gt = [], []
        for i, p in enumerate(optimizer.param_groups[0]["params"]):
            s = optimizer.state[p]
            self.assertTrue(moments[i].shape == velocities[i].shape == p.shape)
            s["step"] = torch.tensor(adam_step, dtype=state_dtype, device=device)
            s["exp_avg"] = moments[i].clone()
            s["exp_avg_sq"] = velocities[i].clone()
            moments_gt.append(s["exp_avg"])
            velocities_gt.append(s["exp_avg_sq"])
        for p, g in zip(state_params_gt, grads):
            p.grad = g.clone().to(state_dtype)
        optimizer.step()
        swa_model.averaged_model.n_averaged.copy_(swa_n_averaged)
        swa_model.update(dummy_model)
        for c, s in zip(compute_params_gt, state_params_gt):
            c.detach().copy_(s.detach().to(c.dtype))

        # Fused AdamSWA, all at once.
        state_params_test = [torch.nn.Parameter(p.clone()) for p in state_params]
        compute_params_test = [p.clone().to(d) for d, p in zip(compute_dtypes, state_params)]
        swa_params_test = [p.clone() for p in state_params]
        fused_optimizer = FusedAdamSWA(
            params=state_params_test,
            compute_params=compute_params_test,
            swa_params=swa_params_test,
            swa_decay_rate=swa_decay_rate,
            lr=lr,
            bias_correction=bias_correction,
            betas=(beta1, beta2),
            eps=eps,
            adam_math_mode=adam_math_mode,
            weight_decay=weight_decay,
            amsgrad=amsgrad,
        )
        moments_test, velocities_test = [], []
        for i, p in enumerate(fused_optimizer.param_groups[0]["params"]):
            s = fused_optimizer.state[p]
            self.assertTrue(moments[i].shape == velocities[i].shape == p.shape)
            s["exp_avg"] = moments[i].clone()
            s["exp_avg_sq"] = velocities[i].clone()
            moments_test.append(s["exp_avg"])
            velocities_test.append(s["exp_avg_sq"])
        for c, g in zip(compute_params_test, grads):
            c.grad = g.clone()
        fused_optimizer.param_groups[0]["step"] = adam_step
        fused_optimizer.swa_param_groups[0]["n_averaged"] = swa_n_averaged
        fused_optimizer.step()

        # Ensure parameters are actually updated.
        for i, (p_gt, p_test, p_origin) in enumerate(
            zip(state_params_gt, state_params_test, state_params)
        ):
            self.assertFalse(torch.allclose(p_gt, p_origin, rtol=rtol, atol=atol))
            self.assertFalse(torch.allclose(p_test, p_origin, rtol=rtol, atol=atol))
        # Ensure FusedAdamSWA correctness.
        self.assertEqual(
            swa_model.averaged_model.n_averaged.item(),
            fused_optimizer.swa_param_groups[0]["n_averaged"],
        )
        for i, (p_test, p_gt) in enumerate(
            zip(
                chain(state_params_test, compute_params_test, swa_params_test),
                chain(state_params_gt, compute_params_gt, swa_params_gt),
            )
        ):
            self.assertTrue(torch.allclose(p_test, p_gt, rtol=rtol, atol=atol))
        # Ensure moments are updated correctly.
        for i, (m, m_gt) in enumerate(
            zip(
                chain(moments_test, velocities_test),
                chain(moments_gt, velocities_gt),
            )
        ):
            self.assertTrue(torch.allclose(m, m_gt, rtol=rtol, atol=atol))


if __name__ == "__main__":
    unittest.main()


================================================
FILE: apex/contrib/test/openfold_triton/test_openfold_mha.py
================================================
import math
import random
from typing import Optional
import torch
import unittest

SKIP_TEST = None
try:
    from apex.contrib.openfold_triton import AttnTri as openfold_attention_triton
except ImportError as e:
    SKIP_TEST = e


def openfold_attention_eager(
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    mask: torch.Tensor,
    bias: Optional[torch.Tensor],
    inf: float,
) -> torch.Tensor:
    # query:  [*, num_heads, Q, c_hidden]
    # key:    [*, num_heads, K, c_hidden]
    # value:  [*, num_heads, V, c_hidden]
    # mask:   Logit mask tensor broadcastable to [*, num_heads, Q, K]
    # bias:   Optional logit bias tensor broadcastable to [*, num_heads, Q, K]
    # inf:    Safe infinity value.
    # assuming K == V

    key = torch.swapdims(key, -2, -1)
    # key: [*, num_heads, c_hidden, K]

    scaling = 1.0 / math.sqrt(query.size(-1))
    a = torch.matmul(query * scaling, key)
    # a: [*, num_heads, Q, K]

    a += (mask - 1.0) * inf
    # a: [*, num_heads, Q, K]

    if bias is not None:
        a += bias
    # a: [*, num_heads, Q, K]

    a = torch.softmax(a, dim=-1)
    # a: [*, num_heads, Q, K]

    a = torch.matmul(a, value)
    # a: [*, num_heads, Q, c_hidden]

    return a


@unittest.skipIf(SKIP_TEST, f"{SKIP_TEST}")
class OpenfoldMhaTest(unittest.TestCase):
    def setUp(self, seed=1234):
        super().setUp()
        random.seed(seed)
        torch.manual_seed(seed)

    # representative workload in openfold
    def test_openfold_triton_mha(self, Z=256, H=4, N_CTX=256, D_HEAD=32, dtype=torch.float16):
        One = 1
        q = (
            torch.empty((One, Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda")
            .normal_(mean=0.1, std=0.2)
            .requires_grad_()
        )
        k = (
            torch.empty((One, Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda")
            .normal_(mean=0.4, std=0.2)
            .requires_grad_()
        )
        v = (
            torch.empty((One, Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda")
            .normal_(mean=0.3, std=0.2)
            .requires_grad_()
        )
        bias = (
            torch.empty((One, One, H, N_CTX, N_CTX), dtype=dtype, device="cuda")
            .normal_(mean=0.2, std=0.2)
            .requires_grad_()
        )
        mask = (
            torch.empty((One, N_CTX, One, One, N_CTX), device="cuda").normal_(mean=0, std=0.5) > 0
        )
        mask = mask.to(device=torch.device("cuda"), dtype=dtype).requires_grad_(False)

        dout = torch.randn_like(q)
        inf = 1e9

        # reference implementation
        ref_out = openfold_attention_eager(q, k, v, mask, bias, inf)
        ref_out.backward(dout)

        ref_dv, v.grad = v.grad.clone(), None
        ref_dk, k.grad = k.grad.clone(), None
        ref_dq, q.grad = q.grad.clone(), None
        ref_dbias, bias.grad = bias.grad.clone(), None

        # triton implementation
        tri_out = openfold_attention_triton(q, k, v, mask, bias, inf, torch.is_grad_enabled())
        tri_out.backward(dout)

        tri_dv, v.grad = v.grad.clone(), None
        tri_dk, k.grad = k.grad.clone(), None
        tri_dq, q.grad = q.grad.clone(), None
        tri_dbias, bias.grad = bias.grad.clone(), None

        # check results
        torch.testing.assert_close(ref_out, tri_out, atol=1e-2, rtol=0)
        torch.testing.assert_close(ref_dv, tri_dv, atol=1e-2, rtol=0)
        torch.testing.assert_close(ref_dk, tri_dk, atol=1e-2, rtol=0)
        torch.testing.assert_close(ref_dq, tri_dq, atol=1e-2, rtol=0)
        torch.testing.assert_close(ref_dbias, tri_dbias, atol=1e-2, rtol=0)


if __name__ == "__main__":
    unittest.main()


================================================
FILE: apex/contrib/test/openfold_triton/test_sync_triton_auto_tune_cache_across_gpus.py
================================================
import os

import torch
import torch.distributed as dist
from torch.testing._internal.common_utils import run_tests
from torch.testing._internal.common_distributed import (
    MultiProcessTestCase,
    requires_nccl,
    skip_if_lt_x_gpu,
)

from apex.contrib.openfold_triton import (
    LayerNormSmallShapeOptImpl,
    sync_triton_auto_tune_cache_across_gpus,
    _tuneable_triton_kernels,
)


class SyncTritonAutoTuneCacheTest(MultiProcessTestCase):
    device_type = "cuda"

    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)

    def setUp(self) -> None:
        super().setUp()
        self._spawn_processes()

    def tearDown(self) -> None:
        torch.cuda.synchronize()
        torch.cuda.empty_cache()
        super().tearDown()

    @property
    def world_size(self) -> int:
        return min(torch.cuda.device_count(), 2)

    @property
    def init_method(self):
        return f"{common_utils.FILE_SCHEMA}{self.file_name}"

    @property
    def destroy_pg_upon_exit(self) -> bool:
        return True

    def _create_process_group_nccl(self):
        def maybe_export(env, val):
            if not type(env) == str:
                raise ValueError(f"Type of type of env is expected to be str, but got {type(env)}")
            if not type(val) == str:
                raise ValueError(f"Type of type of val is expected to be str, but got {type(val)}")
            if os.getenv(env) is None:
                os.environ[env] = val

        maybe_export("MASTER_PORT", "29500")
        maybe_export("MASTER_ADDR", "localhost")

        # create nccl processgroup for two ranks
        dist.init_process_group(
            "nccl",
            world_size=self.world_size,
            rank=self.rank,
        )
        pg = dist.distributed_c10d._get_default_group()
        return pg

    @requires_nccl()
    @skip_if_lt_x_gpu(1)
    def test_sync_triton_auto_tune_cache_across_gpus(self):
        pg = self._create_process_group_nccl()
        device = torch.device(f"cuda:{self.rank % torch.cuda.device_count()}")
        torch.cuda.set_device(device)

        if self.rank == 0:
            eps = 1e-5
            normalized_shape = (
                128,
                64,
            )

            weight = torch.ones(normalized_shape, device=device, requires_grad=True)
            bias = torch.zeros(normalized_shape, device=device, requires_grad=True)

            x = torch.randn(
                (
                    2,
                    2,
                )
                + normalized_shape,
                device=device,
            )
            y = LayerNormSmallShapeOptImpl.apply(x, normalized_shape, weight, bias, eps)
            l = torch.sum(y)
            l.backward()

        sync_triton_auto_tune_cache_across_gpus(strict=False, verbose=True)

        caches_synced = 0
        for func_name, func in _tuneable_triton_kernels.items():
            if len(func.cache) > 0:
                caches_synced = caches_synced + 1
                print(
                    f"caches were synchronized for {func_name} at rank = {self.rank}:",
                    func.cache,
                )

        self.assertTrue(caches_synced > 0)


if __name__ == "__main__":
    run_tests()


================================================
FILE: apex/contrib/test/optimizers/__init__.py
================================================


================================================
FILE: apex/contrib/test/optimizers/test_dist_adam.py
================================================
from contextlib import contextmanager
import io
from typing import Callable, Optional
import unittest
import warnings
from contextlib import nullcontext

import torch
from torch.testing._internal import common_utils

SKIP_TEST = None
try:
    from apex.contrib.optimizers.distributed_fused_adam import DistributedFusedAdam
except ImportError as e:
    SKIP_TEST = e
from apex.distributed_testing.distributed_test_base import NcclDistributedTestBase


class SimpleModel(torch.nn.Module):
    def __init__(self, num_layers, size):
        super().__init__()
        self.params = torch.nn.ParameterList(
            [torch.nn.Parameter(torch.rand(1, size) + 1) for _ in range(num_layers)]
        )

    def forward(self, x):
        y = 0
        for i, param in enumerate(self.params):
            y += (i + 1) * param * x
        return y


def make_models(
    num_layers: int,
    size: int,
    *,
    lr: float = 0.1,
    adam_w_mode: bool = True,
    model_dtype: torch.dtype = torch.float32,
    optim_dtype: Optional[torch.dtype] = None,
    grad_sync_dtype: Optional[torch.dtype] = None,
    param_sync_dtype: Optional[torch.dtype] = None,
    device: torch.device = "cuda",
    process_group: Optional[torch.distributed.ProcessGroup] = None,
    average_grad_sync: bool = True,
    overlap_communication: bool = True,
    bucket_cap_mb: float = 71 / (4 * 1024 * 1024),
    contiguous_buffers: bool = False,
    store_params: bool = False,
    store_param_remainders: bool = False,
    with_scaled_states: bool = False,
    nccl_ub: bool = False,
    with_cuda_graph: bool = False,
):
    # Construct models with same parameters
    ref_model = SimpleModel(num_layers, size).to(dtype=model_dtype, device=device)
    dist_model = SimpleModel(num_layers, size).to(dtype=model_dtype, device=device)
    with torch.no_grad():
        for ref_param, dist_param in zip(dist_model.parameters(), ref_model.parameters()):
            dist_param.copy_(ref_param)

    # Initialize reference model with data-parallelism
    rank = torch.distributed.get_rank()
    ref_model = torch.nn.parallel.DistributedDataParallel(
        ref_model,
        device_ids=[rank] if device == "cuda" else None,
        output_device=rank if device == "cuda" else None,
        process_group=process_group,
    )

    # Construct optimizers with same hyperparameters
    if optim_dtype is None:
        optim_dtype = model_dtype
    optim_args = dict(lr=lr, betas=(0.1, 0.2), eps=0.25, weight_decay=0.1)
    ref_optim_class = torch.optim.AdamW if adam_w_mode else torch.optim.Adam
    ref_optim = ref_optim_class(
        [
            {"params": list(ref_model.parameters())[1::2], "lr": lr * 2},
            {"params": list(ref_model.parameters())[0::2]},
        ],
        **optim_args,
    )
    dist_optim = DistributedFusedAdam(
        [
            {"params": list(dist_model.parameters())[1::2], "lr": lr * 2},
            {"params": list(dist_model.parameters())[0::2]},
        ],
        adam_w_mode=adam_w_mode,
        overlap_grad_sync=overlap_communication,
        overlap_param_sync=overlap_communication,
        bucket_cap_mb=bucket_cap_mb,
        dtype=optim_dtype,
        grad_sync_dtype=grad_sync_dtype,
        param_sync_dtype=param_sync_dtype,
        process_group=process_group,
        average_grad_sync=average_grad_sync,
        contiguous_param_buffer=contiguous_buffers,
        contiguous_grad_buffer=contiguous_buffers,
        store_params=store_params,
        store_param_remainders=store_param_remainders,
        with_scaled_states=with_scaled_states,
        nccl_ub=nccl_ub,
        capturable=with_cuda_graph,
        **optim_args,
    )

    return ref_model, ref_optim, dist_model, dist_optim


@contextmanager
def dummy_context():
    try:
        yield
    finally:
        pass


@unittest.skipIf(SKIP_TEST, f"{SKIP_TEST}")
class TestDistributedFusedAdam(NcclDistributedTestBase):
    seed = 1234

    def test_matches_pytorch(
        self,
        rtol: Optional[float] = None,
        atol: Optional[float] = None,
        num_layers: int = 11,
        layer_size: int = 7,
        batch_size: int = 3,
        num_steps: int = 3,
        micro_batch_steps: int = 3,
        adam_w_mode: bool = True,
        overlap_communication: bool = True,
        use_nosync: bool = True,
        model_dtype: torch.dtype = torch.float32,
        optim_dtype: Optional[torch.dtype] = None,
        grad_sync_dtype: Optional[torch.dtype] = None,
        param_sync_dtype: Optional[torch.dtype] = None,
        device: torch.device = "cuda",
        bucket_cap_mb: float = 71 / (4 * 1024 * 1024),
        contiguous_buffers: bool = False,
        store_params: bool = False,
        store_param_remainders: bool = False,
        with_scaled_states: bool = False,
        nccl_ub: bool = False,
        init_optim_func: Optional[Callable[[DistributedFusedAdam], None]] = None,
        with_cuda_graph: bool = False,
    ):
        torch.manual_seed(self.seed + self.rank)

        # Identical models with data-parallel and ZeRO
        stream = torch.cuda.Stream()
        with torch.cuda.stream(stream):
            ref_model, ref_optim, dist_model, dist_optim = make_models(
                num_layers,
                layer_size,
                adam_w_mode=adam_w_mode,
                model_dtype=model_dtype,
                optim_dtype=optim_dtype,
                grad_sync_dtype=grad_sync_dtype,
                param_sync_dtype=param_sync_dtype,
                device=device,
                overlap_communication=overlap_communication,
                bucket_cap_mb=bucket_cap_mb,
                contiguous_buffers=contiguous_buffers,
                store_params=store_params,
                store_param_remainders=store_param_remainders,
                with_scaled_states=with_scaled_states,
                nccl_ub=nccl_ub,
                with_cuda_graph=with_cuda_graph,
            )

        # Initialize distributed optimizer
        if init_optim_func is not None:
            with torch.cuda.stream(stream):
                init_optim_func(dist_optim)

        # Static data
        static_xs, static_dys = [], []
        ys_ref, grad_xs_ref = [], []
        ys_dist, grad_xs_dist = [], []

        graph = torch.cuda.CUDAGraph() if with_cuda_graph else None
        CAPTURE_ITERATION = 11
        if with_cuda_graph:
            assert num_steps > CAPTURE_ITERATION + 3, "Not enough iterations for CUDA graph test."

        # Training loop
        with torch.cuda.stream(stream):
            for step in range(num_steps):
                # Synthetic data
                for micro_step in range(micro_batch_steps):
                    x = torch.rand(batch_size, layer_size) - 0.5
                    dy = torch.rand_like(x) - 0.5
                    x = x.to(dtype=model_dtype, device=device)
                    dy = dy.to(dtype=model_dtype, device=device)
                    if step == 0:
                        static_xs.append(x)
                        static_dys.append(dy)
                    else:
                        static_xs[micro_step].copy_(x)
                        static_dys[micro_step].copy_(dy)

                # Reference implementation
                ref_optim.zero_grad()
                for micro_step in range(micro_batch_steps):
                    x, dy = static_xs[micro_step], static_dys[micro_step]

                    x_ref = x.detach().clone().requires_grad_(True)
                    y_ref = ref_model(x_ref)
                    y_ref.backward(dy)

                    if step == 0:
                        ys_ref.append(y_ref)
                        grad_xs_ref.append(x_ref.grad)
                    else:
                        with torch.no_grad():
                            ys_ref[micro_step].copy_(y_ref)
                            grad_xs_ref[micro_step].copy_(x_ref.grad)
                ref_optim.step()

                # Distributed implementation
                if not with_cuda_graph or step <= CAPTURE_ITERATION:
                    if with_cuda_graph and step == CAPTURE_ITERATION:
                        ctx = torch.cuda.graph(graph)
                        torch.cuda.synchronize()
                    else:
                        ctx = nullcontext()

                    with ctx:
                        dist_optim.zero_grad()
                        for micro_step in range(micro_batch_steps):
                            x, dy = static_xs[micro_step], static_dys[micro_step]

                            x_dist = x.detach().clone().requires_grad_(True)
                            y_dist = dist_model(x_dist)
                            backward_context = dummy_context
                            if use_nosync and micro_step < micro_batch_steps - 1:
                                backward_context = dist_optim.no_sync
                            with backward_context():
                                y_dist.backward(dy)

                            if step == 0:
                                ys_dist.append(y_dist)
                                grad_xs_dist.append(x_dist.grad)
                            else:
                                with torch.no_grad():
                                    ys_dist[micro_step].copy_(y_dist)
                                    grad_xs_dist[micro_step].copy_(x_dist.grad)
                        dist_optim.step()

                    if with_cuda_graph and step == CAPTURE_ITERATION:
                        graph.replay()
                else:
                    graph.replay()

                # Check that data tensors match
                for mbs in range(micro_batch_steps):
                    torch.testing.assert_close(ys_dist[mbs], ys_ref[mbs], rtol=rtol, atol=atol)
                    torch.testing.assert_close(
                        grad_xs_dist[mbs], grad_xs_ref[mbs], rtol=rtol, atol=atol
                    )

                # Check that parameters match
                for ref_param, dist_param in zip(ref_model.parameters(), dist_model.parameters()):
                    torch.testing.assert_close(dist_param, ref_param, rtol=rtol, atol=atol)

    def test_matches_pytorch_l2_reg(self):
        self.test_matches_pytorch(adam_w_mode=False)

    def test_matches_pytorch_no_overlap(self):
        self.test_matches_pytorch(
            overlap_communication=False,
            use_nosync=False,
        )

    def test_matches_pytorch_sync_every_step(self):
        self.test_matches_pytorch(use_nosync=False)

    def test_matches_pytorch_contiguous_buffers(self):
        self.test_matches_pytorch(contiguous_buffers=True)

    def test_matches_pytorch_fp64(self):
        self.test_matches_pytorch(
            rtol=1.3e-6,
            atol=1e-5,
            model_dtype=torch.float64,
            optim_dtype=torch.float32,
        )

    def test_matches_pytorch_fp16(self):
        self.test_matches_pytorch(
            rtol=5e-3,
            atol=1e-5,
            micro_batch_steps=1,
            model_dtype=torch.float16,
            optim_dtype=torch.float16,
        )

    def test_matches_pytorch_bf16(self):
        self.test_matches_pytorch(
            rtol=5e-2,
            atol=1e-5,
            micro_batch_steps=1,
            model_dtype=torch.bfloat16,
            optim_dtype=torch.bfloat16,
        )

    def test_matches_pytorch_fp16_params(self):
        self.test_matches_pytorch(
            rtol=5e-3,
            atol=1e-5,
            micro_batch_steps=1,
            model_dtype=torch.float16,
            optim_dtype=torch.float32,
            param_sync_dtype=torch.float16,
            store_params=True,
        )

    def test_matches_pytorch_bf16_grads(self):
        self.test_matches_pytorch(
            rtol=5e-2,
            atol=1e-5,
            micro_batch_steps=1,
            model_dtype=torch.float32,
            optim_dtype=torch.float32,
            grad_sync_dtype=torch.bfloat16,
        )

    def test_matches_pytorch_bf16_param_remainders(self):
        self.test_matches_pytorch(
            rtol=5e-2,
            atol=1e-5,
            micro_batch_steps=1,
            model_dtype=torch.bfloat16,
            optim_dtype=torch.float32,
            param_sync_dtype=torch.bfloat16,
            store_params=False,
            store_param_remainders=True,
        )

    def test_matches_pytorch_multi_dtypes(self):
        def init_optim(optim: DistributedFusedAdam):
            params = list(optim.parameters())
            optim.init_params(params[0::3], grad_sync_dtype=torch.bfloat16)
            optim.init_params(params[1::3], param_sync_dtype=torch.bfloat16)

        self.test_matches_pytorch(
            rtol=5e-2,
            atol=1e-5,
            init_optim_func=init_optim,
        )

    def test_matches_pytorch_int64_param_sync(self):
        self.test_matches_pytorch(
            param_sync_dtype=torch.int64,
        )

    def test_matches_pytorch_int32_param_sync_contiguous_buffers(self):
        self.test_matches_pytorch(
            param_sync_dtype=torch.int32,
            contiguous_buffers=True,
        )

    def test_matches_pytorch_uint8_param_sync(self):
        self.test_matches_pytorch(
            rtol=0.5,
            atol=0.05,
            model_dtype=torch.float16,
            optim_dtype=torch.float16,
            micro_batch_steps=1,
            param_sync_dtype=torch.uint8,
        )

    def test_matches_pytorch_scaled_state(self):
        self.test_matches_pytorch(
            rtol=5e-2,
            atol=1e-5,
            micro_batch_steps=1,
            model_dtype=torch.bfloat16,
            optim_dtype=torch.float16,
            param_sync_dtype=torch.int,
            store_params=True,
            with_scaled_states=True,
        )

    def test_matches_pytorch_nccl_ub(self):
        self.test_matches_pytorch(
            contiguous_buffers=True,
            nccl_ub=True,
        )

    def test_raises_on_mismatch(self):
        torch.manual_seed(self.seed + self.rank)

        # Identical models with data-parallel and ZeRO
        num_layers = 11
        layer_size = 7
        ref_model, ref_optim, dist_model, dist_optim = make_models(
            num_layers,
            layer_size,
        )

        # Only perform training step with distributed model
        dist_optim.zero_grad()
        x = torch.rand(3, layer_size) - 0.5
        x = x.to(dtype=torch.float32, device="cuda")
        dy = torch.rand_like(x) - 0.5
        y = dist_model(x)
        y.backward(dy)
        dist_optim.step()

        # Check that parameters do not match
        for ref_param, dist_param in zip(ref_model.parameters(), dist_model.parameters()):
            self.assertRaises(
                AssertionError,
                torch.testing.assert_close,
                dist_param,
                ref_param,
            )

    def test_clip_grad_norm(self):
        torch.manual_seed(self.seed + self.rank)

        # Identical models with data-parallel and ZeRO
        ref_model, ref_optim, dist_model, dist_optim = make_models(1, 1)

        # Training steps with pre-determined gradients
        xs = [3, 1, 4, 1, 5, 9]
        dys = [1, -1, 1, -1, 1, -1]
        for x, dy in zip(xs, dys):
            x = torch.tensor([[x]], dtype=torch.float32, device="cuda")
            dy = torch.tensor([[dy]], dtype=torch.float32, device="cuda")

            # Reference implementation
            ref_optim.zero_grad()
            y_ref = ref_model(x.detach())
            y_ref.backward(dy.detach())
            ref_grad_norm = torch.nn.utils.clip_grad_norm_(ref_model.parameters(), 3.5)
            ref_optim.step()

            # Distributed implementation
            dist_optim.zero_grad()
            y_dist = dist_model(x.detach())
            y_dist.backward(dy.detach())
            dist_grad_norm = dist_optim.clip_grad_norm(3.5)
            dist_optim.step()

            # Check that parameters match
            torch.testing.assert_close(dist_grad_norm, ref_grad_norm)
            for ref_param, dist_param in zip(ref_model.parameters(), dist_model.parameters()):
                torch.testing.assert_close(dist_param, ref_param)

    def test_grad_scaler(self):
        torch.manual_seed(self.seed + self.rank)

        # Identical models with data-parallel and ZeRO
        ref_model, ref_optim, dist_model, dist_optim = make_models(1, 1)
        grad_scaler_args = dict(
            init_scale=3.21,
            growth_factor=1.23,
            backoff_factor=0.876,
            growth_interval=1,
        )
        ref_scaler = torch.amp.GradScaler("cuda", **grad_scaler_args)
        dist_scaler = torch.amp.GradScaler("cuda", **grad_scaler_args)

        # Training steps with pre-determined gradients
        xs = [3, 1, 4, 1, 5, 9]
        dys = [1, float("inf"), 1, 1, float("nan"), -1]
        for x, dy in zip(xs, dys):
            x = torch.tensor([[x]], dtype=torch.float32, device="cuda")
            dy = torch.tensor([[dy]], dtype=torch.float32, device="cuda")

            # Reference implementation
            ref_optim.zero_grad()
            y_ref = ref_model(x.detach())
            ref_scaler.scale(y_ref).backward(dy.detach())
            ref_scaler.step(ref_optim)
            ref_scaler.update()

            # Distributed implementation
            dist_optim.zero_grad()
            y_dist = dist_model(x.detach())
            dist_scaler.scale(y_dist).backward(dy.detach())
            dist_scaler.step(dist_optim)
            dist_scaler.update()

            # Check that parameters match
            for ref_param, dist_param in zip(ref_model.parameters(), dist_model.parameters()):
                torch.testing.assert_close(dist_param, ref_param)

    def test_checkpoint(
        self,
        rtol: Optional[float] = None,
        atol: Optional[float] = None,
        num_layers: int = 2,
        layer_size: int = 2,
        num_steps: int = 3,
        save_group_size: Optional[int] = None,
        load_group_size: Optional[int] = None,
        save_model_kwargs: Optional[dict] = None,
        load_model_kwargs: Optional[dict] = None,
    ):
        """Test state_dict and load_state_dict functions

        Two models are constructed, possibly on different process
        groups. One of the models is trained for a few steps, a
        checkpoint is saved, and the checkpoint is loaded on the other
        model. Both models are then trained for a few steps and
        checked to make sure that they produce identical results.

        Arguments:
            rtol (float): Relative tolerance for numerical checks (see
                torch.allclose).
            atol (float): Absolute tolerance for numerical checks (see
                torch.allclose).
            num_layers (int): Number of layers in test model.
            layer_size (int): Number of features in model layers.
            num_steps (int): Number of training steps to perform
                before and after checkpointing.
            save_group_size (int): Process group size for model that
                saves the checkpoint. Uses the default process group
                by default.
            load_group_size (int): Process group size for model that
                loads the checkpoint. Uses the default process group
                by default.
            save_model_kwargs (dict): keyword arguments passed to
                make_models when constructing the model that saves the
                checkpoint.
            load_model_kwargs (dict): keyword arguments passed to
                make_models when constructing the model that loads the
                checkpoint.

        """

        # Initialize process groups
        world_size = torch.distributed.get_world_size()
        if save_group_size is None:
            save_group_size = world_size
            save_group = None
        else:
            if save_group_size > world_size:
                self.skipTest(f"Requires {save_group_size} ranks, found {world_size}")
            save_ranks = list(range(save_group_size))
            save_group = torch.distributed.new_group(ranks=save_ranks)
        if load_group_size is None:
            load_group_size = world_size
            load_group = None
        else:
            if load_group_size > world_size:
                self.skipTest(f"Requires {load_group_size} ranks, found {world_size}")
            load_ranks = list(range(load_group_size))
            load_group = torch.distributed.new_group(ranks=load_ranks)

        # Construct two models with same config and different params
        torch.manual_seed(self.seed)
        if self.rank < save_group_size:
            if not save_model_kwargs:
                save_model_kwargs = {}
            _, _, model_save, optim_save = make_models(
                num_layers,
                layer_size,
                lr=0.1,
                process_group=save_group,
                average_grad_sync=False,
                overlap_communication=False,
                **save_model_kwargs,
            )
            optim_save.init_params(reversed(list(model_save.parameters())))
        torch.manual_seed(self.seed + 1)
        if self.rank < load_group_size:
            if not load_model_kwargs:
                load_model_kwargs = {}
            _, _, model_load, optim_load = make_models(
                num_layers,
                layer_size,
                lr=1234.0,
                process_group=load_group,
                average_grad_sync=False,
                overlap_communication=False,
                **load_model_kwargs,
            )
            optim_load.init_params(list(model_load.parameters()))

        batch_size = 2 * save_group_size * load_group_size

        def make_global_batch() -> torch.Tensor:
            """Generate random tensor on root rank and broadcast"""
            x = torch.empty(batch_size, layer_size, device="cuda")
            if self.rank == 0:
                torch.rand(x.size(), out=x)
                x -= 0.5
            torch.distributed.broadcast(x, src=0)
            return x

        def to_local_batch(
            global_batch: torch.Tensor,
            group: Optional[torch.distributed.ProcessGroup],
        ) -> Optional[torch.Tensor]:
            """Get local portion of tensor that is replicated across all ranks"""
            group_size = torch.distributed.get_world_size(group)
            if group_size < 0:
                return None
            local_batch_size = batch_size // group_size
            batch_start = self.rank * local_batch_size
            batch_end = (self.rank + 1) * local_batch_size
            return global_batch[batch_start:batch_end, ...]

        def to_global_batch(
            local_batch: torch.Tensor,
            group: Optional[torch.distributed.ProcessGroup],
        ) -> torch.Tensor:
            """Gather distributed tensor and broadcast to all ranks"""

            # Allocate buffer
            global_batch = torch.empty(batch_size, layer_size, device="cuda")

            # Gather data on root rank
            group_size = torch.distributed.get_world_size(group)
            if group_size > 0:
                local_batches = None
                if self.rank == 0:
                    local_batch_size = batch_size // group_size
                    local_batches = [
                        global_batch[rank * local_batch_size : (rank + 1) * local_batch_size, ...]
                        for rank in range(group_size)
                    ]
                torch.distributed.gather(
                    local_batch,
                    local_batches,
                    dst=0,
                    group=group,
                )

            # Broadcast data to all ranks
            torch.distributed.broadcast(global_batch, src=0)
            return global_batch

        # Train one of the models
        torch.manual_seed(self.seed + 2)
        for step in range(num_steps):
            if self.rank < save_group_size:
                optim_save.zero_grad()
            x = make_global_batch()
            dy = make_global_batch()
            if self.rank < save_group_size:
                x = to_local_batch(x, save_group)
                dy = to_local_batch(dy, save_group)
                y = model_save(x)
                y.backward(dy)
                optim_save.step()

        # Make sure models are different
        if self.rank < min(save_group_size, load_group_size):
            for param_save, param_load in zip(model_save.parameters(), model_load.parameters()):
                self.assertRaises(
                    AssertionError,
                    torch.testing.assert_close,
                    param_load,
                    param_save,
                    rtol=rtol,
                    atol=atol,
                )

        # Save state
        state_bytes = None
        if self.rank < save_group_size:
            state_dict = {
                "model": model_save.state_dict(),
                "optim": optim_save.state_dict(),
            }
            byte_stream = io.BytesIO()
            torch.save(state_dict, byte_stream)
            state_bytes = byte_stream.getvalue()

        # Broadcast state from root rank and load
        if self.rank < load_group_size:
            if load_group_size != save_group_size:
                if self.rank != 0:
                    state_bytes = None
                state_bytes = [state_bytes]
                torch.distributed.broadcast_object_list(
                    state_bytes,
                    src=0,
                    group=load_group,
                )
                state_bytes = state_bytes[0]
            state_dict = torch.load(io.BytesIO(state_bytes))
            model_load.load_state_dict(state_dict["model"])
            optim_load.load_state_dict(state_dict["optim"])

        # Make sure models are identical
        if self.rank < min(save_group_size, load_group_size):
            for param_save, param_load in zip(model_save.parameters(), model_load.parameters()):
                torch.testing.assert_close(param_load, param_save, rtol=rtol, atol=atol)

        # Train both models
        torch.manual_seed(self.seed + 3)
        for step in range(num_steps):
            # Reset grads
            if self.rank < save_group_size:
                optim_save.zero_grad()
            if self.rank < load_group_size:
                optim_load.zero_grad()

            # Synthetic data
            x = make_global_batch()
            dy = make_global_batch()

            # Training step for model that saved checkpoint
            y_save = None
            dx_save = None
            if self.rank < save_group_size:
                x_save = to_local_batch(x, save_group)
                x_save = x_save.detach().clone().requires_grad_(True)
                dy_save = to_local_batch(dy, save_group)
                y_save = model_save(x_save)
                y_save.backward(dy_save)
                dx_save = x_save.grad
            y_save = to_global_batch(y_save, save_group)
            dx_save = to_global_batch(dx_save, save_group)

            # Training step for model that loaded checkpoint
            y_load = None
            dx_load = None
            if self.rank < load_group_size:
                x_load = to_local_batch(x, load_group)
                x_load = x_load.detach().clone().requires_grad_(True)
                dy_load = to_local_batch(dy, load_group)
                y_load = model_load(x_load)
                y_load.backward(dy_load)
                dx_load = x_load.grad
            y_load = to_global_batch(y_load, load_group)
            dx_load = to_global_batch(dx_load, load_group)

            # Check that data tensors match
            torch.testing.assert_close(y_load, y_save, rtol=rtol, atol=atol)
            torch.testing.assert_close(dx_load, dx_save, rtol=rtol, atol=atol)

            # Optimizer step
            if self.rank < save_group_size:
                optim_save.step()
            if self.rank < load_group_size:
                optim_load.step()

            # Check that parameters match
            if self.rank < min(save_group_size, load_group_size):
                for param_save, param_load in zip(model_save.parameters(), model_load.parameters()):
                    torch.testing.assert_close(
                        param_load,
                        param_save,
                        rtol=rtol,
                        atol=atol,
                    )

    def test_checkpoint_save_1gpu(self):
        """Test loading checkpoint with one GPU"""
        self.test_checkpoint(save_group_size=1)

    def test_checkpoint_load_1gpu(self):
        """Test saving checkpoint with one GPU"""
        self.test_checkpoint(load_group_size=1)

    def test_checkpoint_bf16(self):
        """Test checkpoint with BF16 model"""
        self.test_checkpoint(
            rtol=5e-2,
            atol=1e-5,
            save_model_kwargs=dict(
                model_dtype=torch.bfloat16,
                optim_dtype=torch.float32,
                param_sync_dtype=torch.bfloat16,
                store_params=False,
                store_param_remainders=True,
            ),
            load_model_kwargs=dict(
                model_dtype=torch.bfloat16,
                optim_dtype=torch.float32,
                param_sync_dtype=torch.bfloat16,
                store_params=False,
                store_param_remainders=True,
            ),
        )

    def test_checkpoint_scaled_state(self):
        """Test checkpoint with scaled FP16 state"""
        self.test_checkpoint(
            rtol=5e-2,
            atol=1e-5,
            save_model_kwargs=dict(
                model_dtype=torch.bfloat16,
                optim_dtype=torch.float16,
                param_sync_dtype=torch.int,
                store_params=True,
                with_scaled_states=True,
            ),
            load_model_kwargs=dict(
                model_dtype=torch.bfloat16,
                optim_dtype=torch.float16,
                param_sync_dtype=torch.int,
                store_params=True,
                with_scaled_states=True,
            ),
        )

    def test_bucket_low_utilization_warning(self):
        """Test warning when bucket utilization is low"""
        layer_size = 2 * 1024 * 1024
        num_layers = 4
        fairish_bucket_cap_mb = 4 * num_layers * layer_size / (1024 * 1024)

        # Check that warning is raised when bucket utilization is low
        with self.assertWarnsRegex(Warning, ".*Consider decreasing the bucket_cap_mb argument."):
            self.test_matches_pytorch(
                num_layers=num_layers,
                layer_size=layer_size,
                overlap_communication=False,
                bucket_cap_mb=fairish_bucket_cap_mb * 2,
            )

        # Check that warning is not raised when bucket utilization is high
        with warnings.catch_warnings(record=True) as warns:
            self.test_matches_pytorch(
                num_layers=num_layers,
                layer_size=layer_size,
                overlap_communication=False,
                bucket_cap_mb=fairish_bucket_cap_mb,
            )
            for w in warns:
                self.assertNotRegex(
                    str(w.message), ".*Consider decreasing the bucket_cap_mb argument."
                )

    def test_cuda_graph(self):
        """Test distributed adam with CUDA graph"""
        if self.world_size <= 8:
            self.skipTest(f"{self.world_size=} is expected to be >= 8")
        self.test_matches_pytorch(
            rtol=5e-3,
            atol=1e-5,
            num_steps=15,
            micro_batch_steps=1,
            model_dtype=torch.float16,
            optim_dtype=torch.float16,
            contiguous_buffers=True,
            with_cuda_graph=True,
        )


if __name__ == "__main__":
    # Assume script has been run with torchrun
    common_utils.run_tests()


================================================
FILE: apex/contrib/test/optimizers/test_distributed_fused_lamb.py
================================================
import inspect

import torch
from torch.cuda.amp import GradScaler
from torch.testing._internal import common_utils
from torch.distributed.distributed_c10d import _coalescing_manager

from apex.contrib.optimizers.distributed_fused_lamb import DistributedFusedLAMB
from apex.distributed_testing.distributed_test_base import NcclDistributedTestBase


def flat_dist_call(param_list: list[torch.Tensor], op, args):
    with _coalescing_manager(async_ops=True) as cm:
        for p in param_list:
            op(p, *args)

    cm.wait()


def get_init_weights_func():
    @torch.no_grad()
    def init_weights(m):
        if isinstance(m, torch.nn.Linear):
            m.weight.fill_(1.0)

    return init_weights


class ModelFoo(torch.nn.Module):
    def __init__(self):
        super(ModelFoo, self).__init__()
        self.linear = torch.nn.Linear(128, 128, bias=False)
        self.loss = torch.nn.MSELoss()

    def forward(self, input_tensor, gt):
        y = self.linear(input_tensor)
        loss = self.loss(y, gt)
        return loss


# A test for distributed fused Lamb optimizer: run several iterations and see if loss decreases
# There are two instances of the same test because based on `world_size` the optimizer decides what collectives operation to use.
# If torch.distributed.get_world_size() == torch.cuda.device_count() it uses only `all_gather`.
# If torch.distributed.get_world_size() < torch.cuda.device_count() it uses both `all_gather` and `reduce_scatter`.
class NcclDistributedFusedLAMB(NcclDistributedTestBase):
    @property
    def world_size(self) -> int:
        return torch.cuda.device_count()

    @common_utils.parametrize("no_copy", [False, True])
    @common_utils.parametrize(
        "opt_kwargs",
        [
            dict(
                overlap_reductions=True,
                dwu_num_blocks=2,
                dwu_num_chunks=2,
                fused_norm=False,
                fuse_scale=False,
                clip_after_ar=True,
                full_ar=False,
            ),
            dict(
                overlap_reductions=False,
                dwu_num_blocks=1,
                dwu_num_chunks=1,
                fused_norm=True,
                fuse_scale=True,
                clip_after_ar=False,
            ),
        ],
    )
    def test_distributed_fused_lamb(self, no_copy, opt_kwargs):
        if (
            no_copy
            and "no_copy" not in inspect.getfullargspec(torch.distributed.reduce_scatter).args
        ):
            self.skipTest("does not support no_copy")
        if no_copy and "no_copy" not in inspect.getfullargspec(torch.distributed.all_gather).args:
            self.skipTest("does not support no_copy")

        assert torch.distributed.is_initialized()
        gpu_count = torch.distributed.get_world_size()

        init_scale = 100
        lr = torch.tensor(0.1).cuda()
        grad_scaler = GradScaler(init_scale=init_scale, growth_interval=1000)

        model = ModelFoo()
        model = model.cuda().half()
        model.apply(get_init_weights_func())

        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "gamma", "beta", "LayerNorm"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.01,
            },
            {
                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]

        if "full_ar" not in opt_kwargs:
            opt_kwargs["full_ar"] = gpu_count == torch.cuda.device_count()

        # Aidyn-A: not sure what parameters are the best for testing purposes,
        # setting up whatever I think appropriate.
        optimizer = DistributedFusedLAMB(
            optimizer_grouped_parameters,
            lr=0.1,
            betas=(0.9, 0.9),
            eps=1e-6,
            max_grad_norm=1.0,
            dwu_group_size=gpu_count,
            dwu_num_rs_pg=1,
            dwu_num_ar_pg=1,
            dwu_num_ag_pg=1,
            use_nvlamb=False,
            set_param_views_to_flat_buffer=False,
            e5m2_allgather=False,
            **opt_kwargs,
        )
        optimizer.set_global_scale(init_scale)

        optimizer._reduce_scatter_no_copy = no_copy
        optimizer._all_gather_no_copy = no_copy

        flat_dist_call(
            [param.data for param in model.parameters()],
            torch.distributed.broadcast,
            (0,),
        )

        x = torch.randn(4096, 128, dtype=torch.float16).cuda()
        y = torch.randn(4096, 128, dtype=torch.float16).cuda()

        losses = []
        for _ in range(10):
            loss = model(x, y)
            optimizer._lazy_init_stage1()
            grad_scaler.scale(loss).backward()
            optimizer._lazy_init_stage2()
            optimizer._lr = lr
            optimizer.complete_reductions()
            optimizer.set_global_scale(grad_scaler._get_scale_async())
            grad_scaler.step(optimizer)
            grad_scaler.update()
            optimizer.zero_grad(set_to_none=True)

            losses.append(loss.item())

        self.assertTrue(losses == sorted(losses, reverse=True))


common_utils.instantiate_parametrized_tests(NcclDistributedFusedLAMB)


class NcclDistributedFusedLAMB_partial_ar(NcclDistributedFusedLAMB):
    @property
    def world_size(self) -> int:
        return max(torch.cuda.device_count() - 1, 1)


if __name__ == "__main__":
    common_utils.run_tests()


================================================
FILE: apex/contrib/test/peer_memory/__init__.py
================================================


================================================
FILE: apex/contrib/test/peer_memory/test_peer_halo_exchange_module.py
================================================
import unittest

import torch
from torch.testing._internal import common_utils

SKIP_TEST = None
from apex.distributed_testing.distributed_test_base import NcclDistributedTestBase

try:
    from apex.contrib.peer_memory import PeerMemoryPool, PeerHaloExchanger1d
except ImportError as e:
    SKIP_TEST = e

# How to run:
# python /path/to/test_peer_halo_exchange_module.py


# Output of this function is used as ground truth in module tests.
def nccl_halo_ex(peer_rank, peer_group_size, y, half_halo, explicit_nhwc, H_split):
    if explicit_nhwc:
        if H_split:
            _, Hp, _, _ = list(y.shape)
            H = Hp - 2 * half_halo
            top_out_halo = y[:, half_halo : 2 * half_halo, :, :]
            top_inp_halo = y[:, :half_halo, :, :]
            btm_out_halo = y[:, H : H + half_halo, :, :]
            btm_inp_halo = y[:, H + half_halo : H + 2 * half_halo, :, :]
        else:
            _, _, Wp, _ = list(y.shape)
            W = Wp - 2 * half_halo
            top_out_halo = y[:, :, half_halo : 2 * half_halo, :]
            top_inp_halo = y[:, :, :half_halo, :]
            btm_out_halo = y[:, :, W : W + half_halo, :]
            btm_inp_halo = y[:, :, W + half_halo : W + 2 * half_halo, :]
    else:
        if H_split:
            _, _, Hp, _ = list(y.shape)
            H = Hp - 2 * half_halo
            top_out_halo = y[:, :, half_halo : 2 * half_halo, :]
            top_inp_halo = y[:, :, :half_halo, :]
            btm_out_halo = y[:, :, H : H + half_halo, :]
            btm_inp_halo = y[:, :, H + half_halo : H + 2 * half_halo, :]
        else:
            _, _, _, Wp = list(y.shape)
            W = Wp - 2 * half_halo
            top_out_halo = y[:, :, :, half_halo : 2 * half_halo]
            top_inp_halo = y[:, :, :, :half_halo]
            btm_out_halo = y[:, :, :, W : W + half_halo]
            btm_inp_halo = y[:, :, :, W + half_halo : W + 2 * half_halo]

    mf = (
        torch.channels_last
        if y.is_contiguous(memory_format=torch.channels_last)
        else torch.contiguous_format
    )
    top_out_halo = top_out_halo.contiguous()
    btm_out_halo = btm_out_halo.contiguous()

    top_inp_halos = [torch.empty_like(top_out_halo) for _ in range(peer_group_size)]
    torch.distributed.all_gather(top_inp_halos, top_out_halo)
    btm_inp_halos = [torch.empty_like(btm_out_halo) for _ in range(peer_group_size)]
    torch.distributed.all_gather(btm_inp_halos, btm_out_halo)
    top_rank = (peer_rank + peer_group_size - 1) % peer_group_size
    btm_rank = (peer_rank + 1) % peer_group_size
    if peer_rank == 0:
        top_inp_halo.zero_()
    else:
        top_inp_halo.copy_(btm_inp_halos[top_rank].to(memory_format=mf))
    if peer_rank == peer_group_size - 1:
        btm_inp_halo.zero_()
    else:
        btm_inp_halo.copy_(top_inp_halos[btm_rank].to(memory_format=mf))


def single_test(
    peer_rank,
    peer_group_size,
    halo_ex,
    C,
    H,
    W,
    half_halo,
    dtype,
    memory_format,
    H_split,
    num_steps,
    numSM=1,
):
    if memory_format == 1:
        # 1 -> explicit nhwc
        explicit_nhwc = True
        if H_split:
            y = torch.randn([1, H + 2 * half_halo, W, C], dtype=dtype, device="cuda")
            ym = y[:, half_halo : H + half_halo, :, :]
        else:
            y = torch.randn([1, H, W + 2 * half_halo, C], dtype=dtype, device="cuda")
            ym = y[:, :, half_halo : W + half_halo, :]
    else:
        # 2 -> native nhwc
        # 3 -> nchw
        explicit_nhwc = False
        if H_split:
            y = torch.randn([1, C, H + 2 * half_halo, W], dtype=dtype, device="cuda")
            if memory_format == 2:
                y = y.to(memory_format=torch.channels_last)
            ym = y[:, :, half_halo : H + half_halo, :]
        else:
            y = torch.randn([1, C, H, W + 2 * half_halo], dtype=dtype, device="cuda")
            if memory_format == 2:
                y = y.to(memory_format=torch.channels_last)
            ym = y[:, :, :, half_halo : W + half_halo]
    y3 = y.clone()
    list_y = []
    for step in range(num_steps):
        halo_ex(y, H_split, explicit_nhwc, numSM)
        list_y.append(y.clone())
        y.copy_(y3)
        halo_ex.peer_pool.reset()
        torch.distributed.barrier()
    y2 = y3.clone()
    list_y2 = []
    for step in range(num_steps):
        nccl_halo_ex(peer_rank, peer_group_size, y2, half_halo, explicit_nhwc, H_split)
        list_y2.append(y2.clone())
        y2.copy_(y3)
    if memory_format == 1:
        memory_format_str = "explicit_nhwc"
    elif memory_format == 2:
        memory_format_str = "native nhwc"
    elif memory_format == 3:
        memory_format_str = "nchw"
    else:
        memory_format_str = "???"
    torch.testing.assert_close(list_y, list_y2, msg=memory_format_str)
    # is_equal = [torch.all(torch.eq(yy, yy2)) for yy, yy2 in zip(list_y, list_y2)]
    # is_equal = torch.tensor(is_equal, dtype=torch.bool)
    # is_equal = torch.all(is_equal)
    # if peer_rank == 0:
    #     if is_equal:
    #         print(
    #             "SUCCESS : N,C,H,W = 1,%d,%d,%d, half_halo=%d, %s, %s, %s"
    #             % (
    #                 C,
    #                 H,
    #                 W,
    #                 half_halo,
    #                 str(dtype),
    #                 memory_format_str,
    #                 "H-split" if H_split else "W-split",
    #             )
    #         )
    #     else:
    #         print(
    #             "FAILURE : N,C,H,W = 1,%d,%d,%d, half_halo=%d, %s, %s, %s"
    #             % (
    #                 C,
    #                 H,
    #                 W,
    #                 half_halo,
    #                 str(dtype),
    #                 memory_format_str,
    #                 "H-split" if H_split else "W-split",
    #             )
    #         )
    #
    # peer memory flag sync relies on there being at least one barrier per step
    # torch.distributed.barrier()


def H_split_tests(N, C, H, W, half_halo, rank, world_size, halo_ex, num_steps):
    Hr = 8 * world_size
    Hp = ((H + Hr - 1) // Hr) * 8

    for i in range(4):
        div = int(pow(2, i))
        single_test(
            rank,
            world_size,
            halo_ex,
            C * div,
            Hp // div,
            W // div,
            half_halo,
            torch.float16,
            1,
            True,
            num_steps,
        )
        single_test(
            rank,
            world_size,
            halo_ex,
            C * div,
            Hp // div,
            W // div,
            half_halo,
            torch.float16,
            2,
            True,
            num_steps,
        )
        single_test(
            rank,
            world_size,
            halo_ex,
            C * div,
            Hp // div,
            W // div,
            half_halo,
            torch.float16,
            3,
            True,
            num_steps,
        )


def W_split_tests(N, C, H, W, half_halo, rank, world_size, halo_ex, num_steps):
    Wr = 8 * world_size
    Wp = ((W + Wr - 1) // Wr) * 8

    for i in range(4):
        div = int(pow(2, i))
        single_test(
            rank,
            world_size,
            halo_ex,
            C * div,
            H // div,
            Wp // div,
            half_halo,
            torch.float16,
            1,
            False,
            num_steps,
        )
        single_test(
            rank,
            world_size,
            halo_ex,
            C * div,
            H // div,
            Wp // div,
            half_halo,
            torch.float16,
            2,
            False,
            num_steps,
        )
        single_test(
            rank,
            world_size,
            halo_ex,
            C * div,
            H // div,
            Wp // div,
            half_halo,
            torch.float16,
            3,
            False,
            num_steps,
        )


def main():
    # for this trivial example peer_rank == rank and peer_group_size == world_size

    torch.distributed.init_process_group("nccl")
    rank = torch.distributed.get_rank()
    world_size = torch.distributed.get_world_size()
    torch.cuda.set_device(rank)
    peer_ranks = [i for i in range(world_size)]
    pool = PeerMemoryPool(0, 2 * 1024 * 1024, peer_ranks)

    num_steps = 100

    half_halo = 1
    halo_ex = PeerHaloExchanger1d(peer_ranks, rank, pool, half_halo)

    H_split_tests(1, 64, 336, 200, half_halo, rank, world_size, halo_ex, num_steps)
    W_split_tests(1, 64, 200, 336, half_halo, rank, world_size, halo_ex, num_steps)


@unittest.skipIf(SKIP_TEST, f"{SKIP_TEST}")
class TestPeerMemory(NcclDistributedTestBase):
    HALF_HALO = 1
    NUM_STEPS = 100

    @property
    def world_size(self) -> int:
        return min(torch.cuda.device_count(), 2)

    # TODO(crcrpar): Check if `world_size` being multiple of 2 is must.
    def _check_world_size_and_may_skip(self) -> None:
        if not (self.world_size >= 2 and self.world_size % 2 == 0):
            self.skipTest(f"world_size is expected to be a multiple of 2 but, {self.world_size}")

    def get_halo_excnahger_1d(self):
        peer_ranks = [i for i in range(self.world_size)]
        pool = PeerMemoryPool(64 * 1024, 2 * 1024 * 1024, peer_ranks)
        halo_exchanger_1d = PeerHaloExchanger1d(
            peer_ranks, self.rank, pool, TestPeerMemory.HALF_HALO
        )
        return halo_exchanger_1d

    def test_height_split(self):
        self._check_world_size_and_may_skip()
        H_split_tests(
            1,
            64,
            336,
            200,
            TestPeerMemory.HALF_HALO,
            self.rank,
            self.world_size,
            self.get_halo_excnahger_1d(),
            TestPeerMemory.NUM_STEPS,
        )

    def test_width_split(self):
        self._check_world_size_and_may_skip()
        W_split_tests(
            1,
            64,
            200,
            336,
            TestPeerMemory.HALF_HALO,
            self.rank,
            self.world_size,
            self.get_halo_excnahger_1d(),
            TestPeerMemory.NUM_STEPS,
        )


if __name__ == "__main__":
    common_utils.run_tests()


================================================
FILE: apex/contrib/test/transducer/__init__.py
================================================


================================================
FILE: apex/contrib/test/transducer/test_transducer_joint.py
================================================
import unittest

import torch

SKIP_TEST = None
try:
    from apex.contrib.transducer import TransducerJoint
    from apex.contrib.transducer import _transducer_ref as transducer_ref
except ImportError as e:
    SKIP_TEST = e


@unittest.skipIf(SKIP_TEST, f"{SKIP_TEST}")
class TransducerJointTest(unittest.TestCase):
    def setUp(self, seed=1234):
        torch.manual_seed(seed)

    def gen_input(self, for_vector_kernel):
        self.B = 4
        T_min = 51
        T_max = 101
        U_min = 12
        U_max = 25
        if for_vector_kernel:
            H = 512
        else:
            H = 509
        dtype = torch.float16
        device = "cuda"

        self.f_tst = torch.randn((self.B, T_max, H), dtype=dtype, requires_grad=True, device=device)
        self.g_tst = torch.randn((self.B, U_max, H), dtype=dtype, requires_grad=True, device=device)
        self.h_grad = torch.randn(self.B, T_max, U_max, H, dtype=dtype, device=device)
        self.f_len = torch.randint(T_min, T_max + 1, (self.B,), dtype=torch.int, device=device)
        self.g_len = torch.randint(U_min, U_max + 1, (self.B,), dtype=torch.int, device=device)
        self.f_len[torch.randint(0, self.B, (1,)).item()] = T_max
        self.g_len[torch.randint(0, self.B, (1,)).item()] = U_max
        self.dropout_prob = 0.5

        # Make sure gradients from out-of-bound locations are zero. This should be guaranteed by
        # the loss function
        for b in range(self.B):
            self.h_grad[b, self.f_len[b] :, :, :] = 0
            self.h_grad[b, :, self.g_len[b] :, :] = 0
        self.h_grad_packed = self._pack(self.h_grad, self.f_len, self.g_len)

    def _pack(self, x, f_len, g_len):
        B = x.size(0)
        list_x = []
        for b in range(B):
            list_x_row = [x[b, t, : g_len[b]] for t in range(f_len[b])]
            x_row = torch.cat(list_x_row)
            list_x.append(x_row)
        x_packed = torch.cat(list_x).data.clone()
        x_packed.requires_grad = True
        batch_offset = torch.cumsum(f_len * g_len, dim=0)
        return x_packed

    def _unpack(self, x, f_len, g_len):
        batch_offset = torch.cumsum(f_len * g_len, dim=0)
        x_unpacked = torch.zeros_like(self.h_grad, dtype=torch.uint8)
        B = self.h_grad.size(0)
        H = self.h_grad.size(-1)
        for b in range(B):
            my_batch_offset = 0 if b == 0 else batch_offset[b - 1]
            my_f_len = f_len[b]
            my_g_len = g_len[b]
            for t in range(my_f_len):
                x_unpacked[b, t, :my_g_len] = x[
                    my_batch_offset + t * my_g_len : my_batch_offset + t * my_g_len + my_g_len
                ]
        return x_unpacked

    def run_transducer_joint(self, for_vector_kernel, pack_output, relu, dropout):
        self.gen_input(for_vector_kernel=for_vector_kernel)
        # Generate reference
        f_ref = self.f_tst.data.clone()
        g_ref = self.g_tst.data.clone()
        f_ref.requires_grad = True
        g_ref.requires_grad = True

        my_joint = TransducerJoint(
            pack_output=pack_output,
            relu=relu,
            dropout=dropout,
            dropout_prob=self.dropout_prob,
            probe_mask=True,
        )
        if not pack_output:
            h_tst = my_joint(f=self.f_tst, g=self.g_tst, f_len=self.f_len, g_len=self.g_len)
            h_tst.backward(self.h_grad)
            if dropout:
                mask = my_joint.mask_probe[0]
        else:
            batch_offset = torch.cumsum(self.f_len * self.g_len, dim=0)
            h_tst = my_joint(
                f=self.f_tst,
                g=self.g_tst,
                f_len=self.f_len,
                g_len=self.g_len,
                batch_offset=batch_offset,
                packed_batch=batch_offset[-1],
            )
            h_tst.backward(self.h_grad_packed)
            if dropout:
                mask_packed = my_joint.mask_probe[0]
                mask = self._unpack(mask_packed, self.f_len, self.g_len)

        # reference
        h_ref, f_grad_ref, g_grad_ref = transducer_ref.transducer_joint_reference(
            f=f_ref,
            g=g_ref,
            h_grad=self.h_grad,
            f_len=self.f_len,
            g_len=self.g_len,
            pack_output=pack_output,
            relu=relu,
            dropout=dropout,
            dropout_prob=self.dropout_prob,
            mask=mask if dropout else None,
        )

        f_grad_tst = self.f_tst.grad
        g_grad_tst = self.g_tst.grad

        torch.testing.assert_close(h_ref, h_tst, atol=1e-5, rtol=1e-5)
        torch.testing.assert_close(f_grad_ref, f_grad_tst, atol=5e-5, rtol=1e-3)
        torch.testing.assert_close(g_grad_ref, g_grad_tst, atol=1e-3, rtol=1e-3)

    def test_transducer_joint(self):
        self.run_transducer_joint(
            for_vector_kernel=True, pack_output=True, relu=False, dropout=False
        )

    def test_transducer_joint_vec(self):
        self.run_transducer_joint(
            for_vector_kernel=True, pack_output=False, relu=False, dropout=False
        )

    def test_transducer_joint_pack(self):
        self.run_transducer_joint(
            for_vector_kernel=False, pack_output=True, relu=False, dropout=False
        )

    def test_transducer_joint_vec_pack(self):
        self.run_transducer_joint(
            for_vector_kernel=True, pack_output=True, relu=False, dropout=False
        )

    def test_transducer_joint_relu(self):
        self.run_transducer_joint(
            for_vector_kernel=True, pack_output=True, relu=True, dropout=False
        )

    def test_transducer_joint_vec_relu(self):
        self.run_transducer_joint(
            for_vector_kernel=True, pack_output=False, relu=True, dropout=False
        )

    def test_transducer_joint_pack_relu(self):
        self.run_transducer_joint(
            for_vector_kernel=False, pack_output=True, relu=True, dropout=False
        )

    def test_transducer_joint_vec_pack_relu(self):
        self.run_transducer_joint(
            for_vector_kernel=True, pack_output=True, relu=True, dropout=False
        )

    @unittest.expectedFailure
    def test_transducer_joint_relu_dropout(self):
        self.run_transducer_joint(for_vector_kernel=True, pack_output=True, relu=True, dropout=True)

    @unittest.expectedFailure
    def test_transducer_joint_vec_relu_dropout(self):
        self.run_transducer_joint(
            for_vector_kernel=True, pack_output=False, relu=True, dropout=True
        )

    @unittest.expectedFailure
    def test_transducer_joint_pack_relu_dropout(self):
        self.run_transducer_joint(
            for_vector_kernel=False, pack_output=True, relu=True, dropout=True
        )

    @unittest.expectedFailure
    def test_transducer_joint_vec_pack_relu_dropout(self):
        self.run_transducer_joint(for_vector_kernel=True, pack_output=True, relu=True, dropout=True)


if __name__ == "__main__":
    unittest.main()


================================================
FILE: apex/contrib/test/transducer/test_transducer_loss.py
================================================
import unittest

import torch

SKIP_TEST = None
try:
    from apex.contrib.transducer import TransducerLoss
    from apex.contrib.transducer import _transducer_ref as transducer_ref
except ImportError as e:
    SKIP_TEST = e


@unittest.skipIf(SKIP_TEST, f"{SKIP_TEST}")
class TransducerLossTest(unittest.TestCase):
    def setUp(self, seed=1234):
        torch.manual_seed(seed)

    def gen_input(self, scalar_t, for_vector_kernel):
        self.B = 5
        T_min = 23
        T_max = 51
        U_min = 12
        U_max = 25
        V = 16 if for_vector_kernel else 14
        self.blank_idx = V - 1
        device = "cuda"

        self.x_tst = torch.randn(
            (self.B, T_max, U_max, V), dtype=scalar_t, requires_grad=True, device=device
        )
        self.y = torch.randint(
            0, self.blank_idx, (self.B, U_max - 1), dtype=torch.int, device=device
        )
        self.f_len = torch.randint(T_min, T_max + 1, (self.B,), dtype=torch.int, device=device)
        self.y_len = torch.randint(U_min - 1, U_max, (self.B,), dtype=torch.int, device=device)
        self.f_len[torch.randint(0, self.B, (1,)).item()] = T_max
        self.y_len[torch.randint(0, self.B, (1,)).item()] = U_max - 1
        self.x_tst_packed, self.batch_offset = self._pack(self.x_tst)
        # Generate reference
        x_ref = self.x_tst.data.clone()
        x_ref.requires_grad = True
        loss_grad = torch.ones(x_ref.size(0), dtype=x_ref.dtype, device=x_ref.device) / x_ref.size(
            0
        )
        _, _, self.grad_ref, self.loss_ref = transducer_ref.transducer_loss_reference(
            x=x_ref,
            label=self.y,
            f_len=self.f_len,
            y_len=self.y_len,
            blank_idx=self.blank_idx,
            loss_grad=loss_grad,
        )

    def _pack(self, x):
        list_x = []
        for b in range(self.B):
            list_x_row = [x[b, t, : self.y_len[b] + 1] for t in range(self.f_len[b])]
            x_row = torch.cat(list_x_row)
            list_x.append(x_row)
        x_packed = torch.cat(list_x).data.clone()
        x_packed.requires_grad = True
        batch_offset = torch.cumsum(self.f_len * (self.y_len + 1), dim=0)
        return x_packed, batch_offset

    def _unpack(self, x):
        x_unpacked = torch.zeros(
            self.B,
            self.f_len.max(),
            self.y_len.max() + 1,
            x.size(-1),
            dtype=x.dtype,
            device=x.device,
        )
        for b in range(self.B):
            my_batch_offset = 0 if b == 0 else self.batch_offset[b - 1]
            my_f_len = self.f_len[b]
            my_g_len = self.y_len[b] + 1
            for t in range(my_f_len):
                for u in range(my_g_len):
                    x_unpacked[b, t, u] = x[my_batch_offset + t * my_g_len + u]
        return x_unpacked

    def run_transducer_loss(self, scalar_t, fuse_softmax_backward, packed_input, for_vector_kernel):
        self.gen_input(scalar_t, for_vector_kernel)
        my_loss = TransducerLoss(
            fuse_softmax_backward=fuse_softmax_backward, packed_input=packed_input
        )
        if not packed_input:
            loss_tst = my_loss(
                x=self.x_tst,
                label=self.y,
                f_len=self.f_len,
                y_len=self.y_len,
                blank_idx=self.blank_idx,
            )
            loss_tst.mean().backward()
            grad_tst = self.x_tst.grad
        else:
            loss_tst = my_loss(
                x=self.x_tst_packed,
                label=self.y,
                f_len=self.f_len,
                y_len=self.y_len,
                blank_idx=self.blank_idx,
                batch_offset=self.batch_offset,
                max_f_len=max(self.f_len),
            )
            loss_tst.mean().backward()
            grad_tst_packed = self.x_tst_packed.grad
            grad_tst = self._unpack(grad_tst_packed)

        return loss_tst, grad_tst

    def test_transducer_loss_fp32(self):
        loss_tst, grad_tst = self.run_transducer_loss(
            scalar_t=torch.float32,
            fuse_softmax_backward=False,
            packed_input=False,
            for_vector_kernel=False,
        )
        torch.testing.assert_close(self.loss_ref, loss_tst, atol=1e-5, rtol=1e-5)
        torch.testing.assert_close(self.grad_ref, grad_tst, atol=1e-5, rtol=1e-5)

    def test_transducer_loss_fp16(self):
        loss_tst, grad_tst = self.run_transducer_loss(
            scalar_t=torch.float16,
            fuse_softmax_backward=False,
            packed_input=False,
            for_vector_kernel=False,
        )
        torch.testing.assert_close(self.loss_ref, loss_tst, atol=1e-5, rtol=1e-5)
        torch.testing.assert_close(self.grad_ref, grad_tst, atol=1e-4, rtol=1e-3)

    def test_transducer_loss_fp16_backward_fusion(self):
        loss_tst, grad_tst = self.run_transducer_loss(
            scalar_t=torch.float16,
            fuse_softmax_backward=True,
            packed_input=False,
            for_vector_kernel=False,
        )
        torch.testing.assert_close(self.loss_ref, loss_tst, atol=1e-5, rtol=1e-5)
        torch.testing.assert_close(self.grad_ref, grad_tst, atol=1e-4, rtol=1e-3)

    def test_transducer_loss_fp16_backward_fusion_packed(self):
        loss_tst, grad_tst = self.run_transducer_loss(
            scalar_t=torch.float16,
            fuse_softmax_backward=True,
            packed_input=True,
            for_vector_kernel=False,
        )
        torch.testing.assert_close(self.loss_ref, loss_tst, atol=1e-5, rtol=1e-5)
        torch.testing.assert_close(self.grad_ref, grad_tst, atol=1e-4, rtol=1e-3)

    def test_transducer_loss_fp16_backward_fusion_packed_vec(self):
        loss_tst, grad_tst = self.run_transducer_loss(
            scalar_t=torch.float16,
            fuse_softmax_backward=True,
            packed_input=True,
            for_vector_kernel=True,
        )
        torch.testing.assert_close(self.loss_ref, loss_tst, atol=1e-5, rtol=1e-5)
        torch.testing.assert_close(self.grad_ref, grad_tst, atol=1e-4, rtol=1e-3)


if __name__ == "__main__":
    unittest.main()


================================================
FILE: apex/contrib/test/xentropy/__init__.py
================================================


================================================
FILE: apex/contrib/test/xentropy/test_label_smoothing.py
================================================
import unittest
import random
import time

import numpy as np

import torch

SKIP_TEST = None
try:
    from apex.contrib import xentropy as label_smoothing
except ImportError as e:
    SKIP_TEST = e


def label_smoothing_raw(x, target, padding_idx, smoothing):
    logprobs = torch.nn.functional.log_softmax(x, dim=-1, dtype=torch.float32)

    non_pad_mask = target != padding_idx
    nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
    nll_loss = nll_loss.squeeze(1)[non_pad_mask]
    smooth_loss = -logprobs.mean(dim=-1)[non_pad_mask]
    loss = (1.0 - smoothing) * nll_loss + smoothing * smooth_loss
    return loss


def label_smoothing_opt_1(x, target, padding_idx, smoothing):
    logprobs = torch.nn.functional.log_softmax(x, dim=-1, dtype=torch.float32)

    pad_mask = target == padding_idx
    ll_loss = logprobs.gather(dim=-1, index=target.unsqueeze(1)).squeeze(1)
    smooth_loss = logprobs.mean(dim=-1)
    loss = (smoothing - 1.0) * ll_loss - smoothing * smooth_loss
    loss.masked_fill_(pad_mask, 0)
    return loss


@unittest.skipIf(SKIP_TEST, f"{SKIP_TEST}")
class LabelSmoothingTest(unittest.TestCase):
    def setUp(self, seed=1234):
        super().setUp()
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)

        # Set pytorch print precision
        torch.set_printoptions(precision=10)

    def gen_test_inputs(self, N, T, H, smoothing, padding_idx, dtype=torch.half):
        logits = torch.randn((N * T, H), dtype=dtype, device="cuda", requires_grad=True)
        labels = torch.randint(0, H, [N * T], device="cuda")
        for i in random.sample(range(N * T), N * T // 6):
            labels[i] = padding_idx
        half_to_float = logits.dtype == torch.half

        return logits, labels, half_to_float

    def print_max_diff_elem(self, ref, tst):
        ref, tst = ref.flatten(), tst.flatten()
        diff = (ref - tst).abs().max()
        idx = (ref - tst).abs().argmax()
        print(
            "Max atol idx: {}, diff: {:.6f}, ref: {:.6f}, tst: {:.6f}".format(
                idx, diff, ref[idx], tst[idx]
            )
        )

    def _test_label_smoothing_function(self, dtype):
        # Set label smoothing configuration
        smoothing, padding_idx = 0.1, 0
        N, T, H = 128, 74, 32320
        iters = 10
        loss_func = label_smoothing.SoftmaxCrossEntropyLoss.apply

        for i in range(iters):
            logits, labels, half_to_float = self.gen_test_inputs(N, T, H, smoothing, padding_idx)

            # Run original softmax cross entropy with label smoothing
            logits.grad = None
            losses = label_smoothing_raw(logits, labels, padding_idx, smoothing)
            loss = losses.sum()
            loss.backward()

            ref_loss = loss.clone().detach()
            ref_grad = logits.grad.clone().detach()

            # Run optimized softmax cross entropy with label smoothing
            logits.grad = None
            losses = loss_func(logits, labels, smoothing, padding_idx, half_to_float)
            loss = losses.sum()
            loss.backward()

            val_loss = loss.clone().detach()
            val_grad = logits.grad.clone().detach()

            # Validate
            self.print_max_diff_elem(ref_grad, val_grad)
            torch.testing.assert_close(val_loss, ref_loss)
            torch.testing.assert_close(val_grad, ref_grad)

    def test_label_smoothing_function_fp16(self):
        self._test_label_smoothing_function(torch.half)

    def test_label_smoothing_function_bf16(self):
        self._test_label_smoothing_function(torch.bfloat16)

    def test_label_smoothing_perf(self):
        # Set label smoothing configuration
        smoothing, padding_idx = 0.1, 0
        N, T, H = 128, 74, 32320
        iters = 1000
        loss_func = label_smoothing.SoftmaxCrossEntropyLoss.apply
        print()

        logits, labels, half_to_float = self.gen_test_inputs(N, T, H, smoothing, padding_idx)

        # Run original softmax cross entropy with label smoothing
        torch.cuda.synchronize()
        ts = time.time()
        for i in range(iters):
            logits.grad = None
            losses = label_smoothing_raw(logits, labels, padding_idx, smoothing)
            loss = losses.sum() / N
            loss.backward()
        torch.cuda.synchronize()
        print(
            "Raw time {:.2f} s elapsed for {} iterations, norm {:.4f}".format(
                time.time() - ts, iters, logits.grad.norm()
            )
        )

        # Run optimized softmax cross entropy with label smoothing
        torch.cuda.synchronize()
        ts = time.time()
        for i in range(iters):
            logits.grad = None
            losses = loss_func(logits, labels, smoothing, padding_idx, half_to_float)
            loss = losses.sum() / N
            loss.backward()
        torch.cuda.synchronize()
        print(
            "Opt time {:.2f} s elapsed for {} iterations, norm {:.4f}".format(
                time.time() - ts, iters, logits.grad.norm()
            )
        )


if __name__ == "__main__":
    unittest.main()


================================================
FILE: apex/contrib/torchsched/__init__.py
================================================
"""Graph scheduler package."""

from __future__ import annotations

from typing import TYPE_CHECKING

import torch
import torch._inductor
from torch._dynamo import list_backends
from torch._dynamo import register_backend
from torch._inductor.compile_fx import compile_fx_inner

from .backend import get_backend

if TYPE_CHECKING:
    from collections.abc import Callable
    from typing import Any

    from torch._ops import OpOverload

__all__ = ["get_backend", "set_default_backend"]

# Register custom operators
torch.ops.import_module("apex.contrib.torchsched.ops")


# Register torch-sched backend
# Same API as torch._inductor.compile_fx
@register_backend
def torchsched(
    model_: torch.fx.GraphModule,
    example_inputs_: list[torch.Tensor],
    inner_compile: Callable[..., Any] = compile_fx_inner,
    config_patches: dict[str, Any] | None = None,
    decompositions: dict[OpOverload, Callable[..., Any]] | None = None,
) -> Callable:
    backend = get_backend(backend="torchsched", scheme="dwb")
    return backend(model_, example_inputs_, inner_compile, config_patches, decompositions)


_SUPPORTED_BACKENDS = list_backends()
_DEFAULT_BACKEND = "inductor"
__torch_compile__ = torch.compile


def set_default_backend(backend: str) -> None:
    """
    Set the default backend for torch.compile.

    Parameters:
        backend (str): The backend to use as the default for torch.compile.
    """
    global _DEFAULT_BACKEND
    assert backend in _SUPPORTED_BACKENDS, f"Unknown backend {backend}"
    _DEFAULT_BACKEND = backend


def torchsched_compile(
    *args: object,
    backend: str | Callable | None = None,
    **kwargs: object,
) -> object:
    """
    Wrap around the original torch.compile to support default backend.

    Parameters:
        *args (object): Positional arguments for torch.compile.
        backend (Union[str, Callable, None]): The backend to use.
            If None, the default backend is used.
        **kwargs (object): Additional keyword arguments for torch.compile.

    Returns:
        object: Compiler or compiled model.
    """
    if backend is None:
        backend = _DEFAULT_BACKEND
    return __torch_compile__(*args, backend=backend, **kwargs)


# Monkey patch torch.compile to set default backend
torch.compile = torchsched_compile


================================================
FILE: apex/contrib/torchsched/backend.py
================================================
"""Graph scheduler backend."""

from __future__ import annotations

import functools
from copy import copy
from typing import TYPE_CHECKING
from typing import ParamSpec
from typing import TypeVar

if TYPE_CHECKING:
    from collections.abc import Callable
    from types import NotImplementedType

import torch
from torch import Tensor
from torch import _TorchCompileInductorWrapper
from torch._dynamo import lookup_backend
from torch._inductor.compile_fx import compile_fx
from torch._inductor.compile_fx import compile_fx_inner
from torch._inductor.decomposition import select_decomp_table

import apex.contrib.torchsched.config as config
from apex.contrib.torchsched.inductor import patch_graph_lowering
from apex.contrib.torchsched.passes import pre_grad_custom_pass

aten = torch.ops.aten
prims = torch.ops.prims

__all__ = ["get_backend"]


P = ParamSpec("P")
R = TypeVar("R")


def enable_multi_stream_scheduling(compile_fn: Callable[P, R]) -> Callable[P, R]:
    assert callable(compile_fn)

    @functools.wraps(compile_fn)
    def _compile_wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
        patch_graph_lowering(patch=True)
        compile_results = compile_fn(*args, **kwargs)
        patch_graph_lowering(patch=False)
        return compile_results

    return _compile_wrapper


# Refer: https://github.com/pytorch/pytorch/blob/v2.6.0/torch/_inductor/decomposition.py#L213
def convolution_backward_decomp_dwb(
    grad_output: Tensor,
    input: Tensor,
    weight: Tensor,
    bias_sizes: tuple[int, ...],
    stride: tuple[int, ...],
    padding: tuple[int, ...],
    dilation: tuple[int, ...],
    transposed: bool,
    output_padding: tuple[int, ...],
    groups: int,
    output_mask: tuple[bool, bool, bool],
) -> tuple[Tensor, Tensor, Tensor] | NotImplementedType:
    """Decomposite convolution bprop using the dgrad/wgrad/bgrad scheme.

    Args:
        grad_output (Tensor): The gradient w.r.t output.
        input (Tensor): The input tensor.
        weight (Tensor): The weight tensor.
        bias_sizes (Tuple[int, ...]): The sizes of the bias tensor.
        stride (Tuple[int, ...]): The stride of the convolution.
        padding (Tuple[int, ...]): The padding of the convolution.
        dilation (Tuple[int, ...]): The dilation of the convolution.
        transposed (bool): Whether the convolution is transposed.
        output_padding (Tuple[int, ...]): The output padding for the transposed convolution.
        groups (int): The number of groups for the convolution.
        output_mask (Tuple[bool, bool, bool]): A mask indicating which gradients to compute.

    Returns:
        Union[Tuple[Tensor, Tensor, Tensor], NotImplemented]: A tuple containing the
            gradients of the input, weight, and bias, or NotImplemented if the
            conditions are not met.
    """
    if not output_mask[2] or grad_output.device.type != "cuda":
        return NotImplemented
    grad_inp, _, _ = aten.convolution_backward(
        grad_output,
        input,
        weight,
        bias_sizes,
        stride,
        padding,
        dilation,
        transposed,
        output_padding,
        groups,
        [output_mask[0], False, False],
    )
    _, grad_weight, _ = aten.convolution_backward(
        grad_output,
        input,
        weight,
        bias_sizes,
        stride,
        padding,
        dilation,
        transposed,
        output_padding,
        groups,
        [False, output_mask[1], False],
    )
    grad_bias = aten.sum(grad_output, [0] + list(range(2, grad_output.dim())))
    return (grad_inp, grad_weight, grad_bias)


def convolution_backward_decomp_wbd(
    grad_output: Tensor,
    input: Tensor,
    weight: Tensor,
    bias_sizes: tuple[int, ...],
    stride: tuple[int, ...],
    padding: tuple[int, ...],
    dilation: tuple[int, ...],
    transposed: bool,
    output_padding: tuple[int, ...],
    groups: int,
    output_mask: tuple[bool, bool, bool],
) -> tuple[Tensor, Tensor, Tensor] | NotImplementedType:
    """Decomposite convolution bprop using the wgrad/bgrad/dgrad scheme.

    Args:
        grad_output (Tensor): The gradient w.r.t output.
        input (Tensor): The input tensor.
        weight (Tensor): The weight tensor.
        bias_sizes (Tuple[int, ...]): The sizes of the bias tensor.
        stride (Tuple[int, ...]): The stride of the convolution.
        padding (Tuple[int, ...]): The padding of the convolution.
        dilation (Tuple[int, ...]): The dilation of the convolution.
        transposed (bool): Whether the convolution is transposed.
        output_padding (Tuple[int, ...]): The output padding for the transposed convolution.
        groups (int): The number of groups for the convolution.
        output_mask (Tuple[bool, bool, bool]): A mask indicating which gradients to compute.

    Returns:
        Union[Tuple[Tensor, Tensor, Tensor], NotImplemented]: A tuple containing the
            gradients of the input, weight, and bias, or NotImplemented if the
            conditions are not met.
    """
    if not output_mask[2] or grad_output.device.type != "cuda":
        return NotImplemented
    _, grad_weight, _ = aten.convolution_backward(
        grad_output,
        input,
        weight,
        bias_sizes,
        stride,
        padding,
        dilation,
        transposed,
        output_padding,
        groups,
        [False, output_mask[1], False],
    )
    grad_bias = aten.sum(grad_output, [0] + list(range(2, grad_output.dim())))
    grad_inp, _, _ = aten.convolution_backward(
        grad_output,
        input,
        weight,
        bias_sizes,
        stride,
        padding,
        dilation,
        transposed,
        output_padding,
        groups,
        [output_mask[0], False, False],
    )
    return (grad_inp, grad_weight, grad_bias)


class DecompositionsWrapper(_TorchCompileInductorWrapper):
    """A wrapper class for handling decompositions in model compilation.

    This class extends the `_TorchCompileInductorWrapper` to include additional
    decompositions for model compilation.

    Args:
        mode (str): The mode for the wrapper.
        options (Optional[Dict]): Additional options for the wrapper.
        dynamic (bool): Whether the wrapper is dynamic.
        decompositions (Dict): A dictionary of decompositions to use.

    Attributes:
        decompositions (Dict): The decompositions used by the wrapper.
    """

    def __init__(
        self,
        mode: str,
        options: dict | None,
        dynamic: bool,
        decompositions: dict,
    ) -> None:
        """Initialize the DecompositionsWrapper."""
        super().__init__(mode, options, dynamic)
        self.decompositions = decompositions
        # Force skip the type checking in self.apply_options() since default values are None type.
        self.config.update(
            {
                "pre_grad_custom_pass": (
                    pre_grad_custom_pass if config.enable_pre_grad_pass else None
                ),
            },
        )

    def __eq__(self, rhs: object) -> bool:
        """Check equality with another DecompositionsWrapper.

        Args:
            rhs (object): The other object to compare with.

        Returns:
            bool: True if the wrappers are equal, False otherwise.
        """
        eq = (
            isinstance(rhs, DecompositionsWrapper)
            and super().__eq__(rhs)
            and rhs.decompositions == self.decompositions
        )
        return eq

    def __call__(
        self,
        model_: torch.nn.Module,
        inputs_: list,
        *args: object,
        **kwargs: object,
    ) -> Callable:
        """Compiles the model with the given inputs and decompositions.

        Args:
            model_ (torch.nn.Module): The model to compile.
            inputs_ (list): The inputs to the model.
            args (object): Positional argument.
            kwargs (object): Keyword argument.

        Returns:
            Callable: The compiled model.
        """
        # Modifications to compilation process should be isolated between each compilations.
        decompositions = copy(select_decomp_table())
        decompositions.update(self.decompositions)
        return compile_fx(
            model_,
            inputs_,
            inner_compile=enable_multi_stream_scheduling(compile_fx_inner),
            config_patches=self.config,
            decompositions=decompositions,
        )


def get_backend(
    backend: str = "torch",
    scheme: str = "dwb",
) -> Callable | DecompositionsWrapper:
    """Get the graph scheduler backend for model compilation.

    This function returns the appropriate backend for model compilation based on
    the specified parameters.

    Args:
        backend (str, optional): The backend to use. Defaults to "torch".
        scheme (str, optional): The decomposition scheme to use. Defaults to "dwb".

    Returns:
        Union[Callable, DecompositionsWrapper]: The backend for model compilation.

    Raises:
        Exception: If an unknown scheme is specified.
    """
    if backend not in ("torch", "torchsched"):
        raise ValueError(f"Unknown compilation {backend=}")
    if scheme not in ("dwb", "wbd"):
        raise ValueError(f"Invalid {scheme=}, use scheme=dwb or wbd instead")

    if backend == "torch":
        return lookup_backend("inductor")

    # [NOTE] Disable buffer reuse and inplace buffers to avoid inter-stream conflicts.
    #
    # In PyTorch Inductor, the safety of buffer reuse and in-place buffer update is ensured by the
    # program's single-stream, serial execution. That is, if op2 is launched only after op1 has
    # completed execution, then these cases are safe:
    #
    #   Case 1: Safe to reuse buffer `workspace1` as `op2`'s workspace.
    #
    #         op1   ->   op2              op1   ->   op2
    #          ↕          ↕       ⇒        ↕          ↑
    #     workspace1 workspace2       workspace1 ←----┘
    #
    #   Case 2: Safe to inpalace `op1`'s output to `buf1` then send to `op2` as input.
    #
    #     buf1 -> op1 -> buf2 -> op2  ⇒  buf1 ↔	op1
    #                                     └-------> op2
    #
    # However, if operators are dispatched to distinct CUDA Streams and execute in parallel, above
    # cases are not safe any more:
    #
    #   Counter example 1: Case 1 is not safe if op1 and op2 are in parallel.
    #
    #        op1
    #         ↕
    #     workspace1 (Buffer modified concurrently by op1 and op2.)
    #         ↕
    #        op2
    #
    #   Counter example 2: Case 2 is not safe if op1 and op2 are in parallel.
    #
    #     buf1 <-->	op1
    #      └------> op2 (Op2 could read op1's input data.)
    #
    # Thus currently we disable both buffer reuse and inplace buffer update to ensure multi-stream
    # correctness.
    #
    # TODO(@davidli): Add cross-stream dependency to Inductor scheduling's dependency system so we
    # can safely reuse and inplace update buffers even in multi-stream scenario.

    if scheme == "dwb":
        return DecompositionsWrapper(
            mode="default",
            options={"allow_buffer_reuse": False, "inplace_buffers": False},
            dynamic=False,
            decompositions={
                aten.convolution_backward.default: convolution_backward_decomp_dwb,
            },
        )
    elif scheme == "wbd":
        return DecompositionsWrapper(
            mode="default",
            options={"allow_buffer_reuse": False, "inplace_buffers": False},
            dynamic=False,
            decompositions={
                aten.convolution_backward.default: convolution_backward_decomp_wbd,
            },
        )
    else:
        # To please mypy
        raise ValueError(f"Invalid {scheme=}, use scheme=dwb or wbd instead")


================================================
FILE: apex/contrib/torchsched/config.py
================================================
"""Configurations for graph scheduler."""

import functools
import os
import re
import sys

# Debug info and dump grpahs
debug = os.getenv("TORCH_SCHED_DEBUG", "0") == "1"

# Toggle pre_grad_pass for various pattern matches
enable_pre_grad_pass = False

# Pre grad pass patterns
pre_grad_pass_options: list[str] = ["cudnn_layer_norm"]

# Number of CUDA streams used for multi-stream scheduling.
# The first stream will be critical path stream, operators on non-critical path will be
# scheduled to other streams in a round-robin way.
num_streams = int(os.getenv("TORCH_SCHED_NUM_STREAMS", "8"))


def _get_skip_post_grad_graph_ids() -> set[int]:
    if ids := os.environ.get("TORCH_SCHED_SKIP_GRAPH_IDS"):
        result: set[int] = set()
        for part in ids.split(","):
            if "-" in part:
                start, end = map(int, part.split("-"))
                result.update(range(start, end + 1))
            else:
                result.add(int(part))
        return result
    else:
        return set()


# IDs of post AOT-autograd graphs that should be skipped for multi-stream scheduling. Can be
# specified via TORCH_SCHED_SKIP_GRAPH_IDS environment variable in a SLURM-like scheme, e.g.,
# TORCH_SCHED_SKIP_GRAPH_IDS=1,2,3-5,7-10
skip_post_grad_graph_ids: set[int] = _get_skip_post_grad_graph_ids()

# Reduce the number of allocated CUDA Events in the generated program by:
# 1. Track reference count of each CUDA Event in the scheduling phase. Skip generating CUDA Events
#    that have no reference counts, i.e., have not been waited by other streams;
# 2. Reuse allocated CUDA Events when feasible.
# This option is enable by default.
reuse_cuda_event: bool = os.getenv("TORCH_SCHED_REUSE_CUDA_EVENT", "1") == "1"


@functools.lru_cache
def __get_dump_code_backends_and_dir(
    dump_code: str | None,
) -> tuple[list[str], str | None]:
    pattern = r"(?:\+(?P<backend>\w+),)?(?P<dir>[\w\/\.\-\s@#~]+)"
    backends, dir = ["torchsched"], None
    if dump_code and (match := re.match(pattern, dump_code)):
        if backend := match.group("backend"):
            backends.append(backend)
        dir = os.path.abspath(match.group("dir"))
    return backends, dir


# Specify dump code backend types and output directory by::
#
#   TORCH_SCHED_DUMP_CODE='+inductor,/dir/to/save/code'
#
# Where `+inductor` enables dump both Inductor and torchsched code. If omitted, only dump
# torchsched code. `/dir/to/save/code` specifies a directory to dump code to.
(
    dump_code_backends,
    dump_code_dir,
) = __get_dump_code_backends_and_dir(os.getenv("TORCH_SCHED_DUMP_CODE"))

from torch.utils._config_module import install_config_module  # noqa: E402

# adds patch, save_config, etc
install_config_module(sys.modules[__name__])


================================================
FILE: apex/contrib/torchsched/inductor/__init__.py
================================================
"""Scheduling abstractions on PyTorch Inductor level."""

from apex.contrib.torchsched.inductor.graph import patch_graph_lowering

__all__ = ["patch_graph_lowering"]


================================================
FILE: apex/contrib/torchsched/inductor/_utils.py
================================================
from __future__ import annotations

import functools
import queue
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from types import TracebackType

import torch

__all__ = [
    "DEFAULT_STREAM",
    "DEFAULT_STREAM_IDX",
    "ENTRANCE_EVENT",
    "EVENT_NAME_TEMPLATE",
    "STREAM_NAME_TEMPLATE",
    "CUDAStreamPool",
    "get_cuda_stream_pool",
]

DEFAULT_STREAM: str = "default_stream"
DEFAULT_STREAM_IDX: int = 0
ENTRANCE_EVENT: str = "event0"
EVENT_NAME_TEMPLATE: str = "event{event_idx:d}"
STREAM_NAME_TEMPLATE: str = "stream{stream_idx:d}"


@functools.lru_cache
def get_stream_name(stream_idx: int) -> str:
    """Generate CUDA Stream name from stream index number.

    Args:
        stream_idx: Non-negative index number. 0 refers to the default stream, others refer to side
            streams.
    """
    if stream_idx == 0:
        return DEFAULT_STREAM
    else:
        return STREAM_NAME_TEMPLATE.format(stream_idx=stream_idx)


class CUDAStreamPool:
    """A pool managing reusable CUDA streams to optimize GPU operations.

    Attributes:
        pool_size (int): The maximum number of CUDA streams managed by the pool.
        stream_queue (queue.Queue): Queue holding the available CUDA streams.
    """

    def __init__(self, device: int | None = None, pool_size: int = 8) -> None:
        """Initializesthe CUDAStreamPool instance.

        Args:
            device (Optional[int], optional): The CUDA device ID.
                Defaults to None (current device).
            pool_size (int, optional): The maximum number of CUDA streams in the pool.
                Defaults to 8.
        """
        self.pool_size: int = pool_size
        self.stream_queue: queue.Queue[torch.cuda.Stream] = queue.Queue(maxsize=pool_size)

        for _ in range(pool_size):
            stream = torch.cuda.Stream(device=device)
            self.stream_queue.put(stream)

    def acquire(self) -> torch.cuda.Stream:
        """Acquire a CUDA stream from the pool.

        Returns:
            torch.cuda.Stream: A CUDA stream object from the pool.
        """
        return self.stream_queue.get()

    def release(self, stream: torch.cuda.Stream | None) -> None:
        """Return a CUDA stream back to the pool.

        Args:
            stream (Optional[torch.cuda.Stream]): The CUDA stream to return to the pool.
        """
        if stream is not None:
            self.stream_queue.put(stream)

    def __enter__(self) -> torch.cuda.Stream:
        """Enters the runtime context and acquires a CUDA stream.

        Returns:
            torch.cuda.Stream: The acquired CUDA stream.
        """
        self.stream = self.acquire()
        self.stream.__enter__()
        return self.stream

    def __exit__(
        self,
        exc_type: type[BaseException] | None,
        exc_val: BaseException | None,
        exc_tb: TracebackType | None,
    ) -> None:
        """Exit the runtime context and releases the acquired CUDA stream.

        Args:
            exc_type (type[BaseException] | None): Exception type, if raised.
            exc_val (BaseException | None): Exception instance, if raised.
            exc_tb (TracebackType | None): Traceback object, if raised.
        """
        self.stream.__exit__(exc_type, exc_val, exc_tb)
        self.release(self.stream)


_cuda_stream_pool: CUDAStreamPool | None = None


def get_cuda_stream_pool(device: int | None = None, pool_size: int = 32) -> CUDAStreamPool:
    """Retrieve a global CUDA stream pool, creating it if necessary.

    This function ensures that only one CUDAStreamPool instance exists globally.

    Args:
        device (Optional[int], optional): The CUDA device ID to initialize the pool on.
            Defaults to None (current device).
        pool_size (int, optional): The number of streams in the pool. Defaults to 32.

    Returns:
        CUDAStreamPool: The global CUDA stream pool instance.
    """
    global _cuda_stream_pool
    if _cuda_stream_pool is None:
        _cuda_stream_pool = CUDAStreamPool(device=device, pool_size=pool_size)
    return _cuda_stream_pool


================================================
FILE: apex/contrib/torchsched/inductor/event.py
================================================
"""CUDA Event abstractions used in Inductor multi-stream scheduling.

Attributes:
    ENTRANCE_EVENT: Name of the first event on the default CUDA Stream that got recorded before all
        kernels.
    EVENT_NAME_TEMPLATE: Python string template to generate event names. Can be used as:

            idx: int = ...
            event = EVENT_NAME_TEMPLATE.format(event_idx=idx)
"""

from __future__ import annotations

import dataclasses
import functools
import itertools

from torch._inductor.codegen.wrapper import IndentedBuffer
from torch._inductor.codegen.wrapper import WrapperLine

import apex.contrib.torchsched.config as torchsched_config
from apex.contrib.torchsched.inductor._utils import DEFAULT_STREAM_IDX
from apex.contrib.torchsched.inductor._utils import ENTRANCE_EVENT
from apex.contrib.torchsched.inductor._utils import EVENT_NAME_TEMPLATE
from apex.contrib.torchsched.inductor._utils import get_stream_name


@functools.total_ordering
@dataclasses.dataclass
class CudaEventSym:
    """Symbolic representation of CUDA Events in the Inductor scheduling phase.

    Args:
        factory: The CUDAEventFactory that generate this event.
        idx: Indexing number assigned in chronological order during scheduling.
        originate_stream_idx: The index of the CUDA stream that this event originated from.
        ref_count: Reference count of this event instance.
        materialized_event: The actual CUDA Event name that will be used in the final PyTorch
            program. Only symbolic event with reference count larger than one will be materialized.

    Note:
        In most cases this class should not be used standalone. Use
        `CUDAEventFactory.get_sym_event()` to instantiate one.
    """

    factory: CudaEventFactory
    idx: int
    originate_stream_idx: int
    ref_count: int = 0
    materialized_event: str | None = None

    def __lt__(self, rhs: CudaEventSym) -> bool:
        """Whether the current event is generated before the rhs event."""
        if self.factory is not rhs.factory:
            return NotImplemented
        return (self.idx, self.originate_stream_idx) < (
            rhs.idx,
            rhs.originate_stream_idx,
        )

    def __eq__(self, rhs: object) -> bool:
        """Whether the current event is identical to the rhs event."""
        if not isinstance(rhs, CudaEventSym):
            return NotImplemented
        return (
            self.idx == rhs.idx
            and self.originate_stream_idx == rhs.originate_stream_idx
            and self.factory is rhs.factory
        )

    def __str__(self) -> str:
        """Represent this symbolic event in string."""
        ret = f"{self.__class__.__name__} (idx={self.idx}"
        ret += f", originate_stream_idx={self.originate_stream_idx}"
        if self.ref_count:
            ret += f", ref_count={self.ref_count}"
        if self.materialized_event:
            ret += f", materialized to `{self.materialized_event}`"
        ret += ")"
        return ret

    def __hash__(self) -> int:
        """Hash this symbolic event."""
        return hash((id(self.factory), self.idx, self.originate_stream_idx))

    def record(self, stream_idx: int) -> _CudaEventRecordLine:
        """Record this event on a given stream.

        Args:
            stream_idx: The index of the stream that this event will record on.

        Returns:
            An internal data structure that depicts stream <-> event dependency.

        Note:
            This method doesn't necessarily generate a event recording in the final program.
            Instead it records the dependence between the stream and the current event. Whether
            or not this event recording show up in the final program depends on the reference
            count of the current event. I.e., if this event is never waited for by the later
            code, this event recording will not be code-generated.
        """
        stream = get_stream_name(stream_idx)
        return _CudaEventRecordLine(self, stream)

    def wait(self, stream_idx: int) -> _CudaEventWaitLine:
        """Wait for this event to complete by a given stream.

        Args:
            stream_idx: The index of the stream that will be waiting for this event to complete.

        Returns:
            An internal data structure that depicts stream <-> event dependency.

        Note:
            This method doesn't necessarily generate a event waiting in the final program. Instead
            it records the dependence between the stream and the current event and also increase
            the reference count of this event. If an event object has called this method, it is
            guaranteed to be generated in the final program.
        """
        assert stream_idx != self.originate_stream_idx
        self.ref_count += 1
        stream = get_stream_name(stream_idx)
        return _CudaEventWaitLine(self, stream)


@dataclasses.dataclass
class _CudaEventRecordLine(WrapperLine):
    event: CudaEventSym
    stream: str
    _reuse_cuda_event: bool = torchsched_config.reuse_cuda_event

    def codegen(self, code: IndentedBuffer) -> None:
        assert 0 <= self.event.ref_count
        assert self.event.materialized_event is None
        if self.event.ref_count or not self._reuse_cuda_event:
            self.event.materialized_event = self.event.factory.get_materialized_event(code)
            code.writeline(f"{self.event.materialized_event}.record({self.stream})")


@dataclasses.dataclass
class _CudaEventWaitLine(WrapperLine):
    event: CudaEventSym
    stream: str

    def codegen(self, code: IndentedBuffer) -> None:
        assert 0 < self.event.ref_count
        assert self.event.materialized_event is not None
        code_line = f"{self.event.materialized_event}.wait({self.stream})"
        self.event.ref_count -= 1
        if self.event.ref_count == 0:
            self.event.factory.deposit_materialized_event(self.event.materialized_event)
            self.event.materialized_event = None
            code_line += f"  # End lifecycle of {self.event}"
        code.writeline(code_line)


class CudaEventFactory:
    """A factory that managements CUDA event creations and materializations.

    This factory maintains internal states to ensure that created cuda events get monotonically
    increasing indices as compilation goes along. It also maintains a pool of materialized cuda
    events that symbolic events can reuse.
    """

    def __init__(self) -> None:
        """Initialize a event factory."""
        self.symbolic_event_idx: itertools.count = itertools.count(start=1)
        self.materialized_event_idx: itertools.count = itertools.count(start=1)
        self.available_materialized_events: set[str] = set()
        self._entrance_event: CudaEventSym | None = None
        self._reuse_cuda_event: bool = torchsched_config.reuse_cuda_event

    def get_entrance_event(self) -> CudaEventSym:
        """Return the cuda event that corresponding to compute graph entering."""
        if self._entrance_event is None:
            self._entrance_event = CudaEventSym(
                factory=self,
                idx=0,
                originate_stream_idx=DEFAULT_STREAM_IDX,
            )
            # Code-gen for entrance event is almost hard-coded in device guard enter so the
            # materialization is slightly different here.
            self._entrance_event.materialized_event = ENTRANCE_EVENT
        return self._entrance_event

    def get_sym_event(self, originate_stream_idx: int) -> CudaEventSym:
        """Allocate a symbolic cuda event."""
        return CudaEventSym(
            factory=self,
            idx=next(self.symbolic_event_idx),
            originate_stream_idx=originate_stream_idx,
        )

    def get_materialized_event(self, code: IndentedBuffer) -> str:
        """Allocate or reuse a materialized cuda event."""
        if self._reuse_cuda_event and self.available_materialized_events:
            return self.available_materialized_events.pop()
        else:
            event = EVENT_NAME_TEMPLATE.format(event_idx=next(self.materialized_event_idx))
            code.writeline(f"{event} = torch.cuda.Event()")
            return event

    def deposit_materialized_event(self, event: str) -> None:
        """Give back a materialized cuda event when the corresponding sym event ends lifecycle."""
        assert event not in self.available_materialized_events
        self.available_materialized_events.add(event)


================================================
FILE: apex/contrib/torchsched/inductor/graph.py
================================================
"""Scheduling abstractions on PyTorch Inductor GraphLowering level."""

from __future__ import annotations

import functools
from pathlib import Path
from typing import TYPE_CHECKING

import torch
from torch._inductor.codegen.common import get_scheduling_for_device
from torch._inductor.codegen.common import get_wrapper_codegen_for_device
from torch._inductor.codegen.common import register_backend_for_device
from torch._inductor.codegen.wrapper import PythonWrapperCodegen
from torch._inductor.graph import GraphLowering
from torch._inductor.scheduler import Scheduler
from torch._inductor.virtualized import V

if TYPE_CHECKING:
    from torch._inductor.utils import ValueWithLineMap

from apex.contrib.torchsched import config as torchsched_config
from apex.contrib.torchsched.inductor.scheduler import MultiCudaStreamScheduler
from apex.contrib.torchsched.inductor.wrapper import MultiStreamWrapperCodegen

_inductor_codegen = GraphLowering.codegen
patching_device_type = "cuda"
schedule_log = torch._logging.getArtifactLogger(__name__, "schedule")


@functools.wraps(GraphLowering.codegen)
def _torchsched_codegen(
    graph: GraphLowering,
) -> tuple[ValueWithLineMap, ValueWithLineMap]:
    # Move patching logic here as post_grad_graph_id was not available until now.
    cpp_wrapper_cls = get_wrapper_codegen_for_device(patching_device_type, cpp_wrapper=True)
    only_cpu = len(graph.device_types - {"cpu", "meta"}) == 0
    scheduling_cls = get_scheduling_for_device(patching_device_type)
    wrapper_cls = get_wrapper_codegen_for_device(patching_device_type)
    write_get_raw_stream = PythonWrapperCodegen.write_get_raw_stream
    if not only_cpu and graph.post_grad_graph_id not in torchsched_config.skip_post_grad_graph_ids:
        patched_scheduler_cls = MultiCudaStreamScheduler
        patched_wrapper_cls = MultiStreamWrapperCodegen
        # torch.compile explicitly calls `write_get_raw_stream` via wrapper's class method in its
        # lowering process to walk around the wrapper-stream LRU cache mechanism. To be compatible
        # with this, we got to patch wrapper's class method as well.
        PythonWrapperCodegen.write_get_raw_stream = MultiStreamWrapperCodegen._write_get_raw_stream
    else:
        patched_scheduler_cls = Scheduler
        patched_wrapper_cls = PythonWrapperCodegen
    register_backend_for_device(
        device=patching_device_type,
        device_scheduling=scheduling_cls,
        device_wrapper_codegen=patched_wrapper_cls,
        device_cpp_wrapper_codegen=cpp_wrapper_cls,
    )

    graph.init_wrapper_code()
    graph.scheduler = patched_scheduler_cls(graph.operations)
    V.debug.draw_orig_fx_graph(graph.orig_gm, graph.scheduler.nodes)
    graph.wrapper_code.push_codegened_graph(graph)
    graph.scheduler.codegen()
    result = graph.wrapper_code.generate(graph.is_inference)
    graph.wrapper_code.pop_codegened_graph()

    PythonWrapperCodegen.write_get_raw_stream = write_get_raw_stream
    register_backend_for_device(
        device=patching_device_type,
        device_scheduling=scheduling_cls,
        device_wrapper_codegen=wrapper_cls,
        device_cpp_wrapper_codegen=cpp_wrapper_cls,
    )

    return result


@functools.wraps(GraphLowering.codegen)
def _mixed_codegen(graph: GraphLowering) -> tuple[ValueWithLineMap, ValueWithLineMap]:
    assert torchsched_config.dump_code_dir
    output_code_per_backend: dict[str, tuple[ValueWithLineMap, ValueWithLineMap]] = {}

    for backend in torchsched_config.dump_code_backends:
        if backend == "torchsched":
            codegen = _torchsched_codegen
        elif backend == "inductor":
            codegen = _inductor_codegen
        else:
            raise ValueError(f"Unknown {backend=} from {torchsched_config.dump_code_backends=}")
        wrapper_code, kernel_code = codegen(graph)
        output_code_per_backend[backend] = (wrapper_code, kernel_code)

    for backend, (wrapper_code, kernel_code) in output_code_per_backend.items():
        backend_dir = Path(torchsched_config.dump_code_dir) / backend
        backend_dir.mkdir(parents=True, exist_ok=True)
        graph_id = graph.post_grad_graph_id
        (backend_dir / f"graph_{graph_id}_wrapper_code.py").write_text(wrapper_code.value)
        if kernel_code.value.strip():
            # Kernel_code is only available in AOTInductor mode.
            (backend_dir / f"graph_{graph_id}_kernel_code.py").write_text(kernel_code.value)

    return output_code_per_backend["torchsched"]


def patch_graph_lowering(patch: bool = True) -> None:
    """Patch PyTorch Inductor lowerings with multi-stream scheduling.

    This function patches the `torch.compile` stack on the GraphLowering level,
    i.e., the compute graph has been captured by Dynamo and it has undergone
    post-auto-gradient passes, including pattern-matching optimizations and
    preliminary operator fusions. At that point, most nodes in the graph are
    either fused Triton templates, or function calls to external libraries. The
    multi-stream scheduler then finds the longest critical path in this graph,
    and schedule other nodes to side streams to exploit the inherent parallelism
    of the given compute graph.

    Args:
        patch: Whether to patch Inductor `GraphLowering` with multi-stream
            scheduler. Set to `False` to restore the default `torch.compile`
            behavior. (default: `True`)
    """
    if patch and torchsched_config.dump_code_dir:
        GraphLowering.codegen = _mixed_codegen
    elif patch:
        GraphLowering.codegen = _torchsched_codegen
    else:
        GraphLowering.codegen = _inductor_codegen


================================================
FILE: apex/contrib/torchsched/inductor/scheduler.py
================================================
"""Scheduling abstractions on PyTorch Inductor Scheduler level."""

from __future__ import annotations

import collections
import itertools
import re
from typing import TYPE_CHECKING
from typing import cast

import torch
import torch._inductor.config as inductor_config
from torch._inductor import ir
from torch._inductor.dependencies import WeakDep
from torch._inductor.scheduler import BaseSchedulerNode
from torch._inductor.scheduler import ExternKernelSchedulerNode
from torch._inductor.scheduler import ForeachKernelSchedulerNode
from torch._inductor.scheduler import FusedSchedulerNode
from torch._inductor.scheduler import NopKernelSchedulerNode
from torch._inductor.scheduler import Scheduler
from torch._inductor.scheduler import SchedulerNode
from torch._inductor.utils import device_need_guard
from torch._inductor.virtualized import V

from apex.contrib.torchsched import config
from apex.contrib.torchsched.inductor._utils import DEFAULT_STREAM_IDX
from apex.contrib.torchsched.inductor._utils import get_stream_name
from apex.contrib.torchsched.inductor.event import CudaEventFactory
from apex.contrib.torchsched.inductor.event import CudaEventSym
from apex.contrib.torchsched.inductor.wrapper import EnterCudaStreamContextLine

if TYPE_CHECKING:
    from apex.contrib.torchsched.inductor.wrapper import MultiStreamWrapperCodegen


schedule_log = torch._logging.getArtifactLogger(__name__, "schedule")


class MultiCudaStreamScheduler(Scheduler):
    """Scheduling post-fusion graph with multi-stream awareness.

    This class introduced a new optimization pass on top of the Inductor :class:`Scheduler`. I.e.,
    it firstly searches for the longest critical path in the given compute graph, currently using
    the path depth as a proxy of execution cost. Then it executes the non-critical computations in
    parallel with the critical path computations by launching them to side CUDA Streams, with the
    goal of scheduling critical path computations back-to-back while saturating GPU resources at
    runtime.

    Args:
        operations: A list of Inductor IR nodes representing fused computations.
    """

    def __init__(self, operations: list[ir.Operation]) -> None:
        """Construct a scheduler object from a list of Inductor IR nodes.

        Refer to :class:`MultiCudaStreamScheduler` doc for argument specification.
        """
        super().__init__(operations)
        self.event_factory = CudaEventFactory()
        self.buff_to_event: dict[str, CudaEventSym] = collections.defaultdict(
            lambda: self.event_factory.get_sym_event(
                originate_stream_idx=self.current_stream_idx,  # type: ignore[arg-type]
            ),
        )
        self.unjoined_events: dict[int, set[CudaEventSym]] = collections.defaultdict(set)
        self.buffers_requiring_device_check: set[str] = set()
        # The only source of which stream context are we currently in at the scheduling phase.
        self._current_stream_ctx: EnterCudaStreamContextLine | None = None
        self.schedule_multi_cuda_streams()

    @property
    def current_stream_idx(self) -> int | None:
        """CUDA Stream index that current scheduler node assigned to."""
        if self._current_stream_ctx is not None:
            return self._current_stream_ctx.stream_idx
        else:
            return None

    @property
    def current_stream_name(self) -> str | None:
        """CUDA Stream name that current scheduler node assigned to."""
        if (stream_idx := self.current_stream_idx) is not None:
            return get_stream_name(stream_idx)
        else:
            return None

    @property
    def buffers_recorded_on_current_stream(self) -> set[str]:
        """Buffer names that have been recorded on the current stream context."""
        assert self._current_stream_ctx is not None
        return self._current_stream_ctx.buffers_recorded_on_this_stream

    @buffers_recorded_on_current_stream.setter
    def buffers_recorded_on_current_stream(self, buffs: set[str]) -> None:
        """Set buffer names that have been recorded on the current stream context.

        Note:
            The name of buffers recorded on the current stream context should be a superset of the
            buffers recorded on the previous stream context.
        """
        assert self._current_stream_ctx is not None
        assert buffs.issuperset(self._current_stream_ctx.buffers_recorded_on_this_stream)
        self._current_stream_ctx.buffers_recorded_on_this_stream = buffs

    def debug_str_short(self, node: BaseSchedulerNode) -> str:
        """Generate short string representing scheduler node's calling function or indices."""
        if node.is_extern() and isinstance(node.node, ir.MultiOutput):
            kernel_str = node.node.codegen_list_tuple_access(
                basename="getitem",
                indices=node.node.indices,
            )
            return f"{node.get_name()} ({kernel_str})"
        elif node.is_extern():
            kernel_name = node.node.get_kernel_name() or str(node.node.op_overload)
            return f"{node.get_name()} ({kernel_name})"
        else:
            return node.get_name()

    def get_last_event(self, events: set[CudaEventSym]) -> CudaEventSym:
        """Identify the latest generated CUDA event among all given events."""
        return sorted(events, reverse=True)[0]  # CudaEventSym is total-ordering.

    def schedule_multi_cuda_streams(self) -> None:
        """Assign each fused Inductor IR nodes with the CUDA Stream to be launched to."""
        if not self.nodes:
            # Empty graphs are sent to compiler in very rare circumstances. Just Skip scheduling.
            return

        buf_originate: dict[str, BaseSchedulerNode] = {}
        node_users: dict[BaseSchedulerNode, set[BaseSchedulerNode]] = collections.defaultdict(set)
        for node in self.nodes:
            for n in node.get_buffer_names():
                buf_originate[n] = node
        for node in self.nodes:
            for d in node.unmet_dependencies:
                assert d.name in buf_originate
                node_users[buf_originate[d.name]].add(node)

        critical_path_per_depth: dict[int, set[BaseSchedulerNode]] = collections.defaultdict(set)
        node_depth: dict[BaseSchedulerNode, int] = collections.defaultdict(lambda: -1)

        def visit(node: BaseSchedulerNode, depth: int, prev: set[BaseSchedulerNode]) -> None:
            if node_depth[node] < depth:
                node_depth[node] = depth
                path = prev | {node}
                if len(critical_path_per_depth[depth]) < len(path):
                    critical_path_per_depth[depth] = path
                for user in node_users[node]:
                    visit(user, depth + 1, path)

        graph_entries = [n for n in self.nodes if not n.unmet_dependencies]
        for entry in graph_entries:
            visit(entry, depth=1, prev=set())

        max_depth, longest_critical_path = sorted(critical_path_per_depth.items(), reverse=True)[0]

        # Allocate CUDA Streams for each fused node:
        # - Critical path nodes go to the default stream
        # - Nodes without GPU operations (currently only covered getitem nodes) go to their
        #   producer's stream
        # - Other nodes go to a set of pre-defined number of side-streams in a round-robin manner
        num_streams = config.num_streams
        if num_streams == 1:
            node_to_stream = {node: DEFAULT_STREAM_IDX for node in self.nodes}
        else:
            node_to_stream = {}
            side_stream_indices = itertools.cycle(range(1, num_streams))
            for node in self.nodes:
                if node in longest_critical_path:
                    node_to_stream[node] = DEFAULT_STREAM_IDX
                elif node.is_extern() and isinstance(node.node, ir.MultiOutput):
                    assert len(node.unmet_dependencies) == 1
                    producer = buf_originate[next(iter(node.unmet_dependencies)).name]
                    node_to_stream[node] = node_to_stream[producer]
                else:
                    node_to_stream[node] = next(side_stream_indices)
        self.node_to_stream = node_to_stream

        # Also remember buffer originate streams.
        buff_to_stream = {}
        for node, stream_idx in node_to_stream.items():
            for buf_name in node.get_buffer_names():
                buff_to_stream[buf_name] = stream_idx
        self.buff_to_stream = buff_to_stream

        schedule_log.debug(f"{' Multi-CUDA-Stream scheduling results ':=^79}")
        schedule_log.debug("Post-fusion graph depth: %d", max_depth)
        schedule_log.debug("Total number of allocated CUDA Streams: %d", num_streams)
        schedule_log.debug(f"{' Critical path ':-^79}")
        for node in self.nodes:
            if node in longest_critical_path:
                schedule_log.debug("- %s", self.debug_str_short(node))
        schedule_log.debug(f"{' Stream assignments of other nodes ':-^79}")
        for node, stream_idx in node_to_stream.items():
            if node not in longest_critical_path:
                schedule_log.debug("- %s -> Stream %d", self.debug_str_short(node), stream_idx)

    def get_final_events_to_sync(self) -> set[CudaEventSym]:
        """Return the CUDA Events that need to be synced at the end of the program.

        Raises:
            ValueError: If there is hanging event on the default stream. This usually means the
                user didn't properly use :meth:`add_unjointed_event` to register hanging events.
        """
        if self.unjoined_events.get(DEFAULT_STREAM_IDX):
            raise ValueError(
                f"Unexpected {self.unjoined_events[DEFAULT_STREAM_IDX]=} on default stream",
            )
        events_to_sync = set()
        for stream, events in self.unjoined_events.items():
            if len(events) == 0:
                schedule_log.debug(f"All events on stream{stream} have been consumed")
                continue
            last_event = self.get_last_event(events)
            if 1 < len(events):
                schedule_log.debug(
                    f"Seeing multiple hanging {events=} on stream{stream}, scheduling the "
                    f"{last_event=} to sync",
                )
            else:
                schedule_log.debug(
                    f"Scheduling the {last_event=} on stream{stream} to sync",
                )
            events_to_sync.add(last_event)
        return events_to_sync

    def clear_unjoined_events(self) -> None:
        """Clear handing event syncs registered by :meth:`add_unjointed_event`."""
        self.unjoined_events.clear()

    def register_downstream_event(
        self,
        node: BaseSchedulerNode,
    ) -> CudaEventSym:
        """Register one CUDA event indicating node execution complete.

        For ordinary Inductor IR nodes, the completion event is newly created using an internal
        event counter. For Inductor no-op nodes, the last event corresponding to its inputs will be
        used instead.

        Args:
            node: The Inductor IR node to generate completion event for.

        Returns:
            The name of the completion event.

        Raises:
            ValueError: If this function is called out side of any stream context.
        """
        if isinstance(node, NopKernelSchedulerNode) and node.unmet_dependencies:
            upstream_events = set()
            for dep in node.unmet_dependencies:
                assert dep.name in self.buff_to_event
                upstream_events.add(self.buff_to_event[dep.name])
            assert 1 <= len(upstream_events)
            downstream_event = self.get_last_event(upstream_events)
            for buff in node.get_buffer_names():
                self.buff_to_event[buff] = downstream_event
        else:
            for i, buff in enumerate(sorted(node.get_buffer_names())):
                if i == 0:
                    downstream_event = self.buff_to_event[buff]
                    assert downstream_event.originate_stream_idx == self.current_stream_idx
                else:
                    self.buff_to_event[buff] = downstream_event
            if (node_stream := self.node_to_stream[node]) != DEFAULT_STREAM_IDX:
                self.unjoined_events[node_stream].add(downstream_event)
            V.graph.wrapper_code.writeline(downstream_event.record(node_stream))
        return downstream_event

    def get_cross_stream_dependencies(
        self,
        node: BaseSchedulerNode,
    ) -> tuple[set[CudaEventSym], set[str]]:
        """Get CUDA Event and buffer dependencies of an IR node.

        Args:
            node: The Inductor IR node to generate code for.

        Returns:
            upstream_events: A set of CUDA Event symbols, these events need to be synced before
                executing `node`'s code.
            buffer_from_other_streams: A set of buffer names, these buffers need to be recorded on
                the CUDA Stream that `node` is running on.
        """
        assert node in self.node_to_stream

        # Process cross-cuda-stream dependencies.
        node_stream = self.node_to_stream[node]
        events_on_stream: dict[int, set[CudaEventSym]] = collections.defaultdict(set)
        buffers_from_other_streams = set()
        if not node.unmet_dependencies and node_stream != DEFAULT_STREAM_IDX:
            # Graph entries on side streams should wait upon the main stream entrance.
            entrance_event = self.event_factory.get_entrance_event()
            events_on_stream[DEFAULT_STREAM_IDX].add(entrance_event)
        for dep in node.read_writes.reads:
            buff = dep.name  # To track stream number and cuda events.
            buff_real = self.mutation_real_name.get(buff, buff)  # The real name in code.
            if dep not in node.unmet_dependencies and not isinstance(dep, WeakDep):
                # Materialized dependencies should be recorded on this stream.
                buffers_from_other_streams.add(buff_real)
                # The scalar tensor argument `dropout_p` of SDPA backward kernels might be on CUDA
                # or CPU devices depending on execution scenario. To ensure program correctness we
                # add a runtime check for it.
                #
                # TODO (@davidli): Remove this ad-hoc checking once PyTorch fix SDPA and
                # MultiOutputLayout issues.
                if node.is_extern() and re.match(
                    r"aten._scaled_dot_product_.*_attention_backward",
                    str(node.node.op_overload),
                ):
                    self.buffers_requiring_device_check.add(buff_real)
                continue
            elif isinstance(dep, WeakDep):
                # Skip unmaterialized dependencies.
                continue
            assert buff in self.buff_to_event
            assert buff in self.buff_to_stream
            buff_event = self.buff_to_event[buff]
            buff_stream = self.buff_to_stream[buff]
            events_on_stream[buff_stream].add(buff_event)
            if buff_stream != node_stream:
                if node.is_extern() and isinstance(node.node, ir.MultiOutput):
                    assert len(node.read_writes.reads) == 1
                    buff_real = node.node.codegen_list_tuple_access(
                        basename=buff_real,
                        indices=node.node.indices,
                    )
                    self.buffers_requiring_device_check |= {
                        buff_real,
                        node.node.get_name(),
                    }
                buffers_from_other_streams.add(buff_real)

        # Should only wait for the latest event from each stream.
        upstream_events = set()
        for stream, events in events_on_stream.items():
            if stream != node_stream:
                last_event = self.get_last_event(events)
                upstream_events.add(last_event)

        return upstream_events, buffers_from_other_streams

    def generate_stream_ctx_enter(self, node: BaseSchedulerNode) -> None:
        """Code-gen to enter the Stream context assigned to node."""
        assert not isinstance(node, NopKernelSchedulerNode)
        wrapper_code = cast("MultiStreamWrapperCodegen", V.graph.wrapper_code)
        upstream_events, buffers_from_other_streams = self.get_cross_stream_dependencies(node)
        node_stream = self.node_to_stream[node]
        self._current_stream_ctx = wrapper_code.codegen_cuda_stream_enter(
            stream_idx=node_stream,
            upstream_events=upstream_events,
            buffers_from_other_streams=buffers_from_other_streams,
            buffers_requiring_device_check=self.buffers_requiring_device_check,
        )

    def generate_stream_ctx_exit(self) -> None:
        """Code-gen to exit from the current Stream context."""
        assert self._current_stream_ctx is not None
        wrapper_code = cast("MultiStreamWrapperCodegen", V.graph.wrapper_code)
        wrapper_code.codegen_cuda_stream_exit()
        self._current_stream_ctx = None

    def propagate_cross_stream_dependencies(self, node: BaseSchedulerNode) -> None:
        """Move input node's dependencies to the entrance of current CUDA Stream context.

        If node is scheduled in the middle of a stream context, its dependencies should be properly
        synced before entering this context. This function extracts `node`'s dependencies and move
        them to the data structure that represents the entrance of current stream context.

        Args:
            node: The Inductor IR node to generate code for. This node must have an assigned stream
                in :meth:`schedule_multi_cuda_streams`.
        """
        assert self.current_stream_idx is not None
        wrapper_code = cast("MultiStreamWrapperCodegen", V.graph.wrapper_code)
        upstream_events, buffers_from_other_streams = self.get_cross_stream_dependencies(node)
        buffers_from_other_streams -= self.buffers_recorded_on_current_stream
        wrapper_code.codegen_buffers_record_stream(
            buffers=buffers_from_other_streams,
            stream_idx=self.current_stream_idx,
            buffers_requiring_device_check=self.buffers_requiring_device_check,
        )
        wrapper_code.codegen_events_wait_stream(
            events=upstream_events,
            stream_idx=self.current_stream_idx,
        )
        self.buffers_recorded_on_current_stream |= buffers_from_other_streams

    def generate_stream_ctx_switching(self, node: BaseSchedulerNode) -> None:
        """Generate stream entering and exiting to properly run node in a multi-stream scenario.

        Stream context switching is only generated if `node`'s assigned stream is different from
        the previous node's stream. If the node is a no-op, its code will be generated in the same
        context of previous node.
        """
        assert node in self.node_to_stream
        stream = None if isinstance(node, NopKernelSchedulerNode) else self.node_to_stream[node]
        if self.current_stream_idx == stream:
            if stream is not None:
                self.propagate_cross_stream_dependencies(node)
            return
        elif self.current_stream_idx is not None and stream is None:
            # Don't generate ctx switching. Memory plaining code (e.g., delete buffers) on current
            # node goes to previous stream ctx.
            return
        elif self.current_stream_idx is None and stream is not None:
            # Enter new ctx, update current stream status.
            self.generate_stream_ctx_enter(node)
        else:
            # Switching from previous stream ctx to the new stream ctx.
            self.generate_stream_ctx_exit()
            self.generate_stream_ctx_enter(node)

    def codegen(self) -> None:
        """Generate Python code for each of the Scheduler IR nodes.

        Note:
            The overall `torch.compile` code-gen is a multi-pass process, which means that this
            method doesn't necessarily generate final program strings for every IR nodes. For
            certain types of IRs, e.g., those involve memory allocation/deletion and CUDA Stream
            switching, this method only generates respective data structures, and the final
            code-gen is delegated to :meth:`WrapperCodeGen.codegen` using information form these
            data structures.

        Raises:
            AssertionError: If any of the conditions met
                * A node need to switch device context but it didn't include device information;
                * A node contains at least one non-weak dependence that was not seen in the
                  :meth:`schedule_multi_cuda_streams` pass;
                * A node contains at least one non-weak cross-stream dependence that the
                  corresponding event was not generated before that point;
                * The fused compute graph contains :class:`ForeachKernelSchedulerNode` but the
                  target backend doesn't support SIMD scheme.
        """
        wrapper_code = cast("MultiStreamWrapperCodegen", V.graph.wrapper_code)
        wrapper_code.codegen_graph_nvtx_range_push(V.graph.post_grad_graph_id)
        for node in self.nodes:
            try:
                schedule_log.debug(
                    "Generating code for node %s with estimated runtime %f",
                    node.get_name(),
                    node.get_estimated_runtime(),
                )
            except Exception:
                schedule_log.debug(
                    "Generating code for node %s with estimated runtime 0.0",
                    node.get_name(),
                )

            self.enter_context(node)

            if not isinstance(node, NopKernelSchedulerNode) and (device := node.get_device()):
                if device != self.current_device or node.is_extern() or node.is_template():
                    self.flush()
                if device != self.current_device:
                    if self.current_device and device_need_guard(
                        self.current_device.type,
                    ):
                        wrapper_code.codegen_device_guard_exit()
                    if device_need_guard(device.type):
                        assert device.index is not None, "device should have an index"
                        wrapper_code.codegen_device_guard_enter(device.index)
                    self.current_device: torch.device | None = device

            self.generate_stream_ctx_switching(node)
            self.buffer_names_to_free.update(node.last_usage)

            if node.is_template():
                node, *epilogue = node.get_nodes()
                self.get_backend(device).codegen_template(node, epilogue)
            elif node.is_extern():
                node = cast("ExternKernelSchedulerNode", node)
                self.codegen_extern_call(node)
            elif node.is_foreach():
                node = cast("ForeachKernelSchedulerNode", node)
                backend_ = self.get_backend(device)
                from torch._inductor.codegen.cuda_combined_scheduling import (
                    CUDACombinedScheduling,
                )
                from torch._inductor.codegen.simd import SIMDScheduling

                if isinstance(backend_, (SIMDScheduling, CUDACombinedScheduling)):
                    backend = backend_
                else:
                    raise AssertionError(f"{type(self)=}")
                backend.codegen_combo_kernel(node)
            elif isinstance(node, (FusedSchedulerNode, SchedulerNode)):
                self.get_backend(device).codegen_node(node)
            else:
                assert isinstance(node, NopKernelSchedulerNode)
                node.mark_run()

            if inductor_config.triton.debug_sync_kernel:
                self.get_backend(device).codegen_sync()

            self.available_buffer_names.update(node.get_buffer_names())
            self.completed_operations.update(node.get_operation_names())
            self.register_downstream_event(node)

            if not isinstance(node, NopKernelSchedulerNode):
                device = node.get_device()
                if device is not None and self.get_backend(device).ready_to_flush():
                    self.flush()

        if self.current_device and device_need_guard(self.current_device.type):
            # Exit the last stream context.
            if self._current_stream_ctx is not None:
                self.generate_stream_ctx_exit()
            # Record the default stream on buffers from other streams.
            side_stream_buffers = set()
            for output in V.graph.get_output_names():
                if self.buff_to_stream.get(output, DEFAULT_STREAM_IDX) != DEFAULT_STREAM_IDX:
                    side_stream_buffers.add(output)
            wrapper_code.codegen_buffers_record_stream(
                buffers=side_stream_buffers,
                stream_idx=DEFAULT_STREAM_IDX,
                buffers_requiring_device_check=self.buffers_requiring_device_check,
            )
            # Sync hanging events from other streams.
            if events_to_sync := self.get_final_events_to_sync():
                wrapper_code.codegen_events_wait_stream(
                    events=events_to_sync,
                    stream_idx=DEFAULT_STREAM_IDX,
                )
            # exit the outermost CUDA device guard. this is
            # important for nested indentation codegen-ing.
            wrapper_code.codegen_device_guard_exit()

        wrapper_code.codegen_graph_nvtx_range_pop()
        self.flush()


================================================
FILE: apex/contrib/torchsched/inductor/wrapper.py
================================================
"""Scheduling abstractions on PyTorch Inductor WrapperCodeGen level.

Attributes:
    DEFAULT_STREAM: Name of the default CUDA Stream on the final generated Python code.
    DEFAULT_STREAM_IDX: Index number of the default CUDA Stream in `torchsched` internal passes.
    STREAM_NAME_TEMPLATE: Python string template to generate stream names. Can be used as:

            idx: int = ...
            stream = STREAM_NAME_TEMPLATE.format(stream_idx=idx)
"""

from __future__ import annotations

import dataclasses
from typing import TYPE_CHECKING

from torch._inductor.codegen.wrapper import EnterDeviceContextManagerLine
from torch._inductor.codegen.wrapper import ExitDeviceContextManagerLine
from torch._inductor.codegen.wrapper import IndentedBuffer
from torch._inductor.codegen.wrapper import PythonWrapperCodegen
from torch._inductor.codegen.wrapper import SubgraphPythonWrapperCodegen
from torch._inductor.codegen.wrapper import WrapperLine
from torch._inductor.virtualized import V

import apex.contrib.torchsched.config as config
from apex.contrib.torchsched.inductor._utils import DEFAULT_STREAM
from apex.contrib.torchsched.inductor._utils import ENTRANCE_EVENT
from apex.contrib.torchsched.inductor._utils import STREAM_NAME_TEMPLATE
from apex.contrib.torchsched.inductor._utils import get_stream_name

if TYPE_CHECKING:
    from torch._inductor.graph import GraphLowering
    from torch._inductor.ir import GraphPartitionSignature

    from apex.contrib.torchsched.inductor.event import CudaEventSym


@dataclasses.dataclass
class EnterDeviceContextManagerWithStreamInfoLine(EnterDeviceContextManagerLine):
    """Enter a CUDA device context and allocate required side streams.

    Note:
        - The number of allocated streams is controlled by :attr:`torchsched.config.num_streams`;
    """

    def codegen(self, code: IndentedBuffer) -> None:
        """Generate context switching and stream allocation code."""
        if V.graph.cpp_wrapper:
            super().codegen(code)
        else:
            super().codegen(code)
            code.writeline(f"{DEFAULT_STREAM} = torch.cuda.current_stream()")
            code.writeline(f"{ENTRANCE_EVENT} = {DEFAULT_STREAM}.record_event()")

            code.writeline(
                "from apex.contrib.torchsched.inductor._utils import get_cuda_stream_pool"
            )
            code.writeline(
                f"cuda_stream_pool = get_cuda_stream_pool(device={self.device_idx}, "
                f"pool_size={config.num_streams})",
            )

            for i in range(1, config.num_streams):
                code.writeline(
                    f"{STREAM_NAME_TEMPLATE.format(stream_idx=i)} = cuda_stream_pool.acquire()",
                )


@dataclasses.dataclass
class ExitDeviceContextManagerWithStreamInfoLine(ExitDeviceContextManagerLine):
    """Exit a CUDA device context and release allocated streams."""

    def codegen(self, code: IndentedBuffer) -> None:
        """Generate context switching and stream release code."""
        for i in range(1, config.num_streams):
            code.writeline(
                f"cuda_stream_pool.release({STREAM_NAME_TEMPLATE.format(stream_idx=i)})",
            )
        if not V.graph.cpp_wrapper:
            code.do_unindent()


@dataclasses.dataclass
class EnterCudaStreamContextLine(WrapperLine):
    """Enter a context executed by respective CUDA Stream and insert necessary syncs.

    Attributes:
        wrapper: The code-gen wrapper of the current compilation phase.
        stream_idx: The index number corresponds to the entering CUDA Stream context.
        upstream_events: Names of CUDA Events that the current stream should be waiting for before
            the stream switching.
        buffers_from_other_streams: Name of buffers produced by other CUDA Streams. Those buffers
            should be recorded to the current stream to avoid accidental memory free.
        buffers_requiring_device_check: Name of buffers that might not be on CUDA devices and
            require runtime device checking before recording stream to them.
    """

    stream_idx: int

    def __post_init__(self) -> None:
        """Track buffers have been recorded on this stream to reduce duplicate recording."""
        self.buffers_recorded_on_this_stream: set[str] = set()

    def codegen(self, code: IndentedBuffer) -> None:
        """Generate stream switching and buffer recording code."""
        code.writeline(f"with torch.cuda.stream({get_stream_name(self.stream_idx)}):")
        code.do_indent()

        # [NOTE] The 3-indent-level assertion
        #
        #     Indent level 1: Inductor wrapper call indent
        #         Indent level 2: Device guard context indent
        #             Indent level 3: CUDA Stream context indent
        #
        # Over or under indenting usually means that :meth:`MultiCudaStreamScheduler.codegen`
        # introduced bugs on stream context switching. This check also applies to stream context
        # exiting, as in :meth:`ExitCudaStreamContextLine.codegen`.
        assert code._indent == 3


@dataclasses.dataclass
class ExitCudaStreamContextLine(WrapperLine):
    """Generate code to exit the current stream context.

    Note:
        Most attributes and checking logics of this class have been moved to
        :meth:`MultiStreamWrapperCodeGen.codegen_cuda_stream_exit`. We preserve this data structure
        because the checking and unindent should be generated in the latter phase of code-gen.
    """

    def codegen(self, code: IndentedBuffer) -> None:
        """Check indentation level and exit the current stream context."""
        assert code._indent == 3  # See :note:`The 3-indent-level assertion` above.
        code.do_unindent()


class MultiStreamWrapperCodegen(PythonWrapperCodegen):
    """Wrapper code generator for graph scheduling."""

    def __init__(self) -> None:
        """Construct a code-gen wrapper and disable raw stream caching.

        Note:
            The :meth:`write_get_raw_stream` method processed in this constructor is invoked from
            literally everywhere throughout the Inductor stack, but the current
            :meth:`PythonWrapperCodegen.write_get_raw_stream` is LRU-cached and always returns a
            const raw stream name. This is not what we wanted in a multi-stream environment. Thus
            we need to re-patch this function in instance initialization.
        """
        super().__init__()
        self.write_get_raw_stream = self._write_get_raw_stream

    @staticmethod
    def create(
        is_subgraph: bool,
        subgraph_name: str,
        parent_wrapper: MultiStreamWrapperCodegen,
        partition_signatures: GraphPartitionSignature | None = None,
    ) -> MultiStreamWrapperCodegen | SubgraphPythonWrapperCodegen:
        """Instantiate a wrapper codegen for an Inductor graph or a subgraph."""
        if is_subgraph:
            assert subgraph_name is not None
            assert parent_wrapper is not None
            return SubgraphPythonWrapperCodegen(
                subgraph_name,
                parent_wrapper,
                partition_signatures,
            )
        return MultiStreamWrapperCodegen()

    def _write_get_raw_stream(self, device_idx: int, graph: GraphLowering | None = None) -> str:
        self.write_triton_header_once()
        if (current_stream_name := V.graph.scheduler.current_stream_name) is not None:
            name = f"{current_stream_name}_raw"
            self.writeline(f"{name} = {current_stream_name}.cuda_stream")
        else:
            name = f"stream{device_idx}"
            self.writeline(f"{name} = get_raw_stream({device_idx})")
        return name

    def codegen_graph_nvtx_range_push(self, post_grad_graph_id: int) -> None:
        """Generate NVTX range push for graph."""
        self.writeline(f"torch.cuda.nvtx.range_push('graph {post_grad_graph_id}')")

    def codegen_graph_nvtx_range_pop(self) -> None:
        """Generate NVTX range pop for graph."""
        self.writeline("torch.cuda.nvtx.range_pop()")

    def codegen_device_guard_enter(self, device_idx: int) -> None:
        """Generate data structure for device guard context.

        Note:
            Refer to :class:`EnterDeviceContextManagerWithStreamInfoLine` doc for more details.
        """
        self.writeline(
            EnterDeviceContextManagerWithStreamInfoLine(
                device_idx,
                self.last_seen_device_guard_index,
            ),
        )
        self.last_seen_device_guard_index: int = device_idx

    def codegen_device_guard_exit(self) -> None:
        """Generate data structure for exiting device guard context."""
        self.writeline(ExitDeviceContextManagerWithStreamInfoLine())

    def codegen_cuda_stream_enter(
        self,
        stream_idx: int,
        upstream_events: set[CudaEventSym],
        buffers_from_other_streams: set[str],
        buffers_requiring_device_check: set[str] | None = None,
    ) -> EnterCudaStreamContextLine:
        """Generate data structure for entering a CUDA Stream context.

        Args:
            stream_idx: The index number of the entering CUDA Stream context.
            upstream_events: Names of CUDA Events that the current stream should be waiting for
                before the stream switching. This is usually the events that are generated by the
                previous stream context.
            buffers_from_other_streams: Name of buffers produced by other CUDA Streams. Those
                buffers should be recorded to the current stream to avoid accidental memory free.
            buffers_requiring_device_check: Name of buffers that might not be on CUDA devices and
                require runtime device checking before recording stream to them.

        Note:
            - Refer to :class:`EnterCudaStreamContextLine` for argument specifications;
            - Once entered a context, the stream associated with this context will also be recorded
              such that kernels in subsequent code-gen can get the correct stream index.

        Raises:
            ValueError: If this function is called while the previous stream context isn't exited.
        """
        if (current_stream_name := V.graph.scheduler.current_stream_name) is not None:
            raise ValueError(
                f"Nested stream context switching: {current_stream_name} -> "
                f"{get_stream_name(stream_idx)}",
            )
        ctx_entrance = EnterCudaStreamContextLine(stream_idx=stream_idx)
        self.writeline(ctx_entrance)
        self.codegen_buffers_record_stream(
            buffers=buffers_from_other_streams,
            stream_idx=stream_idx,
            buffers_requiring_device_check=buffers_requiring_device_check,
        )
        ctx_entrance.buffers_recorded_on_this_stream |= buffers_from_other_streams
        self.codegen_events_wait_stream(
            events=upstream_events,
            stream_idx=stream_idx,
        )
        return ctx_entrance

    def codegen_cuda_stream_exit(self) -> None:
        """Generate data structure for exiting a CUDA Stream context."""
        self.writeline(ExitCudaStreamContextLine())

    def codegen_events_wait_stream(self, events: set[CudaEventSym], stream_idx: int) -> None:
        """Generate data structure for syncing hanging CUDA Events with certain stream.

        Args:
            events: Symbols of the events that need to be synchronized with the given stream.
            stream_idx: Index of the CUDA stream to synchronize the events with.
        """
        for event in events:
            self.writeline(event.wait(stream_idx))

    def codegen_buffers_record_stream(
        self,
        buffers: set[str],
        stream_idx: int,
        buffers_requiring_device_check: set[str] | None = None,
    ) -> None:
        """Generate data structure for recording steam on return tensors before program exit.

        Args:
            buffers: Names of buffers that need to be recorded on the given stream.
            stream_idx: Index of the CUDA stream to record the buffers to.
            buffers_requiring_device_check: Name of buffers that might not be on CUDA devices and
                require runtime device checking before recording stream to them. If not provided,
                buffers will be recorded to the given stream without runtime device checking.
        """
        for buff in buffers:
            prefix = (
                f"if {buff}.is_cuda: "
                if buffers_requiring_device_check and buff in buffers_requiring_device_check
                else ""
            )
            self.writeline(f"{prefix}{buff}.record_stream({get_stream_name(stream_idx)})")


================================================
FILE: apex/contrib/torchsched/ops/__init__.py
================================================
"""Custom PyTorch operators."""

import torch

__all__: list[str] = []

# Register custom operators
torch.ops.import_module("apex.contrib.torchsched.ops.layer_norm")


================================================
FILE: apex/contrib/torchsched/ops/layer_norm.py
================================================
"""Customized CuDNN frontend layer norm.

Please refer to:

* https://github.com/NVIDIA/cudnn-frontend/blob/main/samples/python/20_layernorm.ipynb
"""

from __future__ import annotations

import math

import cudnn
import torch

__all__ = ["get_cudnn_manager"]


class CuDNNManager:
    """CuDNN fronted context manager.

    Notice: CuDNN handle must be created after distributed process group initialization.
    """

    def __init__(self) -> None:
        self._handle = cudnn.create_handle()
        self._cudnn_stream = torch.cuda.Stream()
        self.reset_stream()

    def __del__(self) -> None:
        if cudnn is not None and hasattr(cudnn, "destroy_handle"):
            cudnn.destroy_handle(self._handle)

    def __enter__(self) -> CuDNNManager:
        self._torch_stream = torch.cuda.current_stream()
        self._cudnn_stream.wait_stream(self._torch_stream)
        torch.cuda.set_stream(self._cudnn_stream)
        return self

    def __exit__(
        self,
        exc_type: type | None,
        exc_val: Exception | None,
        exc_tb: object | None,
    ) -> None:
        self._torch_stream.wait_stream(self._cudnn_stream)
        torch.cuda.set_stream(self._torch_stream)
        del self._torch_stream

    def set_stream(self, stream: torch.cuda.Stream) -> None:
        cudnn.set_stream(stream=stream.cuda_stream, handle=self._handle)

    def reset_stream(self) -> None:
        cudnn.set_stream(stream=self._cudnn_stream.cuda_stream, handle=self._handle)

    @property
    def handle(self) -> int:
        return self._handle

    @property
    def stream(self) -> torch.cuda.Stream:
        return self._cudnn_stream


_global_cudnn_manager: CuDNNManager | None = None


def get_cudnn_manager() -> CuDNNManager:
    """Get the CuDNN front-end context manager.

    Returns:
        CuDNNManager: Global CuDNN manager.
    """
    global _global_cudnn_manager
    if _global_cudnn_manager is None:
        _global_cudnn_manager = CuDNNManager()
    return _global_cudnn_manager


class LayerNormGraphFactory:
    """cuDNN front-end layer norm graph factory.

    cuDNN layer norm constraints:

    * All tensors are 4-dimensional;
    * `x` and `y` have the same layout in the graph;
    """

    _graphs: dict = {}
    _symbols: dict = {}
    _TORCH2CUDNN: dict = {
        torch.bool: cudnn.data_type.BOOLEAN,
        torch.bfloat16: cudnn.data_type.BFLOAT16,
        torch.float16: cudnn.data_type.HALF,
        torch.float32: cudnn.data_type.FLOAT,
        torch.uint8: cudnn.data_type.UINT8,
    }

    @classmethod
    def get_forward_graph(
        cls: type[LayerNormGraphFactory],
        m: int,
        n: int,
        xdtype: torch.dtype,
        wdtype: torch.dtype,
    ) -> tuple[cudnn._compiled_module.pygraph, tuple]:
        key = m, n, xdtype, wdtype, "FORWARD"
        if key in cls._graphs:
            if key not in cls._symbols:
                raise RuntimeError(
                    f"Symbolic tensor was not constructed for layer-norm forward graph with input "
                    f"shape {(m, n)} and data type {(xdtype, wdtype)}",
                )
            return cls._graphs[key], cls._symbols[key]

        cudnn_manager: CuDNNManager = get_cudnn_manager()
        graph = cudnn.pygraph(
            intermediate_data_type=cudnn.data_type.FLOAT,
            compute_data_type=cudnn.data_type.FLOAT,
            handle=cudnn_manager.handle,
        )
        x_sym = graph.tensor(
            name="x_sym",
            dim=(m, n, 1, 1),
            stride=(n, 1, n, n),  # Simulate the channel-last format.
            data_type=cls._TORCH2CUDNN[xdtype],
        )
        scale_sym = graph.tensor(
            name="scale_sym",
            dim=(1, n, 1, 1),
            stride=(n, 1, n, n),
            data_type=cls._TORCH2CUDNN[wdtype],
        )
        bias_sym = graph.tensor(
            name="bias_sym",
            dim=(1, n, 1, 1),
            stride=(n, 1, n, n),
            data_type=cls._TORCH2CUDNN[wdtype],
        )
        eps_sym = graph.tensor(
            name="eps_sym",
            dim=(1, 1, 1, 1),
            stride=(1, 1, 1, 1),
            is_pass_by_value=True,
            data_type=cudnn.data_type.FLOAT,
        )

        y_sym, x_mean_sym, x_invstd_sym = graph.layernorm(
            name=f"layer-norm-forward-{key}",
            norm_forward_phase=cudnn.norm_forward_phase.TRAINING,
            input=x_sym,
            scale=scale_sym,
            bias=bias_sym,
            epsilon=eps_sym,
        )

        y_sym.set_output(True).set_data_type(cls._TORCH2CUDNN[xdtype])
        x_mean_sym.set_output(True).set_data_type(cls._TORCH2CUDNN[torch.float32])
        x_invstd_sym.set_output(True).set_data_type(cls._TORCH2CUDNN[torch.float32])

        graph.validate()
        graph.build_operation_graph()
        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
        graph.check_support()
        graph.build_plans(cudnn.build_plan_policy.HEURISTICS_CHOICE)  # ALL
        symbols = (
            x_sym,
            scale_sym,
            bias_sym,
            eps_sym,
            y_sym,
            x_mean_sym,
            x_invstd_sym,
        )

        cls._graphs[key] = graph
        cls._symbols[key] = symbols

        return graph, symbols

    @classmethod
    def get_backward_graph(
        cls: type[LayerNormGraphFactory],
        m: int,
        n: int,
        xdtype: torch.dtype,
        wdtype: torch.dtype,
    ) -> tuple[cudnn._compiled_module.pygraph, tuple]:
        key = m, n, xdtype, wdtype, "BACKWARD"
        if key in cls._graphs:
            if key not in cls._symbols:
                raise RuntimeError(
                    f"Symbolic tensor was not constructed for layer-norm backward "
                    f"graph with input shape {(m, n)} and data type {(xdtype, wdtype)}",
                )
            return cls._graphs[key], cls._symbols[key]

        cudnn_manager: CuDNNManager = get_cudnn_manager()
        graph = cudnn.pygraph(
            intermediate_data_type=cudnn.data_type.FLOAT,
            compute_data_type=cudnn.data_type.FLOAT,
            handle=cudnn_manager.handle,
        )
        x_sym = graph.tensor(
            name="x_sym",
            dim=(m, n, 1, 1),
            stride=(n, 1, n, n),  # Simulate the channel-last format.
            data_type=cls._TORCH2CUDNN[xdtype],
        )
        d_y_sym = graph.tensor(
            name="d_y_sym",
            dim=(m, n, 1, 1),
            stride=(n, 1, n, n),  # Simulate the channel-last format.
            data_type=cls._TORCH2CUDNN[xdtype],
        )
        scale_sym = graph.tensor(
            name="scale_sym",
            dim=(1, n, 1, 1),
            stride=(n, 1, n, n),
            data_type=cls._TORCH2CUDNN[wdtype],
        )
        x_mean_sym = graph.tensor(
            name="x_mean_sym",
            dim=(m, 1, 1, 1),
            stride=(1, 1, 1, 1),
            data_type=cudnn.data_type.FLOAT,
        )
        x_invstd_sym = graph.tensor(
            name="x_invstd_sym",
            dim=(m, 1, 1, 1),
            stride=(1, 1, 1, 1),
            data_type=cudnn.data_type.FLOAT,
        )
        d_x_sym, d_scale_sym, d_bias_sym = graph.layernorm_backward(
            name=f"layer-norm-backward-{key}",
            grad=d_y_sym,
            input=x_sym,
            scale=scale_sym,
            mean=x_mean_sym,
            inv_variance=x_invstd_sym,
        )

        d_x_sym.set_output(True).set_data_type(cls._TORCH2CUDNN[xdtype])
        d_scale_sym.set_output(True).set_data_type(cls._TORCH2CUDNN[wdtype])
        d_bias_sym.set_output(True).set_data_type(cls._TORCH2CUDNN[wdtype])

        graph.validate()
        graph.build_operation_graph()
        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
        graph.check_support()
        graph.build_plans(cudnn.build_plan_policy.HEURISTICS_CHOICE)  # ALL
        symbols = (
            x_sym,
            d_y_sym,
            scale_sym,
            x_mean_sym,
            x_invstd_sym,
            d_x_sym,
            d_scale_sym,
            d_bias_sym,
        )

        cls._graphs[key] = graph
        cls._symbols[key] = symbols

        return graph, symbols


@torch.library.custom_op("cudnn::layer_norm", mutates_args=(), device_types="cuda")
def layer_norm(
    x: torch.Tensor,
    normalized_shape: list[int],
    weight: torch.Tensor,
    bias: torch.Tensor,
    eps: float = 1e-05,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    # PyTorch LayerNorm:
    #   * Shape (N, S, H), normalized_shape (H,);
    #   * Shape (N, C, H, W), normalized_shape (C, H, W);
    # cuDNN LayerNorm expects shape (M, N, 1, 1) and normalized_shape (1, N, 1, 1)
    if tuple(x.shape[-len(normalized_shape) :]) != tuple(normalized_shape):  # noqa: E203
        raise ValueError(
            f"CuDNN LayerNorm expects `x.shape[{-len(normalized_shape)}:]` equals to "
            f"`normalized_shape`, but got:\n    {x.shape=}, {normalized_shape=}",
        )
    assert weight.dtype == bias.dtype
    assert x.is_contiguous()

    stream = torch.cuda.current_stream()
    cudnn_manager: CuDNNManager = get_cudnn_manager()
    cudnn_manager.set_stream(stream)

    xdtype, wdtype, device = x.dtype, weight.dtype, x.device
    m, n = math.prod(x.shape[: -len(normalized_shape)]), math.prod(normalized_shape)
    (
        forward_graph,
        (
            x_sym,
            scale_sym,
            bias_sym,
            eps_sym,
            y_sym,
            x_mean_sym,
            x_invstd_sym,
        ),
    ) = LayerNormGraphFactory.get_forward_graph(m, n, xdtype, wdtype)

    x_contiguous = x.reshape(m, n, 1, 1)  # NOTE: x could be noncontiguous.
    weight = weight.view(1, n, 1, 1)
    bias = bias.view(1, n, 1, 1)
    eps_cpu = torch.full((1, 1, 1, 1), eps, dtype=torch.float32, device="cpu")

    y = torch.empty_like(x_contiguous)
    x_mean = torch.empty(m, 1, 1, 1, dtype=torch.float32, device=device)
    x_invstd = torch.empty(m, 1, 1, 1, dtype=torch.float32, device=device)
    workspace = torch.empty(
        forward_graph.get_workspace_size(),
        dtype=torch.uint8,
        device=device,
    )

    forward_graph.execute(
        {
            x_sym: x_contiguous.detach(),
            scale_sym: weight.detach(),
            bias_sym: bias.detach(),
            eps_sym: eps_cpu.detach(),
            y_sym: y.detach(),
            x_mean_sym: x_mean.detach(),
            x_invstd_sym: x_invstd.detach(),
        },
        workspace,
    )
    y = y.view(x.shape)

    return y, x_mean, x_invstd


@layer_norm.register_fake
def layer_norm_fake(
    x: torch.Tensor,
    normalized_shape: list[int],
    weight: torch.Tensor,
    bias: torch.Tensor,
    eps: float = 1e-05,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    m = math.prod(x.shape[: -len(normalized_shape)])

    y = torch.empty_like(x)
    x_mean = torch.empty(m, 1, 1, 1, dtype=torch.float32, device=x.device)
    x_invstd = torch.empty(m, 1, 1, 1, dtype=torch.float32, device=x.device)
    return y, x_mean, x_invstd


@torch.library.custom_op(
    "cudnn::layer_norm_backward",
    mutates_args=(),
    device_types="cuda",
)
def layer_norm_backward(
    d_y: torch.Tensor,
    x_mean: torch.Tensor,
    x_invstd: torch.Tensor,
    x: torch.Tensor,
    normalized_shape: list[int],
    weight: torch.Tensor,
    bias: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    xdtype, wdtype, device = d_y.dtype, weight.dtype, d_y.device
    m, n = math.prod(x.shape[: -len(normalized_shape)]), math.prod(normalized_shape)

    stream = torch.cuda.current_stream()
    cudnn_manager: CuDNNManager = get_cudnn_manager()
    cudnn_manager.set_stream(stream)

    (
        backward_graph,
        (
            x_sym,
            d_y_sym,
            scale_sym,
            x_mean_sym,
            x_invstd_sym,
            d_x_sym,
            d_scale_sym,
            d_bias_sym,
        ),
    ) = LayerNormGraphFactory.get_backward_graph(m, n, xdtype, wdtype)

    d_y_contiguous = d_y.reshape(m, n, 1, 1)  # NOTE: d_y could also be noncontiguous.
    d_x = torch.empty_like(x)
    d_weight = torch.empty_like(weight)
    d_bias = torch.empty_like(bias)
    workspace = torch.empty(
        backward_graph.get_workspace_size(),
        dtype=torch.uint8,
        device=device,
    )

    backward_graph.execute(
        {
            x_sym: x.detach(),
            d_y_sym: d_y_contiguous.detach(),
            scale_sym: weight.detach(),
            x_mean_sym: x_mean.detach(),
            x_invstd_sym: x_invstd.detach(),
            d_x_sym: d_x.detach(),
            d_scale_sym: d_weight.detach(),
            d_bias_sym: d_bias.detach(),
        },
        workspace,
    )

    return d_x, d_weight, d_bias


@layer_norm_backward.register_fake
def layer_norm_backward_fake(
    d_y: torch.Tensor,
    x_mean: torch.Tensor,
    x_invstd: torch.Tensor,
    x: torch.Tensor,
    normalized_shape: list[int],
    weight: torch.Tensor,
    bias: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    d_x = torch.empty_like(x)
    d_weight = torch.empty_like(weight)
    d_bias = torch.empty_like(bias)
    return d_x, d_weight, d_bias


def layer_norm_setup_context(
    ctx: torch.autograd.FunctionCtx,
    inputs: tuple,
    output: tuple,
) -> torch.Tensor:
    x, normalized_shape, weight, bias, eps = inputs
    y, x_mean, x_invstd = output

    ctx.save_for_backward(x, weight, bias, x_mean, x_invstd)
    ctx.normalized_shape = normalized_shape

    return y


def layer_norm_backward_wrapper(
    ctx: torch.autograd.FunctionCtx,
    d_y: torch.Tensor,
    d_x_mean: torch.Tensor,
    d_x_invstd: torch.Tensor,
) -> tuple[torch.Tensor, None, torch.Tensor, torch.Tensor, None]:
    x, weight, bias, x_mean, x_invstd = ctx.saved_tensors
    normalized_shape = ctx.normalized_shape

    d_x, d_weight, d_bias = layer_norm_backward(
        d_y,
        x_mean,
        x_invstd,
        x,
        normalized_shape,
        weight,
        bias,
    )

    return d_x, None, d_weight, d_bias, None


torch.library.register_autograd(
    "cudnn::layer_norm",
    layer_norm_backward_wrapper,
    setup_context=layer_norm_setup_context,
)


================================================
FILE: apex/contrib/torchsched/passes/__init__.py
================================================
"""Customized compiler passes."""

from __future__ import annotations

from apex.contrib.torchsched.passes.pre_grad_passes import pre_grad_custom_pass

__all__ = ["pre_grad_custom_pass"]


================================================
FILE: apex/contrib/torchsched/passes/pre_grad_passes.py
================================================
"""Customized Inductor passes."""

from __future__ import annotations

import logging
from typing import TYPE_CHECKING

import torch
from torch._dynamo.utils import counters
from torch.fx import replace_pattern

if TYPE_CHECKING:
    from collections.abc import Callable
    from collections.abc import Sequence

from apex.contrib.torchsched import config

__all__ = ["pre_grad_custom_pass"]

# pass name to (pattern replacement) mapping
PRE_GRAD_PASS_PATTERNS: dict[str, tuple[Callable, Callable]] = {}


def register_pattern(name: str, pattern: Callable, replacement: Callable) -> None:
    assert name not in PRE_GRAD_PASS_PATTERNS
    PRE_GRAD_PASS_PATTERNS[name] = pattern, replacement


def replace_layer_norm(
    x: torch.Tensor,
    normalized_shape: Sequence[int],
    weight: torch.Tensor,
    bias: torch.Tensor,
    eps: float,
) -> torch.Tensor:
    y, x_mean, x_invstd = torch.ops.cudnn.layer_norm(
        x,
        normalized_shape,
        weight,
        bias,
        eps,
    )
    return y


register_pattern(
    "cudnn_layer_norm",
    torch.nn.functional.layer_norm,
    replace_layer_norm,
)


def run_pre_grad_pass(
    name: str,
    graph: torch.fx.Graph,
    pattern: Callable,
    replacement: Callable,
) -> int:
    """Run a pre-gradient pass on the given graph.

    Args:
        name (str): A string identifier for the pass.
        graph (torch.fx.Graph): The graph to be transformed.
        pattern (Callable): A callable that defines the pattern to match in the graph.
        replacement (Callable): A callable that defines the replacement for matched patterns.

    Returns:
        An integer representing the number of transformations applied.

    Note:
        These two doesn't match because of kwargs (Inductor vs. torch.fx.symbolic_trace):

            %layer_norm : [num_users=1] = call_function[target=torch.nn.functional.layer_norm](
                args = (%l_args_0_, (320,), %l_fn_parameters_weight_, %l_fn_parameters_bias_,
                1e-05), kwargs = {})
            %layer_norm : [num_users=1] = call_function[target=torch.nn.functional.layer_norm](
                args = (%input_1, %normalized_shape), kwargs = {weight: %weight, bias: %bias,
                eps: %eps})
    """
    # Manually trace the graph and move kwargs to args
    pattern_graph = torch.fx.symbolic_trace(pattern).graph
    for node in pattern_graph.nodes:
        if node.op == "call_function" and node.target == pattern:
            node.args = node.args + tuple(node.kwargs.values())
            node.kwargs = {}
    pattern_graph.owning_module.recompile()

    matched = replace_pattern(graph.owning_module, pattern_graph, replacement)
    graph.owning_module.recompile()
    graph.lint()

    return len(matched)


def pre_grad_custom_pass(graph: torch.fx.Graph) -> None:
    """Run customized pre-grad passes.

    Args:
        graph (torch.fx.Graph): The FX graph to be optimized.
    """
    passes = config.pre_grad_pass_options
    for pass_name in passes:
        assert pass_name in PRE_GRAD_PASS_PATTERNS, f"Unknown pre_grad pass: {pass_name}"
        pattern, replacement = PRE_GRAD_PASS_PATTERNS[pass_name]
        replaced = run_pre_grad_pass(pass_name, graph, pattern, replacement)
        counters["torchsched"][f"pre_grad_{pass_name}"] += replaced
        logging.debug("Pre grad pass %s replaced %d sub-graphs", pass_name, replaced)


================================================
FILE: apex/contrib/transducer/__init__.py
================================================
from .transducer import TransducerJoint
from .transducer import TransducerLoss
from . import _transducer_ref


================================================
FILE: apex/contrib/transducer/_transducer_ref.py
================================================
import torch


def transducer_loss_reference(x, label, f_len, y_len, blank_idx, loss_grad):
    def log_sum_exp(a, b):
        if a >= b:
            return a + torch.log(1 + torch.exp(b - a))
        else:
            return b + torch.log(1 + torch.exp(a - b))

    def forward_alpha(x, label, f_len, y_len, blank_idx):
        B, T, U, V = x.size()
        acc_t = torch.float32 if x.dtype in [torch.float16, torch.float32] else x.dtype
        alpha = torch.zeros((B, T, U), dtype=acc_t, device=x.device)
        for b in range(B):
            alpha[b, 0, 0] = 0
            for t in range(1, f_len[b]):
                alpha[b, t, 0] = alpha[b, t - 1, 0] + x[b, t - 1, 0, blank_idx]
            for u in range(1, y_len[b] + 1):
                alpha[b, 0, u] = alpha[b, 0, u - 1] + x[b, 0, u - 1, label[b, u - 1]]
            for t in range(1, f_len[b]):
                for u in range(1, y_len[b] + 1):
                    curr_ = alpha[b, t - 1, u] + x[b, t - 1, u, blank_idx]
                    next_ = alpha[b, t, u - 1] + x[b, t, u - 1, label[b, u - 1]]
                    alpha[b, t, u] = log_sum_exp(curr_, next_)
        return alpha

    def forward_beta(x, label, f_len, y_len, blank_idx):
        B, T, U, V = x.shape
        acc_t = torch.float32 if x.dtype in [torch.float16, torch.float32] else x.dtype
        beta = torch.zeros((B, T, U), dtype=acc_t, device=x.device)
        for b in range(B):
            beta[b, f_len[b] - 1, y_len[b]] = x[b, f_len[b] - 1, y_len[b], blank_idx]
            for t in range(f_len[b] - 2, -1, -1):
                beta[b, t, y_len[b]] = beta[b, t + 1, y_len[b]] + x[b, t, y_len[b], blank_idx]
            for u in range(y_len[b] - 1, -1, -1):
                beta[b, f_len[b] - 1, u] = (
                    beta[b, f_len[b] - 1, u + 1] + x[b, f_len[b] - 1, u, label[b, u]]
                )
            for t in range(f_len[b] - 2, -1, -1):
                for u in range(y_len[b] - 1, -1, -1):
                    curr_ = beta[b, t + 1, u] + x[b, t, u, blank_idx]
                    next_ = beta[b, t, u + 1] + x[b, t, u, label[b, u]]
                    beta[b, t, u] = log_sum_exp(curr_, next_)
        return beta

    def backward(x, label, f_len, y_len, alpha, beta, loss_grad, blank_idx):
        grad = torch.zeros_like(x)
        B, T, U, V = x.size()
        for b in range(B):
            common_factor = torch.log(loss_grad[b]) + alpha - beta[b, 0, 0]
            # next
            for u in range(y_len[b]):
                grad[b, : f_len[b], u, label[b, u]] = -torch.exp(
                    common_factor[b, : f_len[b], u]
                    + beta[b, : f_len[b], u + 1]
                    + x[b, : f_len[b], u, label[b, u]]
                )

            # current
            grad[b, : f_len[b] - 1, : y_len[b] + 1, blank_idx] = -torch.exp(
                common_factor[b, : f_len[b] - 1, : y_len[b] + 1]
                + beta[b, 1 : f_len[b], : y_len[b] + 1]
                + x[b, : f_len[b] - 1, : y_len[b] + 1, blank_idx]
            )

            grad[b, f_len[b] - 1, y_len[b], blank_idx] = -torch.exp(
                common_factor[b, f_len[b] - 1, y_len[b]] + x[b, f_len[b] - 1, y_len[b], blank_idx]
            )

        return grad

    x_log = torch.nn.functional.log_softmax(x, dim=-1)
    alpha = forward_alpha(x_log, label, f_len, y_len, blank_idx)
    beta = forward_beta(x_log, label, f_len, y_len, blank_idx)
    grad = backward(x_log, label, f_len, y_len, alpha, beta, loss_grad, blank_idx)
    x_log.backward(grad)
    loss = -beta[:, 0, 0]
    loss = loss.to(x.dtype)
    return alpha, beta, x.grad, loss


def transducer_joint_reference(
    f, g, h_grad, f_len, g_len, pack_output, relu, dropout, dropout_prob=0, mask=None
):
    if dropout and mask == None:
        raise NotImplementedError("mask needs to supplied to test dropout.")
    B, T, H = f.size()
    U = g.size(1)
    f_expand = f.unsqueeze(dim=2)
    g_expand = g.unsqueeze(dim=1)
    h = f_expand + g_expand
    if relu:
        h = torch.nn.functional.relu(h)
    if dropout:
        h *= mask
        scale = 1 / (1 - dropout_prob)
        h *= scale
    h.backward(h_grad)

    if pack_output == False:
        # intentionally set don't-care region to -1 to test if transducer joint
        # write these regions to avoid NaN and inf
        for b in range(B):
            h[b, f_len[b] :] = -1
            h[b, :, g_len[b] :] = -1

        return h, f.grad, g.grad

    # packing
    list_to_pack = []
    for b in range(B):
        list_to_pack.append(h[b, : f_len[b], : g_len[b], :].reshape(-1, H))
    h_packed = torch.cat(list_to_pack)
    return h_packed, f.grad, g.grad


================================================
FILE: apex/contrib/transducer/transducer.py
================================================
import torch
import transducer_loss_cuda
import transducer_joint_cuda


class TransducerJoint(torch.nn.Module):
    """Transducer joint
    Detail of this loss function can be found in: Sequence Transduction with Recurrent Neural
    Networks

    Arguments:
        pack_output (bool, optional): whether to pack the output in a compact form with don't-care
        data being removed. (default: False)
        relu (bool, optional): apply ReLU to the output of the joint operation. Requires opt=1
        (default: False)
        dropout (bool, optional): apply dropout to the output of the joint operation. Requires opt=1
        (default: False)
        opt (int, optional): pick the optimization level in [0, 1]. opt=1 picks a tiled algorithm.
            (default: 1)
        fwd_tile_size (int, optional): tile size used in forward operation. This argument will be
        ignored if opt != 1. (default: 4)
        dropout_prob (float, optional): dropout probability. (default: 0.0)
        probe_mask (bool, optional): a flag used to probe the mask generated by ReLU and/or dropout
        operation. When this argument is set to True, the mask can be accessed through
        self.mask_probe. (default: false)
    """

    def __init__(
        self,
        pack_output=False,
        relu=False,
        dropout=False,
        opt=1,
        fwd_tile_size=4,
        dropout_prob=0,
        probe_mask=False,
    ):
        super(TransducerJoint, self).__init__()
        self.pack_output = pack_output
        self.relu = relu
        self.dropout = dropout
        self.dropout_prob = dropout_prob
        self.opt = opt
        self.fwd_tile_size = fwd_tile_size
        self.dummy_batch_offset = torch.empty(0)
        masked = self.relu or self.dropout
        self.mask_probe = [] if masked and probe_mask else None
        if masked and opt != 1:
            raise NotImplementedError("ReLU and dropout fusion is only supported with opt=1")

    def forward(self, f, g, f_len, g_len, batch_offset=None, packed_batch=0):
        """Forward operation of transducer joint

        Arguments:
            f (tensor): transcription vector from encode block of shape (B, T, H).
            g (tensor): prediction vector form predict block of shape (B, U, H).
            f_len (tensor): length of transcription vector for each batch.
            g_len (tensor): length of prediction vector minus 1 for each batch.
            batch_offset (tensor, optional): tensor containing the offset of each batch
                in the results. For example, batch offset can be obtained from:
                batch_offset = torch.cumsum(f_len*g_len, dim=0)
                This argument is required if pack_output == True, and is ignored if
                pack_output == False. (default: None)
            packed_batch (int, optional): the batch size after packing. This argument is
                ignored if pack_output == False. (default: 0)
        """
        my_batch_offset = batch_offset if self.pack_output else self.dummy_batch_offset
        if self.pack_output and (batch_offset is None or packed_batch == 0):
            raise Exception("Please specify batch_offset and packed_batch when packing is enabled")
        dropout = self.dropout and self.training  # only dropout for training
        return TransducerJointFunc.apply(
            f,
            g,
            f_len,
            g_len,
            self.pack_output,
            self.relu,
            dropout,
            my_batch_offset,
            packed_batch,
            self.opt,
            self.fwd_tile_size,
            self.dropout_prob,
            self.mask_probe,
        )


class TransducerLoss(torch.nn.Module):
    """Transducer loss
    Detail of this loss function can be found in: Sequence Transduction with Recurrent Neural
    Networks

    Arguments:
        fuse_softmax_backward (bool, optional) whether to fuse the backward of transducer loss with
            softmax. (default: True)
        opt (int, optional): pick the optimization level in [0, 1]. opt=1 picks a more optimized
            algorithm. In some cases, opt=1 might fall back to opt=0. (default: 1)
        packed_input (bool, optional): whether to pack the output in a compact form with don't-care
        data being removed. (default: False)
    """

    def __init__(self, fuse_softmax_backward=True, opt=1, packed_input=False):
        super(TransducerLoss, self).__init__()
        self.fuse_softmax_backward = fuse_softmax_backward
        self.opt = opt
        self.packed_input = packed_input
        self.dummy_batch_offset = torch.empty(0)

    def forward(
        self,
        x,
        label,
        f_len,
        y_len,
        blank_idx,
        batch_offset=None,
        max_f_len=None,
        debug_list=None,
    ):
        """Forward operation of transducer joint

        Arguments:
            x (tensor): input tensor to the loss function with a shape of (B, T, U, H).
            label (tensor): labels for the input data.
            f_len (tensor): lengths of the inputs in the time dimension for each batch.
            y_len (tensor): lengths of the labels for each batch.
            blank_idx (int): index for the null symbol.
            batch_offset (tensor, optional): tensor containing the offset of each batch
                in the input. For example, batch offset can be obtained from:
                batch_offset = torch.cumsum(f_len*(y_len+1), dim=0)
                This argument is required if packed_input == True, and is ignored if
                packed_input == False. (default: None)
            max_f_len (int, optional): maximum length of the input in the time dimension.
                For example, it can be obtained as
                max_f_len = max(f_len)
                This argument is required if packed_input == True, and is ignored if
                packed_input == False. (default: None)
                (default: None)
            debug_list (list, optional): when an empty list is supplied, Alpha and Beta generated
                in the forward operation will be attached to this list for debug purpose.
                (default: None)
        """
        if self.packed_input:
            if batch_offset is None or max_f_len is None:
                raise Exception(
                    "Please specify batch_offset and max_f_len when packing is \
                                    enabled"
                )
            my_batch_offset = batch_offset
            my_max_f_len = max_f_len
        else:
            my_batch_offset = self.dummy_batch_offset
            my_max_f_len = x.size(1)
        return TransducerLossFunc.apply(
            x,
            label,
            f_len,
            y_len,
            my_batch_offset,
            my_max_f_len,
            blank_idx,
            self.fuse_softmax_backward,
            debug_list,
            self.opt,
            self.packed_input,
        )


class TransducerLossFunc(torch.autograd.Function):
    @staticmethod
    def forward(
        ctx,
        x,
        label,
        f_len,
        y_len,
        batch_offset,
        max_f_len,
        blank_idx,
        fuse_softmax_backward,
        debug_list,
        opt,
        packed_input,
    ):
        if fuse_softmax_backward == False:
            with torch.enable_grad():
                x = torch.nn.functional.log_softmax(x, dim=-1)
        else:
            x = torch.nn.functional.log_softmax(x, dim=-1)
        alpha, beta, loss = transducer_loss_cuda.forward(
            x,
            label,
            f_len,
            y_len,
            batch_offset,
            max_f_len,
            blank_idx,
            opt,
            packed_input,
        )
        if debug_list == []:
            debug_list += [alpha, beta]
        ctx.save_for_backward(x, alpha, beta, f_len, y_len, label, batch_offset)
        ctx.blank_idx = blank_idx
        ctx.fuse_softmax_backward = fuse_softmax_backward
        ctx.opt = opt
        ctx.packed_input = packed_input
        ctx.max_f_len = max_f_len
        return loss

    @staticmethod
    def backward(ctx, loss_grad):
        x, alpha, beta, f_len, y_len, label, batch_offset = ctx.saved_tensors
        x_grad = transducer_loss_cuda.backward(
            x,
            loss_grad,
            alpha,
            beta,
            f_len,
            y_len,
            label,
            batch_offset,
            ctx.max_f_len,
            ctx.blank_idx,
            ctx.opt,
            ctx.fuse_softmax_backward,
            ctx.packed_input,
        )
        if ctx.fuse_softmax_backward == False:
            x_grad = x.backward(x_grad)
        return x_grad, None, None, None, None, None, None, None, None, None, None


class TransducerJointFunc(torch.autograd.Function):
    @staticmethod
    def forward(
        ctx,
        f,
        g,
        f_len,
        g_len,
        pack_output,
        relu,
        dropout,
        batch_offset,
        packed_batch,
        opt,
        fwd_tile_size,
        dropout_prob,
        mask_probe,
    ):
        h = transducer_joint_cuda.forward(
            f,
            g,
            f_len,
            g_len,
            batch_offset,
            packed_batch,
            opt,
            pack_output,
            relu,
            dropout,
            dropout_prob,
            fwd_tile_size,
        )
        masked = relu or dropout
        if masked:
            ctx.save_for_backward(h[1], f_len, g_len, batch_offset)
            if mask_probe is not None:
                mask_probe.append(h[1])
        else:
            ctx.save_for_backward(f_len, g_len, batch_offset)

        ctx.pack_output = pack_output
        ctx.masked = relu or dropout
        ctx.max_f_len = f.size(1)
        ctx.max_g_len = g.size(1)
        ctx.scale = 1 / (1 - dropout_prob) if dropout and dropout_prob != 1 else 1
        return h[0]

    @staticmethod
    def backward(ctx, loss_grad):
        if ctx.masked:
            mask, f_len, g_len, batch_offset = ctx.saved_tensors
            inp = [loss_grad, mask]
        else:
            f_len, g_len, batch_offset = ctx.saved_tensors
            inp = [loss_grad]

        f_grad, g_grad = transducer_joint_cuda.backward(
            inp,
            f_len,
            g_len,
            batch_offset,
            ctx.max_f_len,
            ctx.max_g_len,
            ctx.pack_output,
            ctx.scale,
        )

        return (
            f_grad,
            g_grad,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
        )


================================================
FILE: apex/contrib/xentropy/__init__.py
================================================
from .softmax_xentropy import SoftmaxCrossEntropyLoss


__all__ = [
    "SoftmaxCrossEntropyLoss",
]


================================================
FILE: apex/contrib/xentropy/softmax_xentropy.py
================================================
import torch

import xentropy_cuda


class SoftmaxCrossEntropyLoss(torch.autograd.Function):
    @staticmethod
    def forward(ctx, logits, labels, smoothing=0.0, padding_idx=0, half_to_float=False):
        losses, max_log_sum_exp = xentropy_cuda.forward(logits, labels, smoothing, half_to_float)
        losses.masked_fill_(labels == padding_idx, 0)

        ctx.save_for_backward(
            logits,
            max_log_sum_exp,
            labels,
            torch.FloatTensor([smoothing]),
            torch.LongTensor([padding_idx]),
        )

        return losses

    @staticmethod
    def backward(ctx, grad_loss):
        logits, max_log_sum_exp, labels, smoothing, padding_idx = ctx.saved_tensors

        if not grad_loss.is_contiguous():
            grad_loss = grad_loss.contiguous()
        grad_loss.masked_fill_(labels == padding_idx.item(), 0)
        grad_logits = xentropy_cuda.backward(
            grad_loss.contiguous(), logits, max_log_sum_exp, labels, smoothing.item()
        )

        return grad_logits, None, None, None, None


================================================
FILE: apex/distributed_testing/__init__.py
================================================
"""Distributed testing utilities."""

from apex.distributed_testing.distributed_test_base import (
    DistributedTestBase,
    NcclDistributedTestBase,
    UccDistributedTestBase,
)

__all__ = [
    "DistributedTestBase",
    "NcclDistributedTestBase",
    "UccDistributedTestBase",
]


================================================
FILE: apex/distributed_testing/_ucc_util.py
================================================
from torch import distributed as dist

HAS_UCC = hasattr(dist, "is_ucc_available") and dist.is_ucc_available()
if not HAS_UCC:
    try:
        import torch_ucc

        HAS_UCC = True
    except ImportError:
        HAS_UCC = False


================================================
FILE: apex/distributed_testing/distributed_test_base.py
================================================
import os
import sys
import unittest
from packaging.version import Version, parse

import torch
from torch import distributed as dist
from torch.utils import collect_env
from torch.testing._internal import common_utils
from torch.testing._internal import common_distributed

from apex.distributed_testing._ucc_util import HAS_UCC

# NOTE(mkozuki): Version guard for ucc. ref: https://github.com/openucx/ucc/issues/496
_TORCH_UCC_COMPAT_NVIDIA_DRIVER_VERSION = Version("470.42.01")
_driver_version = None
if torch.cuda.is_available():
    _driver_version = parse(collect_env.get_nvidia_driver_version(collect_env.run))
HAS_TORCH_UCC_COMPAT_NVIDIA_DRIVER = (
    _driver_version is not None and _driver_version >= _TORCH_UCC_COMPAT_NVIDIA_DRIVER_VERSION
)


class DistributedTestBase(common_distributed.MultiProcessTestCase):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)

    def setUp(self) -> None:
        super().setUp()
        self._setup_pre_spawn()
        self._spawn_processes()

    def tearDown(self) -> None:
        torch.cuda.empty_cache()
        super().tearDown()

    @property
    def world_size(self) -> int:
        return min(torch.cuda.device_count(), 4)

    @property
    def init_method(self):
        return f"{common_utils.FILE_SCHEMA}{self.file_name}"

    @property
    def destroy_pg_upon_exit(self) -> bool:
        # Overriding base test class: do not auto destroy PG upon exit.
        return False

    @classmethod
    def _run(cls, rank, test_name, file_name, pipe, **kwargs):
        self = cls(test_name)
        self.assertTrue(torch.cuda.is_available())
        self.assertTrue(hasattr(self, "DISTRIBUTED_BACKEND"))
        self.rank = rank
        self.file_name = file_name

        print(f"[dist init] rank = {self.rank}, world_size = {self.world_size}")

        try:
            dist.init_process_group(
                init_method=self.init_method,
                backend=self.DISTRIBUTED_BACKEND,
                world_size=int(self.world_size),
                rank=self.rank,
            )
        except RuntimeError as e:
            if "recompile" in e.args[0]:
                print(f"Backend of {self.DISTRIBUTED_BACKEND} not available")
                sys.exit(0)
            raise

        torch.cuda.set_device(self.rank % torch.cuda.device_count())

        dist.barrier()
        self.run_test(test_name, pipe)
        dist.barrier()

        dist.destroy_process_group()
        sys.exit(0)

    def _setup_pre_spawn(self):
        pass


class NcclDistributedTestBase(DistributedTestBase):
    DISTRIBUTED_BACKEND = "nccl"


@unittest.skipUnless(
    HAS_UCC,
    "Requires either torch ucc or pytorch build from source with native ucc installed and enabled",
)
@unittest.skipUnless(
    HAS_TORCH_UCC_COMPAT_NVIDIA_DRIVER,
    f"`torch_ucc` requires NVIDIA driver >= {_TORCH_UCC_COMPAT_NVIDIA_DRIVER_VERSION} but {_driver_version} found. "
    "See https://github.com/openucx/ucc/issues/496",
)
class UccDistributedTestBase(DistributedTestBase):
    DISTRIBUTED_BACKEND = "ucc"

    def _setup_pre_spawn(self) -> None:
        self.master_addr = "localhost"
        os.environ["MASTER_ADDR"] = "localhost"
        self._has_master_port = "MASTER_PORT" in os.environ
        if self._has_master_port:
            self.master_port = os.environ["MASTER_PORT"]
        else:
            try:
                from caffe2.torch.fb.common.utils import get_free_port

                self.master_port = str(get_free_port())
            except ImportError:
                self.master_port = "12375"
            os.environ["MASTER_PORT"] = self.master_port

        self._has_ucx_tls = "UCX_TLS" in os.environ
        if not self._has_ucx_tls:
            os.environ["UCX_TLS"] = "tcp,cuda"
        print('os.environ["UCX_TLS"] = {}'.format(os.environ["UCX_TLS"]))

    def tearDown(self) -> None:
        super().tearDown()
        if not self._has_master_port:
            del os.environ["MASTER_PORT"]
        if not self._has_ucx_tls:
            del os.environ["UCX_TLS"]

    @property
    def init_method(self):
        return "tcp://localhost:" + os.environ["MASTER_PORT"]


================================================
FILE: apex/fused_dense/__init__.py
================================================
from .fused_dense import *


================================================
FILE: apex/fused_dense/fused_dense.py
================================================
import torch
from torch import nn
import fused_dense_cuda
from apex._autocast_utils import _cast_if_autocast_enabled


# implements fused GEMM+bias in forward pass using mlp_cuda from apex
class FusedDenseFunc(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input, weight, bias):
        ctx.save_for_backward(input, weight)
        output = fused_dense_cuda.linear_bias_forward(input, weight, bias)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        input, weight = ctx.saved_tensors
        grad_input, grad_weight, grad_bias = fused_dense_cuda.linear_bias_backward(
            input, weight, grad_output
        )
        return grad_input, grad_weight, grad_bias


class DenseNoBiasFunc(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input, weight):
        ctx.save_for_backward(input, weight)
        output = torch.matmul(input, weight.t())
        return output

    @staticmethod
    def backward(ctx, grad_output):
        input, weight = ctx.saved_tensors
        grad_input = grad_output.mm(weight)
        grad_weight = grad_output.t().mm(input)
        return grad_input, grad_weight


class FusedDenseGeluDenseFunc(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input, weight1, bias1, weight2, bias2):
        ctx.save_for_backward(input, weight1, weight2)
        output1, output2, gelu_in = fused_dense_cuda.linear_gelu_linear_forward(
            input, weight1, bias1, weight2, bias2
        )
        ctx.save_for_backward(input, weight1, weight2, gelu_in, output1)
        return output2

    @staticmethod
    def backward(ctx, grad_output):
        input, weight1, weight2, gelu_in, output1 = ctx.saved_tensors
        grad_input, grad_weight1, grad_bias1, grad_weight2, grad_bias2 = (
            fused_dense_cuda.linear_gelu_linear_backward(
                input, gelu_in, output1, weight1, weight2, grad_output
            )
        )
        return grad_input, grad_weight1, grad_bias1, grad_weight2, grad_bias2


def _fused_dense(input, weight, bias):
    args = _cast_if_autocast_enabled(input, weight, bias)
    with torch.amp.autocast("cuda", enabled=False):
        return FusedDenseFunc.apply(*args)


def _dense_no_bias(input, weight):
    args = _cast_if_autocast_enabled(input, weight)
    with torch.amp.autocast("cuda", enabled=False):
        return DenseNoBiasFunc.apply(*args)


def _fused_dense_gelu_dense(input, weight1, bias1, weight2, bias2):
    args = _cast_if_autocast_enabled(input, weight1, bias1, weight2, bias2)
    with torch.amp.autocast("cuda", enabled=False):
        return FusedDenseGeluDenseFunc.apply(*args)


class FusedDense(nn.Module):
    def __init__(self, in_features, out_features, bias=True):
        super(FusedDense, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.Parameter(torch.empty(out_features, in_features))
        if bias:
            self.bias = nn.Parameter(torch.empty(out_features))
        else:
            # assert False, "no-bias option not added yet"
            self.register_parameter("bias", None)

    def forward(self, input):
        if self.bias is not None:
            return _fused_dense(input, self.weight, self.bias)
        else:
            return _dense_no_bias(input, self.weight)


class FusedDenseGeluDense(nn.Module):
    def __init__(self, in_features, intermediate_features, out_features, bias=True):
        super(FusedDenseGeluDense, self).__init__()
        assert bias == True, "DenseGeluDense module without bias is currently not supported"
        self.in_features = in_features
        self.intermediate_features = intermediate_features
        self.out_features = out_features
        self.weight1 = nn.Parameter(torch.empty(intermediate_features, in_features))
        self.bias1 = nn.Parameter(torch.empty(intermediate_features))
        self.weight2 = nn.Parameter(torch.empty(out_features, intermediate_features))
        self.bias2 = nn.Parameter(torch.empty(out_features))

    def forward(self, input):
        return _fused_dense_gelu_dense(input, self.weight1, self.bias1, self.weight2, self.bias2)


================================================
FILE: apex/mlp/__init__.py
================================================
from .mlp import *


================================================
FILE: apex/mlp/mlp.py
================================================
from copy import copy
import math

import torch
from torch import nn

from apex._autocast_utils import _cast_if_autocast_enabled
import mlp_cuda


class MlpFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, bias, activation, *args):
        output = mlp_cuda.forward(bias, activation, args)
        ctx.save_for_backward(*args)
        ctx.outputs = output
        ctx.bias = bias
        ctx.activation = activation
        return output[0]

    @staticmethod
    def backward(ctx, grad_o):
        grads = mlp_cuda.backward(ctx.bias, ctx.activation, grad_o, ctx.outputs, ctx.saved_tensors)
        del ctx.outputs
        return (None, None, *grads)


def mlp_function(bias, activation, *args):
    autocast_args = _cast_if_autocast_enabled(bias, activation, *args)
    return MlpFunction.apply(*autocast_args)


class MLP(torch.nn.Module):
    """Launch MLP in C++

    Args:
        mlp_sizes (list of int): MLP sizes. Example: [1024,1024,1024] will create 2 MLP layers with shape 1024x1024
        bias (bool): Default True:
        relu (bool): Default True
    """

    def __init__(self, mlp_sizes, bias=True, activation="relu"):
        super().__init__()
        self.num_layers = len(mlp_sizes) - 1
        self.mlp_sizes = copy(mlp_sizes)
        self.bias = 1 if bias else 0

        if activation == "none":
            self.activation = 0
        elif activation == "relu":
            self.activation = 1
        elif activation == "sigmoid":
            self.activation = 2
        else:
            raise TypeError("activation must be relu or none.")

        self.weights = []
        self.biases = []
        for i in range(self.num_layers):
            w = torch.nn.Parameter(torch.empty(mlp_sizes[i + 1], mlp_sizes[i]))
            self.weights.append(w)
            name = "weight_{}".format(i)
            setattr(self, name, w)
            if self.bias:
                b = torch.nn.Parameter(torch.empty(mlp_sizes[i + 1]))
                self.biases.append(b)
                name = "bias_{}".format(i)
                setattr(self, name, b)

        self.reset_parameters()

    def reset_parameters(self):
        for weight in self.weights:
            dimsum = weight.size(0) + weight.size(1)
            std = math.sqrt(2.0 / float(dimsum))
            nn.init.normal_(weight, 0.0, std)
        if self.bias:
            for bias in self.biases:
                std = math.sqrt(1.0 / float(bias.size(0)))
                nn.init.normal_(bias, 0.0, std)

    def forward(self, input):
        return mlp_function(self.bias, self.activation, input, *self.weights, *self.biases)

    def extra_repr(self):
        s = f"MLP sizes: {self.mlp_sizes}, Bias={self.bias}, activation={self.activation}"
        return s


================================================
FILE: apex/multi_tensor_apply/__init__.py
================================================
from .multi_tensor_apply import MultiTensorApply

multi_tensor_applier = MultiTensorApply(2048 * 32)


================================================
FILE: apex/multi_tensor_apply/multi_tensor_apply.py
================================================
class MultiTensorApply(object):
    available = False
    warned = False

    def __init__(self, chunk_size):
        try:
            import amp_C

            MultiTensorApply.available = True
            self.chunk_size = chunk_size
        except ImportError as err:
            MultiTensorApply.available = False
            MultiTensorApply.import_err = err

    def check_avail(self):
        if MultiTensorApply.available == False:
            raise RuntimeError(
                "Attempted to call MultiTensorApply method, but MultiTensorApply "
                "is not available, possibly because Apex was installed without "
                "--cpp_ext --cuda_ext.  Original import error message:",
                MultiTensorApply.import_err,
            )

    def __call__(self, op, noop_flag_buffer, tensor_lists, *args):
        self.check_avail()

        return op(self.chunk_size, noop_flag_buffer, tensor_lists, *args)


================================================
FILE: apex/normalization/__init__.py
================================================
from .fused_layer_norm import (
    FusedLayerNorm,
    MixedFusedLayerNorm,
    FusedRMSNorm,
    MixedFusedRMSNorm,
)


================================================
FILE: apex/normalization/fused_layer_norm.py
================================================
import importlib
import numbers

import torch
from torch.nn.parameter import Parameter
from torch.nn import init
from torch.nn import functional as F
from typing import List, Tuple

from apex._autocast_utils import _cast_if_autocast_enabled

global fused_layer_norm_cuda
fused_layer_norm_cuda = None


# PyTorch supports `torch.library.custom_op` since 2.4.0.
def supports_custom_op() -> bool:
    return hasattr(torch.library, "custom_op")


# Reference implementation from Huggingface
def manual_rms_norm(input, normalized_shape, weight, eps):
    # layer norm should always be calculated in float32
    dims = tuple(i for i in range(-1, -len(normalized_shape) - 1, -1))
    variance = input.to(torch.float32).pow(2).mean(dims, keepdim=True)
    input = input * torch.rsqrt(variance + eps)

    if weight is None:
        return input

    # convert into half-precision if necessary
    if weight.dtype in [torch.float16, torch.bfloat16]:
        input = input.to(weight.dtype)

    return weight * input


class FusedLayerNormAffineFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input, weight, bias, normalized_shape, eps, memory_efficient=False):
        global fused_layer_norm_cuda
        if fused_layer_norm_cuda is None:
            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
        ctx.normalized_shape = normalized_shape
        ctx.eps = eps
        ctx.memory_efficient = memory_efficient
        input_ = input.contiguous()
        weight_ = weight.contiguous()
        bias_ = bias.contiguous()
        output, mean, invvar = fused_layer_norm_cuda.forward_affine(
            input_, ctx.normalized_shape, weight_, bias_, ctx.eps
        )
        if ctx.memory_efficient:
            ctx.save_for_backward(output, weight_, bias_, None, invvar)
        else:
            ctx.save_for_backward(input_, weight_, bias_, mean, invvar)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        input_or_output, weight_, bias_, mean, invvar = ctx.saved_tensors
        grad_input = grad_weight = grad_bias = None
        grad_input, grad_weight, grad_bias = fused_layer_norm_cuda.backward_affine(
            grad_output.contiguous(),
            mean,
            invvar,
            input_or_output,
            ctx.normalized_shape,
            weight_,
            bias_,
            ctx.eps,
            ctx.memory_efficient,
        )
        return grad_input, grad_weight, grad_bias, None, None, None


if supports_custom_op():

    @torch.library.custom_op("apex::fused_layer_norm_affine_fwd", mutates_args=())
    def fused_layer_norm_affine_fwd(
        input: torch.Tensor,
        weight: torch.Tensor,
        bias: torch.Tensor,
        normalized_shape: List[int],
        eps: float,
        memory_efficient: bool = False,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        global fused_layer_norm_cuda
        if fused_layer_norm_cuda is None:
            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")

        input_ = input.contiguous()
        weight_ = weight.contiguous()
        bias_ = bias.contiguous()
        output, mean, invvar = fused_layer_norm_cuda.forward_affine(
            input_, normalized_shape, weight_, bias_, eps
        )
        return output, mean, invvar

    @fused_layer_norm_affine_fwd.register_fake
    def fused_layer_norm_affine_fwd_fake(
        input: torch.Tensor,
        weight: torch.Tensor,
        bias: torch.Tensor,
        normalized_shape: List[int],
        eps: float,
        memory_efficient: bool = False,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        input = input.contiguous()
        weight = weight.contiguous()
        bias = bias.contiguous()
        idiff = input.ndim - len(normalized_shape)
        n = 1
        for i in range(idiff):
            n *= input.shape[i]
        if input.dtype in [torch.float16, torch.bfloat16]:
            dtype = torch.float32
        else:
            dtype = input.dtype
        mean = torch.empty([n], dtype=dtype, device=input.device)
        invvar = torch.empty_like(mean)
        return torch.empty_like(input), mean, invvar

    @torch.library.custom_op("apex::fused_layer_norm_affine_bwd", mutates_args=())
    def fused_layer_norm_affine_bwd(
        grad_output: torch.Tensor,
        mean: torch.Tensor,
        invvar: torch.Tensor,
        input_or_output: torch.Tensor,
        normalized_shape: List[int],
        weight: torch.Tensor,
        bias: torch.Tensor,
        eps: float,
        memory_efficient: bool = False,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        grad_input, grad_weight, grad_bias = fused_layer_norm_cuda.backward_affine(
            grad_output.contiguous(),
            mean,
            invvar,
            input_or_output,
            normalized_shape,
            weight,
            bias,
            eps,
            memory_efficient,
        )
        return grad_input, grad_weight, grad_bias

    @fused_layer_norm_affine_bwd.register_fake
    def fused_layer_norm_affine_bwd_fake(
        grad_output: torch.Tensor,
        mean: torch.Tensor,
        invvar: torch.Tensor,
        input_or_output: torch.Tensor,
        normalized_shape: List[int],
        weight: torch.Tensor,
        bias: torch.Tensor,
        eps: float,
        memory_efficient: bool = False,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        grad_input = torch.empty_like(input_or_output)
        grad_weight = torch.empty_like(weight)
        grad_bias = torch.empty_like(bias)
        return grad_input, grad_weight, grad_bias

    def _fused_layer_norm_affine_backward(ctx, grad_output, grad_mean, grad_invvar):
        input_or_output, weight_, bias_, mean, invvar = ctx.saved_tensors
        grad_input = grad_weight = grad_bias = None
        grad_input, grad_weight, grad_bias = fused_layer_norm_affine_bwd(
            grad_output,
            mean,
            invvar,
            input_or_output,
            ctx.normalized_shape,
            weight_,
            bias_,
            ctx.eps,
            ctx.memory_efficient,
        )
        return grad_input, grad_weight, grad_bias, None, None, None

    def _fused_layer_norm_affine_setup_context(ctx, inputs, output):
        input, weight, bias, normalized_shape, eps, memory_efficient = inputs
        output, mean, invvar = output
        input_ = input.contiguous()
        weight_ = weight.contiguous()
        bias_ = bias.contiguous()
        if memory_efficient:
            ctx.save_for_backward(output, weight_, bias_, None, invvar)
        else:
            ctx.save_for_backward(input_, weight_, bias_, mean, invvar)
        ctx.normalized_shape = normalized_shape
        ctx.eps = eps
        ctx.memory_efficient = memory_efficient

    fused_layer_norm_affine_fwd.register_autograd(
        _fused_layer_norm_affine_backward,
        setup_context=_fused_layer_norm_affine_setup_context,
    )


class FusedRMSNormAffineFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input, weight, normalized_shape, eps, memory_efficient=False):
        global fused_layer_norm_cuda
        if fused_layer_norm_cuda is None:
            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
        ctx.normalized_shape = normalized_shape
        ctx.eps = eps
        ctx.memory_efficient = memory_efficient
        input_ = input.contiguous()
        weight_ = weight.contiguous()
        output, invvar = fused_layer_norm_cuda.rms_forward_affine(
            input_, ctx.normalized_shape, weight_, ctx.eps
        )
        if ctx.memory_efficient:
            ctx.save_for_backward(output, weight_, invvar)
        else:
            ctx.save_for_backward(input_, weight_, invvar)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        input_or_output, weight_, invvar = ctx.saved_tensors
        grad_input = grad_weight = None
        grad_input, grad_weight = fused_layer_norm_cuda.rms_backward_affine(
            grad_output.contiguous(),
            invvar,
            input_or_output,
            ctx.normalized_shape,
            weight_,
            ctx.eps,
            ctx.memory_efficient,
        )
        return grad_input, grad_weight, None, None, None


if supports_custom_op():

    @torch.library.custom_op("apex::fused_rms_norm_affine_fwd", mutates_args=())
    def fused_rms_norm_affine_fwd(
        input: torch.Tensor,
        weight: torch.Tensor,
        normalized_shape: List[int],
        eps: float,
        memory_efficient: bool = False,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        global fused_layer_norm_cuda
        if fused_layer_norm_cuda is None:
            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")

        input_ = input.contiguous()
        weight_ = weight.contiguous()
        output, invvar = fused_layer_norm_cuda.rms_forward_affine(
            input_, normalized_shape, weight_, eps
        )
        return output, invvar

    @fused_rms_norm_affine_fwd.register_fake
    def fused_rms_norm_affine_fwd_fake(
        input: torch.Tensor,
        weight: torch.Tensor,
        normalized_shape: List[int],
        eps: float,
        memory_efficient: bool = False,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        input = input.contiguous()
        weight = weight.contiguous()
        idiff = input.ndim - len(normalized_shape)
        n = 1
        for i in range(idiff):
            n *= input.shape[i]
        if input.dtype in [torch.float16, torch.bfloat16]:
            dtype = torch.float32
        else:
            dtype = input.dtype
        return (
            torch.empty_like(input),
            torch.empty(
                [n],
                dtype=dtype,
                device=input.device,
                requires_grad=input.requires_grad,
                memory_format=torch.contiguous_format,
            ),
        )

    @torch.library.custom_op("apex::fused_rms_norm_affine_bwd", mutates_args=())
    def fused_rms_norm_affine_bwd(
        grad_output: torch.Tensor,
        invvar: torch.Tensor,
        input_or_output: torch.Tensor,
        normalized_shape: List[int],
        weight: torch.Tensor,
        eps: float,
        memory_efficient: bool = False,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        grad_input, grad_weight = fused_layer_norm_cuda.rms_backward_affine(
            grad_output.contiguous(),
            invvar,
            input_or_output,
            normalized_shape,
            weight,
            eps,
            memory_efficient,
        )
        return grad_input, grad_weight

    @fused_rms_norm_affine_bwd.register_fake
    def fused_rms_norm_affine_bwd_fake(
        grad_output: torch.Tensor,
        invvar: torch.Tensor,
        input_or_output: torch.Tensor,
        normalized_shape: List[int],
        weight: torch.Tensor,
        eps: float,
        memory_efficient: bool = False,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        grad_input = torch.empty_like(input_or_output)
        grad_weight = torch.empty_like(weight)
        return grad_input, grad_weight

    def _fused_rms_norm_affine_backward(ctx, grad_output, grad_invvar):
        input_or_output, weight_, invvar = ctx.saved_tensors
        grad_input = grad_weight = None
        grad_input, grad_weight = fused_rms_norm_affine_bwd(
            grad_output,
            invvar,
            input_or_output,
            ctx.normalized_shape,
            weight_,
            ctx.eps,
            ctx.memory_efficient,
        )
        return grad_input, grad_weight, None, None, None

    def _fused_rms_norm_affine_setup_context(ctx, inputs, output):
        input_, weight_, normalized_shape, eps, memory_efficient = inputs
        output_, invvar = output
        input_ = input_.contiguous()
        weight_ = weight_.contiguous()
        if memory_efficient:
            ctx.save_for_backward(output_, weight_, invvar)
        else:
            ctx.save_for_backward(input_, weight_, invvar)
        ctx.normalized_shape = normalized_shape
        ctx.eps = eps
        ctx.memory_efficient = memory_efficient

    fused_rms_norm_affine_fwd.register_autograd(
        _fused_rms_norm_affine_backward,
        setup_context=_fused_rms_norm_affine_setup_context,
    )


class FusedLayerNormAffineMixedDtypesFunction(FusedLayerNormAffineFunction):
    @staticmethod
    def forward(ctx, input, weight, bias, normalized_shape, eps, memory_efficient=False):
        global fused_layer_norm_cuda
        if fused_layer_norm_cuda is None:
            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
        ctx.normalized_shape = normalized_shape
        ctx.eps = eps
        ctx.memory_efficient = memory_efficient
        input_ = input.contiguous()
        weight_ = weight.contiguous()
        bias_ = bias.contiguous()
        output, mean, invvar = fused_layer_norm_cuda.forward_affine_mixed_dtypes(
            input_, ctx.normalized_shape, weight_, bias_, ctx.eps
        )
        if ctx.memory_efficient:
            ctx.save_for_backward(output, weight_, bias_, None, invvar)
        else:
            ctx.save_for_backward(input_, weight_, bias_, mean, invvar)
        return output


class FusedRMSNormAffineMixedDtypesFunction(FusedRMSNormAffineFunction):
    @staticmethod
    def forward(ctx, input, weight, normalized_shape, eps, memory_efficient=False):
        global fused_layer_norm_cuda
        if fused_layer_norm_cuda is None:
            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
        ctx.normalized_shape = normalized_shape
        ctx.eps = eps
        ctx.memory_efficient = memory_efficient
        input_ = input.contiguous()
        weight_ = weight.contiguous()
        output, invvar = fused_layer_norm_cuda.rms_forward_affine_mixed_dtypes(
            input_, ctx.normalized_shape, weight_, ctx.eps
        )
        if ctx.memory_efficient:
            ctx.save_for_backward(output, weight_, invvar)
        else:
            ctx.save_for_backward(input_, weight_, invvar)
        return output


class FusedLayerNormFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input, normalized_shape, eps, memory_efficient=False):
        global fused_layer_norm_cuda
        if fused_layer_norm_cuda is None:
            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
        ctx.normalized_shape = normalized_shape
        ctx.eps = eps
        ctx.memory_efficient = memory_efficient
        input_ = input.contiguous()
        output, mean, invvar = fused_layer_norm_cuda.forward(input_, ctx.normalized_shape, ctx.eps)
        if ctx.memory_efficient:
            ctx.save_for_backward(output, None, invvar)
        else:
            ctx.save_for_backward(input_, mean, invvar)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        input_or_output, mean, invvar = ctx.saved_tensors
        grad_input = fused_layer_norm_cuda.backward(
            grad_output.contiguous(),
            mean,
            invvar,
            input_or_output,
            ctx.normalized_shape,
            ctx.eps,
            ctx.memory_efficient,
        )
        return grad_input, None, None, None


if supports_custom_op():

    @torch.library.custom_op("apex::fused_layer_norm_fwd", mutates_args=())
    def fused_layer_norm_fwd(
        input: torch.Tensor,
        normalized_shape: List[int],
        eps: float,
        memory_efficient: bool = False,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        global fused_layer_norm_cuda
        if fused_layer_norm_cuda is None:
            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")

        input_ = input.contiguous()
        output, mean, invvar = fused_layer_norm_cuda.forward(input_, normalized_shape, eps)
        return output, mean, invvar

    @fused_layer_norm_fwd.register_fake
    def fused_layer_norm_fwd_fake(
        input: torch.Tensor,
        normalized_shape: List[int],
        eps: float,
        memory_efficient: bool = False,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        input = input.contiguous()
        idiff = input.ndim - len(normalized_shape)
        n = 1
        for i in range(idiff):
            n *= input.shape[i]
        if input.dtype in [torch.float16, torch.bfloat16]:
            dtype = torch.float32
        else:
            dtype = input.dtype
        mean = torch.empty([n], dtype=dtype, device=input.device)
        invvar = torch.empty_like(mean)
        return torch.empty_like(input), mean, invvar

    @torch.library.custom_op("apex::fused_layer_norm_bwd", mutates_args=())
    def fused_layer_norm_bwd(
        grad_output: torch.Tensor,
        mean: torch.Tensor,
        invvar: torch.Tensor,
        input_or_output: torch.Tensor,
        normalized_shape: List[int],
        eps: float,
        memory_efficient: bool = False,
    ) -> torch.Tensor:
        grad_input = fused_layer_norm_cuda.backward(
            grad_output.contiguous(),
            mean,
            invvar,
            input_or_output,
            normalized_shape,
            eps,
            memory_efficient,
        )
        return grad_input

    @fused_layer_norm_bwd.register_fake
    def fused_layer_norm_bwd_fake(
        grad_output: torch.Tensor,
        mean: torch.Tensor,
        invvar: torch.Tensor,
        input_or_output: torch.Tensor,
        normalized_shape: List[int],
        eps: float,
        memory_efficient: bool = False,
    ) -> torch.Tensor:
        grad_input = torch.empty_like(input_or_output)
        return grad_input

    def _fused_layer_norm_backward(ctx, grad_output, grad_mean, grad_invvar):
        input_or_output, mean, invvar = ctx.saved_tensors
        grad_input = fused_layer_norm_bwd(
            grad_output,
            mean,
            invvar,
            input_or_output,
            ctx.normalized_shape,
            ctx.eps,
            ctx.memory_efficient,
        )
        return grad_input, None, None, None

    def _fused_layer_norm_setup_context(ctx, inputs, output):
        input, normalized_shape, eps, memory_efficient = inputs
        output, mean, invvar = output
        input_ = input.contiguous()
        if memory_efficient:
            ctx.save_for_backward(output, None, invvar)
        else:
            ctx.save_for_backward(input_, mean, invvar)
        ctx.normalized_shape = normalized_shape
        ctx.eps = eps
        ctx.memory_efficient = memory_efficient

    fused_layer_norm_fwd.register_autograd(
        _fused_layer_norm_backward,
        setup_context=_fused_layer_norm_setup_context,
    )


class FusedRMSNormFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input, normalized_shape, eps, memory_efficient=False):
        global fused_layer_norm_cuda
        if fused_layer_norm_cuda is None:
            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
        ctx.normalized_shape = normalized_shape
        ctx.eps = eps
        ctx.memory_efficient = memory_efficient
        input_ = input.contiguous()
        output, invvar = fused_layer_norm_cuda.rms_forward(input_, ctx.normalized_shape, ctx.eps)
        if ctx.memory_efficient:
            ctx.save_for_backward(output, invvar)
        else:
            ctx.save_for_backward(input_, invvar)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        input_or_output, invvar = ctx.saved_tensors
        grad_input = None
        grad_input = fused_layer_norm_cuda.rms_backward(
            grad_output.contiguous(),
            invvar,
            input_or_output,
            ctx.normalized_shape,
            ctx.eps,
            ctx.memory_efficient,
        )
        return grad_input, None, None, None


if supports_custom_op():

    @torch.library.custom_op("apex::fused_rms_norm_fwd", mutates_args=())
    def fused_rms_norm_fwd(
        input: torch.Tensor,
        normalized_shape: List[int],
        eps: float,
        memory_efficient: bool = False,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        global fused_layer_norm_cuda
        if fused_layer_norm_cuda is None:
            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")

        input_ = input.contiguous()
        output, invvar = fused_layer_norm_cuda.rms_forward(input_, normalized_shape, eps)
        return output, invvar

    @fused_rms_norm_fwd.register_fake
    def fused_rms_norm_fwd_fake(
        input: torch.Tensor,
        normalized_shape: List[int],
        eps: float,
        memory_efficient: bool = False,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        input = input.contiguous()
        idiff = input.ndim - len(normalized_shape)
        n = 1
        for i in range(idiff):
            n *= input.shape[i]
        if input.dtype in [torch.float16, torch.bfloat16]:
            dtype = torch.float32
        else:
            dtype = input.dtype
        return (
            torch.empty_like(input),
            torch.empty(
                [n],
                dtype=dtype,
                device=input.device,
                requires_grad=input.requires_grad,
                memory_format=torch.contiguous_format,
            ),
        )

    @torch.library.custom_op("apex::fused_rms_norm_bwd", mutates_args=())
    def fused_rms_norm_bwd(
        grad_output: torch.Tensor,
        invvar: torch.Tensor,
        input_or_output: torch.Tensor,
        normalized_shape: List[int],
        eps: float,
        memory_efficient: bool = False,
    ) -> torch.Tensor:
        grad_input = fused_layer_norm_cuda.rms_backward(
            grad_output.contiguous(),
            invvar,
            input_or_output,
            normalized_shape,
            eps,
            memory_efficient,
        )
        return grad_input

    @fused_rms_norm_bwd.register_fake
    def fused_rms_norm_bwd_fake(
        grad_output: torch.Tensor,
        invvar: torch.Tensor,
        input_or_output: torch.Tensor,
        normalized_shape: List[int],
        eps: float,
        memory_efficient: bool = False,
    ) -> torch.Tensor:
        grad_input = torch.empty_like(input_or_output)
        return grad_input

    def _fused_rms_norm_backward(ctx, grad_output, grad_invvar):
        input_or_output, invvar = ctx.saved_tensors
        grad_input = None
        grad_input = fused_rms_norm_bwd(
            grad_output,
            invvar,
            input_or_output,
            ctx.normalized_shape,
            ctx.eps,
            ctx.memory_efficient,
        )
        return grad_input, None, None, None

    def _fused_rms_norm_setup_context(ctx, inputs, output):
        input_, normalized_shape, eps, memory_efficient = inputs
        output_, invvar = output
        input_ = input_.contiguous()
        if memory_efficient:
            ctx.save_for_backward(output_, invvar)
        else:
            ctx.save_for_backward(input_, invvar)
        ctx.normalized_shape = normalized_shape
        ctx.eps = eps
        ctx.memory_efficient = memory_efficient

    fused_rms_norm_fwd.register_autograd(
        _fused_rms_norm_backward, setup_context=_fused_rms_norm_setup_context
    )


def fused_layer_norm_affine(
    input, weight, bias, normalized_shape, eps=1e-6, memory_efficient=False
):
    args = _cast_if_autocast_enabled(input, weight, bias, normalized_shape, eps, memory_efficient)
    with torch.amp.autocast("cuda", enabled=False):
        if supports_custom_op():
            return fused_layer_norm_affine_fwd(*args)[0]
        else:
            return FusedLayerNormAffineFunction.apply(*args)


def fused_layer_norm(input, normalized_shape, eps=1e-6, memory_efficient=False):
    args = _cast_if_autocast_enabled(input, normalized_shape, eps, memory_efficient)
    with torch.amp.autocast("cuda", enabled=False):
        if supports_custom_op():
            return fused_layer_norm_fwd(*args)[0]
        else:
            return FusedLayerNormFunction.apply(*args)


def mixed_dtype_fused_layer_norm_affine(
    input, weight, bias, normalized_shape, eps=1e-6, memory_efficient=False
):
    args = _cast_if_autocast_enabled(input, weight, bias, normalized_shape, eps, memory_efficient)
    with torch.amp.autocast("cuda", enabled=False):
        return FusedLayerNormAffineMixedDtypesFunction.apply(*args)


def fused_rms_norm_affine(input, weight, normalized_shape, eps=1e-6, memory_efficient=False):
    args = _cast_if_autocast_enabled(input, weight, normalized_shape, eps, memory_efficient)
    with torch.amp.autocast("cuda", enabled=False):
        if supports_custom_op():
            return fused_rms_norm_affine_fwd(*args)[0]
        else:
            return FusedRMSNormAffineFunction.apply(*args)


def fused_rms_norm(input, normalized_shape, eps=1e-6, memory_efficient=False):
    args = _cast_if_autocast_enabled(input, normalized_shape, eps, memory_efficient)
    with torch.amp.autocast("cuda", enabled=False):
        if supports_custom_op():
            return fused_rms_norm_fwd(*args)[0]
        else:
            return FusedRMSNormFunction.apply(*args)


def mixed_dtype_fused_rms_norm_affine(
    input, weight, normalized_shape, eps=1e-6, memory_efficient=False
):
    args = _cast_if_autocast_enabled(input, weight, normalized_shape, eps, memory_efficient)
    with torch.amp.autocast("cuda", enabled=False):
        return FusedRMSNormAffineMixedDtypesFunction.apply(*args)


class FusedLayerNorm(torch.nn.Module):
    r"""Applies Layer Normalization over a mini-batch of inputs as described in
    the paper `Layer Normalization`_ .

    Currently only runs on cuda() tensors.

    .. math::
        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated separately over the last
    certain number dimensions which have to be of the shape specified by
    :attr:`normalized_shape`.
    :math:`\gamma` and :math:`\beta` are learnable affine transform parameters of
    :attr:`normalized_shape` if :attr:`elementwise_affine` is ``True``.

    .. note::
        Unlike Batch Normalization and Instance Normalization, which applies
        scalar scale and bias for each entire channel/plane with the
        :attr:`affine` option, Layer Normalization applies per-element scale and
        bias with :attr:`elementwise_affine`.

    This layer uses statistics computed from input data in both training and
    evaluation modes.

    Args:
        normalized_shape (int or list or torch.Size): input shape from an expected input
            of size

            .. math::
                [* \times \text{normalized}\_\text{shape}[0] \times \text{normalized}\_\text{shape}[1]
                    \times \ldots \times \text{normalized}\_\text{shape}[-1]]

            If a single integer is used, it is treated as a singleton list, and this module will
            normalize over the last dimension which is expected to be of that specific size.
        eps: a value added to the denominator for numerical stability. Default: 1e-5
        elementwise_affine: a boolean value that when set to ``True``, this module
            has learnable per-element affine parameters initialized to ones (for weights)
            and zeros (for biases). Default: ``True``.

    Shape:
        - Input: :math:`(N, *)`
        - Output: :math:`(N, *)` (same shape as input)

    Examples::

        >>> input = torch.randn(20, 5, 10, 10)
        >>> # With Learnable Parameters
        >>> m = apex.normalization.FusedLayerNorm(input.size()[1:])
        >>> # Without Learnable Parameters
        >>> m = apex.normalization.FusedLayerNorm(input.size()[1:], elementwise_affine=False)
        >>> # Normalize over last two dimensions
        >>> m = apex.normalization.FusedLayerNorm([10, 10])
        >>> # Normalize over last dimension of size 10
        >>> m = apex.normalization.FusedLayerNorm(10)
        >>> # Activating the module
        >>> output = m(input)

    .. _`Layer Normalization`: https://arxiv.org/abs/1607.06450
    """

    def __init__(
        self,
        normalized_shape,
        eps=1e-5,
        elementwise_affine=True,
        memory_efficient=False,
    ):
        super().__init__()

        global fused_layer_norm_cuda
        fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")

        if isinstance(normalized_shape, numbers.Integral):
            normalized_shape = (normalized_shape,)
        self.normalized_shape = torch.Size(normalized_shape)
        self.eps = eps
        self.elementwise_affine = elementwise_affine
        self.memory_efficient = memory_efficient
        if self.elementwise_affine:
            self.weight = Parameter(torch.empty(*normalized_shape))
            self.bias = Parameter(torch.empty(*normalized_shape))
        else:
            self.register_parameter("weight", None)
            self.register_parameter("bias", None)
        self.reset_parameters()

    def reset_parameters(self):
        if self.elementwise_affine:
            init.ones_(self.weight)
            init.zeros_(self.bias)

    def forward(self, input):
        if (
            torch.jit.is_tracing()
            or torch.jit.is_scripting()
            or torch.compiler.is_compiling()
            or not input.is_cuda
        ):
            return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps)
        if self.elementwise_affine:
            return fused_layer_norm_affine(
                input,
                self.weight,
                self.bias,
                self.normalized_shape,
                self.eps,
                self.memory_efficient,
            )
        else:
            return fused_layer_norm(input, self.normalized_shape, self.eps, self.memory_efficient)

    def extra_repr(self):
        return "{normalized_shape}, eps={eps}, elementwise_affine={elementwise_affine}".format(
            **self.__dict__
        )


class FusedRMSNorm(torch.nn.Module):
    r"""Applies RMS Normalization over a mini-batch of inputs

    Currently only runs on cuda() tensors.

    .. math::
        y = \frac{x}{\mathrm{RMS}[x]} * \gamma

    The root-mean-square is calculated separately over the last
    certain number dimensions which have to be of the shape specified by
    :attr:`normalized_shape`.
    :math:`\gamma` is a learnable affine transform parameter of
    :attr:`normalized_shape` if :attr:`elementwise_affine` is ``True``.
    `epsilon` is added to the mean-square, then the root of the sum is taken.

    .. note::
        Unlike Batch Normalization and Instance Normalization, which applies
        scalar scale and bias for each entire channel/plane with the
        :attr:`affine` option, RMS Normalization applies per-element scale
        with :attr:`elementwise_affine`.

    This layer uses statistics computed from input data in both training and
    evaluation modes.

    Args:
        normalized_shape (int or list or torch.Size): input shape from an expected input
            of size

            .. math::
                [* \times \text{normalized}\_\text{shape}[0] \times \text{normalized}\_\text{shape}[1]
                    \times \ldots \times \text{normalized}\_\text{shape}[-1]]

            If a single integer is used, it is treated as a singleton list, and this module will
            normalize over the last dimension which is expected to be of that specific size.
        eps: a value added to the denominator for numerical stability. Default: 1e-5
        elementwise_affine: a boolean value that when set to ``True``, this module
            has learnable per-element affine parameters initialized to ones (for weights)
            and zeros (for biases). Default: ``True``.

    Shape:
        - Input: :math:`(N, *)`
        - Output: :math:`(N, *)` (same shape as input)

    Examples::

        >>> input = torch.randn(20, 5, 10, 10)
        >>> # With Learnable Parameters
        >>> m = apex.normalization.FusedRMSNorm(input.size()[1:])
        >>> # Without Learnable Parameters
        >>> m = apex.normalization.FusedRMSNorm(input.size()[1:], elementwise_affine=False)
        >>> # Normalize over last two dimensions
        >>> m = apex.normalization.FusedRMSNorm([10, 10])
        >>> # Normalize over last dimension of size 10
        >>> m = apex.normalization.FusedRMSNorm(10)
        >>> # Activating the module
        >>> output = m(input)

    .. _`Root Mean Square Layer Normalization`: https://arxiv.org/pdf/1910.07467.pdf
    """

    def __init__(
        self,
        normalized_shape,
        eps=1e-5,
        elementwise_affine=True,
        memory_efficient=False,
    ):
        super().__init__()

        global fused_layer_norm_cuda
        fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")

        if isinstance(normalized_shape, numbers.Integral):
            normalized_shape = (normalized_shape,)
        self.normalized_shape = torch.Size(normalized_shape)
        self.eps = eps
        self.elementwise_affine = elementwise_affine
        self.memory_efficient = memory_efficient
        if self.elementwise_affine:
            self.weight = Parameter(torch.empty(*normalized_shape))
        else:
            self.register_parameter("weight", None)
        self.reset_parameters()

    def reset_parameters(self):
        if self.elementwise_affine:
            init.ones_(self.weight)

    def forward(self, input):
        if (
            torch.jit.is_tracing()
            or torch.jit.is_scripting()
            or torch.compiler.is_compiling()
            or not input.is_cuda
        ):
            return manual_rms_norm(input, self.normalized_shape, self.weight, self.eps)

        if self.elementwise_affine:
            return fused_rms_norm_affine(
                input,
                self.weight,
                self.normalized_shape,
                self.eps,
                self.memory_efficient,
            )
        else:
            return fused_rms_norm(input, self.normalized_shape, self.eps, self.memory_efficient)

    def extra_repr(self):
        return "{normalized_shape}, eps={eps}, elementwise_affine={elementwise_affine}".format(
            **self.__dict__
        )


# NOTE (mkozuki): Why "mixed"?
# MixedFusedLayerNorm differs from FusedLayerNorm in that this layer norm uses parameter's dtype
# as output tensor's dtype while FusedLayerNorm uses input tensor's dtype for output tensor's dtype.
# See: `layer_norm_affine` and `layer_norm_affine_mixed_dtypes` in "csrc/layer_norm_cuda.cpp"
class MixedFusedLayerNorm(FusedLayerNorm):
    def __init__(self, normalized_shape, eps=1e-5, *, memory_efficient=False, **kwargs):
        if "elementwise_affine" in kwargs:
            import warnings

            warnings.warn("MixedFusedLayerNorm does not support `elementwise_affine` argument")
            elementwise_affine = kwargs.pop("elementwise_affine")
            if not elementwise_affine:
                raise RuntimeError(
                    "MixedFusedLayerNorm does not support `elementwise_affine = False`"
                )

        super().__init__(
            normalized_shape=normalized_shape,
            eps=eps,
            elementwise_affine=True,
            memory_efficient=memory_efficient,
        )

    def forward(self, input: torch.Tensor):
        # NOTE (mkozuki): CPU path is here mainly for unittest sake.
        if (
            torch.jit.is_tracing()
            or torch.jit.is_scripting()
            or torch.compiler.is_compiling()
            or not input.is_cuda
        ):
            return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps)
        return mixed_dtype_fused_layer_norm_affine(
            input,
            self.weight,
            self.bias,
            self.normalized_shape,
            self.eps,
            self.memory_efficient,
        )


# MixedFusedLayerNorm differs from FusedLayerNorm in that this layer norm uses parameter's dtype
# as output tensor's dtype while FusedLayerNorm uses input tensor's dtype for output tensor's dtype.
# See: `layer_norm_affine` and `layer_norm_affine_mixed_dtypes` in "csrc/layer_norm_cuda.cpp"
class MixedFusedRMSNorm(FusedRMSNorm):
    def __init__(self, normalized_shape, eps=1e-5, *, memory_efficient=False, **kwargs):
        if "elementwise_affine" in kwargs:
            import warnings

            warnings.warn("MixedFusedRMSNorm does not support `elementwise_affine` argument")
            elementwise_affine = kwargs.pop("elementwise_affine")
            if not elementwise_affine:
                raise RuntimeError(
                    "MixedFusedRMSNorm does not support `elementwise_affine = False`"
                )

        super().__init__(
            normalized_shape=normalized_shape,
            eps=eps,
            elementwise_affine=True,
            memory_efficient=memory_efficient,
        )

    def forward(self, input: torch.Tensor):
        # NOTE (mkozuki): CPU path is here mainly for unittest sake.
        # TODO Manual RMS Norm Implementation Here
        if (
            torch.jit.is_tracing()
            or torch.jit.is_scripting()
            or torch.compiler.is_compiling()
            or not input.is_cuda
        ):
            return manual_rms_norm(input, self.normalized_shape, self.weight, self.eps)
        return mixed_dtype_fused_rms_norm_affine(
            input, self.weight, self.normalized_shape, self.eps, self.memory_efficient
        )


================================================
FILE: apex/optimizers/__init__.py
================================================
from .fused_sgd import FusedSGD
from .fused_adam import FusedAdam
from .fused_novograd import FusedNovoGrad
from .fused_lamb import FusedLAMB
from .fused_adagrad import FusedAdagrad
from .fused_mixed_precision_lamb import FusedMixedPrecisionLamb


================================================
FILE: apex/optimizers/fused_adagrad.py
================================================
import torch
from apex.multi_tensor_apply import multi_tensor_applier


class FusedAdagrad(torch.optim.Optimizer):
    """Implements Adagrad algorithm.

    Currently GPU-only.  Requires Apex to be installed via
    ``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./``.

    This version of fused Adagrad implements 2 fusions.
      * Fusion of the Adagrad update's elementwise operations
      * A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.

    :class:`apex.optimizers.FusedAdagrad`'s usage is identical to any ordinary Pytorch optimizer::
        opt = apex.optimizers.FusedAdagrad(model.parameters(), lr = ....)
        ...
        opt.step()

    :class:`apex.optimizers.FusedAdagrad` may be used with or without Amp.  If you wish to use :class:`FusedAdagrad` with Amp,
    you may choose any ``opt_level``::
        opt = apex.optimizers.FusedAdagrad(model.parameters(), lr = ....)
        model, opt = amp.initialize(model, opt, opt_level="O0" or "O1 or "O2")
        ...
        opt.step()
    In general, ``opt_level="O1"`` is recommended.

    It has been proposed in `Adaptive Subgradient Methods for Online Learning
    and Stochastic Optimization`_.
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-2)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-10)
        adagrad_w_mode (boolean, optional): Apply L2 regularization or weight decay
            True for decoupled weight decay (also known as AdamW) (default: False)

    .. _Adaptive Subgradient Methods for Online Learning and Stochastic
        Optimization: http://jmlr.org/papers/v12/duchi11a.html
    """

    def __init__(
        self,
        params,
        lr=1e-2,
        eps=1e-10,
        weight_decay=0.0,
        set_grad_none=True,
        adagrad_w_mode=False,
    ):
        defaults = dict(lr=lr, eps=eps, weight_decay=weight_decay)
        super(FusedAdagrad, self).__init__(params, defaults)
        self.adagrad_w_mode = 1 if adagrad_w_mode else 0
        self.set_grad_none = set_grad_none

        if multi_tensor_applier.available:
            import amp_C

            # Skip buffer
            self._dummy_overflow_buf = torch.cuda.IntTensor([0])
            self.multi_tensor_adagrad = amp_C.multi_tensor_adagrad
        else:
            raise RuntimeError("apex.optimizers.FusedAdagrad requires cuda extensions")

    def zero_grad(self):
        if self.set_grad_none:
            for group in self.param_groups:
                for p in group["params"]:
                    p.grad = None
        else:
            super(FusedAdagrad, self).zero_grad()

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            # create lists for multi-tensor apply
            g_16, p_16, h_16 = [], [], []
            g_32, p_32, h_32 = [], [], []

            for p in group["params"]:
                if p.grad is None:
                    continue
                if p.grad.data.is_sparse:
                    raise RuntimeError("FusedAdagrad does not support sparse gradients")

                state = self.state[p]
                # State initialization
                if len(state) == 0:
                    # Exponential moving average of gradient values
                    state["sum"] = torch.zeros_like(p.data)
                if p.dtype == torch.float16:
                    g_16.append(p.grad.data)
                    p_16.append(p.data)
                    h_16.append(state["sum"])
                elif p.dtype == torch.float32:
                    g_32.append(p.grad.data)
                    p_32.append(p.data)
                    h_32.append(state["sum"])
                else:
                    raise RuntimeError("FusedAdagrad only support fp16 and fp32.")

            if len(g_16) > 0:
                multi_tensor_applier(
                    self.multi_tensor_adagrad,
                    self._dummy_overflow_buf,
                    [g_16, p_16, h_16],
                    group["lr"],
                    group["eps"],
                    self.adagrad_w_mode,
                    group["weight_decay"],
                )
            if len(g_32) > 0:
                multi_tensor_applier(
                    self.multi_tensor_adagrad,
                    self._dummy_overflow_buf,
                    [g_32, p_32, h_32],
                    group["lr"],
                    group["eps"],
                    self.adagrad_w_mode,
                    group["weight_decay"],
                )

        return loss


================================================
FILE: apex/optimizers/fused_adam.py
================================================
import torch
from apex.multi_tensor_apply import multi_tensor_applier


class FusedAdam(torch.optim.Optimizer):
    """Implements Adam algorithm.

    Currently GPU-only.  Requires Apex to be installed via
    ``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./``.

    This version of fused Adam implements 2 fusions.

      * Fusion of the Adam update's elementwise operations
      * A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.

    :class:`apex.optimizers.FusedAdam` may be used as a drop-in replacement for ``torch.optim.AdamW``,
    or ``torch.optim.Adam`` with ``adam_w_mode=False``::

        opt = apex.optimizers.FusedAdam(model.parameters(), lr = ....)
        ...
        opt.step()

    :class:`apex.optimizers.FusedAdam` may be used with or without Amp.  If you wish to use :class:`FusedAdam` with Amp,
    you may choose any ``opt_level``::

        opt = apex.optimizers.FusedAdam(model.parameters(), lr = ....)
        model, opt = amp.initialize(model, opt, opt_level="O0" or "O1 or "O2")
        ...
        opt.step()

    In general, ``opt_level="O1"`` is recommended.


    .. warning::
        A previous version of :class:`FusedAdam` allowed a number of additional arguments to ``step``.  These additional arguments
        are now deprecated and unnecessary.

    Adam was been proposed in `Adam: A Method for Stochastic Optimization`_.

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups.
        lr (float, optional): learning rate. (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square. (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability. (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False) NOT SUPPORTED in FusedAdam!
        adam_w_mode (boolean, optional): Apply L2 regularization or weight decay
            True for decoupled weight decay(also known as AdamW) (default: True)
        set_grad_none (bool, optional): whether set grad to None when zero_grad()
            method is called. (default: True)
        capturable (bool, optional): whether to use the version of the optimizer
            that can be used with CUDA Graphs. (default: False)
        master_weights (bool, optional): whether to maintain FP32 master weights
           in the optimizer with FP16 mixed precision training, currently can
           only be used with capturable set to True. (default: False)

    .. _Adam - A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    def __init__(
        self,
        params,
        lr=1e-3,
        bias_correction=True,
        betas=(0.9, 0.999),
        eps=1e-8,
        adam_w_mode=True,
        weight_decay=0.0,
        amsgrad=False,
        set_grad_none=True,
        capturable=False,
        master_weights=False,
    ):
        if amsgrad:
            raise RuntimeError("FusedAdam does not support the AMSGrad variant.")
        if master_weights and not capturable:
            raise RuntimeError(
                "Master weights is currently only supported with the capturable version."
            )
        # If the optimizer is capturable then LR should be a tensor (on GPU)
        lr = torch.tensor(lr, dtype=torch.float32) if capturable else lr
        defaults = dict(
            lr=lr,
            bias_correction=bias_correction,
            betas=betas,
            eps=eps,
            weight_decay=weight_decay,
        )
        super(FusedAdam, self).__init__(params, defaults)
        self.adam_w_mode = 1 if adam_w_mode else 0
        self.set_grad_none = set_grad_none

        self.capturable = capturable
        self.master_weights = master_weights

        # Create full precision master weights
        self.param_groups_master = []
        for i, pg in enumerate(self.param_groups):
            param_list = pg["params"]
            self.param_groups_master.append(
                {
                    "params": [
                        p.clone().detach().float() if self.master_weights else None
                        for p in param_list
                    ],
                }
            )

        if capturable:
            for idx, group in enumerate(self.param_groups):
                if len(group["params"]) == 0:
                    continue
                device = group["params"][0].device
                for item in ["lr"]:
                    self.param_groups[idx][item] = group[item].to(device=device)

            self._step_supports_amp_scaling = True

        if multi_tensor_applier.available:
            import amp_C

            # Skip buffer
            self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device="cuda")
            self.multi_tensor_adam = amp_C.multi_tensor_adam
            self.multi_tensor_adam_capturable = amp_C.multi_tensor_adam_capturable
            self.multi_tensor_adam_capturable_master = amp_C.multi_tensor_adam_capturable_master
        else:
            raise RuntimeError("apex.optimizers.FusedAdam requires cuda extensions")

    def zero_grad(self):
        if self.set_grad_none:
            for group in self.param_groups:
                for p in group["params"]:
                    p.grad = None
        else:
            super(FusedAdam, self).zero_grad()

    def step(
        self,
        closure=None,
        grads=None,
        output_params=None,
        scale=None,
        grad_norms=None,
        grad_scaler=None,
    ):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.

        The remaining arguments are deprecated, and are only retained (for the moment) for error-checking purposes.
        """
        if any(p is not None for p in [grads, output_params, scale, grad_norms]):
            raise RuntimeError(
                "FusedAdam has been updated.  Simply initialize it identically to torch.optim.Adam, and call step() with no arguments."
            )
        loss = None
        if closure is not None:
            loss = closure()

        for group, group_master in zip(self.param_groups, self.param_groups_master):
            if len(group["params"]) == 0:
                continue
            device = group["params"][0].device
            bias_correction = 1 if group["bias_correction"] else 0
            beta1, beta2 = group["betas"]

            # assume same step across group now to simplify things
            # per parameter step can be easily support by making it tensor, or pass list into kernel
            if "step" in group:
                group["step"] += (
                    1 if not self.capturable else (self._dummy_overflow_buf != 1).to(torch.int)
                )
            else:
                group["step"] = (
                    1 if not self.capturable else torch.tensor([1], dtype=torch.int, device=device)
                )

            # create lists for multi-tensor apply
            g_16, p_16, m_16, v_16 = [], [], [], []
            g_bf, p_bf, m_bf, v_bf = [], [], [], []
            g_32, p_32, m_32, v_32 = [], [], [], []
            p_16_master = []
            p_32_master = []

            for p, p_master in zip(group["params"], group_master["params"]):
                if p.grad is None:
                    continue
                if p.grad.data.is_sparse:
                    raise RuntimeError(
                        "FusedAdam does not support sparse gradients, please consider SparseAdam instead"
                    )

                state = self.state[p]
                # State initialization
                if len(state) == 0:
                    # Exponential moving average of gradient values
                    state["exp_avg"] = torch.zeros_like(p.data).float()
                    # Exponential moving average of squared gradient values
                    state["exp_avg_sq"] = torch.zeros_like(p.data).float()

                if p.dtype == torch.float16:
                    if self.master_weights:
                        p_16_master.append(p_master.data)
                    g_16.append(p.grad.data)
                    p_16.append(p.data)
                    m_16.append(state["exp_avg"])
                    v_16.append(state["exp_avg_sq"])
                elif p.dtype == torch.bfloat16:
                    g_bf.append(p.grad)
                    p_bf.append(p)
                    m_bf.append(state["exp_avg"])
                    v_bf.append(state["exp_avg_sq"])
                elif p.dtype == torch.float32:
                    if self.master_weights:
                        p_32_master.append(p_master.data)
                    g_32.append(p.grad.data)
                    p_32.append(p.data)
                    m_32.append(state["exp_avg"])
                    v_32.append(state["exp_avg_sq"])
                else:
                    raise RuntimeError("FusedAdam only support fp16 and fp32.")

            # If the optimizer is capturable, then if there's a grad scaler it works
            # on the GPU + a different multi_tensor_applier should be called
            if self.capturable:
                # overflow check of gradients
                found_inf = (
                    grad_scaler._check_inf_per_device(self)[device]
                    if grad_scaler is not None
                    else torch.zeros((1,), device=device)
                )
                self._dummy_overflow_buf.copy_(found_inf)

                # get unscale scale factor
                scale, inv_scale = None, None
                if grad_scaler:
                    scale = grad_scaler._get_scale_async()
                    inv_scale = scale.double().reciprocal().float()
                else:
                    scale = torch.ones((1,), device=device)
                    inv_scale = torch.ones((1,), device=device)

                if len(g_16) > 0:
                    multi_tensor_applier(
                        self.multi_tensor_adam_capturable_master
                        if self.master_weights
                        else self.multi_tensor_adam_capturable,
                        self._dummy_overflow_buf,
                        [g_16, p_16, m_16, v_16, p_16_master]
                        if self.master_weights
                        else [g_16, p_16, m_16, v_16],
                        group["lr"],
                        beta1,
                        beta2,
                        group["eps"],
                        group["step"],
                        self.adam_w_mode,
                        bias_correction,
                        group["weight_decay"],
                        inv_scale,
                    )

                if len(g_bf) > 0:
                    multi_tensor_applier(
                        self.multi_tensor_adam_capturable,
                        self._dummy_overflow_buf,
                        [g_bf, p_bf, m_bf, v_bf],
                        group["lr"],
                        beta1,
                        beta2,
                        group["eps"],
                        group["step"],
                        self.adam_w_mode,
                        bias_correction,
                        group["weight_decay"],
                        inv_scale,
                    )

                if len(g_32) > 0:
                    multi_tensor_applier(
                        self.multi_tensor_adam_capturable_master
                        if self.master_weights
                        else self.multi_tensor_adam_capturable,
                        self._dummy_overflow_buf,
                        [g_32, p_32, m_32, v_32, p_32_master]
                        if self.master_weights
                        else [g_32, p_32, m_32, v_32],
                        group["lr"],
                        beta1,
                        beta2,
                        group["eps"],
                        group["step"],
                        self.adam_w_mode,
                        bias_correction,
                        group["weight_decay"],
                        inv_scale,
                    )
            else:
                if len(g_16) > 0:
                    multi_tensor_applier(
                        self.multi_tensor_adam,
                        self._dummy_overflow_buf,
                        [g_16, p_16, m_16, v_16],
                        group["lr"],
                        beta1,
                        beta2,
                        group["eps"],
                        group["step"],
                        self.adam_w_mode,
                        bias_correction,
                        group["weight_decay"],
                    )

                if len(g_bf) > 0:
                    multi_tensor_applier(
                        self.multi_tensor_adam,
                        self._dummy_overflow_buf,
                        [g_bf, p_bf, m_bf, v_bf],
                        group["lr"],
                        beta1,
                        beta2,
                        group["eps"],
                        group["step"],
                        self.adam_w_mode,
                        bias_correction,
                        group["weight_decay"],
                    )

                if len(g_32) > 0:
                    multi_tensor_applier(
                        self.multi_tensor_adam,
                        self._dummy_overflow_buf,
                        [g_32, p_32, m_32, v_32],
                        group["lr"],
                        beta1,
                        beta2,
                        group["eps"],
                        group["step"],
                        self.adam_w_mode,
                        bias_correction,
                        group["weight_decay"],
                    )

        return loss


================================================
FILE: apex/optimizers/fused_lamb.py
================================================
import torch
from apex.multi_tensor_apply import multi_tensor_applier


class FusedLAMB(torch.optim.Optimizer):
    """Implements LAMB algorithm.

    Currently GPU-only.  Requires Apex to be installed via
    ``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./``.

    This version of fused LAMB implements 2 fusions.

      * Fusion of the LAMB update's elementwise operations
      * A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.

    :class:`apex.optimizers.FusedLAMB`'s usage is identical to any ordinary Pytorch optimizer::

        opt = apex.optimizers.FusedLAMB(model.parameters(), lr = ....)
        ...
        opt.step()

    :class:`apex.optimizers.FusedLAMB` may be used with or without Amp.  If you wish to use :class:`FusedLAMB` with Amp,
    you may choose any ``opt_level``::

        opt = apex.optimizers.FusedLAMB(model.parameters(), lr = ....)
        model, opt = amp.initialize(model, opt, opt_level="O0" or "O1 or "O2")
        ...
        opt.step()

    In general, ``opt_level="O1"`` is recommended.

    LAMB was proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups.
        lr (float, optional): learning rate. (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its norm. (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability. (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            NOT SUPPORTED now! (default: False)
        adam_w_mode (boolean, optional): Apply L2 regularization or weight decay
            True for decoupled weight decay(also known as AdamW) (default: True)
        grad_averaging (bool, optional): whether apply (1-beta2) to grad when
            calculating running averages of gradient. (default: True)
        set_grad_none (bool, optional): whether set grad to None when zero_grad()
            method is called. (default: True)
        max_grad_norm (float, optional): value used to clip global grad norm
            (default: 1.0)
        use_nvlamb (boolean, optional): Apply adaptive learning rate to 0.0
            weight decay parameter (default: False)

    .. _Large Batch Optimization for Deep Learning - Training BERT in 76 minutes:
        https://arxiv.org/abs/1904.00962
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    def __init__(
        self,
        params,
        lr=1e-3,
        bias_correction=True,
        betas=(0.9, 0.999),
        eps=1e-6,
        weight_decay=0.01,
        amsgrad=False,
        adam_w_mode=True,
        grad_averaging=True,
        set_grad_none=True,
        max_grad_norm=1.0,
        use_nvlamb=False,
    ):
        if amsgrad:
            raise RuntimeError("FusedLAMB does not support the AMSGrad variant.")
        defaults = dict(
            lr=lr,
            bias_correction=bias_correction,
            betas=betas,
            eps=eps,
            weight_decay=weight_decay,
            grad_averaging=grad_averaging,
            max_grad_norm=max_grad_norm,
        )
        super(FusedLAMB, self).__init__(params, defaults)
        if multi_tensor_applier.available:
            import amp_C

            self.multi_tensor_l2norm = amp_C.multi_tensor_l2norm
            # Skip buffer
            self._dummy_overflow_buf = torch.tensor(
                [0], dtype=torch.int, device=self.param_groups[0]["params"][0].device
            )
            self.multi_tensor_lamb = amp_C.multi_tensor_lamb
        else:
            raise RuntimeError("apex.optimizers.FusedLAMB requires cuda extensions")

        self.adam_w_mode = 1 if adam_w_mode else 0
        self.set_grad_none = set_grad_none
        self.use_nvlamb = use_nvlamb

    def zero_grad(self):
        if self.set_grad_none:
            for group in self.param_groups:
                for p in group["params"]:
                    p.grad = None
        else:
            super(FusedLAMB, self).zero_grad()

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        # create separate grad lists for fp32 and fp16 params
        g_all_32, g_all_16 = [], []
        for group in self.param_groups:
            for p in group["params"]:
                if p.grad is None:
                    continue
                if p.dtype == torch.float32:
                    g_all_32.append(p.grad.data)
                elif p.dtype == torch.float16:
                    g_all_16.append(p.grad.data)
                else:
                    raise RuntimeError("FusedLAMB only support fp16 and fp32.")

        device = self.param_groups[0]["params"][0].device
        g_norm_32, g_norm_16 = (
            torch.zeros(1, device=device),
            torch.zeros(1, device=device),
        )
        # compute grad norm for two lists
        if len(g_all_32) > 0:
            g_norm_32 = multi_tensor_applier(
                self.multi_tensor_l2norm, self._dummy_overflow_buf, [g_all_32], False
            )[0]
        if len(g_all_16) > 0:
            g_norm_16 = multi_tensor_applier(
                self.multi_tensor_l2norm, self._dummy_overflow_buf, [g_all_16], False
            )[0]

        # blend two grad norms to get global grad norm
        global_grad_norm = multi_tensor_applier(
            self.multi_tensor_l2norm,
            self._dummy_overflow_buf,
            [[g_norm_32, g_norm_16]],
            False,
        )[0]
        max_grad_norm = self.defaults["max_grad_norm"]

        for group in self.param_groups:
            bias_correction = 1 if group["bias_correction"] else 0
            beta1, beta2 = group["betas"]
            grad_averaging = 1 if group["grad_averaging"] else 0

            # assume same step across group now to simplify things
            # per parameter step can be easily support by making it tensor, or pass list into kernel
            if "step" in group:
                group["step"] += 1
            else:
                group["step"] = 1

            # create lists for multi-tensor apply
            g_16, p_16, m_16, v_16 = [], [], [], []
            g_32, p_32, m_32, v_32 = [], [], [], []

            for p in group["params"]:
                if p.grad is None:
                    continue
                if p.grad.data.is_sparse:
                    raise RuntimeError(
                        "FusedLAMB does not support sparse gradients, please consider SparseAdam instead"
                    )

                state = self.state[p]
                # State initialization
                if len(state) == 0:
                    # Exponential moving average of gradient values
                    state["exp_avg"] = torch.zeros_like(p.data)
                    # Exponential moving average of gradient values
                    state["exp_avg_sq"] = torch.zeros_like(p.data)

                if p.dtype == torch.float16:
                    g_16.append(p.grad.data)
                    p_16.append(p.data)
                    m_16.append(state["exp_avg"])
                    v_16.append(state["exp_avg_sq"])
                elif p.dtype == torch.float32:
                    g_32.append(p.grad.data)
                    p_32.append(p.data)
                    m_32.append(state["exp_avg"])
                    v_32.append(state["exp_avg_sq"])
                else:
                    raise RuntimeError("FusedLAMB only support fp16 and fp32.")

            if len(g_16) > 0:
                multi_tensor_applier(
                    self.multi_tensor_lamb,
                    self._dummy_overflow_buf,
                    [g_16, p_16, m_16, v_16],
                    group["lr"],
                    beta1,
                    beta2,
                    group["eps"],
                    group["step"],
                    bias_correction,
                    group["weight_decay"],
                    grad_averaging,
                    self.adam_w_mode,
                    global_grad_norm,
                    max_grad_norm,
                    self.use_nvlamb,
                )
            if len(g_32) > 0:
                multi_tensor_applier(
                    self.multi_tensor_lamb,
                    self._dummy_overflow_buf,
                    [g_32, p_32, m_32, v_32],
                    group["lr"],
                    beta1,
                    beta2,
                    group["eps"],
                    group["step"],
                    bias_correction,
                    group["weight_decay"],
                    grad_averaging,
                    self.adam_w_mode,
                    global_grad_norm,
                    max_grad_norm,
                    self.use_nvlamb,
                )

        return loss


================================================
FILE: apex/optimizers/fused_mixed_precision_lamb.py
================================================
import torch
from copy import deepcopy
from itertools import chain
from collections import defaultdict, abc as container_abcs

from apex.multi_tensor_apply import multi_tensor_applier


class FusedMixedPrecisionLamb(torch.optim.Optimizer):
    def __init__(
        self,
        params,
        lr=1e-3,
        step=0,
        bias_correction=True,
        betas=(0.9, 0.999),
        eps=1e-6,
        weight_decay=0.01,
        amsgrad=False,
        adam_w_mode=True,
        grad_averaging=True,
        max_grad_norm=1.0,
        use_nvlamb=False,
        reduced_precision_dtype=None,
    ):
        if amsgrad:
            raise RuntimeError("FusedLAMB does not support the AMSGrad variant.")

        # init defaults
        defaults = dict(
            lr=torch.tensor(lr, dtype=torch.float32),
            step=torch.tensor([step], dtype=torch.int),
            bias_correction=bias_correction,
            betas=betas,
            eps=eps,
            weight_decay=weight_decay,
            grad_averaging=grad_averaging,
            max_grad_norm=max_grad_norm,
        )

        # init base module
        super(FusedMixedPrecisionLamb, self).__init__(params, defaults)

        # The learning rate (lr) and optimizer step (step) should be located on device
        # in order to faciliated device sync free execution
        device = self.param_groups[0]["params"][0].device
        tensor_state = ["lr", "step"]
        for idx, group in enumerate(self.param_groups):
            for item in tensor_state:
                self.param_groups[idx][item] = group[item].to(device=device)

        if multi_tensor_applier.available:
            import amp_C

            self.multi_tensor_l2norm = amp_C.multi_tensor_l2norm_mp
            # Skip buffer
            self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device=device)
            self.multi_tensor_lamb = amp_C.multi_tensor_lamb_mp
        else:
            raise RuntimeError("apex.optimizers.FusedLAMB requires cuda extensions")

        # Mixed Precision support
        self.reduced_precision_dtype = reduced_precision_dtype
        self.param_groups_full_precision = []

        self._step_supports_amp_scaling = True
        self.adam_w_mode = 1 if adam_w_mode else 0
        self.use_nvlamb = use_nvlamb

    # This method is overridden from the parent class because there is not a way to override
    # the nested function cast() that copies a saved piece of state to the device without
    # redundantly doing the copy.
    def load_state_dict(self, state_dict):
        r"""Loads the optimizer state.

        Args:
            state_dict (dict): optimizer state. Should be an object returned
                from a call to :meth:`state_dict`.
        """
        # deepcopy, to be consistent with module API
        state_dict = deepcopy(state_dict)
        # Validate the state_dict
        groups = self.param_groups
        saved_groups = state_dict["param_groups"]

        if len(groups) != len(saved_groups):
            raise ValueError("loaded state dict has a different number of parameter groups")
        param_lens = (len(g["params"]) for g in groups)
        saved_lens = (len(g["params"]) for g in saved_groups)
        if any(p_len != s_len for p_len, s_len in zip(param_lens, saved_lens)):
            raise ValueError(
                "loaded state dict contains a parameter group "
                "that doesn't match the size of optimizer's group"
            )

        # Update the state
        id_map = {
            old_id: p
            for old_id, p in zip(
                chain.from_iterable((g["params"] for g in saved_groups)),
                chain.from_iterable((g["params"] for g in groups)),
            )
        }

        def cast(param, value):
            r"""Make a deep copy of value, casting all tensors to device of param."""
            if isinstance(value, torch.Tensor):
                # The original version casted the saved value to the params dtype
                # This doesn't work for mixed precision Lamb where the momentum and
                # velocity are expected to be in full precision while the params are
                # in reduced precision
                value = value.to(value.device)
                return value
            elif isinstance(value, dict):
                return {k: cast(param, v) for k, v in value.items()}
            elif isinstance(value, container_abcs.Iterable):
                return type(value)(cast(param, v) for v in value)
            else:
                return value

        # Copy state assigned to params (and cast tensors to appropriate types).
        # State that is not assigned to params is copied as is (needed for
        # backward compatibility).
        state = defaultdict(dict)
        for k, v in state_dict["state"].items():
            if k in id_map:
                param = id_map[k]
                state[param] = cast(param, v)
            else:
                state[k] = v

        # Update parameter groups, setting their 'params' value
        def update_group(group, new_group):
            new_group["params"] = group["params"]
            return new_group

        param_groups = [update_group(g, ng) for g, ng in zip(groups, saved_groups)]
        self.__setstate__({"state": state, "param_groups": param_groups})

    def _setup_full_precision_params(self):
        for i, pg in enumerate(self.param_groups):
            param_list = pg["params"]
            self.param_groups_full_precision.append(
                {
                    "params": [
                        p.clone().detach().to(dtype=torch.float32)
                        if (self.reduced_precision_dtype is not None)
                        and (p.dtype == self.reduced_precision_dtype)
                        else None
                        for p in param_list
                    ],
                }
            )

    # add_param_groups() is overridden because default items can be tensors. The
    # parent version does not clone the default item, so two param groups can
    # accidentally point to the same default item value where they can differ
    # given they are in separate groups.
    def add_param_group(self, param_group):
        super().add_param_group(param_group)
        for name, default in self.defaults.items():
            if isinstance(default, torch.Tensor):
                self.param_groups[len(self.param_groups) - 1][name] = default.clone()

    @torch.no_grad()
    def step(self, closure=None, grad_scaler=None):
        loss = None
        if closure is not None:
            loss = closure()

        # The full precision params are set up in the first step of the optimizer
        # instead of in the constructor because the full precision params will get out
        # out of sync with the model params if DDP syncs the model params across devices
        # after the optimizer is constructed.
        if len(self.param_groups_full_precision) == 0:
            self._setup_full_precision_params()

        # create separate grad lists for params
        grad_list = []
        for gid, group in enumerate(self.param_groups):
            for pid, p in enumerate(group["params"]):
                assert group["params"][0].dtype == p.dtype, (
                    "Error: Parameters are not of the identical type: {} != {}".format(
                        group["params"][0].dtype, p.dtype
                    )
                )
                if p.grad is None:
                    continue
                grad_list.append(p.grad)

        # Overflow check of gradients
        device = self.param_groups[0]["params"][0].device
        found_inf = (
            grad_scaler._check_inf_per_device(self)[device]
            if grad_scaler is not None
            else torch.zeros((1,), device=device)
        )
        self._dummy_overflow_buf.copy_(found_inf)

        # Get unscale scale factor
        scale, inv_scale = None, None
        if grad_scaler:
            scale = grad_scaler._get_scale_async()
            inv_scale = scale.double().reciprocal().float()
        else:
            scale = torch.ones((1,), device=device)
            inv_scale = torch.ones((1,), device=device)

        # grad_norm is of scaled gradients.
        # So, multiply `max_grad_norm` by scale.
        max_grad_norm = self.defaults["max_grad_norm"] * scale
        grad_norm = multi_tensor_applier(
            self.multi_tensor_l2norm,
            self._dummy_overflow_buf,
            [grad_list],
            False,
        )[0]

        # Run LAMB optimization math
        for gid, (group, group_full) in enumerate(
            zip(self.param_groups, self.param_groups_full_precision)
        ):
            bias_correction = 1 if group["bias_correction"] else 0
            beta1, beta2 = group["betas"]
            grad_averaging = 1 if group["grad_averaging"] else 0

            # assume same step across group now to simplify things
            # per parameter step can be easily support by making it tensor, or pass list into kernel
            group["step"] += (self._dummy_overflow_buf != 1).to(torch.int)

            state_lists = [
                [],  # (0) grads
                [],  # (1) params
                [],  # (2) momentum state
                [],  # (3) velocity state
            ]
            if self.reduced_precision_dtype is not None:
                state_lists.append([])  # (4) params reduced_dtype

            for p, p_full in zip(group["params"], group_full["params"]):
                if p.grad is None:
                    continue
                assert not p.grad.is_sparse

                state = self.state[p]
                # State initialization
                if len(state) == 0:
                    dtype = p.dtype
                    if (
                        self.reduced_precision_dtype is not None
                        and p.dtype == self.reduced_precision_dtype
                    ):
                        dtype = torch.float32
                    # Exponential moving average of gradient values
                    state["exp_avg"] = torch.zeros_like(p.data, dtype=dtype)
                    # Exponential moving average of gradient values
                    state["exp_avg_sq"] = torch.zeros_like(p.data, dtype=dtype)

                if self.reduced_precision_dtype is not None:
                    state_lists[0].append(p.grad.data)
                    state_lists[1].append(p_full.data)
                    state_lists[2].append(state["exp_avg"])
                    state_lists[3].append(state["exp_avg_sq"])
                    state_lists[4].append(p.data)
                else:
                    state_lists[0].append(p.grad.data)
                    state_lists[1].append(p.data)
                    state_lists[2].append(state["exp_avg"])
                    state_lists[3].append(state["exp_avg_sq"])

            multi_tensor_applier(
                self.multi_tensor_lamb,
                self._dummy_overflow_buf,
                state_lists,
                group["lr"],
                beta1,
                beta2,
                group["eps"],
                group["step"],
                bias_correction,
                group["weight_decay"],
                grad_averaging,
                self.adam_w_mode,
                grad_norm,
                max_grad_norm,
                self.use_nvlamb,
                found_inf,
                inv_scale,
            )

        return loss


================================================
FILE: apex/optimizers/fused_novograd.py
================================================
import torch
from apex.multi_tensor_apply import multi_tensor_applier


class FusedNovoGrad(torch.optim.Optimizer):
    """Implements NovoGrad algorithm.

    Currently GPU-only.  Requires Apex to be installed via
    ``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./``.

    This version of fused NovoGrad implements 2 fusions.

      * Fusion of the NovoGrad update's elementwise operations
      * A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.

    :class:`apex.optimizers.FusedNovoGrad`'s usage is identical to any Pytorch optimizer::

        opt = apex.optimizers.FusedNovoGrad(model.parameters(), lr = ....)
        ...
        opt.step()

    :class:`apex.optimizers.FusedNovoGrad` may be used with or without Amp.  If you wish to use :class:`FusedNovoGrad` with Amp,
    you may choose any ``opt_level``::

        opt = apex.optimizers.FusedNovoGrad(model.parameters(), lr = ....)
        model, opt = amp.initialize(model, opt, opt_level="O0" or "O1 or "O2")
        ...
        opt.step()

    In general, ``opt_level="O1"`` is recommended.

    It has been proposed in `Jasper: An End-to-End Convolutional Neural Acoustic Model`_.
    More info: https://nvidia.github.io/OpenSeq2Seq/html/optimizers.html#novograd

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups.
        lr (float, optional): learning rate. (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its norm. (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability. (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            NOT SUPPORTED now! (default: False)
        reg_inside_moment (bool, optional): whether do regularization (norm and L2)
            in momentum calculation. True for include, False for not include and
            only do it on update term. (default: False)
        grad_averaging (bool, optional): whether apply (1-beta1) to grad when
            calculating running averages of gradient. (default: True)
        norm_type (int, optional): which norm to calculate for each layer.
            2 for L2 norm, and 0 for infinite norm. These 2 are only supported
            type now. (default: 2)
        init_zero (bool, optional): whether init norm with 0 (start averaging on
            1st step) or first step norm (start averaging on 2nd step). True for
            init with 0. (default: False)
        set_grad_none (bool, optional): whether set grad to None when zero_grad()
            method is called. (default: True)

    .. _Jasper - An End-to-End Convolutional Neural Acoustic Model:
        https://arxiv.org/abs/1904.03288
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    def __init__(
        self,
        params,
        lr=1e-3,
        bias_correction=True,
        betas=(0.9, 0.999),
        eps=1e-8,
        weight_decay=0.0,
        amsgrad=False,
        reg_inside_moment=False,
        grad_averaging=True,
        norm_type=2,
        init_zero=False,
        set_grad_none=True,
    ):
        if amsgrad:
            raise RuntimeError("FusedNovoGrad does not support the AMSGrad variant.")
        defaults = dict(
            lr=lr,
            bias_correction=bias_correction,
            betas=betas,
            eps=eps,
            weight_decay=weight_decay,
            grad_averaging=grad_averaging,
            norm_type=norm_type,
            init_zero=init_zero,
        )
        super(FusedNovoGrad, self).__init__(params, defaults)
        if multi_tensor_applier.available:
            import amp_C
            # Skip buffer

            # Creating the overflow buffer on the same device as the params tensors.
            self._dummy_overflow_buf = torch.tensor(
                [0], dtype=torch.int, device=self.param_groups[0]["params"][0].device
            )
            self.multi_tensor_novograd = amp_C.multi_tensor_novograd
        else:
            raise RuntimeError("apex.optimizers.FusedNovoGrad requires cuda extensions")

        self.moment_mode = 0 if reg_inside_moment else 1
        self.set_grad_none = set_grad_none

    def zero_grad(self):
        if self.set_grad_none:
            for group in self.param_groups:
                for p in group["params"]:
                    p.grad = None
        else:
            super(FusedNovoGrad, self).zero_grad()

    def load_state_dict(self, state_dict):
        super(FusedNovoGrad, self).load_state_dict(state_dict)
        # in case exp_avg_sq is not on the same device as params, move it there
        for group in self.param_groups:
            if len(group["params"]) > 0:
                group["exp_avg_sq"][0] = group["exp_avg_sq"][0].to(group["params"][0].device)
                group["exp_avg_sq"][1] = group["exp_avg_sq"][1].to(group["params"][0].device)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            bias_correction = 1 if group["bias_correction"] else 0
            beta1, beta2 = group["betas"]
            grad_averaging = 1 if group["grad_averaging"] else 0

            # assume same step across group now to simplify things
            # per parameter step can be easily support by making it tensor, or pass list into kernel
            if "step" in group:
                group["step"] += 1
            else:
                group["step"] = 1

            # create lists for multi-tensor apply
            g_16, p_16, m_16 = [], [], []
            g_32, p_32, m_32 = [], [], []

            for p in group["params"]:
                if p.grad is None:
                    continue
                if p.grad.data.is_sparse:
                    raise RuntimeError(
                        "FusedNovoGrad does not support sparse gradients, please consider SparseAdam instead"
                    )

                state = self.state[p]
                # State initialization
                if len(state) == 0:
                    # Exponential moving average of gradient values
                    state["exp_avg"] = torch.zeros_like(p.data)

                if p.dtype == torch.float16:
                    g_16.append(p.grad.data)
                    p_16.append(p.data)
                    m_16.append(state["exp_avg"])
                elif p.dtype == torch.float32:
                    g_32.append(p.grad.data)
                    p_32.append(p.data)
                    m_32.append(state["exp_avg"])
                else:
                    raise RuntimeError("FusedNovoGrad only support fp16 and fp32.")

            # we store per weight norm as one tensor for one group/precision combination
            # different from optim.Adam, we store norm here(not ^2) so we can unify calculation for norm types
            if "exp_avg_sq" not in group:
                group["exp_avg_sq"] = [None, None]
                if group["init_zero"]:
                    # Creating the following parameters on the same device as the params tensors.
                    group["exp_avg_sq"][0] = (
                        torch.cuda.FloatTensor(
                            len(g_16), device=self.param_groups[0]["params"][0].device
                        )
                        .contiguous()
                        .fill_(0)
                    )
                    group["exp_avg_sq"][1] = (
                        torch.cuda.FloatTensor(
                            len(g_32), device=self.param_groups[0]["params"][0].device
                        )
                        .contiguous()
                        .fill_(0)
                    )
                else:  # init with first step norm, so first blend have no effect
                    if group["norm_type"] == 0:
                        v_16 = [torch.max(torch.abs(g.to(torch.float32))).item() for g in g_16]
                        v_32 = [torch.max(torch.abs(g)).item() for g in g_32]
                    elif group["norm_type"] == 2:
                        v_16 = [
                            torch.sum(torch.pow(g.to(torch.float32), 2)).sqrt().item() for g in g_16
                        ]
                        v_32 = [torch.sum(torch.pow(g, 2)).sqrt().item() for g in g_32]
                    else:
                        raise RuntimeError("FusedNovoGrad only support l2/inf norm now.")
                    # Creating the following parameters on the same device as the params tensors.
                    group["exp_avg_sq"][0] = torch.cuda.FloatTensor(
                        v_16, device=self.param_groups[0]["params"][0].device
                    )
                    group["exp_avg_sq"][1] = torch.cuda.FloatTensor(
                        v_32, device=self.param_groups[0]["params"][0].device
                    )
            else:
                assert len(g_16) == group["exp_avg_sq"][0].numel()
                assert len(g_32) == group["exp_avg_sq"][1].numel()

            if len(g_16) > 0:
                multi_tensor_applier(
                    self.multi_tensor_novograd,
                    self._dummy_overflow_buf,
                    [g_16, p_16, m_16],
                    group["exp_avg_sq"][0],
                    group["lr"],
                    beta1,
                    beta2,
                    group["eps"],
                    group["step"],
                    bias_correction,
                    group["weight_decay"],
                    grad_averaging,
                    self.moment_mode,
                    group["norm_type"],
                )
            if len(g_32) > 0:
                multi_tensor_applier(
                    self.multi_tensor_novograd,
                    self._dummy_overflow_buf,
                    [g_32, p_32, m_32],
                    group["exp_avg_sq"][1],
                    group["lr"],
                    beta1,
                    beta2,
                    group["eps"],
                    group["step"],
                    bias_correction,
                    group["weight_decay"],
                    grad_averaging,
                    self.moment_mode,
                    group["norm_type"],
                )

        return loss


================================================
FILE: apex/optimizers/fused_sgd.py
================================================
import torch
from torch.optim.optimizer import Optimizer, required

from apex.multi_tensor_apply import multi_tensor_applier


class FusedSGD(Optimizer):
    r"""Implements stochastic gradient descent (optionally with momentum).

    Currently GPU-only.  Requires Apex to be installed via
    ``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./``.

    This version of fused SGD implements 2 fusions.

      * Fusion of the SGD update's elementwise operations
      * A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.

    :class:`apex.optimizers.FusedSGD` may be used as a drop-in replacement for ``torch.optim.SGD``::

        opt = apex.optimizers.FusedSGD(model.parameters(), lr = ....)
        ...
        opt.step()

    :class:`apex.optimizers.FusedSGD` may be used with or without Amp.  If you wish to use :class:`FusedSGD` with Amp,
    you may choose any ``opt_level``::

        opt = apex.optimizers.FusedSGD(model.parameters(), lr = ....)
        model, opt = amp.initialize(model, opt, opt_level="O0" or "O1 or "O2")
        ...
        opt.step()

    In general, ``opt_level="O1"`` is recommended.

    Nesterov momentum is based on the formula from
    `On the importance of initialization and momentum in deep learning`__.

    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float): learning rate
        momentum (float, optional): momentum factor (default: 0)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        dampening (float, optional): dampening for momentum (default: 0)
        nesterov (bool, optional): enables Nesterov momentum (default: False)

    Example:
        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
        >>> optimizer.zero_grad()
        >>> loss_fn(model(input), target).backward()
        >>> optimizer.step()

    __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf

    .. note::
        The implementation of SGD with Momentum/Nesterov subtly differs from
        Sutskever et. al. and implementations in some other frameworks.

        Considering the specific case of Momentum, the update can be written as

        .. math::
                  v = \rho * v + g \\
                  p = p - lr * v

        where p, g, v and :math:`\rho` denote the parameters, gradient,
        velocity, and momentum respectively.

        This is in contrast to Sutskever et. al. and
        other frameworks which employ an update of the form

        .. math::
             v = \rho * v + lr * g \\
             p = p - v

        The Nesterov version is analogously modified.
    """

    def __init__(
        self,
        params,
        lr=required,
        momentum=0,
        dampening=0,
        weight_decay=0,
        nesterov=False,
        wd_after_momentum=False,
        materialize_master_grads=True,
        set_grad_none=False,
    ):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(
            lr=lr,
            momentum=momentum,
            dampening=dampening,
            weight_decay=weight_decay,
            nesterov=nesterov,
        )
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(FusedSGD, self).__init__(params, defaults)

        self.wd_after_momentum = wd_after_momentum
        self.materialize_master_grads = materialize_master_grads
        self.most_recent_scale = 1.0
        self.scale_set_by_backward = False
        self.set_grad_none = set_grad_none

        if multi_tensor_applier.available:
            import amp_C

            # Skip buffer
            self._dummy_overflow_buf = torch.tensor(
                [0], dtype=torch.int, device=self.param_groups[0]["params"][0].device
            )
            self.multi_tensor_sgd = amp_C.multi_tensor_sgd
        else:
            raise RuntimeError("apex.optimizers.FusedSGD requires cuda extensions")

    def __setstate__(self, state):
        super(FusedSGD, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault("nesterov", False)

    def zero_grad(self):
        if self.set_grad_none:
            for group in self.param_groups:
                for p in group["params"]:
                    p.grad = None
        else:
            super(FusedSGD, self).zero_grad()

    def get_momentums(self, params):
        momentums = []
        first_run = True
        for p in params:
            param_state = self.state[p]
            # torch.optim.SGD initializes momentum in the main loop, we have
            # to do it here, and track whether or not we've done so, so that
            # momentum application can be skipped in the main kernel.
            if "momentum_buffer" not in param_state:
                first_run = True
                buf = param_state["momentum_buffer"] = torch.zeros_like(p.data)
                momentums.append(buf)
            else:
                first_run = False
                momentums.append(param_state["momentum_buffer"])
        return momentums, first_run

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        explicit_master_params = hasattr(self, "_amp_stash") and hasattr(
            self._amp_stash, "fp32_from_fp16_groups"
        )

        for gid, group in enumerate(self.param_groups):
            weight_decay = group["weight_decay"]
            momentum = group["momentum"]
            dampening = group["dampening"]
            nesterov = group["nesterov"]

            # For each group, there are 3 possible combinations we need to consider:
            # grad_type, param_to_update_type, momentum_type, requires_fp16_model_copy
            # 1. fp16, fp16, fp16, No
            # 2. fp32, fp32, fp32, No
            # 3. fp16, fp32, fp32, Yes

            first_runs = [True, True]

            # I think a bit of code divergence in exchange for naming clarity is worthwhile
            if explicit_master_params:
                stash = self._amp_stash

                fp32_params = [p for p in stash.fp32_from_fp32_groups[gid] if p.grad is not None]
                fp32_grads = [
                    p.grad for p in stash.fp32_from_fp32_groups[gid] if p.grad is not None
                ]
                fp32_momentums, first_runs[1] = self.get_momentums(fp32_params)

                if self.materialize_master_grads:
                    fp16_model_params = [
                        p
                        for i, p in enumerate(stash.fp16_groups[gid])
                        if stash.fp32_from_fp16_groups[gid][i].grad is not None
                    ]
                    fp32_from_fp16_grads = [
                        p.grad for p in stash.fp32_from_fp16_groups[gid] if p.grad is not None
                    ]
                    fp32_from_fp16_params = [
                        p for p in stash.fp32_from_fp16_groups[gid] if p.grad is not None
                    ]
                    fp32_from_fp16_momentums, first_runs[0] = self.get_momentums(
                        fp32_from_fp16_params
                    )

                    fp16_set = [
                        fp32_from_fp16_grads,
                        fp32_from_fp16_params,
                        fp32_from_fp16_momentums,
                        fp16_model_params,
                    ]
                else:
                    fp16_model_params = [p for p in stash.fp16_groups[gid] if p.grad is not None]
                    fp16_model_grads = [
                        p.grad for p in stash.fp16_groups[gid] if p.grad is not None
                    ]
                    fp32_from_fp16_params = [
                        p
                        for i, p in enumerate(stash.fp32_from_fp16_groups[gid])
                        if stash.fp16_groups[gid][i].grad is not None
                    ]
                    fp32_from_fp16_momentums, first_runs[0] = self.get_momentums(
                        fp32_from_fp16_params
                    )

                    fp16_set = [
                        fp16_model_grads,
                        fp32_from_fp16_params,
                        fp32_from_fp16_momentums,
                        fp16_model_params,
                    ]

                launch_sets = [fp16_set, [fp32_grads, fp32_params, fp32_momentums]]
            else:
                fp16_params = [
                    p for p in group["params"] if (p.dtype == torch.float16 and p.grad is not None)
                ]
                fp16_grads = [
                    p.grad
                    for p in group["params"]
                    if (p.dtype == torch.float16 and p.grad is not None)
                ]
                fp16_momentums, first_runs[0] = self.get_momentums(fp16_params)

                fp32_params = [
                    p for p in group["params"] if (p.dtype == torch.float32 and p.grad is not None)
                ]
                fp32_grads = [
                    p.grad
                    for p in group["params"]
                    if (p.dtype == torch.float32 and p.grad is not None)
                ]
                fp32_momentums, first_runs[1] = self.get_momentums(fp32_params)

                launch_sets = [
                    [fp16_grads, fp16_params, fp16_momentums],
                    [fp32_grads, fp32_params, fp32_momentums],
                ]

            for s, (launch_set, first_run) in enumerate(zip(launch_sets, first_runs)):
                assert len(launch_set[0]) == len(launch_set[1])
                assert len(launch_set[0]) == len(launch_set[2])
                if len(launch_set[0]) > 0:
                    multi_tensor_applier(
                        self.multi_tensor_sgd,
                        self._dummy_overflow_buf,
                        launch_set,
                        weight_decay,
                        momentum,
                        dampening,
                        group["lr"],
                        nesterov,
                        first_run,
                        self.wd_after_momentum,
                        1.0 / self.most_recent_scale,
                    )

        self.most_recent_scale = 1.0
        self.scale_set_by_backward = False

        return loss


================================================
FILE: csrc/amp_C_frontend.cpp
================================================
#include <torch/extension.h>

void multi_tensor_scale_cuda(int chunk_size, at::Tensor noop_flag, std::vector<std::vector<at::Tensor>> tensor_lists,
                             float scale);

void multi_tensor_sgd_cuda(int chunk_size, at::Tensor noop_flag, std::vector<std::vector<at::Tensor>> tensor_lists,
                           float wd, float momentum, float dampening, float lr, bool nesterov, bool first_run,
                           bool wd_after_momentum, float scale);

void multi_tensor_axpby_cuda(int chunk_size, at::Tensor noop_flag, std::vector<std::vector<at::Tensor>> tensor_lists,
                             float a, float b, int arg_to_check);

std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(int chunk_size, at::Tensor noop_flag,
                                                            std::vector<std::vector<at::Tensor>> tensor_lists,
                                                            at::optional<bool> per_tensor_python);

std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_mp_cuda(int chunk_size, at::Tensor noop_flag,
                                                               std::vector<std::vector<at::Tensor>> tensor_lists,
                                                               at::optional<bool> per_tensor_python);

std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_scale_cuda(int chunk_size, at::Tensor noop_flag,
                                                                  std::vector<std::vector<at::Tensor>> tensor_lists,
                                                                  float scale, at::optional<bool> per_tensor_python);

std::tuple<at::Tensor, at::Tensor> multi_tensor_unscale_l2norm_cuda(int chunk_size, at::Tensor noop_flag,
                                                                    std::vector<std::vector<at::Tensor>> tensor_lists,
                                                                    at::Tensor inv_scale,
                                                                    at::optional<bool> per_tensor_python);

void multi_tensor_lamb_stage1_cuda(int chunk_size, at::Tensor noop_flag,
                                   std::vector<std::vector<at::Tensor>> tensor_lists, at::Tensor per_tensor_decay,
                                   const int step, const float beta1, const float beta2, const float epsilon,
                                   at::Tensor global_grad_norm, const float max_global_grad_norm);

void multi_tensor_lamb_stage2_cuda(int chunk_size, at::Tensor noop_flag,
                                   std::vector<std::vector<at::Tensor>> tensor_lists, at::Tensor per_tensor_param_norm,
                                   at::Tensor per_tensor_update_norm, const float lr, const float weight_decay,
                                   at::optional<bool> use_nvlamb_python);

void multi_tensor_adam_cuda(int chunk_size, at::Tensor noop_flag, std::vector<std::vector<at::Tensor>> tensor_lists,
                            const float lr, const float beta1, const float beta2, const float epsilon, const int step,
                            const int mode, const int bias_correction, const float weight_decay);

void multi_tensor_adam_capturable_cuda(int chunk_size, at::Tensor noop_flag,
                                       std::vector<std::vector<at::Tensor>> tensor_lists, at::Tensor lr,
                                       const float beta1, const float beta2, const float epsilon, at::Tensor step,
                                       const int mode, const int bias_correction, const float weight_decay,
                                       at::Tensor inv_scale);

void multi_tensor_adam_capturable_master_cuda(int chunk_size, at::Tensor noop_flag,
                                              std::vector<std::vector<at::Tensor>> tensor_lists, at::Tensor lr,
                                              const float beta1, const float beta2, const float epsilon,
                                              at::Tensor step, const int mode, const int bias_correction,
                                              const float weight_decay, at::Tensor inv_scale);

void multi_tensor_adagrad_cuda(int chunk_size, at::Tensor noop_flag, std::vector<std::vector<at::Tensor>> tensor_lists,
                               const float lr, const float epsilon, const int mode, const float weight_decay);

void multi_tensor_novograd_cuda(int chunk_size, at::Tensor noop_flag, std::vector<std::vector<at::Tensor>> tensor_lists,
                                at::Tensor grad_norms, const float lr, const float beta1, const float beta2,
                                const float epsilon, const int step, const int bias_correction,
                                const float weight_decay, const int grad_averaging, const int mode,
                                const int norm_type);

void multi_tensor_lamb_cuda(int chunk_size, at::Tensor noop_flag, std::vector<std::vector<at::Tensor>> tensor_lists,
                            const float lr, const float beta1, const float beta2, const float epsilon, const int step,
                            const int bias_correction, const float weight_decay, const int grad_averaging,
                            const int mode, at::Tensor global_grad_norm, const float max_grad_norm,
                            at::optional<bool> use_nvlamb_python);

void multi_tensor_lamb_mp_cuda(int chunk_size, at::Tensor noop_flag, std::vector<std::vector<at::Tensor>> tensor_lists,
                               at::Tensor lr, const float beta1, const float beta2, const float epsilon,
                               at::Tensor step, const int bias_correction, const float weight_decay,
                               const int grad_averaging, const int mode, at::Tensor global_grad_norm,
                               at::Tensor max_grad_norm, at::optional<bool> use_nvlamb_python, at::Tensor found_inf,
                               at::Tensor inv_scale);

at::Tensor update_scale_hysteresis_cuda(at::Tensor current_scale, at::Tensor growth_tracker,
                                        at::Tensor hysteresis_tracker, at::Tensor found_inf, const double growth_factor,
                                        const double backoff_factor, const int64_t growth_interval,
                                        const int hysteresis);

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("multi_tensor_scale", &multi_tensor_scale_cuda, "Fused overflow check + scale for a list of contiguous tensors",
        py::call_guard<py::gil_scoped_release>());
  m.def("multi_tensor_sgd", &multi_tensor_sgd_cuda, "Fused SGD optimizer for list of contiguous tensors",
        py::call_guard<py::gil_scoped_release>());
  m.def("multi_tensor_axpby", &multi_tensor_axpby_cuda, "out = a*x + b*y for a list of contiguous tensors",
        py::call_guard<py::gil_scoped_release>());
  m.def("multi_tensor_l2norm", &multi_tensor_l2norm_cuda, "Computes L2 norm for a list of contiguous tensors",
        py::call_guard<py::gil_scoped_release>());
  m.def("multi_tensor_l2norm_mp", &multi_tensor_l2norm_mp_cuda, "Computes L2 norm for a list of contiguous tensors",
        py::call_guard<py::gil_scoped_release>());
  m.def("multi_tensor_l2norm_scale", &multi_tensor_l2norm_scale_cuda,
        "Computes L2 norm for a list of contiguous tensors and does scaling", py::call_guard<py::gil_scoped_release>());
  m.def("multi_tensor_unscale_l2norm", &multi_tensor_unscale_l2norm_cuda,
        "Computes L2 norm for a list of contiguous tensors after unscaling (unscaling is only performed for L2 norm "
        "computation, and tensors are not updated)",
        py::call_guard<py::gil_scoped_release>());
  m.def("multi_tensor_lamb_stage1_cuda", &multi_tensor_lamb_stage1_cuda, "Computes update part of LAMB optimizer",
        py::call_guard<py::gil_scoped_release>());
  m.def("multi_tensor_lamb_stage2_cuda", &multi_tensor_lamb_stage2_cuda,
        "Completes application of gradient to parameters for LAMB optimizer", py::call_guard<py::gil_scoped_release>());
  m.def("multi_tensor_adam", &multi_tensor_adam_cuda,
        "Compute and apply gradient update to parameters for Adam optimizer", py::call_guard<py::gil_scoped_release>());
  m.def("multi_tensor_adam_capturable", &multi_tensor_adam_capturable_cuda,
        "Compute and apply gradient update to parameters for Adam optimizer with CUDA graph support and LR scheduling",
        py::call_guard<py::gil_scoped_release>());
  m.def("multi_tensor_adam_capturable_master", &multi_tensor_adam_capturable_master_cuda,
        "Compute and apply gradient update to parameters for Adam optimizer with CUDA graph support, LR scheduling and "
        "FP32 master weights",
        py::call_guard<py::gil_scoped_release>());
  m.def("multi_tensor_adagrad", &multi_tensor_adagrad_cuda,
        "Compute and apply gradient update to parameters for Adam optimizer", py::call_guard<py::gil_scoped_release>());
  m.def("multi_tensor_novograd", &multi_tensor_novograd_cuda,
        "Compute and apply gradient update to parameters for Adam optimizer", py::call_guard<py::gil_scoped_release>());
  m.def("multi_tensor_lamb", &multi_tensor_lamb_cuda, "Computes and apply update for LAMB optimizer",
        py::call_guard<py::gil_scoped_release>());
  m.def("multi_tensor_lamb_mp", &multi_tensor_lamb_mp_cuda, "Computes and apply update for LAMB optimizer",
        py::call_guard<py::gil_scoped_release>());
  m.def("update_scale_hysteresis", &update_scale_hysteresis_cuda, "Updates scale while accounting for hysteresis",
        py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: csrc/flatten_unflatten.cpp
================================================
#include <torch/csrc/utils/tensor_flatten.h>
#include <torch/extension.h>
// https://github.com/pytorch/pytorch/blob/master/torch/csrc/utils/tensor_flatten.h

at::Tensor flatten(std::vector<at::Tensor> tensors) { return torch::utils::flatten_dense_tensors(tensors); }

std::vector<at::Tensor> unflatten(at::Tensor flat, std::vector<at::Tensor> tensors) {
  return torch::utils::unflatten_dense_tensors(flat, tensors);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("flatten", &flatten, "Flatten dense tensors", py::call_guard<py::gil_scoped_release>());
  m.def("unflatten", &unflatten, "Unflatten dense tensors", py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: csrc/fused_dense.cpp
================================================
#include <stdio.h>
#include <torch/extension.h>
#include <torch/torch.h>

#include <vector>

template <typename T>
int linear_bias_forward_cuda(at::Tensor input, T* weight, at::Tensor bias, int in_features, int batch_size,
                             int out_features, at::Tensor output, void* lt_workspace);

template <typename T>
int linear_bias_backward_cuda(T* input, T* weight, T* d_output, int in_features, int batch_size, int out_features,
                              T* d_weight, T* d_bias, T* d_input, void* lt_workspace);

template <typename T>
int linear_gelu_linear_forward_cuda(T* input, T* weight1, T* bias1, T* weight2, T* bias2, int in_features,
                                    int hidden_features, int batch_size, int out_features, T* output1, T* output2,
                                    T* gelu_in, void* lt_workspace);

template <typename T>
int linear_gelu_linear_backward_cuda(T* input, T* gelu_in, T* output1, T* weight1, T* weight2, T* d_output1,
                                     T* d_output2, int in_features, int batch_size, int hidden_features,
                                     int out_features, T* d_weight1, T* d_weight2, T* d_bias1, T* d_bias2, T* d_input,
                                     void* lt_workspace);

at::Tensor linear_bias_forward(at::Tensor input, at::Tensor weight, at::Tensor bias) {
  auto batch_size = input.size(0);
  auto in_features = input.size(1);

  int out_features = weight.size(0);

  // auto reserved_size = get_mlp_reserved_space(batch_size, num_layers, output_features.data());

  // create output/workspace tensor
  auto out = at::empty({batch_size, out_features}, input.options());
  // auto reserved_space = at::empty({reserved_size}, inputs[0].options());
  //  allocate fixed 4MB workspace for cublaslt for now, and this gets at least 4 MB
  auto lt_workspace = at::empty({1 << 22}, input.options());

  AT_DISPATCH_FLOATING_TYPES_AND2(
      at::ScalarType::Half, at::ScalarType::BFloat16, input.scalar_type(), "linear_bias_forward", [&] {
        scalar_t* w_ptr = weight.data_ptr<scalar_t>();
        scalar_t* b_ptr = bias.data_ptr<scalar_t>();
        [[maybe_unused]] auto result =
            linear_bias_forward_cuda<scalar_t>(input, w_ptr, bias, in_features, batch_size, out_features, out,
                                               // out.data_ptr<scalar_t>(),
                                               // reserved_space.data_ptr<scalar_t>(),
                                               (void*)(lt_workspace.data_ptr<scalar_t>()));
      });

  return {out};
}

std::vector<at::Tensor> linear_bias_backward(at::Tensor input, at::Tensor weight, at::Tensor d_output) {
  auto batch_size = input.size(0);
  auto in_features = input.size(1);

  int out_features = weight.size(0);

  // auto reserved_size = get_mlp_reserved_space(batch_size, num_layers, output_features.data());

  // create output/workspace tensor
  auto d_weight = at::empty({out_features, in_features}, input.options());
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION < 11600
  auto d_bias = d_output.view({-1, out_features}).sum(0, false);
#else
  auto d_bias = at::empty({out_features}, input.options());
#endif
  auto d_input = at::empty({batch_size, in_features}, input.options());
  // auto reserved_space = at::empty({reserved_size}, inputs[0].options());
  //  allocate fixed 4MB workspace for cublaslt for now, and this gets at least 4 MB
  auto lt_workspace = at::empty({1 << 22}, input.options());

  AT_DISPATCH_FLOATING_TYPES_AND2(
      at::ScalarType::Half, at::ScalarType::BFloat16, input.scalar_type(), "linear_bias_backward", [&] {
        scalar_t* w_ptr = weight.data_ptr<scalar_t>();
        scalar_t* d_b_ptr = d_bias.data_ptr<scalar_t>();
        [[maybe_unused]] auto result = linear_bias_backward_cuda<scalar_t>(
            input.data_ptr<scalar_t>(), w_ptr, d_output.data_ptr<scalar_t>(), in_features, batch_size, out_features,
            d_weight.data_ptr<scalar_t>(), d_bias.data_ptr<scalar_t>(), d_input.data_ptr<scalar_t>(),
            // reserved_space.data_ptr<scalar_t>(),
            (void*)(lt_workspace.data_ptr<scalar_t>()));
      });

  return {d_input, d_weight, d_bias};
}

std::vector<at::Tensor> linear_gelu_linear_forward(at::Tensor input, at::Tensor weight1, at::Tensor bias1,
                                                   at::Tensor weight2, at::Tensor bias2) {
  auto batch_size = input.size(0);
  auto in_features = input.size(1);

  int hidden_features = weight1.size(0);
  int out_features = weight2.size(0);

  // auto reserved_size = get_mlp_reserved_space(batch_size, num_layers, output_features.data());

  // create output/workspace tensor
  auto output1 = at::empty({batch_size, hidden_features}, input.options());
  auto gelu_in = at::empty({batch_size, hidden_features}, input.options());
  auto output2 = at::empty({batch_size, out_features}, input.options());
  // auto reserved_space = at::empty({reserved_size}, inputs[0].options());
  //  allocate fixed 4MB workspace for cublaslt for now, and this gets at least 4 MB
  auto lt_workspace = at::empty({1 << 22}, input.options());

  AT_DISPATCH_FLOATING_TYPES_AND2(
      at::ScalarType::Half, at::ScalarType::BFloat16, input.scalar_type(), "linear_gelu_linear_forward", [&] {
        scalar_t* w1_ptr = weight1.data_ptr<scalar_t>();
        scalar_t* b1_ptr = bias1.data_ptr<scalar_t>();
        scalar_t* w2_ptr = weight2.data_ptr<scalar_t>();
        scalar_t* b2_ptr = bias2.data_ptr<scalar_t>();
        [[maybe_unused]] auto result = linear_gelu_linear_forward_cuda<scalar_t>(
            input.data_ptr<scalar_t>(), w1_ptr, b1_ptr, w2_ptr, b2_ptr, in_features, hidden_features, batch_size,
            out_features, output1.data_ptr<scalar_t>(), output2.data_ptr<scalar_t>(), gelu_in.data_ptr<scalar_t>(),
            // reserved_space.data_ptr<scalar_t>(),
            (void*)(lt_workspace.data_ptr<scalar_t>()));
      });

  return {output1, output2, gelu_in};
}

std::vector<at::Tensor> linear_gelu_linear_backward(at::Tensor input, at::Tensor gelu_in, at::Tensor output1,
                                                    at::Tensor weight1, at::Tensor weight2, at::Tensor d_output2) {
  auto batch_size = input.size(0);
  auto in_features = input.size(1);

  int hidden_features = weight1.size(0);
  int out_features = weight2.size(0);

  // auto reserved_size = get_mlp_reserved_space(batch_size, num_layers, output_features.data());

  // create output/workspace tensor
  auto d_weight1 = at::empty({hidden_features, in_features}, input.options());
  auto d_weight2 = at::empty({out_features, hidden_features}, input.options());
  auto d_bias1 = at::empty({hidden_features}, input.options());
  auto d_bias2 = at::empty({out_features}, input.options());
  auto d_input = at::empty({batch_size, in_features}, input.options());
  auto d_output1 = at::empty({batch_size, hidden_features}, input.options());
  // auto reserved_space = at::empty({reserved_size}, inputs[0].options());
  //  allocate fixed 4MB workspace for cublaslt for now, and this gets at least 4 MB
  auto lt_workspace = at::empty({1 << 22}, input.options());

  AT_DISPATCH_FLOATING_TYPES_AND2(
      at::ScalarType::Half, at::ScalarType::BFloat16, input.scalar_type(), "linear_bias_backward", [&] {
        // scalar_t* w_ptr = weight.data_ptr<scalar_t>();
        // scalar_t* d_b_ptr = d_bias.data_ptr<scalar_t>();
        [[maybe_unused]] auto result = linear_gelu_linear_backward_cuda<scalar_t>(
            input.data_ptr<scalar_t>(), gelu_in.data_ptr<scalar_t>(), output1.data_ptr<scalar_t>(),
            weight1.data_ptr<scalar_t>(), weight2.data_ptr<scalar_t>(), d_output1.data_ptr<scalar_t>(),
            d_output2.data_ptr<scalar_t>(), in_features, batch_size, hidden_features, out_features,
            d_weight1.data_ptr<scalar_t>(), d_weight2.data_ptr<scalar_t>(), d_bias1.data_ptr<scalar_t>(),
            d_bias2.data_ptr<scalar_t>(), d_input.data_ptr<scalar_t>(),
            // reserved_space.data_ptr<scalar_t>(),
            (void*)(lt_workspace.data_ptr<scalar_t>()));
      });

  return {d_input, d_weight1, d_bias1, d_weight2, d_bias2};
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("linear_bias_forward", &linear_bias_forward, "linear bias forward", py::call_guard<py::gil_scoped_release>());
  m.def("linear_bias_backward", &linear_bias_backward, "linear bias backward",
        py::call_guard<py::gil_scoped_release>());
  m.def("linear_gelu_linear_forward", &linear_gelu_linear_forward, "linear gelu linear forward",
        py::call_guard<py::gil_scoped_release>());
  m.def("linear_gelu_linear_backward", &linear_gelu_linear_backward, "linear gelu linear backward",
        py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: csrc/fused_dense_cuda.cu
================================================
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <torch/torch.h>

/* Includes, cuda */
#include <cublas_v2.h>
#include <cuda_runtime.h>

#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11000
// includes cublaslt
#include <cublasLt.h>
#endif
// FP64 Wrapper around cublas GEMMEx
cublasStatus_t gemm_bias(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
                         const float* alpha, double* A, int lda, double* B, int ldb, const float* beta, double* C,
                         int ldc) {
  return cublasGemmEx(handle, transa, transb, m, n, k, alpha, A, CUDA_R_64F, lda, B, CUDA_R_64F, ldb, beta, C,
                      CUDA_R_64F, ldc, CUDA_R_64F, CUBLAS_GEMM_DEFAULT);
}

// FP32 Wrapper around cublas GEMMEx
cublasStatus_t gemm_bias(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
                         const float* alpha, float* A, int lda, float* B, int ldb, const float* beta, float* C,
                         int ldc) {
  return cublasGemmEx(handle, transa, transb, m, n, k, alpha, A, CUDA_R_32F, lda, B, CUDA_R_32F, ldb, beta, C,
                      CUDA_R_32F, ldc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT);
}

// FP16 Tensor core wrapper around cublas GEMMEx
cublasStatus_t gemm_bias(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
                         const float* alpha, at::Half* A, int lda, at::Half* B, int ldb, const float* beta, at::Half* C,
                         int ldc) {
  return cublasGemmEx(handle, transa, transb, m, n, k, alpha, A, CUDA_R_16F, lda, B, CUDA_R_16F, ldb, beta, C,
                      CUDA_R_16F, ldc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
}

// BF16 Tensor core wrapper around cublas GEMMEx
cublasStatus_t gemm_bias(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
                         const float* alpha, at::BFloat16* A, int lda, at::BFloat16* B, int ldb, const float* beta,
                         at::BFloat16* C, int ldc) {
  return cublasGemmEx(handle, transa, transb, m, n, k, alpha, A, CUDA_R_16BF, lda, B, CUDA_R_16BF, ldb, beta, C,
                      CUDA_R_16BF, ldc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
}

#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600

int gemm_bias_lt(cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
                 const float* alpha,                                            /* host pointer */
                 at::Half* A, int lda, at::Half* B, int ldb, const float* beta, /* host pointer */
                 at::Half* C, int ldc, void* workspace, size_t workspaceSize, cudaStream_t stream, bool use_bias,
                 const void* bias) {
  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;

  cublasLtMatmulDescOpaque_t operationDesc = {};
  cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
  cublasLtMatmulPreferenceOpaque_t preference = {};

  int returnedResults = 0;
  cublasLtMatmulHeuristicResult_t heuristicResult = {};
  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT;

  // Create operation descriptor; see cublasLtMatmulDescAttributes_t
  // for details about defaults; here we just set the transforms for
  // A and B.
  status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  if (use_bias) {
    status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias));
    if (status != CUBLAS_STATUS_SUCCESS) {
      goto CLEANUP;
    }
    epilogue = CUBLASLT_EPILOGUE_BIAS;
  }

  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue));
  if (status != CUBLAS_STATUS_SUCCESS) {
    goto CLEANUP;
  }

  // Create matrix descriptors. Not setting any extra attributes.
  status =
      cublasLtMatrixLayoutInit(&Adesc, CUDA_R_16F, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status =
      cublasLtMatrixLayoutInit(&Bdesc, CUDA_R_16F, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_16F, m, n, ldc);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // Create preference handle; In general, extra attributes can be
  // used here to disable tensor ops or to make sure algo selected
  // will work with badly aligned A, B, C. However, for simplicity
  // here we assume A,B,C are always well aligned (e.g., directly
  // come from cudaMalloc)
  status = cublasLtMatmulPreferenceInit(&preference);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulPreferenceSetAttribute(&preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize,
                                                sizeof(workspaceSize));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // We just need the best available heuristic to try and run matmul.
  // There is no guarantee that this will work. For example, if A is
  // badly aligned, you can request more (e.g. 32) algos and try to
  // run them one by one until something works.
  status = cublasLtMatmulAlgoGetHeuristic(ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1,
                                          &heuristicResult, &returnedResults);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  if (returnedResults == 0) {
    status = CUBLAS_STATUS_NOT_SUPPORTED;
    goto CLEANUP;
  }
  status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc,
                          //&heuristicResult.algo,
                          NULL, workspace, workspaceSize, stream);

CLEANUP:
  // Descriptors are no longer needed as all GPU work was already
  // enqueued.
  return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
}

int gemm_bias_lt(cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
                 const float* alpha,                                                    /* host pointer */
                 at::BFloat16* A, int lda, at::BFloat16* B, int ldb, const float* beta, /* host pointer */
                 at::BFloat16* C, int ldc, void* workspace, size_t workspaceSize, cudaStream_t stream, bool use_bias,
                 const void* bias) {
  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;

  cublasLtMatmulDescOpaque_t operationDesc = {};
  cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
  cublasLtMatmulPreferenceOpaque_t preference = {};

  int returnedResults = 0;
  cublasLtMatmulHeuristicResult_t heuristicResult = {};
  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT;

  // Create operation descriptor; see cublasLtMatmulDescAttributes_t
  // for details about defaults; here we just set the transforms for
  // A and B.
  status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  if (use_bias) {
    status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias));
    if (status != CUBLAS_STATUS_SUCCESS) {
      goto CLEANUP;
    }
    epilogue = CUBLASLT_EPILOGUE_BIAS;
  }

  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue));
  if (status != CUBLAS_STATUS_SUCCESS) {
    goto CLEANUP;
  }

  // Create matrix descriptors. Not setting any extra attributes.
  status =
      cublasLtMatrixLayoutInit(&Adesc, CUDA_R_16BF, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status =
      cublasLtMatrixLayoutInit(&Bdesc, CUDA_R_16BF, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_16BF, m, n, ldc);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // Create preference handle; In general, extra attributes can be
  // used here to disable tensor ops or to make sure algo selected
  // will work with badly aligned A, B, C. However, for simplicity
  // here we assume A,B,C are always well aligned (e.g., directly
  // come from cudaMalloc)
  status = cublasLtMatmulPreferenceInit(&preference);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulPreferenceSetAttribute(&preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize,
                                                sizeof(workspaceSize));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // We just need the best available heuristic to try and run matmul.
  // There is no guarantee that this will work. For example, if A is
  // badly aligned, you can request more (e.g. 32) algos and try to
  // run them one by one until something works.
  status = cublasLtMatmulAlgoGetHeuristic(ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1,
                                          &heuristicResult, &returnedResults);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  if (returnedResults == 0) {
    status = CUBLAS_STATUS_NOT_SUPPORTED;
    goto CLEANUP;
  }
  status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc,
                          //&heuristicResult.algo,
                          NULL, workspace, workspaceSize, stream);

CLEANUP:
  // Descriptors are no longer needed as all GPU work was already
  // enqueued.
  return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
}

int gemm_bias_lt(cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
                 const float* alpha,                                        /* host pointer */
                 double* A, int lda, double* B, int ldb, const float* beta, /* host pointer */
                 double* C, int ldc, void* workspace, size_t workspaceSize, cudaStream_t stream, bool use_bias,
                 const void* bias) {
  return 1;
}

int gemm_bias_lt(cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
                 const float* alpha,                                      /* host pointer */
                 float* A, int lda, float* B, int ldb, const float* beta, /* host pointer */
                 float* C, int ldc, void* workspace, size_t workspaceSize, cudaStream_t stream, bool use_bias,
                 const void* bias) {
  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;

  cublasLtMatmulDescOpaque_t operationDesc = {};
  cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
  cublasLtMatmulPreferenceOpaque_t preference = {};

  int returnedResults = 0;
  cublasLtMatmulHeuristicResult_t heuristicResult = {};
  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT;

  // Create operation descriptor; see cublasLtMatmulDescAttributes_t
  // for details about defaults; here we just set the transforms for
  // A and B.
  status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  if (use_bias) {
    status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias));
    if (status != CUBLAS_STATUS_SUCCESS) {
      goto CLEANUP;
    }
    epilogue = CUBLASLT_EPILOGUE_BIAS;
  }

  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue));
  if (status != CUBLAS_STATUS_SUCCESS) {
    goto CLEANUP;
  }

  // Create matrix descriptors. Not setting any extra attributes.
  status =
      cublasLtMatrixLayoutInit(&Adesc, CUDA_R_32F, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status =
      cublasLtMatrixLayoutInit(&Bdesc, CUDA_R_32F, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_32F, m, n, ldc);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // Create preference handle; In general, extra attributes can be
  // used here to disable tensor ops or to make sure algo selected
  // will work with badly aligned A, B, C. However, for simplicity
  // here we assume A,B,C are always well aligned (e.g., directly
  // come from cudaMalloc)
  status = cublasLtMatmulPreferenceInit(&preference);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulPreferenceSetAttribute(&preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize,
                                                sizeof(workspaceSize));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // We just need the best available heuristic to try and run matmul.
  // There is no guarantee that this will work. For example, if A is
  // badly aligned, you can request more (e.g. 32) algos and try to
  // run them one by one until something works.
  status = cublasLtMatmulAlgoGetHeuristic(ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1,
                                          &heuristicResult, &returnedResults);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  if (returnedResults == 0) {
    status = CUBLAS_STATUS_NOT_SUPPORTED;
    goto CLEANUP;
  }

  status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc,
                          &heuristicResult.algo, workspace, workspaceSize, stream);

CLEANUP:
  // Descriptors are no longer needed as all GPU work was already
  // enqueued.
  return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
}

int gemm_bias_gelu_lt(cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
                      int k, const float* alpha,                                     /* host pointer */
                      at::Half* A, int lda, at::Half* B, int ldb, const float* beta, /* host pointer */
                      at::Half* C, int64_t ldc, void* workspace, size_t workspaceSize, cudaStream_t stream,
                      bool use_bias, const void* gelu_in, const void* bias) {
  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;

  cublasLtMatmulDescOpaque_t operationDesc = {};
  cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
  cublasLtMatmulPreferenceOpaque_t preference = {};

  int returnedResults = 0;
  cublasLtMatmulHeuristicResult_t heuristicResult = {};
  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_GELU_AUX;

  // Create operation descriptor; see cublasLtMatmulDescAttributes_t
  // for details about defaults; here we just set the transforms for
  // A and B.
  status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, &gelu_in,
                                          sizeof(gelu_in));
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &ldc, sizeof(ldc));

  if (use_bias) {
    status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias));
    if (status != CUBLAS_STATUS_SUCCESS) {
      goto CLEANUP;
    }
    epilogue = CUBLASLT_EPILOGUE_GELU_AUX_BIAS;
  }

  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue));
  if (status != CUBLAS_STATUS_SUCCESS) {
    goto CLEANUP;
  }

  // Create matrix descriptors. Not setting any extra attributes.
  status =
      cublasLtMatrixLayoutInit(&Adesc, CUDA_R_16F, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status =
      cublasLtMatrixLayoutInit(&Bdesc, CUDA_R_16F, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_16F, m, n, ldc);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // Create preference handle; In general, extra attributes can be
  // used here to disable tensor ops or to make sure algo selected
  // will work with badly aligned A, B, C. However, for simplicity
  // here we assume A,B,C are always well aligned (e.g., directly
  // come from cudaMalloc)
  status = cublasLtMatmulPreferenceInit(&preference);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulPreferenceSetAttribute(&preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize,
                                                sizeof(workspaceSize));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // We just need the best available heuristic to try and run matmul.
  // There is no guarantee that this will work. For example, if A is
  // badly aligned, you can request more (e.g. 32) algos and try to
  // run them one by one until something works.
  status = cublasLtMatmulAlgoGetHeuristic(ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1,
                                          &heuristicResult, &returnedResults);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  if (returnedResults == 0) {
    status = CUBLAS_STATUS_NOT_SUPPORTED;
    goto CLEANUP;
  }
  status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc,
                          //&heuristicResult.algo,
                          NULL, workspace, workspaceSize, stream);

CLEANUP:
  // Descriptors are no longer needed as all GPU work was already
  // enqueued.
  return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
}

int gemm_bias_gelu_lt(cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
                      int k, const float* alpha,                                             /* host pointer */
                      at::BFloat16* A, int lda, at::BFloat16* B, int ldb, const float* beta, /* host pointer */
                      at::BFloat16* C, int64_t ldc, void* workspace, size_t workspaceSize, cudaStream_t stream,
                      bool use_bias, const void* gelu_in, const void* bias) {
  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;

  cublasLtMatmulDescOpaque_t operationDesc = {};
  cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
  cublasLtMatmulPreferenceOpaque_t preference = {};

  int returnedResults = 0;
  cublasLtMatmulHeuristicResult_t heuristicResult = {};
  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_GELU_AUX;

  // Create operation descriptor; see cublasLtMatmulDescAttributes_t
  // for details about defaults; here we just set the transforms for
  // A and B.
  status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, &gelu_in,
                                          sizeof(gelu_in));
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &ldc, sizeof(ldc));

  if (use_bias) {
    status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias));
    if (status != CUBLAS_STATUS_SUCCESS) {
      goto CLEANUP;
    }
    epilogue = CUBLASLT_EPILOGUE_GELU_AUX_BIAS;
  }

  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue));
  if (status != CUBLAS_STATUS_SUCCESS) {
    goto CLEANUP;
  }

  // Create matrix descriptors. Not setting any extra attributes.
  status =
      cublasLtMatrixLayoutInit(&Adesc, CUDA_R_16BF, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status =
      cublasLtMatrixLayoutInit(&Bdesc, CUDA_R_16BF, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_16BF, m, n, ldc);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // Create preference handle; In general, extra attributes can be
  // used here to disable tensor ops or to make sure algo selected
  // will work with badly aligned A, B, C. However, for simplicity
  // here we assume A,B,C are always well aligned (e.g., directly
  // come from cudaMalloc)
  status = cublasLtMatmulPreferenceInit(&preference);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulPreferenceSetAttribute(&preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize,
                                                sizeof(workspaceSize));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // We just need the best available heuristic to try and run matmul.
  // There is no guarantee that this will work. For example, if A is
  // badly aligned, you can request more (e.g. 32) algos and try to
  // run them one by one until something works.
  status = cublasLtMatmulAlgoGetHeuristic(ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1,
                                          &heuristicResult, &returnedResults);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  if (returnedResults == 0) {
    status = CUBLAS_STATUS_NOT_SUPPORTED;
    goto CLEANUP;
  }
  status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc,
                          //&heuristicResult.algo,
                          NULL, workspace, workspaceSize, stream);

CLEANUP:
  // Descriptors are no longer needed as all GPU work was already
  // enqueued.
  return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
}

int gemm_bias_gelu_lt(cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
                      int k, const float* alpha,                                 /* host pointer */
                      double* A, int lda, double* B, int ldb, const float* beta, /* host pointer */
                      double* C, int ldc, void* workspace, size_t workspaceSize, cudaStream_t stream, bool use_bias,
                      const void* gelu_in, const void* bias) {
  return 1;
}

int gemm_bias_gelu_lt(cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
                      int k, const float* alpha,                               /* host pointer */
                      float* A, int lda, float* B, int ldb, const float* beta, /* host pointer */
                      float* C, int64_t ldc, void* workspace, size_t workspaceSize, cudaStream_t stream, bool use_bias,
                      const void* gelu_in, const void* bias) {
  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;

  cublasLtMatmulDescOpaque_t operationDesc = {};
  cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
  cublasLtMatmulPreferenceOpaque_t preference = {};

  int returnedResults = 0;
  cublasLtMatmulHeuristicResult_t heuristicResult = {};
  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_GELU_AUX;

  // Create operation descriptor; see cublasLtMatmulDescAttributes_t
  // for details about defaults; here we just set the transforms for
  // A and B.
  status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, &gelu_in,
                                          sizeof(gelu_in));
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &ldc, sizeof(ldc));

  if (use_bias) {
    status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias));
    if (status != CUBLAS_STATUS_SUCCESS) {
      goto CLEANUP;
    }
    epilogue = CUBLASLT_EPILOGUE_GELU_AUX_BIAS;
  }

  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue));
  if (status != CUBLAS_STATUS_SUCCESS) {
    goto CLEANUP;
  }

  // Create matrix descriptors. Not setting any extra attributes.
  status =
      cublasLtMatrixLayoutInit(&Adesc, CUDA_R_32F, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status =
      cublasLtMatrixLayoutInit(&Bdesc, CUDA_R_32F, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_32F, m, n, ldc);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // Create preference handle; In general, extra attributes can be
  // used here to disable tensor ops or to make sure algo selected
  // will work with badly aligned A, B, C. However, for simplicity
  // here we assume A,B,C are always well aligned (e.g., directly
  // come from cudaMalloc)
  status = cublasLtMatmulPreferenceInit(&preference);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulPreferenceSetAttribute(&preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize,
                                                sizeof(workspaceSize));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // We just need the best available heuristic to try and run matmul.
  // There is no guarantee that this will work. For example, if A is
  // badly aligned, you can request more (e.g. 32) algos and try to
  // run them one by one until something works.
  status = cublasLtMatmulAlgoGetHeuristic(ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1,
                                          &heuristicResult, &returnedResults);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  if (returnedResults == 0) {
    status = CUBLAS_STATUS_NOT_SUPPORTED;
    goto CLEANUP;
  }
  status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc,
                          //&heuristicResult.algo,
                          NULL, workspace, workspaceSize, stream);

CLEANUP:
  // Descriptors are no longer needed as all GPU work was already
  // enqueued.
  return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
}

int gemm_bgradb_lt(cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
                   const float* alpha,                                            /* host pointer */
                   at::Half* A, int lda, at::Half* B, int ldb, const float* beta, /* host pointer */
                   at::Half* C, int ldc, void* workspace, size_t workspaceSize, cudaStream_t stream, bool use_bias,
                   const void* bgrad) {
  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;

  cublasLtMatmulDescOpaque_t operationDesc = {};
  cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
  cublasLtMatmulPreferenceOpaque_t preference = {};

  int returnedResults = 0;
  cublasLtMatmulHeuristicResult_t heuristicResult = {};
  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT;

  // Create operation descriptor; see cublasLtMatmulDescAttributes_t
  // for details about defaults; here we just set the transforms for
  // A and B.
  status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  if (use_bias) {
    status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bgrad, sizeof(bgrad));
    if (status != CUBLAS_STATUS_SUCCESS) {
      goto CLEANUP;
    }
    epilogue = CUBLASLT_EPILOGUE_BGRADB;
  }

  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue));
  if (status != CUBLAS_STATUS_SUCCESS) {
    goto CLEANUP;
  }

  // Create matrix descriptors. Not setting any extra attributes.
  status =
      cublasLtMatrixLayoutInit(&Adesc, CUDA_R_16F, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status =
      cublasLtMatrixLayoutInit(&Bdesc, CUDA_R_16F, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_16F, m, n, ldc);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // Create preference handle; In general, extra attributes can be
  // used here to disable tensor ops or to make sure algo selected
  // will work with badly aligned A, B, C. However, for simplicity
  // here we assume A,B,C are always well aligned (e.g., directly
  // come from cudaMalloc)
  status = cublasLtMatmulPreferenceInit(&preference);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulPreferenceSetAttribute(&preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize,
                                                sizeof(workspaceSize));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // We just need the best available heuristic to try and run matmul.
  // There is no guarantee that this will work. For example, if A is
  // badly aligned, you can request more (e.g. 32) algos and try to
  // run them one by one until something works.
  status = cublasLtMatmulAlgoGetHeuristic(ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1,
                                          &heuristicResult, &returnedResults);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  if (returnedResults == 0) {
    status = CUBLAS_STATUS_NOT_SUPPORTED;
    goto CLEANUP;
  }
  status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc,
                          //&heuristicResult.algo,
                          NULL, workspace, workspaceSize, stream);

CLEANUP:
  // Descriptors are no longer needed as all GPU work was already
  // enqueued.
  return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
}

int gemm_bgradb_lt(cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
                   const float* alpha,                                                    /* host pointer */
                   at::BFloat16* A, int lda, at::BFloat16* B, int ldb, const float* beta, /* host pointer */
                   at::BFloat16* C, int ldc, void* workspace, size_t workspaceSize, cudaStream_t stream, bool use_bias,
                   const void* bgrad) {
  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;

  cublasLtMatmulDescOpaque_t operationDesc = {};
  cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
  cublasLtMatmulPreferenceOpaque_t preference = {};

  int returnedResults = 0;
  cublasLtMatmulHeuristicResult_t heuristicResult = {};
  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT;

  // Create operation descriptor; see cublasLtMatmulDescAttributes_t
  // for details about defaults; here we just set the transforms for
  // A and B.
  status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  if (use_bias) {
    status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bgrad, sizeof(bgrad));
    if (status != CUBLAS_STATUS_SUCCESS) {
      goto CLEANUP;
    }
    epilogue = CUBLASLT_EPILOGUE_BGRADB;
  }

  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue));
  if (status != CUBLAS_STATUS_SUCCESS) {
    goto CLEANUP;
  }

  // Create matrix descriptors. Not setting any extra attributes.
  status =
      cublasLtMatrixLayoutInit(&Adesc, CUDA_R_16BF, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status =
      cublasLtMatrixLayoutInit(&Bdesc, CUDA_R_16BF, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_16BF, m, n, ldc);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // Create preference handle; In general, extra attributes can be
  // used here to disable tensor ops or to make sure algo selected
  // will work with badly aligned A, B, C. However, for simplicity
  // here we assume A,B,C are always well aligned (e.g., directly
  // come from cudaMalloc)
  status = cublasLtMatmulPreferenceInit(&preference);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulPreferenceSetAttribute(&preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize,
                                                sizeof(workspaceSize));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // We just need the best available heuristic to try and run matmul.
  // There is no guarantee that this will work. For example, if A is
  // badly aligned, you can request more (e.g. 32) algos and try to
  // run them one by one until something works.
  status = cublasLtMatmulAlgoGetHeuristic(ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1,
                                          &heuristicResult, &returnedResults);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  if (returnedResults == 0) {
    status = CUBLAS_STATUS_NOT_SUPPORTED;
    goto CLEANUP;
  }
  status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc,
                          //&heuristicResult.algo,
                          NULL, workspace, workspaceSize, stream);

CLEANUP:
  // Descriptors are no longer needed as all GPU work was already
  // enqueued.
  return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
}

int gemm_bgradb_lt(cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
                   const float* alpha,                                        /* host pointer */
                   double* A, int lda, double* B, int ldb, const float* beta, /* host pointer */
                   double* C, int ldc, void* workspace, size_t workspaceSize, cudaStream_t stream, bool use_bias,
                   const void* bgrad) {
  return 1;
}

int gemm_bgradb_lt(cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
                   const float* alpha,                                      /* host pointer */
                   float* A, int lda, float* B, int ldb, const float* beta, /* host pointer */
                   float* C, int ldc, void* workspace, size_t workspaceSize, cudaStream_t stream, bool use_bias,
                   const void* bgrad) {
  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;

  cublasLtMatmulDescOpaque_t operationDesc = {};
  cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
  cublasLtMatmulPreferenceOpaque_t preference = {};

  int returnedResults = 0;
  cublasLtMatmulHeuristicResult_t heuristicResult = {};
  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT;

  // Create operation descriptor; see cublasLtMatmulDescAttributes_t
  // for details about defaults; here we just set the transforms for
  // A and B.
  status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  if (use_bias) {
    status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bgrad, sizeof(bgrad));
    if (status != CUBLAS_STATUS_SUCCESS) {
      goto CLEANUP;
    }
    epilogue = CUBLASLT_EPILOGUE_BGRADB;
  }

  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue));
  if (status != CUBLAS_STATUS_SUCCESS) {
    goto CLEANUP;
  }

  // Create matrix descriptors. Not setting any extra attributes.
  status =
      cublasLtMatrixLayoutInit(&Adesc, CUDA_R_32F, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status =
      cublasLtMatrixLayoutInit(&Bdesc, CUDA_R_32F, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_32F, m, n, ldc);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // Create preference handle; In general, extra attributes can be
  // used here to disable tensor ops or to make sure algo selected
  // will work with badly aligned A, B, C. However, for simplicity
  // here we assume A,B,C are always well aligned (e.g., directly
  // come from cudaMalloc)
  status = cublasLtMatmulPreferenceInit(&preference);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulPreferenceSetAttribute(&preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize,
                                                sizeof(workspaceSize));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // We just need the best available heuristic to try and run matmul.
  // There is no guarantee that this will work. For example, if A is
  // badly aligned, you can request more (e.g. 32) algos and try to
  // run them one by one until something works.
  status = cublasLtMatmulAlgoGetHeuristic(ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1,
                                          &heuristicResult, &returnedResults);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  if (returnedResults == 0) {
    status = CUBLAS_STATUS_NOT_SUPPORTED;
    goto CLEANUP;
  }

  status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc,
                          &heuristicResult.algo, workspace, workspaceSize, stream);

CLEANUP:
  // Descriptors are no longer needed as all GPU work was already
  // enqueued.
  return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
}

int gemm_dgelu_bgradb_lt(cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
                         int k, const float* alpha,                                     /* host pointer */
                         at::Half* A, int lda, at::Half* B, int ldb, const float* beta, /* host pointer */
                         at::Half* C, int64_t ldc, void* workspace, size_t workspaceSize, cudaStream_t stream,
                         const void* gelu_in, const void* bgrad) {
  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;

  cublasLtMatmulDescOpaque_t operationDesc = {};
  cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
  cublasLtMatmulPreferenceOpaque_t preference = {};

  int returnedResults = 0;
  cublasLtMatmulHeuristicResult_t heuristicResult = {};
  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DGELU_BGRAD;

  // Create operation descriptor; see cublasLtMatmulDescAttributes_t
  // for details about defaults; here we just set the transforms for
  // A and B.
  status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bgrad, sizeof(bgrad));
  if (status != CUBLAS_STATUS_SUCCESS) {
    goto CLEANUP;
  }
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, &gelu_in,
                                          sizeof(gelu_in));
  if (status != CUBLAS_STATUS_SUCCESS) {
    goto CLEANUP;
  }
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &ldc, sizeof(ldc));

  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue));
  if (status != CUBLAS_STATUS_SUCCESS) {
    goto CLEANUP;
  }

  // Create matrix descriptors. Not setting any extra attributes.
  status =
      cublasLtMatrixLayoutInit(&Adesc, CUDA_R_16F, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status =
      cublasLtMatrixLayoutInit(&Bdesc, CUDA_R_16F, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_16F, m, n, ldc);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // Create preference handle; In general, extra attributes can be
  // used here to disable tensor ops or to make sure algo selected
  // will work with badly aligned A, B, C. However, for simplicity
  // here we assume A,B,C are always well aligned (e.g., directly
  // come from cudaMalloc)
  status = cublasLtMatmulPreferenceInit(&preference);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulPreferenceSetAttribute(&preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize,
                                                sizeof(workspaceSize));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // We just need the best available heuristic to try and run matmul.
  // There is no guarantee that this will work. For example, if A is
  // badly aligned, you can request more (e.g. 32) algos and try to
  // run them one by one until something works.
  status = cublasLtMatmulAlgoGetHeuristic(ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1,
                                          &heuristicResult, &returnedResults);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  if (returnedResults == 0) {
    status = CUBLAS_STATUS_NOT_SUPPORTED;
    goto CLEANUP;
  }
  status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc,
                          //&heuristicResult.algo,
                          NULL, workspace, workspaceSize, stream);

CLEANUP:
  // Descriptors are no longer needed as all GPU work was already
  // enqueued.
  return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
}

int gemm_dgelu_bgradb_lt(cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
                         int k, const float* alpha,                                             /* host pointer */
                         at::BFloat16* A, int lda, at::BFloat16* B, int ldb, const float* beta, /* host pointer */
                         at::BFloat16* C, int64_t ldc, void* workspace, size_t workspaceSize, cudaStream_t stream,
                         const void* gelu_in, const void* bgrad) {
  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;

  cublasLtMatmulDescOpaque_t operationDesc = {};
  cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
  cublasLtMatmulPreferenceOpaque_t preference = {};

  int returnedResults = 0;
  cublasLtMatmulHeuristicResult_t heuristicResult = {};
  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DGELU_BGRAD;

  // Create operation descriptor; see cublasLtMatmulDescAttributes_t
  // for details about defaults; here we just set the transforms for
  // A and B.
  status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bgrad, sizeof(bgrad));
  if (status != CUBLAS_STATUS_SUCCESS) {
    goto CLEANUP;
  }
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, &gelu_in,
                                          sizeof(gelu_in));
  if (status != CUBLAS_STATUS_SUCCESS) {
    goto CLEANUP;
  }
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &ldc, sizeof(ldc));

  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue));
  if (status != CUBLAS_STATUS_SUCCESS) {
    goto CLEANUP;
  }

  // Create matrix descriptors. Not setting any extra attributes.
  status =
      cublasLtMatrixLayoutInit(&Adesc, CUDA_R_16BF, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status =
      cublasLtMatrixLayoutInit(&Bdesc, CUDA_R_16BF, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_16BF, m, n, ldc);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // Create preference handle; In general, extra attributes can be
  // used here to disable tensor ops or to make sure algo selected
  // will work with badly aligned A, B, C. However, for simplicity
  // here we assume A,B,C are always well aligned (e.g., directly
  // come from cudaMalloc)
  status = cublasLtMatmulPreferenceInit(&preference);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulPreferenceSetAttribute(&preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize,
                                                sizeof(workspaceSize));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // We just need the best available heuristic to try and run matmul.
  // There is no guarantee that this will work. For example, if A is
  // badly aligned, you can request more (e.g. 32) algos and try to
  // run them one by one until something works.
  status = cublasLtMatmulAlgoGetHeuristic(ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1,
                                          &heuristicResult, &returnedResults);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  if (returnedResults == 0) {
    status = CUBLAS_STATUS_NOT_SUPPORTED;
    goto CLEANUP;
  }
  status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc,
                          //&heuristicResult.algo,
                          NULL, workspace, workspaceSize, stream);

CLEANUP:
  // Descriptors are no longer needed as all GPU work was already
  // enqueued.
  return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
}

int gemm_dgelu_bgradb_lt(cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
                         int k, const float* alpha,                                 /* host pointer */
                         double* A, int lda, double* B, int ldb, const float* beta, /* host pointer */
                         double* C, int ldc, void* workspace, size_t workspaceSize, cudaStream_t stream,
                         const void* gelu_in, const void* bgrad) {
  return 1;
}

int gemm_dgelu_bgradb_lt(cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
                         int k, const float* alpha,                               /* host pointer */
                         float* A, int lda, float* B, int ldb, const float* beta, /* host pointer */
                         float* C, int64_t ldc, void* workspace, size_t workspaceSize, cudaStream_t stream,
                         const void* gelu_in, const void* bgrad) {
  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;

  cublasLtMatmulDescOpaque_t operationDesc = {};
  cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
  cublasLtMatmulPreferenceOpaque_t preference = {};

  int returnedResults = 0;
  cublasLtMatmulHeuristicResult_t heuristicResult = {};
  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DGELU_BGRAD;

  // Create operation descriptor; see cublasLtMatmulDescAttributes_t
  // for details about defaults; here we just set the transforms for
  // A and B.
  status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bgrad, sizeof(bgrad));
  if (status != CUBLAS_STATUS_SUCCESS) {
    goto CLEANUP;
  }
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, &gelu_in,
                                          sizeof(gelu_in));
  if (status != CUBLAS_STATUS_SUCCESS) {
    goto CLEANUP;
  }
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &ldc, sizeof(ldc));

  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue));
  if (status != CUBLAS_STATUS_SUCCESS) {
    goto CLEANUP;
  }

  // Create matrix descriptors. Not setting any extra attributes.
  status =
      cublasLtMatrixLayoutInit(&Adesc, CUDA_R_32F, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status =
      cublasLtMatrixLayoutInit(&Bdesc, CUDA_R_32F, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_32F, m, n, ldc);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // Create preference handle; In general, extra attributes can be
  // used here to disable tensor ops or to make sure algo selected
  // will work with badly aligned A, B, C. However, for simplicity
  // here we assume A,B,C are always well aligned (e.g., directly
  // come from cudaMalloc)
  status = cublasLtMatmulPreferenceInit(&preference);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulPreferenceSetAttribute(&preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize,
                                                sizeof(workspaceSize));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // We just need the best available heuristic to try and run matmul.
  // There is no guarantee that this will work. For example, if A is
  // badly aligned, you can request more (e.g. 32) algos and try to
  // run them one by one until something works.
  status = cublasLtMatmulAlgoGetHeuristic(ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1,
                                          &heuristicResult, &returnedResults);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  if (returnedResults == 0) {
    status = CUBLAS_STATUS_NOT_SUPPORTED;
    goto CLEANUP;
  }
  status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc,
                          //&heuristicResult.algo,
                          NULL, workspace, workspaceSize, stream);

CLEANUP:
  // Descriptors are no longer needed as all GPU work was already
  // enqueued.
  return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
}

#endif

template <typename T>
int linear_bias_forward_cuda(at::Tensor input, T* weight, at::Tensor bias, int in_features, int batch_size,
                             int out_features, at::Tensor output, void* lt_workspace) {
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  // Get the stream from cublas handle to reuse for biasReLU kernel.
  cudaStream_t stream;
  cublasGetStream(handle, &stream);
  const float alpha = 1.0;
  const float beta_zero = 0.0;
  const float beta_one = 1.0;
  int status = 1;
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600
  status = gemm_bias_lt((cublasLtHandle_t)handle, CUBLAS_OP_T, CUBLAS_OP_N, out_features, batch_size, in_features,
                        &alpha,                                                            /* host pointer */
                        weight, in_features, input.data_ptr<T>(), in_features, &beta_zero, /* host pointer */
                        output.data_ptr<T>(), out_features, lt_workspace, 1 << 22, stream, true,
                        static_cast<const void*>(bias.data_ptr<T>()));
#endif
  if (status != 0) {
    output.copy_(bias);
    status = gemm_bias(handle, CUBLAS_OP_T, CUBLAS_OP_N, out_features, batch_size, in_features, &alpha, weight,
                       in_features, input.data_ptr<T>(), in_features, &beta_one, output.data_ptr<T>(), out_features);
  }
  return status;
}

template <typename T>
int linear_bias_backward_cuda(T* input, T* weight, T* d_output, int in_features, int batch_size, int out_features,
                              T* d_weight, T* d_bias, T* d_input, void* lt_workspace) {
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  // Get the stream from cublas handle to reuse for biasReLU kernel.
  cudaStream_t stream;
  cublasGetStream(handle, &stream);
  const float alpha = 1.0;
  const float beta_zero = 0.0;
  int status = 1;
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600
  status = gemm_bgradb_lt((cublasLtHandle_t)handle, CUBLAS_OP_N, CUBLAS_OP_T, in_features, out_features, batch_size,
                          &alpha,                                                 /* host pointer */
                          input, in_features, d_output, out_features, &beta_zero, /* host pointer */
                          d_weight, in_features, lt_workspace, 1 << 22, stream, true, static_cast<const void*>(d_bias));
#endif

  if (status != 0) {
    status = gemm_bias(handle, CUBLAS_OP_N, CUBLAS_OP_T, in_features, out_features, batch_size, &alpha, input,
                       in_features, d_output, out_features, &beta_zero, d_weight, in_features);
  }

  status = gemm_bias(handle, CUBLAS_OP_N, CUBLAS_OP_N, in_features, batch_size, out_features, &alpha, weight,
                     in_features, d_output, out_features, &beta_zero, d_input, in_features);
  return status;
}

template <typename T>
int linear_gelu_linear_forward_cuda(T* input, T* weight1, T* bias1, T* weight2, T* bias2, int in_features,
                                    int hidden_features, int batch_size, int out_features, T* output1, T* output2,
                                    T* gelu_in, void* lt_workspace) {
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  // Get the stream from cublas handle to reuse for biasReLU kernel.
  cudaStream_t stream;
  cublasGetStream(handle, &stream);
  const float alpha = 1.0;
  const float beta_zero = 0.0;
  int status = 1;
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600
  status = gemm_bias_gelu_lt((cublasLtHandle_t)handle, CUBLAS_OP_T, CUBLAS_OP_N, hidden_features, batch_size,
                             in_features, &alpha,                                  /* host pointer */
                             weight1, in_features, input, in_features, &beta_zero, /* host pointer */
                             output1, hidden_features, lt_workspace, 1 << 22, stream, true,
                             static_cast<const void*>(gelu_in), static_cast<const void*>(bias1));
  status = gemm_bias_lt((cublasLtHandle_t)handle, CUBLAS_OP_T, CUBLAS_OP_N, out_features, batch_size, hidden_features,
                        &alpha,                                                         /* host pointer */
                        weight2, hidden_features, output1, hidden_features, &beta_zero, /* host pointer */
                        output2, out_features, lt_workspace, 1 << 22, stream, true, static_cast<const void*>(bias2));
  return status;
#else
  return 1;
#endif
}

template <typename T>
int linear_gelu_linear_backward_cuda(T* input, T* gelu_in, T* output1, T* weight1, T* weight2, T* d_output1,
                                     T* d_output2, int in_features, int batch_size, int hidden_features,
                                     int out_features, T* d_weight1, T* d_weight2, T* d_bias1, T* d_bias2, T* d_input,
                                     void* lt_workspace) {
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  // Get the stream from cublas handle to reuse for biasReLU kernel.
  cudaStream_t stream;
  cublasGetStream(handle, &stream);
  const float alpha = 1.0;
  const float beta_zero = 0.0;
  int status = 1;
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600
  // wgrad for first gemm
  status = gemm_bgradb_lt((cublasLtHandle_t)handle, CUBLAS_OP_N, CUBLAS_OP_T, hidden_features, out_features, batch_size,
                          &alpha,                                                        /* host pointer */
                          output1, hidden_features, d_output2, out_features, &beta_zero, /* host pointer */
                          d_weight2, hidden_features, lt_workspace, 1 << 22, stream, true,
                          static_cast<const void*>(d_bias2));
  // dgrad for second GEMM
  status = gemm_dgelu_bgradb_lt((cublasLtHandle_t)handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_features, batch_size,
                                out_features, &alpha,                                          /* host pointer */
                                weight2, hidden_features, d_output2, out_features, &beta_zero, /* host pointer */
                                d_output1, hidden_features, lt_workspace, 1 << 22, stream,
                                static_cast<const void*>(gelu_in), static_cast<const void*>(d_bias1));
  // wgrad for the first GEMM
  status = gemm_bias(handle, CUBLAS_OP_N, CUBLAS_OP_T, in_features, hidden_features, batch_size, &alpha, input,
                     in_features, d_output1, hidden_features, &beta_zero, d_weight1, in_features);

  // dgrad for the first GEMM
  status = gemm_bias(handle, CUBLAS_OP_N, CUBLAS_OP_N, in_features, batch_size, hidden_features, &alpha, weight1,
                     in_features, d_output1, hidden_features, &beta_zero, d_input, in_features);
#endif
  return status;
}

template int linear_bias_forward_cuda<at::Half>(at::Tensor input, at::Half* weight, at::Tensor bias, int in_features,
                                                int batch_size, int out_features, at::Tensor output,
                                                void* lt_workspace);

template int linear_bias_forward_cuda<float>(at::Tensor input, float* weight, at::Tensor bias, int in_features,
                                             int batch_size, int out_features, at::Tensor output, void* lt_workspace);

template int linear_bias_forward_cuda<double>(at::Tensor input, double* weight, at::Tensor bias, int in_features,
                                              int batch_size, int out_features, at::Tensor output, void* lt_workspace);

template int linear_bias_backward_cuda<at::Half>(at::Half* input, at::Half* weight, at::Half* d_output, int in_features,
                                                 int batch_size, int out_features, at::Half* d_weight, at::Half* d_bias,
                                                 at::Half* d_input, void* lt_workspace);

template int linear_bias_backward_cuda<float>(float* input, float* weight, float* d_output, int in_features,
                                              int batch_size, int out_features, float* d_weight, float* d_bias,
                                              float* d_input, void* lt_workspace);

template int linear_bias_backward_cuda<double>(double* input, double* weight, double* d_output, int in_features,
                                               int batch_size, int out_features, double* d_weight, double* d_bias,
                                               double* d_input, void* lt_workspace);

template int linear_gelu_linear_forward_cuda<at::Half>(at::Half* input, at::Half* weight1, at::Half* bias1,
                                                       at::Half* weight2, at::Half* bias2, int in_features,
                                                       int hidden_features, int batch_size, int out_features,
                                                       at::Half* output1, at::Half* output2, at::Half* gelu_in,
                                                       void* lt_workspace);

template int linear_gelu_linear_forward_cuda<float>(float* input, float* weight1, float* bias1, float* weight2,
                                                    float* bias2, int in_features, int hidden_features, int batch_size,
                                                    int out_features, float* output1, float* output2, float* gelu_in,
                                                    void* lt_workspace);

template int linear_gelu_linear_forward_cuda<double>(double* input, double* weight1, double* bias1, double* weight2,
                                                     double* bias2, int in_features, int hidden_features,
                                                     int batch_size, int out_features, double* output1, double* output2,
                                                     double* gelu_in, void* lt_workspace);

template int linear_gelu_linear_backward_cuda<at::Half>(at::Half* input, at::Half* gelu_in, at::Half* output1,
                                                        at::Half* weight1, at::Half* weight2, at::Half* d_output1,
                                                        at::Half* d_output2, int in_features, int batch_size,
                                                        int hidden_features, int out_features, at::Half* d_weight1,
                                                        at::Half* d_weight2, at::Half* d_bias1, at::Half* d_bias2,
                                                        at::Half* d_input, void* lt_workspace);

template int linear_gelu_linear_backward_cuda<float>(float* input, float* gelu_in, float* output1, float* weight1,
                                                     float* weight2, float* d_output1, float* d_output2,
                                                     int in_features, int batch_size, int hidden_features,
                                                     int out_features, float* d_weight1, float* d_weight2,
                                                     float* d_bias1, float* d_bias2, float* d_input,
                                                     void* lt_workspace);

template int linear_gelu_linear_backward_cuda<double>(double* input, double* gelu_in, double* output1, double* weight1,
                                                      double* weight2, double* d_output1, double* d_output2,
                                                      int in_features, int batch_size, int hidden_features,
                                                      int out_features, double* d_weight1, double* d_weight2,
                                                      double* d_bias1, double* d_bias2, double* d_input,
                                                      void* lt_workspace);

template int linear_bias_forward_cuda<at::BFloat16>(at::Tensor input, at::BFloat16* weight, at::Tensor bias,
                                                    int in_features, int batch_size, int out_features,
                                                    at::Tensor output, void* lt_workspace);

template int linear_bias_backward_cuda<at::BFloat16>(at::BFloat16* input, at::BFloat16* weight, at::BFloat16* d_output,
                                                     int in_features, int batch_size, int out_features,
                                                     at::BFloat16* d_weight, at::BFloat16* d_bias,
                                                     at::BFloat16* d_input, void* lt_workspace);

template int linear_gelu_linear_forward_cuda<at::BFloat16>(at::BFloat16* input, at::BFloat16* weight1,
                                                           at::BFloat16* bias1, at::BFloat16* weight2,
                                                           at::BFloat16* bias2, int in_features, int hidden_features,
                                                           int batch_size, int out_features, at::BFloat16* output1,
                                                           at::BFloat16* output2, at::BFloat16* gelu_in,
                                                           void* lt_workspace);

template int linear_gelu_linear_backward_cuda<at::BFloat16>(
    at::BFloat16* input, at::BFloat16* gelu_in, at::BFloat16* output1, at::BFloat16* weight1, at::BFloat16* weight2,
    at::BFloat16* d_output1, at::BFloat16* d_output2, int in_features, int batch_size, int hidden_features,
    int out_features, at::BFloat16* d_weight1, at::BFloat16* d_weight2, at::BFloat16* d_bias1, at::BFloat16* d_bias2,
    at::BFloat16* d_input, void* lt_workspace);


================================================
FILE: csrc/layer_norm_cuda.cpp
================================================
#include <torch/extension.h>

#include <cassert>
#include <optional>
#include <vector>

namespace {
void compute_n1_n2(at::Tensor input, at::IntArrayRef normalized_shape, int& n1, int& n2) {
  int idiff = input.ndimension() - normalized_shape.size();
  n2 = 1;
  for (int i = 0; i < (int)normalized_shape.size(); ++i) {
    assert(input.sizes()[i + idiff] == normalized_shape[i]);
    n2 *= normalized_shape[i];
  }
  n1 = 1;
  for (int i = 0; i < idiff; ++i) {
    n1 *= input.sizes()[i];
  }
}

void check_args(at::IntArrayRef normalized_shape, at::Tensor gamma, at::Tensor beta) {
  TORCH_CHECK(!gamma.defined() || gamma.sizes().equals(normalized_shape));
  TORCH_CHECK(!beta.defined() || beta.sizes().equals(normalized_shape));
}

void check_args(at::IntArrayRef normalized_shape, at::Tensor gamma) {
  TORCH_CHECK(!gamma.defined() || gamma.sizes().equals(normalized_shape));
}

void check_args(at::Tensor input, at::IntArrayRef normalized_shape, int& n1, int& n2) {
  int64_t normalized_ndim = normalized_shape.size();

  if (normalized_ndim < 1) {
    std::stringstream ss;
    ss << "Expected normalized_shape to be at least 1-dimensional, i.e., "
       << "containing at least one element, but got normalized_shape=" << normalized_shape;
    throw std::runtime_error(ss.str());
  }

  auto input_shape = input.sizes();
  auto input_ndim = input.dim();

  if (input_ndim < normalized_ndim || !input_shape.slice(input_ndim - normalized_ndim).equals(normalized_shape)) {
    std::stringstream ss;
    ss << "Given normalized_shape=" << normalized_shape << ", expected input with shape [*";
    for (auto size : normalized_shape) {
      ss << ", " << size;
    }
    ss << "], but got input of size" << input_shape;
    throw std::runtime_error(ss.str());
  }

  compute_n1_n2(input, normalized_shape, n1, n2);
}

void check_args(at::Tensor input, at::IntArrayRef normalized_shape, at::Tensor gamma, at::Tensor beta, int& n1,
                int& n2) {
  check_args(input, normalized_shape, n1, n2);
  check_args(normalized_shape, gamma, beta);
}

void check_args(at::Tensor input, at::IntArrayRef normalized_shape, at::Tensor gamma, int& n1, int& n2) {
  check_args(input, normalized_shape, n1, n2);
  check_args(normalized_shape, gamma);
}
}  // namespace

void cuda_layer_norm(at::Tensor& output, at::Tensor& mean, at::Tensor& invvar, const at::Tensor& input, int n1, int n2,
                     at::IntArrayRef normalized_shape, const std::optional<at::Tensor>& gamma,
                     const std::optional<at::Tensor>& beta, double epsilon);

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
  CHECK_CUDA(x);       \
  CHECK_CONTIGUOUS(x)

std::vector<at::Tensor> layer_norm(const at::Tensor& input, at::IntArrayRef normalized_shape, double epsilon) {
  CHECK_INPUT(input);
  int n1, n2;
  check_args(input, normalized_shape, n1, n2);
  at::Tensor output = at::empty_like(input);
  at::Tensor mean = at::empty({n1}, input.options().dtype(input.scalar_type() == at::ScalarType::Half ||
                                                                  input.scalar_type() == at::ScalarType::BFloat16
                                                              ? at::ScalarType::Float
                                                              : input.scalar_type()));
  at::Tensor invvar = at::empty_like(mean);
  cuda_layer_norm(output, mean, invvar, input, n1, n2, normalized_shape, std::nullopt, std::nullopt, epsilon);
  return {output, mean, invvar};
}

std::vector<at::Tensor> layer_norm_affine(const at::Tensor& input, at::IntArrayRef normalized_shape,
                                          const at::Tensor& gamma, const at::Tensor& beta, double epsilon) {
  CHECK_INPUT(input);
  CHECK_INPUT(gamma);
  CHECK_INPUT(beta);
  int n1, n2;
  check_args(input, normalized_shape, gamma, beta, n1, n2);
  at::Tensor output = at::empty_like(input);
  const auto stats_dtype =
      (input.scalar_type() == at::ScalarType::Half || input.scalar_type() == at::ScalarType::BFloat16)
          ? at::ScalarType::Float
          : input.scalar_type();
  at::Tensor mean = at::empty({n1}, input.options().dtype(stats_dtype));
  at::Tensor invvar = at::empty_like(mean);
  cuda_layer_norm(output, mean, invvar, input, n1, n2, normalized_shape, gamma, beta, epsilon);
  return {output, mean, invvar};
}

std::vector<at::Tensor> layer_norm_affine_mixed_dtypes(const at::Tensor& input, at::IntArrayRef normalized_shape,
                                                       const at::Tensor& gamma, const at::Tensor& beta,
                                                       double epsilon) {
  CHECK_INPUT(input);
  int n1, n2;
  check_args(input, normalized_shape, n1, n2);
  at::Tensor output = at::empty_like(input, gamma.options().dtype(gamma.scalar_type()));
  at::Tensor mean = at::empty({n1}, input.options().dtype(input.scalar_type() == at::ScalarType::Half ||
                                                                  input.scalar_type() == at::ScalarType::BFloat16
                                                              ? at::ScalarType::Float
                                                              : input.scalar_type()));
  at::Tensor invvar = at::empty_like(mean);
  cuda_layer_norm(output, mean, invvar, input, n1, n2, normalized_shape, gamma, beta, epsilon);
  return {output, mean, invvar};
}

void cuda_layer_norm_gradient(at::Tensor& dout, const std::optional<at::Tensor>& mean, at::Tensor& invvar,
                              at::Tensor& input_or_output, int n1, int n2, at::IntArrayRef normalized_shape,
                              const std::optional<at::Tensor>& gamma, const std::optional<at::Tensor>& beta,
                              double epsilon, at::Tensor& grad_input, const std::optional<at::Tensor>& grad_gamma,
                              const std::optional<at::Tensor>& grad_beta, bool memory_efficient);

at::Tensor layer_norm_gradient(at::Tensor& dout, const std::optional<at::Tensor>& mean_, at::Tensor& invvar,
                               at::Tensor& input_or_output, at::IntArrayRef normalized_shape, double epsilon,
                               bool memory_efficient) {
  CHECK_INPUT(dout);
  CHECK_INPUT(invvar);
  CHECK_INPUT(input_or_output);
  int n1, n2;
  check_args(input_or_output, normalized_shape, n1, n2);
  at::Tensor grad_input = at::empty_like(input_or_output);

  cuda_layer_norm_gradient(dout, mean_, invvar, input_or_output, n1, n2, normalized_shape, std::nullopt, std::nullopt,
                           epsilon, grad_input, std::nullopt, std::nullopt, memory_efficient);
  return grad_input;
}

std::vector<at::Tensor> layer_norm_gradient_affine(at::Tensor& dout, const std::optional<at::Tensor>& mean_,
                                                   at::Tensor& invvar, at::Tensor& input_or_output,
                                                   at::IntArrayRef normalized_shape, at::Tensor& gamma,
                                                   at::Tensor& beta, double epsilon, bool memory_efficient) {
  CHECK_INPUT(dout);
  CHECK_INPUT(invvar);
  CHECK_INPUT(input_or_output);
  CHECK_INPUT(gamma);
  CHECK_INPUT(beta);
  int n1, n2;
  check_args(input_or_output, normalized_shape, gamma, beta, n1, n2);
  at::Tensor grad_input = at::empty_like(input_or_output);
  at::Tensor grad_gamma = at::empty_like(gamma);
  at::Tensor grad_beta = at::empty_like(beta);
  cuda_layer_norm_gradient(dout, mean_, invvar, input_or_output, n1, n2, normalized_shape, gamma, beta, epsilon,
                           grad_input, grad_gamma, grad_beta, memory_efficient);
  return {grad_input, grad_gamma, grad_beta};
}

void cuda_rms_norm(at::Tensor& output, at::Tensor& invvar, const at::Tensor& input, int n1, int n2,
                   at::IntArrayRef normalized_shape, const std::optional<at::Tensor>& gamma, double epsilon);

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
  CHECK_CUDA(x);       \
  CHECK_CONTIGUOUS(x)

std::vector<at::Tensor> rms_norm(const at::Tensor& input, at::IntArrayRef normalized_shape, double epsilon) {
  CHECK_INPUT(input);
  int n1, n2;
  check_args(input, normalized_shape, n1, n2);
  at::Tensor output = at::empty_like(input);
  at::Tensor invvar = at::empty({n1}, input.options().dtype(input.scalar_type() == at::ScalarType::Half ||
                                                                    input.scalar_type() == at::ScalarType::BFloat16
                                                                ? at::ScalarType::Float
                                                                : input.scalar_type()));
  cuda_rms_norm(output, invvar, input, n1, n2, normalized_shape, std::nullopt, epsilon);
  return {output, invvar};
}

std::vector<at::Tensor> rms_norm_affine(const at::Tensor& input, at::IntArrayRef normalized_shape,
                                        const at::Tensor& gamma, double epsilon) {
  CHECK_INPUT(input);
  CHECK_INPUT(gamma);
  int n1, n2;
  check_args(input, normalized_shape, gamma, n1, n2);
  at::Tensor output = at::empty_like(input);
  const auto stats_dtype =
      (input.scalar_type() == at::ScalarType::Half || input.scalar_type() == at::ScalarType::BFloat16)
          ? at::ScalarType::Float
          : input.scalar_type();
  at::Tensor invvar = at::empty({n1}, input.options().dtype(stats_dtype));
  cuda_rms_norm(output, invvar, input, n1, n2, normalized_shape, gamma, epsilon);
  return {output, invvar};
}

std::vector<at::Tensor> rms_norm_affine_mixed_dtypes(const at::Tensor& input, at::IntArrayRef normalized_shape,
                                                     const at::Tensor& gamma, double epsilon) {
  CHECK_INPUT(input);
  int n1, n2;
  check_args(input, normalized_shape, n1, n2);
  at::Tensor output = at::empty_like(input, gamma.options().dtype(gamma.scalar_type()));
  at::Tensor invvar = at::empty({n1}, input.options().dtype(input.scalar_type() == at::ScalarType::Half ||
                                                                    input.scalar_type() == at::ScalarType::BFloat16
                                                                ? at::ScalarType::Float
                                                                : input.scalar_type()));

  cuda_rms_norm(output, invvar, input, n1, n2, normalized_shape, gamma, epsilon);
  return {output, invvar};
}

void cuda_rms_norm_gradient(at::Tensor& dout, at::Tensor& invvar, at::Tensor& input_or_output, int n1, int n2,
                            at::IntArrayRef normalized_shape, const std::optional<at::Tensor>& gamma, double epsilon,
                            at::Tensor& grad_input, const std::optional<at::Tensor>& grad_gamma, bool memory_efficient);

at::Tensor rms_norm_gradient(at::Tensor& dout, at::Tensor& invvar, at::Tensor& input_or_output,
                             at::IntArrayRef normalized_shape, double epsilon, bool memory_efficient) {
  CHECK_INPUT(dout);
  CHECK_INPUT(invvar);
  CHECK_INPUT(input_or_output);
  int n1, n2;
  check_args(input_or_output, normalized_shape, n1, n2);
  at::Tensor grad_input = at::empty_like(input_or_output);
  cuda_rms_norm_gradient(dout, invvar, input_or_output, n1, n2, normalized_shape, std::nullopt, epsilon, grad_input,
                         std::nullopt, memory_efficient);
  return grad_input;
}

std::vector<at::Tensor> rms_norm_gradient_affine(at::Tensor& dout, at::Tensor& invvar, at::Tensor& input_or_output,
                                                 at::IntArrayRef normalized_shape, at::Tensor& gamma, double epsilon,
                                                 bool memory_efficient) {
  CHECK_INPUT(dout);
  CHECK_INPUT(invvar);
  CHECK_INPUT(input_or_output);
  CHECK_INPUT(gamma);
  int n1, n2;
  check_args(input_or_output, normalized_shape, gamma, n1, n2);
  at::Tensor grad_input = at::empty_like(input_or_output);
  at::Tensor grad_gamma = at::empty_like(gamma);
  cuda_rms_norm_gradient(dout, invvar, input_or_output, n1, n2, normalized_shape, gamma, epsilon, grad_input,
                         grad_gamma, memory_efficient);
  return {grad_input, grad_gamma};
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward_affine", &layer_norm_affine, "LayerNorm forward (CUDA)", py::call_guard<py::gil_scoped_release>());
  m.def("forward", &layer_norm, "LayerNorm forward (CUDA)", py::call_guard<py::gil_scoped_release>());
  m.def("backward_affine", &layer_norm_gradient_affine, "LayerNorm backward (CUDA)",
        py::call_guard<py::gil_scoped_release>());
  m.def("backward", &layer_norm_gradient, "LayerNorm backward (CUDA)", py::call_guard<py::gil_scoped_release>());

  m.def("forward_affine_mixed_dtypes", &layer_norm_affine_mixed_dtypes,
        "LayerNorm forward with mixed dtypes (CUDA) compatible with Megatron's implementation",
        py::call_guard<py::gil_scoped_release>());

  m.def("rms_forward_affine", &rms_norm_affine, "RMSNorm forward (CUDA)", py::call_guard<py::gil_scoped_release>());
  m.def("rms_forward", &rms_norm, "RMSNorm forward (CUDA)", py::call_guard<py::gil_scoped_release>());
  m.def("rms_backward_affine", &rms_norm_gradient_affine, "RMSNorm backward (CUDA)",
        py::call_guard<py::gil_scoped_release>());
  m.def("rms_backward", &rms_norm_gradient, "RMSNorm backward (CUDA)", py::call_guard<py::gil_scoped_release>());

  m.def("rms_forward_affine_mixed_dtypes", &rms_norm_affine_mixed_dtypes,
        "RMSNorm forward with mixed dtypes (CUDA) compatible with Megatron's implementation",
        py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: csrc/layer_norm_cuda_kernel.cu
================================================
#include <cuda.h>
#include <cuda_runtime.h>

#include <optional>

#include "ATen/ATen.h"
#include "ATen/AccumulateType.h"
#include "ATen/cuda/CUDAContext.h"
#include "ATen/cuda/DeviceUtils.cuh"
#include "static_switch.h"
#include "type_shim.h"

template <typename U>
__device__ void cuWelfordOnlineSum(const U curr, U& mu, U& sigma2, U& count) {
  count = count + U(1);
  U delta = curr - mu;
  U lmean = mu + delta / count;
  mu = lmean;
  U delta2 = curr - lmean;
  sigma2 = sigma2 + delta * delta2;
}

template <typename U>
__device__ void cuChanOnlineSum(const U muB, const U sigma2B, const U countB, U& mu, U& sigma2, U& count) {
  U delta = muB - mu;
  U nA = count;
  U nB = countB;
  count = count + countB;
  U nX = count;
  if (nX > U(0)) {
    nA = nA / nX;
    nB = nB / nX;
    mu = nA * mu + nB * muB;
    sigma2 = sigma2 + sigma2B + delta * delta * nA * nB * nX;
  } else {
    mu = U(0);
    sigma2 = U(0);
  }
}

template <typename U>
__device__ void cuRMSOnlineSum(const U curr, U& sigma2) {
  sigma2 = sigma2 + curr * curr;
}

template <typename U>
__device__ void cuChanRMSOnlineSum(const U sigma2B, U& sigma2) {
  sigma2 = sigma2 + sigma2B;
}

template <typename T, typename U>
__device__ void cuWelfordMuSigma2(const T* __restrict__ vals, const int n1, const int n2, const int i1, U& mu,
                                  U& sigma2, U* buf, bool rms_only) {
  // Assumptions:
  // 1) blockDim.x == warpSize
  // 2) Tensor is contiguous
  // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
  //
  // compute variance and mean over n2
  U count = U(0);
  mu = U(0);
  sigma2 = U(0);
  if (i1 < n1) {
    // one warp normalizes one n1 index,
    // synchronization is implicit
    // initialize with standard Welford algorithm
    const int numx = blockDim.x * blockDim.y;
    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
    const T* lvals = vals + i1 * n2;
    int l = 4 * thrx;
    for (; l + 3 < n2; l += 4 * numx) {
      for (int k = 0; k < 4; ++k) {
        U curr = static_cast<U>(lvals[l + k]);
        if (!rms_only) {
          cuWelfordOnlineSum<U>(curr, mu, sigma2, count);
        } else {
          cuRMSOnlineSum<U>(curr, sigma2);
        }
      }
    }
    for (; l < n2; ++l) {
      U curr = static_cast<U>(lvals[l]);
      if (!rms_only) {
        cuWelfordOnlineSum<U>(curr, mu, sigma2, count);
      } else {
        cuRMSOnlineSum<U>(curr, sigma2);
      }
    }
    // intra-warp reductions
    for (int l = 0; l <= 4; ++l) {
      int srcLaneB = (threadIdx.x + (1 << l)) & 31;
      U sigma2B = WARP_SHFL(sigma2, srcLaneB);
      if (!rms_only) {
        U muB = WARP_SHFL(mu, srcLaneB);
        U countB = WARP_SHFL(count, srcLaneB);
        cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
      } else {
        cuChanRMSOnlineSum<U>(sigma2B, sigma2);
      }
    }
    // threadIdx.x == 0 has correct values for each warp
    // inter-warp reductions
    if (blockDim.y > 1) {
      U* ubuf = (U*)buf;
      U* ibuf = (U*)(ubuf + blockDim.y);
      for (int offset = blockDim.y / 2; offset > 0; offset /= 2) {
        // upper half of warps write to shared
        if (threadIdx.x == 0 && threadIdx.y >= offset && threadIdx.y < 2 * offset) {
          const int wrt_y = threadIdx.y - offset;
          if (!rms_only) {
            ubuf[2 * wrt_y] = mu;
            ibuf[wrt_y] = count;
          }
          ubuf[2 * wrt_y + 1] = sigma2;
        }
        __syncthreads();
        // lower half merges
        if (threadIdx.x == 0 && threadIdx.y < offset) {
          U sigma2B = ubuf[2 * threadIdx.y + 1];
          if (!rms_only) {
            U muB = ubuf[2 * threadIdx.y];
            U countB = ibuf[threadIdx.y];
            cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
          } else {
            cuChanRMSOnlineSum<U>(sigma2B, sigma2);
          }
        }
        __syncthreads();
      }
      // threadIdx.x = 0 && threadIdx.y == 0 only thread that has correct values
      if (threadIdx.x == 0 && threadIdx.y == 0) {
        if (!rms_only) {
          ubuf[0] = mu;
        }
        ubuf[1] = sigma2;
      }
      __syncthreads();
      if (!rms_only) {
        mu = ubuf[0];
      }
      sigma2 = ubuf[1] / U(n2);
      // don't care about final value of count, we know count == n2
    } else {
      if (!rms_only) {
        mu = WARP_SHFL(mu, 0);
      }
      sigma2 = WARP_SHFL(sigma2 / U(n2), 0);
    }
  }
}

template <>
__device__ void cuWelfordMuSigma2(const at::Half* __restrict__ vals, const int n1, const int n2, const int i1,
                                  float& mu, float& sigma2, float* buf, bool rms_only) {
  // Assumptions:
  // 1) blockDim.x == warpSize
  // 2) Tensor is contiguous
  // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
  //
  // compute variance and mean over n2
  float count = 0.0f;
  mu = float(0);
  sigma2 = float(0);
  if (i1 < n1) {
    // one warp normalizes one n1 index,
    // synchronization is implicit
    // initialize with standard Welford algorithm
    const int numx = blockDim.x * blockDim.y;
    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
    const at::Half* lvals = vals + i1 * n2;
    int l = 8 * thrx;
    if ((((size_t)lvals) & 3) != 0) {
      // 16 bit alignment
      // first thread consumes first point
      if (thrx == 0) {
        float curr = static_cast<float>(lvals[0]);
        if (!rms_only) {
          cuWelfordOnlineSum(curr, mu, sigma2, count);
        } else {
          cuRMSOnlineSum(curr, sigma2);
        }
      }
      ++l;
    }
    // at this point, lvals[l] are 32 bit aligned for all threads.
    for (; l + 7 < n2; l += 8 * numx) {
      for (int k = 0; k < 8; k += 2) {
        float2 curr = __half22float2(*((__half2*)(lvals + l + k)));
        if (!rms_only) {
          cuWelfordOnlineSum(curr.x, mu, sigma2, count);
          cuWelfordOnlineSum(curr.y, mu, sigma2, count);
        } else {
          cuRMSOnlineSum(curr.x, sigma2);
          cuRMSOnlineSum(curr.y, sigma2);
        }
      }
    }
    for (; l < n2; ++l) {
      float curr = static_cast<float>(lvals[l]);
      if (!rms_only) {
        cuWelfordOnlineSum(curr, mu, sigma2, count);
      } else {
        cuRMSOnlineSum(curr, sigma2);
      }
    }
    // intra-warp reductions
    for (int l = 0; l <= 4; ++l) {
      int srcLaneB = (threadIdx.x + (1 << l)) & 31;
      float sigma2B = WARP_SHFL(sigma2, srcLaneB);
      if (!rms_only) {
        float muB = WARP_SHFL(mu, srcLaneB);
        float countB = WARP_SHFL(count, srcLaneB);
        cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
      } else {
        cuChanRMSOnlineSum(sigma2B, sigma2);
      }
    }
    // threadIdx.x == 0 has correct values for each warp
    // inter-warp reductions
    if (blockDim.y > 1) {
      float* ubuf = (float*)buf;
      float* ibuf = (float*)(ubuf + blockDim.y);
      for (int offset = blockDim.y / 2; offset > 0; offset /= 2) {
        // upper half of warps write to shared
        if (threadIdx.x == 0 && threadIdx.y >= offset && threadIdx.y < 2 * offset) {
          const int wrt_y = threadIdx.y - offset;
          ubuf[2 * wrt_y + 1] = sigma2;
          if (!rms_only) {
            ubuf[2 * wrt_y] = mu;
            ibuf[wrt_y] = count;
          }
        }
        __syncthreads();
        // lower half merges
        if (threadIdx.x == 0 && threadIdx.y < offset) {
          float sigma2B = ubuf[2 * threadIdx.y + 1];
          if (!rms_only) {
            float muB = ubuf[2 * threadIdx.y];
            float countB = ibuf[threadIdx.y];
            cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
          } else {
            cuChanRMSOnlineSum(sigma2B, sigma2);
          }
        }
        __syncthreads();
      }
      // threadIdx.x = 0 && threadIdx.y == 0 only thread that has correct values
      if (threadIdx.x == 0 && threadIdx.y == 0) {
        if (!rms_only) {
          ubuf[0] = mu;
        }
        ubuf[1] = sigma2;
      }
      __syncthreads();
      if (!rms_only) {
        mu = ubuf[0];
      }
      sigma2 = ubuf[1] / float(n2);
      // don't care about final value of count, we know count == n2
    } else {
      if (!rms_only) {
        mu = WARP_SHFL(mu, 0);
      }
      sigma2 = WARP_SHFL(sigma2 / float(n2), 0);
    }
  }
}

template <typename U>
U rsqrt(U v) {
  return U(1) / sqrt(v);
}
template <>
float rsqrt(float v) {
  return rsqrtf(v);
}
template <>
double rsqrt(double v) {
  return rsqrt(v);
}

namespace {
// This is the un-specialized struct.  Note that we prevent instantiation of this
// struct by putting an undefined symbol in the function body so it won't compile.
//  template <typename T>
//  struct SharedMemory
//  {
//      // Ensure that we won't compile any un-specialized types
//      __device__ T *getPointer()
//      {
//          extern __device__ void error(void);
//          error();
//          return nullptr;
//      }
//  };
// https://github.com/NVIDIA/apex/issues/246
template <typename T>
struct SharedMemory;

template <>
struct SharedMemory<float> {
  __device__ float* getPointer() {
    extern __shared__ float s_float[];
    return s_float;
  }
};

template <>
struct SharedMemory<double> {
  __device__ double* getPointer() {
    extern __shared__ double s_double[];
    return s_double;
  }
};
}  // namespace

template <typename T, typename U, typename V>
__device__ void cuApplyLayerNorm_(V* __restrict__ output_vals, U* __restrict__ mean, U* __restrict__ invvar,
                                  const T* __restrict__ vals, const int n1, const int n2, const U epsilon,
                                  const V* __restrict__ gamma, const V* __restrict__ beta, bool rms_only) {
  // Assumptions:
  // 1) blockDim.x == warpSize
  // 2) Tensors are contiguous
  //
  for (auto i1 = blockIdx.y; i1 < n1; i1 += gridDim.y) {
    SharedMemory<U> shared;
    U* buf = shared.getPointer();
    U mu, sigma2;
    cuWelfordMuSigma2(vals, n1, n2, i1, mu, sigma2, buf, rms_only);

    const T* lvals = vals + i1 * n2;
    V* ovals = output_vals + i1 * n2;
    U c_invvar = rsqrt(sigma2 + epsilon);
    const int numx = blockDim.x * blockDim.y;
    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
    if (gamma != nullptr && (beta != nullptr || rms_only)) {
      for (int i = thrx; i < n2; i += numx) {
        U curr = static_cast<U>(lvals[i]);
        if (!rms_only) {
          ovals[i] = gamma[i] * static_cast<V>(c_invvar * (curr - mu)) + beta[i];
        } else {
          ovals[i] = gamma[i] * static_cast<V>(c_invvar * curr);
        }
      }
    } else {
      for (int i = thrx; i < n2; i += numx) {
        U curr = static_cast<U>(lvals[i]);
        if (!rms_only) {
          ovals[i] = static_cast<V>(c_invvar * (curr - mu));
        } else {
          ovals[i] = static_cast<V>(c_invvar * curr);
        }
      }
    }
    if (threadIdx.x == 0 && threadIdx.y == 0) {
      if (!rms_only) {
        mean[i1] = mu;
      }
      invvar[i1] = c_invvar;
    }
    __syncthreads();
  }
}

template <typename T, typename U, typename V = T>
__global__ void cuApplyLayerNorm(V* __restrict__ output_vals, U* __restrict__ mean, U* __restrict__ invvar,
                                 const T* __restrict__ vals, const int n1, const int n2, const U epsilon,
                                 const V* __restrict__ gamma, const V* __restrict__ beta) {
  cuApplyLayerNorm_<T, U, V>(output_vals, mean, invvar, vals, n1, n2, epsilon, gamma, beta, false);
}

template <typename T, typename U, typename V = T>
__global__ void cuApplyRMSNorm(V* __restrict__ output_vals, U* __restrict__ invvar, const T* __restrict__ vals,
                               const int n1, const int n2, const U epsilon, const V* __restrict__ gamma) {
  cuApplyLayerNorm_<T, U, V>(output_vals, nullptr, invvar, vals, n1, n2, epsilon, gamma, nullptr, true);
}

template <typename V>
__device__ V clamp_by_magnitude(V curr_gamma, double eps) {
  const V kMinGamma = V(eps);
  if (curr_gamma >= 0) {
    if (curr_gamma < kMinGamma) {
      return kMinGamma;
    } else {
      return curr_gamma;
    }
  } else {
    if (curr_gamma > -kMinGamma) {
      return -kMinGamma;
    } else {
      return curr_gamma;
    }
  }
}

template <typename T, typename U, typename V, bool MemoryEfficient>
__device__ void cuLoadWriteStridedInputs(const int i1_block, const int thr_load_row_off, const int thr_load_col_off,
                                         const int i2_off, const int row_stride, U* warp_buf1, U* warp_buf2,
                                         const T* input_or_output, const V* dout, const int i1_end, const int n2,
                                         const U* __restrict__ mean, const U* __restrict__ invvar,
                                         const V* __restrict__ gamma, const V* __restrict__ beta, const double eps,
                                         bool rms_only) {
  int i1 = i1_block + thr_load_row_off;
  if (i1 < i1_end) {
    for (int k = 0; k < blockDim.y; ++k) {
      int i2 = i2_off + k;
      int load_idx = i1 * n2 + i2;
      int write_idx = thr_load_row_off * row_stride + thr_load_col_off + k;
      if (i2 < n2) {
        U c_h = static_cast<U>(input_or_output[load_idx]);
        U curr_dout = static_cast<U>(dout[load_idx]);
        if (!rms_only) {
          warp_buf1[write_idx] = curr_dout;
          if (MemoryEfficient) {
            U curr_beta = static_cast<U>(beta[i2]);
            warp_buf2[write_idx] = curr_dout * (c_h - curr_beta) / static_cast<U>(clamp_by_magnitude(gamma[i2], eps));
          } else {
            warp_buf2[write_idx] = curr_dout * (c_h - mean[i1]) * invvar[i1];
          }
        } else {
          if (MemoryEfficient) {
            warp_buf2[write_idx] = curr_dout * (c_h) / static_cast<U>(clamp_by_magnitude(gamma[i2], eps));
          } else {
            warp_buf2[write_idx] = curr_dout * (c_h)*invvar[i1];
          }
        }
      } else {
        if (!rms_only) {
          warp_buf1[write_idx] = U(0);
        }
        warp_buf2[write_idx] = U(0);
      }
    }
  } else {
    for (int k = 0; k < blockDim.y; ++k) {
      int write_idx = thr_load_row_off * row_stride + thr_load_col_off + k;
      if (!rms_only) {
        warp_buf1[write_idx] = U(0);
      }
      warp_buf2[write_idx] = U(0);
    }
  }
}

template <typename T, typename U, typename V, bool MemoryEfficient>
__device__ void cuLoadAddStridedInputs(const int i1_block, const int thr_load_row_off, const int thr_load_col_off,
                                       const int i2_off, const int row_stride, U* warp_buf1, U* warp_buf2,
                                       const T* input_or_output, const V* dout, const int i1_end, const int n2,
                                       const U* __restrict__ mean, const U* __restrict__ invvar,
                                       const V* __restrict__ gamma, const V* __restrict__ beta, const double eps,
                                       bool rms_only) {
  int i1 = i1_block + thr_load_row_off;
  if (i1 < i1_end) {
    for (int k = 0; k < blockDim.y; ++k) {
      int i2 = i2_off + k;
      int load_idx = i1 * n2 + i2;
      int write_idx = thr_load_row_off * row_stride + thr_load_col_off + k;
      if (i2 < n2) {
        U c_h = static_cast<U>(input_or_output[load_idx]);
        U curr_dout = static_cast<U>(dout[load_idx]);
        if (!rms_only) {
          U curr_beta = static_cast<U>(beta[i2]);
          warp_buf1[write_idx] += curr_dout;
          if (MemoryEfficient) {
            warp_buf2[write_idx] += curr_dout * (c_h - curr_beta) / static_cast<U>(clamp_by_magnitude(gamma[i2], eps));
          } else {
            warp_buf2[write_idx] += curr_dout * (c_h - mean[i1]) * invvar[i1];
          }
        } else {
          if (MemoryEfficient) {
            warp_buf2[write_idx] += curr_dout * (c_h) / static_cast<U>(clamp_by_magnitude(gamma[i2], eps));
          } else {
            warp_buf2[write_idx] += curr_dout * (c_h)*invvar[i1];
          }
        }
      }
    }
  }
}

template <typename T, typename U, typename V, bool MemoryEfficient>
__global__ void cuComputePartGradGammaBeta(const V* __restrict__ dout, const T* __restrict__ input_or_output,
                                           const int n1, const int n2, const U* __restrict__ mean,
                                           const U* __restrict__ invvar, U epsilon, const V* __restrict__ gamma,
                                           const V* __restrict__ beta, U* part_grad_gamma, U* part_grad_beta,
                                           const double eps, bool rms_only) {
  const int numsegs_n1 = (n1 + blockDim.y * blockDim.y - 1) / (blockDim.y * blockDim.y);
  const int segs_per_block = (numsegs_n1 + gridDim.y - 1) / gridDim.y;
  const int i1_beg = blockIdx.y * segs_per_block * blockDim.y * blockDim.y;
  const int i1_beg_plus_one = (blockIdx.y + 1) * segs_per_block * blockDim.y * blockDim.y;
  const int i1_end = i1_beg_plus_one < n1 ? i1_beg_plus_one : n1;
  const int row_stride = blockDim.x + 1;
  const int thr_load_col_off = (threadIdx.x * blockDim.y) & (blockDim.x - 1);
  const int thr_load_row_off = (threadIdx.x * blockDim.y) / blockDim.x + threadIdx.y * blockDim.y;
  const int i2_off = blockIdx.x * blockDim.x + thr_load_col_off;
  SharedMemory<U> shared;
  U* buf = shared.getPointer();  // buf has at least blockDim.x * blockDim.y * blockDim.y + (blockDim.y -
                                 // 1)*(blockDim.x/blockDim.y) elements
  U* warp_buf1 = (U*)buf;
  U* warp_buf2 = warp_buf1 + blockDim.y * blockDim.y * row_stride;
  // compute partial sums from strided inputs
  // do this to increase number of loads in flight
  cuLoadWriteStridedInputs<T, U, V, MemoryEfficient>(i1_beg, thr_load_row_off, thr_load_col_off, i2_off, row_stride,
                                                     warp_buf1, warp_buf2, input_or_output, dout, i1_end, n2, mean,
                                                     invvar, gamma, beta, eps, rms_only);
  for (int i1_block = i1_beg + blockDim.y * blockDim.y; i1_block < i1_end; i1_block += blockDim.y * blockDim.y) {
    cuLoadAddStridedInputs<T, U, V, MemoryEfficient>(i1_block, thr_load_row_off, thr_load_col_off, i2_off, row_stride,
                                                     warp_buf1, warp_buf2, input_or_output, dout, i1_end, n2, mean,
                                                     invvar, gamma, beta, eps, rms_only);
  }
  __syncthreads();
  // inter-warp reductions
  // sum within each warp
  U acc1 = U(0);
  U acc2 = U(0);
  for (int k = 0; k < blockDim.y; ++k) {
    int row1 = threadIdx.y + k * blockDim.y;
    int idx1 = row1 * row_stride + threadIdx.x;
    if (!rms_only) {
      acc1 += warp_buf1[idx1];
    }
    acc2 += warp_buf2[idx1];
  }
  if (!rms_only) {
    warp_buf1[threadIdx.y * row_stride + threadIdx.x] = acc1;
  }
  warp_buf2[threadIdx.y * row_stride + threadIdx.x] = acc2;
  __syncthreads();
  // sum all warps
  for (int offset = blockDim.y / 2; offset > 1; offset /= 2) {
    if (threadIdx.y < offset) {
      int row1 = threadIdx.y;
      int row2 = threadIdx.y + offset;
      int idx1 = row1 * row_stride + threadIdx.x;
      int idx2 = row2 * row_stride + threadIdx.x;
      if (!rms_only) {
        warp_buf1[idx1] += warp_buf1[idx2];
      }
      warp_buf2[idx1] += warp_buf2[idx2];
    }
    __syncthreads();
  }
  int i2 = blockIdx.x * blockDim.x + threadIdx.x;
  if (threadIdx.y == 0 && i2 < n2) {
    int row1 = threadIdx.y;
    int row2 = threadIdx.y + 1;
    int idx1 = row1 * row_stride + threadIdx.x;
    int idx2 = row2 * row_stride + threadIdx.x;
    if (!rms_only) {
      part_grad_beta[blockIdx.y * n2 + i2] = warp_buf1[idx1] + warp_buf1[idx2];
    }
    part_grad_gamma[blockIdx.y * n2 + i2] = warp_buf2[idx1] + warp_buf2[idx2];
  }
}

template <typename U, typename V>
__global__ void cuComputeGradGammaBeta(const U* part_grad_gamma, const U* part_grad_beta, const int part_size,
                                       const int n1, const int n2, V* grad_gamma, V* grad_beta, bool rms_only) {
  // sum partial gradients for gamma and beta
  SharedMemory<U> shared;
  U* buf = shared.getPointer();
  int i2 = blockIdx.x * blockDim.x + threadIdx.x;
  if (i2 < n2) {
    // each warp does sequential reductions until reduced part_size is num_warps
    int num_warp_reductions = part_size / blockDim.y;
    U sum_gamma = U(0);
    U sum_beta = U(0);
    const U* part_grad_gamma_ptr = part_grad_gamma + threadIdx.y * num_warp_reductions * n2 + i2;
    const U* part_grad_beta_ptr = part_grad_beta + threadIdx.y * num_warp_reductions * n2 + i2;
    for (int warp_offset = 0; warp_offset < num_warp_reductions; ++warp_offset) {
      sum_gamma += part_grad_gamma_ptr[warp_offset * n2];
      if (!rms_only) {
        sum_beta += part_grad_beta_ptr[warp_offset * n2];
      }
    }
    // inter-warp reductions
    const int nbsize3 = blockDim.x * blockDim.y / 2;
    for (int offset = blockDim.y / 2; offset >= 1; offset /= 2) {
      // top half write to shared memory
      if (threadIdx.y >= offset && threadIdx.y < 2 * offset) {
        const int write_idx = (threadIdx.y - offset) * blockDim.x + threadIdx.x;
        buf[write_idx] = sum_gamma;
        if (!rms_only) {
          buf[write_idx + nbsize3] = sum_beta;
        }
      }
      __syncthreads();
      // bottom half sums
      if (threadIdx.y < offset) {
        const int read_idx = threadIdx.y * blockDim.x + threadIdx.x;
        sum_gamma += buf[read_idx];
        if (!rms_only) {
          sum_beta += buf[read_idx + nbsize3];
        }
      }
      __syncthreads();
    }
    // write out fully summed gradients
    if (threadIdx.y == 0) {
      grad_gamma[i2] = sum_gamma;
      if (!rms_only) {
        grad_beta[i2] = sum_beta;
      }
    }
  }
}

template <typename T, typename U, typename V, bool MemoryEfficient>
__global__ void cuComputeGradInput(const V* __restrict__ dout, const T* __restrict__ input_or_output, const int n1,
                                   const int n2, const U* __restrict__ mean, const U* __restrict__ invvar, U epsilon,
                                   const V* gamma, const V* beta, T* grad_input, const double eps, bool rms_only) {
  for (auto i1 = blockIdx.y; i1 < n1; i1 += gridDim.y) {
    U sum_loss1 = U(0);
    U sum_loss2 = U(0);
    const T* k_h = input_or_output + i1 * n2;
    const V* k_dout = dout + i1 * n2;
    const U c_invvar = invvar[i1];
    const U c_mean = !MemoryEfficient ? mean[i1] : 0.;
    const int numx = blockDim.x * blockDim.y;
    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
    if (gamma != nullptr) {
      int l = 4 * thrx;
      for (; l + 3 < n2; l += 4 * numx) {
        for (int k = 0; k < 4; ++k) {
          const U c_h = static_cast<U>(k_h[l + k]);
          const U c_loss = static_cast<U>(k_dout[l + k]);
          if (!rms_only) {
            sum_loss1 += c_loss * gamma[l + k];
            if (MemoryEfficient) {
              sum_loss2 += c_loss * (c_h - beta[l + k]);
            } else {
              sum_loss2 += c_loss * gamma[l + k] * (c_h - c_mean) * c_invvar;
            }
          } else {
            if (MemoryEfficient) {
              sum_loss2 += c_loss * c_h;
            } else {
              sum_loss2 += c_loss * gamma[l + k] * (c_h)*c_invvar;
            }
          }
        }
      }
      for (; l < n2; ++l) {
        const U c_h = static_cast<U>(k_h[l]);
        const U c_loss = static_cast<U>(k_dout[l]);
        if (!rms_only) {
          sum_loss1 += c_loss * gamma[l];
          if (MemoryEfficient) {
            sum_loss2 += c_loss * (c_h - beta[l]);
          } else {
            sum_loss2 += c_loss * gamma[l] * (c_h - c_mean) * c_invvar;
          }
        } else {
          if (MemoryEfficient) {
            sum_loss2 += c_loss * c_h;
          } else {
            sum_loss2 += c_loss * gamma[l] * (c_h)*c_invvar;
          }
        }
      }
    } else {
      int l = 4 * thrx;
      for (; l + 3 < n2; l += 4 * numx) {
        for (int k = 0; k < 4; ++k) {
          const U c_h = static_cast<U>(k_h[l + k]);
          const U c_loss = static_cast<U>(k_dout[l + k]);
          if (!rms_only) {
            sum_loss1 += c_loss;
            if (MemoryEfficient) {
              sum_loss2 += c_loss * c_h;
            } else {
              sum_loss2 += c_loss * (c_h - c_mean) * c_invvar;
            }
          } else {
            if (MemoryEfficient) {
              sum_loss2 += c_loss * c_h;
            } else {
              sum_loss2 += c_loss * (c_h)*c_invvar;
            }
          }
        }
      }
      for (; l < n2; ++l) {
        const U c_h = static_cast<U>(k_h[l]);
        const U c_loss = static_cast<U>(k_dout[l]);
        if (!rms_only) {
          sum_loss1 += c_loss;
          if (MemoryEfficient) {
            sum_loss2 += c_loss * c_h;
          } else {
            sum_loss2 += c_loss * (c_h - c_mean) * c_invvar;
          }
        } else {
          if (MemoryEfficient) {
            sum_loss2 += c_loss * c_h;
          } else {
            sum_loss2 += c_loss * (c_h)*c_invvar;
          }
        }
      }
    }
    // intra-warp reductions
    for (int mask = blockDim.x / 2; mask > 0; mask /= 2) {
      if (!rms_only) {
        sum_loss1 += WARP_SHFL_XOR(sum_loss1, mask);
      }
      sum_loss2 += WARP_SHFL_XOR(sum_loss2, mask);
    }
    // inter-warp reductions
    if (blockDim.y > 1) {
      SharedMemory<U> shared;
      U* buf = shared.getPointer();
      for (int offset = blockDim.y / 2; offset > 0; offset /= 2) {
        // upper half of warps write to shared
        if (threadIdx.y >= offset && threadIdx.y < 2 * offset) {
          const int wrt_i = (threadIdx.y - offset) * blockDim.x + threadIdx.x;
          if (!rms_only) {
            buf[2 * wrt_i] = sum_loss1;
          }
          buf[2 * wrt_i + 1] = sum_loss2;
        }
        __syncthreads();
        // lower half merges
        if (threadIdx.y < offset) {
          const int read_i = threadIdx.y * blockDim.x + threadIdx.x;
          if (!rms_only) {
            sum_loss1 += buf[2 * read_i];
          }
          sum_loss2 += buf[2 * read_i + 1];
        }
        __syncthreads();
      }
      if (threadIdx.y == 0) {
        if (!rms_only) {
          buf[2 * threadIdx.x] = sum_loss1;
        }
        buf[2 * threadIdx.x + 1] = sum_loss2;
      }
      __syncthreads();
      if (threadIdx.y != 0) {
        if (!rms_only) {
          sum_loss1 = buf[2 * threadIdx.x];
        }
        sum_loss2 = buf[2 * threadIdx.x + 1];
      }
    }
    // all threads now have the two sums over l
    U fH = (U)n2;
    U term1 = (U(1) / fH) * c_invvar;
    T* k_grad_input = grad_input + i1 * n2;
    if (gamma != nullptr) {
      for (int l = thrx; l < n2; l += numx) {
        const U c_h = static_cast<U>(k_h[l]);
        const U c_loss = static_cast<U>(k_dout[l]);
        const U k_gamma = static_cast<U>(clamp_by_magnitude(gamma[l], eps));
        U f_grad_input = fH * c_loss * k_gamma;
        if (!rms_only) {
          const U k_beta = beta[l];
          f_grad_input -= sum_loss1;
          if (MemoryEfficient) {
            f_grad_input -= (c_h - k_beta) / k_gamma * sum_loss2;
          } else {
            f_grad_input -= (c_h - c_mean) * c_invvar * sum_loss2;
          }
        } else {
          if (MemoryEfficient) {
            f_grad_input -= c_h / k_gamma * sum_loss2;
          } else {
            f_grad_input -= c_h * c_invvar * sum_loss2;
          }
        }
        f_grad_input *= term1;
        k_grad_input[l] = static_cast<T>(f_grad_input);
      }
    } else {
      for (int l = thrx; l < n2; l += numx) {
        const U c_h = static_cast<U>(k_h[l]);
        const U c_loss = static_cast<U>(k_dout[l]);
        U f_grad_input = fH * c_loss;
        if (!rms_only) {
          f_grad_input -= sum_loss1;
          if (MemoryEfficient) {
            f_grad_input -= c_h * sum_loss2;
          } else {
            f_grad_input -= (c_h - c_mean) * c_invvar * sum_loss2;
          }
        } else {
          if (MemoryEfficient) {
            f_grad_input -= c_h * sum_loss2;
          } else {
            f_grad_input -= c_h * c_invvar * sum_loss2;
          }
        }
        f_grad_input *= term1;
        k_grad_input[l] = static_cast<T>(f_grad_input);
      }
    }
    // prevent race where buf is written again before reads are done
    __syncthreads();
  }
}

template <typename T, typename U, typename V = T>
void HostApplyLayerNorm(V* output, U* mean, U* invvar, const T* input, int n1, int n2, double epsilon, const V* gamma,
                        const V* beta) {
  auto stream = at::cuda::getCurrentCUDAStream().stream();
  const dim3 threads(32, 4, 1);
  const uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
  const dim3 blocks(1, std::min((uint64_t)n1, maxGridY), 1);
  int nshared = threads.y > 1 ? threads.y * sizeof(U) + (threads.y / 2) * sizeof(U) : 0;
  cuApplyLayerNorm<<<blocks, threads, nshared, stream>>>(output, mean, invvar, input, n1, n2, U(epsilon), gamma, beta);
}

template <typename T, typename U, typename V = T>
void HostApplyRMSNorm(V* output, U* invvar, const T* input, int n1, int n2, double epsilon, const V* gamma) {
  auto stream = at::cuda::getCurrentCUDAStream().stream();
  const dim3 threads(32, 4, 1);
  const uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
  const dim3 blocks(1, std::min((uint64_t)n1, maxGridY), 1);
  int nshared = threads.y > 1 ? threads.y * sizeof(U) + (threads.y / 2) * sizeof(U) : 0;
  cuApplyRMSNorm<<<blocks, threads, nshared, stream>>>(output, invvar, input, n1, n2, U(epsilon), gamma);
}

void cuda_layer_norm(at::Tensor& output, at::Tensor& mean, at::Tensor& invvar, const at::Tensor& input, int n1, int n2,
                     at::IntArrayRef normalized_shape, const std::optional<at::Tensor>& gamma,
                     const std::optional<at::Tensor>& beta, double epsilon) {
  using namespace at;
  DISPATCH_DOUBLE_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(
      input.scalar_type(), output.scalar_type(), "layer_norm_cuda_kernel",
      using accscalar_t = at::acc_type<scalar_t_in, true>;
      HostApplyLayerNorm<scalar_t_in, accscalar_t, scalar_t_out>(
          output.data_ptr<scalar_t_out>(), mean.data_ptr<accscalar_t>(), invvar.data_ptr<accscalar_t>(),
          input.data_ptr<scalar_t_in>(), n1, n2, epsilon, gamma.has_value() ? gamma->data_ptr<scalar_t_out>() : nullptr,
          beta.has_value() ? beta->data_ptr<scalar_t_out>() : nullptr);)
}

void cuda_rms_norm(at::Tensor& output, at::Tensor& invvar, const at::Tensor& input, int n1, int n2,
                   at::IntArrayRef normalized_shape, const std::optional<at::Tensor>& gamma, double epsilon) {
  using namespace at;
  DISPATCH_DOUBLE_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(
      input.scalar_type(), output.scalar_type(), "rms_norm_cuda_kernel",
      using accscalar_t = at::acc_type<scalar_t_in, true>;
      HostApplyRMSNorm<scalar_t_in, accscalar_t, scalar_t_out>(
          output.data_ptr<scalar_t_out>(), invvar.data_ptr<accscalar_t>(), input.data_ptr<scalar_t_in>(), n1, n2,
          epsilon, gamma.has_value() ? gamma->data_ptr<scalar_t_out>() : nullptr);)
}

template <typename T, typename U = float, typename V = T>
void HostLayerNormGradient(const V* dout, const U* mean, const U* invvar, at::Tensor& input_or_output, int n1, int n2,
                           const V* gamma, const V* beta, double epsilon, T* grad_input, V* grad_gamma, V* grad_beta,
                           bool memory_efficient) {
  auto stream = at::cuda::getCurrentCUDAStream().stream();

  if (gamma != nullptr && beta != nullptr) {
    // compute grad_gamma(j) and grad_beta(j)
    const int part_size = 16;
    const dim3 threads2(32, 4, 1);
    const dim3 blocks2((n2 + threads2.x - 1) / threads2.x, part_size, 1);
    const int nshared2_a = 2 * sizeof(U) * threads2.y * threads2.y * (threads2.x + 1);
    const int nshared2_b = threads2.x * threads2.y * sizeof(U);
    const int nshared2 = nshared2_a > nshared2_b ? nshared2_a : nshared2_b;
    // note (mkozuki): I can hard code part_grad_gamma's dtype as float given that
    // the `cuda_layer_norm_gradient` doesn't support double.
    const auto part_grad_dtype = (input_or_output.scalar_type() == at::ScalarType::Half ||
                                  input_or_output.scalar_type() == at::ScalarType::BFloat16)
                                     ? at::ScalarType::Float
                                     : input_or_output.scalar_type();
    at::Tensor part_grad_gamma = at::empty({part_size, n2}, input_or_output.options().dtype(part_grad_dtype));
    at::Tensor part_grad_beta = at::empty_like(part_grad_gamma);
    BOOL_SWITCH(memory_efficient, MemoryEfficient, [&] {
      auto kernel = &cuComputePartGradGammaBeta<T, U, V, MemoryEfficient>;
      kernel<<<blocks2, threads2, nshared2, stream>>>(dout, input_or_output.data_ptr<T>(), n1, n2, mean, invvar,
                                                      U(epsilon), gamma, beta, part_grad_gamma.data_ptr<U>(),
                                                      part_grad_beta.data_ptr<U>(), epsilon, false);
    });

    const dim3 threads3(32, 8, 1);
    const dim3 blocks3((n2 + threads2.x - 1) / threads2.x, 1, 1);
    const int nshared3 = threads3.x * threads3.y * sizeof(U);
    cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, stream>>>(
        part_grad_gamma.data_ptr<U>(), part_grad_beta.data_ptr<U>(), part_size, n1, n2, grad_gamma, grad_beta, false);
  }

  // compute grad_input
  const uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
  const dim3 blocks1(1, std::min((uint64_t)n1, maxGridY), 1);
  const dim3 threads1(32, 4, 1);
  int nshared = threads1.y > 1 ? threads1.y * threads1.x * sizeof(U) : 0;
  BOOL_SWITCH(memory_efficient, MemoryEfficient, [&] {
    auto kernel = cuComputeGradInput<T, U, V, MemoryEfficient>;
    kernel<<<blocks1, threads1, nshared, stream>>>(dout, input_or_output.data_ptr<T>(), n1, n2, mean, invvar,
                                                   U(epsilon), gamma, beta, grad_input, epsilon, false);
  });
}

template <typename T, typename U = float, typename V = T>
void HostRMSNormGradient(const V* dout, const U* invvar, at::Tensor& input_or_output, int n1, int n2, const V* gamma,
                         double epsilon, T* grad_input, V* grad_gamma, bool memory_efficient) {
  auto stream = at::cuda::getCurrentCUDAStream().stream();

  if (gamma != nullptr) {
    const int part_size = 16;
    const dim3 threads2(32, 4, 1);
    const dim3 blocks2((n2 + threads2.x - 1) / threads2.x, part_size, 1);
    const int nshared2_a = 2 * sizeof(U) * threads2.y * threads2.y * (threads2.x + 1);
    const int nshared2_b = threads2.x * threads2.y * sizeof(U);
    const int nshared2 = nshared2_a > nshared2_b ? nshared2_a : nshared2_b;
    // note (mkozuki): I can hard code part_grad_gamma's dtype as float given that
    // the `cuda_layer_norm_gradient` doesn't support double.
    const auto part_grad_dtype = (input_or_output.scalar_type() == at::ScalarType::Half ||
                                  input_or_output.scalar_type() == at::ScalarType::BFloat16)
                                     ? at::ScalarType::Float
                                     : input_or_output.scalar_type();
    at::Tensor part_grad_gamma = at::empty({part_size, n2}, input_or_output.options().dtype(part_grad_dtype));
    BOOL_SWITCH(memory_efficient, MemoryEfficient, [&] {
      auto kernel = &cuComputePartGradGammaBeta<T, U, V, MemoryEfficient>;
      kernel<<<blocks2, threads2, nshared2, stream>>>(dout, input_or_output.data_ptr<T>(), n1, n2, invvar, /* unused */
                                                      invvar, U(epsilon), gamma, gamma,                    /* unused */
                                                      part_grad_gamma.data_ptr<U>(),
                                                      part_grad_gamma.data_ptr<U>(), /* unused */
                                                      epsilon, true);
    });

    const dim3 threads3(32, 8, 1);
    const dim3 blocks3((n2 + threads2.x - 1) / threads2.x, 1, 1);
    const int nshared3 = threads3.x * threads3.y * sizeof(U);
    cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, stream>>>(
        part_grad_gamma.data_ptr<U>(), part_grad_gamma.data_ptr<U>(), /* unused */
        part_size, n1, n2, grad_gamma, grad_gamma,                    /* unused */
        true);
  }

  // compute grad_input
  const uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
  const dim3 blocks1(1, std::min((uint64_t)n1, maxGridY), 1);
  const dim3 threads1(32, 4, 1);
  int nshared = threads1.y > 1 ? threads1.y * threads1.x * sizeof(U) : 0;
  BOOL_SWITCH(memory_efficient, MemoryEfficient, [&] {
    auto kernel = cuComputeGradInput<T, U, V, MemoryEfficient>;
    kernel<<<blocks1, threads1, nshared, stream>>>(dout, input_or_output.data_ptr<T>(), n1, n2, invvar, /* unused */
                                                   invvar, U(epsilon), gamma, gamma,                    /* unused */
                                                   grad_input, epsilon, true);
  });
}

void cuda_layer_norm_gradient(at::Tensor& dout, const std::optional<at::Tensor>& mean, at::Tensor& invvar,
                              at::Tensor& input_or_output, int n1, int n2, at::IntArrayRef normalized_shape,
                              const std::optional<at::Tensor>& gamma, const std::optional<at::Tensor>& beta,
                              double epsilon, at::Tensor& grad_input, const std::optional<at::Tensor>& grad_gamma,
                              const std::optional<at::Tensor>& grad_beta, bool memory_efficient) {
  using namespace at;
  // we can do away with `accscalar_t` as there're only three dtypes: fp32, fp16, bf16
  DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(
      input_or_output.scalar_type(), gamma.has_value() ? gamma->scalar_type() : input_or_output.scalar_type(),
      "cuComputeGradInput", using accscalar_t = at::acc_type<scalar_t_in, true>;
      HostLayerNormGradient(dout.data_ptr<scalar_t_out>(), mean.has_value() ? mean->data_ptr<accscalar_t>() : nullptr,
                            invvar.data_ptr<accscalar_t>(), input_or_output, n1, n2,
                            // TMJ pass nullptr argument for gamma, beta, grad_gamma and grad_beta
                            // if gamma Tensor is nullptr on input.
                            gamma.has_value() ? gamma->data_ptr<scalar_t_out>() : nullptr,
                            gamma.has_value() ? beta->data_ptr<scalar_t_out>() : nullptr, epsilon,
                            grad_input.data_ptr<scalar_t_in>(),
                            gamma.has_value() ? grad_gamma->data_ptr<scalar_t_out>() : nullptr,
                            gamma.has_value() ? grad_beta->data_ptr<scalar_t_out>() : nullptr, memory_efficient);)
}

void cuda_rms_norm_gradient(at::Tensor& dout, at::Tensor& invvar, at::Tensor& input_or_output, int n1, int n2,
                            at::IntArrayRef normalized_shape, const std::optional<at::Tensor>& gamma, double epsilon,
                            at::Tensor& grad_input, const std::optional<at::Tensor>& grad_gamma,
                            bool memory_efficient) {
  using namespace at;
  // we can do away with `accscalar_t` as there're only three dtypes: fp32, fp16, bf16
  // DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(
  DISPATCH_DOUBLE_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(
      input_or_output.scalar_type(), gamma.has_value() ? gamma->scalar_type() : input_or_output.scalar_type(),
      "cuComputeGradInputRMS", using accscalar_t = at::acc_type<scalar_t_in, true>;
      HostRMSNormGradient(dout.data_ptr<scalar_t_out>(), invvar.data_ptr<accscalar_t>(), input_or_output, n1, n2,
                          // TMJ pass nullptr argument for gamma, beta, grad_gamma and grad_beta
                          // if gamma Tensor is nullptr on input.
                          gamma.has_value() ? gamma->data_ptr<scalar_t_out>() : nullptr, epsilon,
                          grad_input.data_ptr<scalar_t_in>(),
                          gamma.has_value() ? grad_gamma->data_ptr<scalar_t_out>() : nullptr, memory_efficient);)
}


================================================
FILE: csrc/megatron/fused_rotary_positional_embedding.cpp
================================================
/* coding=utf-8
 * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <torch/extension.h>

namespace fused_rope {

torch::Tensor fwd_cuda(const torch::Tensor& input, const torch::Tensor& freqs, const bool transpose_output);

torch::Tensor bwd_cuda(const torch::Tensor& output_grads, const torch::Tensor& freqs, const bool transpose_output);

torch::Tensor fwd_cached_cuda(const torch::Tensor& input, const torch::Tensor& cos, const torch::Tensor& sin,
                              const bool transpose_output);

torch::Tensor bwd_cached_cuda(const torch::Tensor& output_grads, const torch::Tensor& cos, const torch::Tensor& sin,
                              const bool transpose_output);

torch::Tensor fwd_thd_cuda(const torch::Tensor& input, const torch::Tensor& cu_seqlens, const torch::Tensor& freqs);

torch::Tensor bwd_thd_cuda(const torch::Tensor& output_grads, const torch::Tensor& cu_seqlens,
                           const torch::Tensor& freqs);

torch::Tensor fwd_2d_cuda(const torch::Tensor& input, const torch::Tensor& cos_h, const torch::Tensor& sin_h,
                          const torch::Tensor& cos_w, const torch::Tensor& sin_w);

torch::Tensor bwd_2d_cuda(const torch::Tensor& output_grads, const torch::Tensor& cos_h, const torch::Tensor& sin_h,
                          const torch::Tensor& cos_w, const torch::Tensor& sin_w);

torch::Tensor fwd(const at::Tensor& input, const at::Tensor& freqs, const bool transpose_output) {
  TORCH_CHECK(input.dim() == 4, "expected 4D tensor");
  TORCH_CHECK(freqs.dim() == 4, "expected 4D tensor");
  TORCH_CHECK(input.size(0) == freqs.size(0), "expected input and freqs tensor have the same sequence length");
  TORCH_CHECK(freqs.size(1) == 1 && freqs.size(2) == 1,
              "expected the second and third dims of the freqs tensor equal 1");
  TORCH_CHECK(input.size(3) >= freqs.size(3),
              "expected the last dim of the input tensor equals or is "
              "greater than the freqs tensor");
  TORCH_CHECK(freqs.scalar_type() == at::ScalarType::Float, "Dtype of the freqs tensor must be float");

  return fwd_cuda(input, freqs, transpose_output);
}

torch::Tensor bwd(const torch::Tensor& output_grads, const at::Tensor& freqs, const bool transpose_output) {
  TORCH_CHECK(output_grads.dim() == 4, "expected 4D tensor");
  TORCH_CHECK(freqs.dim() == 4, "expected 4D tensor");
  TORCH_CHECK(output_grads.size(0) == freqs.size(0),
              "expected output_grads and freqs tensor have the same sequence length");
  TORCH_CHECK(freqs.size(1) == 1 && freqs.size(2) == 1,
              "expected the second and third dims of the freqs tensor equal 1");
  TORCH_CHECK(output_grads.size(3) >= freqs.size(3),
              "expected the last dim of the output_grads tensor equals or is "
              "greater than the freqs tensor");
  TORCH_CHECK(freqs.scalar_type() == at::ScalarType::Float, "Dtype of the freqs tensor must be float");

  return bwd_cuda(output_grads, freqs, transpose_output);
}

torch::Tensor fwd_cached(const at::Tensor& input, const at::Tensor& cos, const at::Tensor& sin,
                         const bool transpose_output) {
  TORCH_CHECK(input.dim() == 4, "expected 4D tensor");
  TORCH_CHECK(cos.dim() == 4, "expected 4D tensor");
  TORCH_CHECK(sin.dim() == 4, "expected 4D tensor");
  TORCH_CHECK(input.size(0) == cos.size(0), "expected input and cos tensor have the same sequence length");
  TORCH_CHECK(input.size(0) == sin.size(0), "expected input and sin tensor have the same sequence length");
  TORCH_CHECK(cos.size(1) == 1 && cos.size(2) == 1, "expected the second and third dims of the cos tensor equal 1");
  TORCH_CHECK(sin.size(1) == 1 && sin.size(2) == 1, "expected the second and third dims of the sin tensor equal 1");
  TORCH_CHECK(cos.size(3) == sin.size(3), "expected cos and sin tensor have the same last dim");
  TORCH_CHECK(input.size(3) >= cos.size(3),
              "expected the last dim of the input tensor equals or is "
              "greater than the cos tensor");
  TORCH_CHECK(cos.scalar_type() == sin.scalar_type(), "expected cos and sin tensor have the same dtype");

  return fwd_cached_cuda(input, cos, sin, transpose_output);
}

torch::Tensor bwd_cached(const torch::Tensor& output_grads, const at::Tensor& cos, const at::Tensor& sin,
                         const bool transpose_output) {
  TORCH_CHECK(output_grads.dim() == 4, "expected 4D tensor");
  TORCH_CHECK(cos.dim() == 4, "expected 4D tensor");
  TORCH_CHECK(sin.dim() == 4, "expected 4D tensor");
  TORCH_CHECK(output_grads.size(0) == cos.size(0),
              "expected output_grads and cos tensor have the same sequence length");
  TORCH_CHECK(output_grads.size(0) == sin.size(0),
              "expected output_grads and sin tensor have the same sequence length");
  TORCH_CHECK(cos.size(1) == 1 && cos.size(2) == 1, "expected the second and third dims of the cos tensor equal 1");
  TORCH_CHECK(sin.size(1) == 1 && sin.size(2) == 1, "expected the second and third dims of the sin tensor equal 1");
  TORCH_CHECK(cos.size(3) == sin.size(3), "expected cos and sin tensor have the same last dim");
  TORCH_CHECK(output_grads.size(3) >= cos.size(3),
              "expected the last dim of the output_grads tensor equals or is "
              "greater than the cos tensor");
  TORCH_CHECK(cos.scalar_type() == sin.scalar_type(), "expected cos and sin tensor have the same dtype");

  return bwd_cached_cuda(output_grads, cos, sin, transpose_output);
}

torch::Tensor fwd_thd(const torch::Tensor& input, const torch::Tensor& cu_seqlens, const torch::Tensor& freqs) {
  TORCH_CHECK(input.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(cu_seqlens.dim() == 1, "expected 1D tensor");
  TORCH_CHECK(freqs.dim() == 4, "expected 4D tensor");
  TORCH_CHECK(freqs.size(1) == 1 && freqs.size(2) == 1,
              "expected the second and third dims of the freqs tensor equal 1");
  TORCH_CHECK(input.size(2) >= freqs.size(3),
              "expected the last dim of the input tensor equals or is "
              "greater than the freqs tensor");
  TORCH_CHECK(freqs.scalar_type() == at::ScalarType::Float, "Dtype of the freqs tensor must be float");

  return fwd_thd_cuda(input, cu_seqlens, freqs);
}

torch::Tensor bwd_thd(const torch::Tensor& output_grads, const torch::Tensor& cu_seqlens, const torch::Tensor& freqs) {
  TORCH_CHECK(output_grads.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(cu_seqlens.dim() == 1, "expected 1D tensor");
  TORCH_CHECK(freqs.dim() == 4, "expected 4D tensor");
  TORCH_CHECK(freqs.size(1) == 1 && freqs.size(2) == 1,
              "expected the second and third dims of the freqs tensor equal 1");
  TORCH_CHECK(output_grads.size(2) >= freqs.size(3),
              "expected the last dim of the output_grads tensor equals or is "
              "greater than the freqs tensor");
  TORCH_CHECK(freqs.scalar_type() == at::ScalarType::Float, "Dtype of the freqs tensor must be float");

  return bwd_thd_cuda(output_grads, cu_seqlens, freqs);
}

torch::Tensor fwd_2d(const torch::Tensor& input, const torch::Tensor& cos_h, const torch::Tensor& sin_h,
                     const torch::Tensor& cos_w, const torch::Tensor& sin_w) {
  TORCH_CHECK(input.dim() == 5, "expected input to be 5D tensor");
  TORCH_CHECK(cos_h.dim() == 4, "expected cos_h to be 4D tensor");
  TORCH_CHECK(sin_h.dim() == 4, "expected sin_h to be 4D tensor");
  TORCH_CHECK(cos_w.dim() == 4, "expected cos_w to be 4D tensor");
  TORCH_CHECK(sin_w.dim() == 4, "expected sin_w to be 4D tensor");
  TORCH_CHECK(cos_h.size(2) == 1, "expected third dim of cos_h/sin_h equals 1");
  TORCH_CHECK(input.size(1) <= cos_h.size(1), "expected input's height <= cos_h/sin_h's");
  TORCH_CHECK(input.size(4) / 2 == cos_h.size(3), "expected cos_h/sin_h's head dim equals input's head dim / 2");
  TORCH_CHECK(cos_w.size(2) == 1, "expected third dim of cos_w/sin_w equals 1");
  TORCH_CHECK(input.size(2) <= cos_w.size(1), "expected input's width <= cos_w/sin_w's");
  TORCH_CHECK(input.size(4) / 2 == cos_w.size(3), "expected cos_w/sin_w's head dim equals input's head dim / 2");

  return fwd_2d_cuda(input, cos_h, sin_h, cos_w, sin_w);
}

torch::Tensor bwd_2d(const torch::Tensor& output_grads, const torch::Tensor& cos_h, const torch::Tensor& sin_h,
                     const torch::Tensor& cos_w, const torch::Tensor& sin_w) {
  TORCH_CHECK(output_grads.dim() == 5, "expected output_grads to be 5D tensor");
  TORCH_CHECK(cos_h.dim() == 4, "expected cos_h to be 4D tensor");
  TORCH_CHECK(sin_h.dim() == 4, "expected sin_h to be 4D tensor");
  TORCH_CHECK(cos_w.dim() == 4, "expected cos_w to be 4D tensor");
  TORCH_CHECK(sin_w.dim() == 4, "expected sin_w to be 4D tensor");
  TORCH_CHECK(cos_h.size(2) == 1, "expected third dim of cos_h/sin_h equals 1");
  TORCH_CHECK(output_grads.size(1) <= cos_h.size(1), "expected output_grads' height <= cos_h/sin_h's");
  TORCH_CHECK(output_grads.size(4) / 2 == cos_h.size(3),
              "expected cos_h/sin_h's head dim equals output_grads' head dim / 2");
  TORCH_CHECK(cos_w.size(2) == 1, "expected third dim of cos_w/sin_w equals 1");
  TORCH_CHECK(output_grads.size(2) <= cos_w.size(1), "expected output_grads' width <= cos_w/sin_w's");
  TORCH_CHECK(output_grads.size(4) / 2 == cos_w.size(3),
              "expected cos_w/sin_w's head dim equals output_grads' head dim / 2");

  return bwd_2d_cuda(output_grads, cos_h, sin_h, cos_w, sin_w);
}

}  // end namespace fused_rope

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &fused_rope::fwd, "Fused Rotary Positional Embedding -- Forward.",
        py::call_guard<py::gil_scoped_release>());
  m.def("backward", &fused_rope::bwd, "Fused Rotary Positional Embedding -- Backward.",
        py::call_guard<py::gil_scoped_release>());
  // cache sin/cos
  m.def("forward_cached", &fused_rope::fwd_cached, "Fused Rotary Positional Embedding Cached -- Forward.",
        py::call_guard<py::gil_scoped_release>());
  m.def("backward_cached", &fused_rope::bwd_cached, "Fused Rotary Positional Embedding Cached -- Backward.",
        py::call_guard<py::gil_scoped_release>());
  // thd
  m.def("forward_thd", &fused_rope::fwd_thd, "Fused Rotary Positional Embedding for thd layout -- Forward.",
        py::call_guard<py::gil_scoped_release>());
  m.def("backward_thd", &fused_rope::bwd_thd, "Fused Rotary Positional Embedding for thd layout -- Backward.",
        py::call_guard<py::gil_scoped_release>());
  // 2d
  m.def("forward_2d", &fused_rope::fwd_2d, "2D Fused Rotary Positional Embedding -- Forward.",
        py::call_guard<py::gil_scoped_release>());
  m.def("backward_2d", &fused_rope::bwd_2d, "2D Fused Rotary Positional Embedding -- Backward.",
        py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: csrc/megatron/fused_rotary_positional_embedding.h
================================================
/* coding=utf-8
 * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/macros/Macros.h>
#include <cuda_runtime.h>
#include <torch/extension.h>

namespace {

template <typename scalar_t>
__device__ void fused_rope_block_forward(const scalar_t* src, const float* freqs, scalar_t* dst, const int offset_block,
                                         const int offset_block_dst, const int h, const int d, const int d2,
                                         const int stride_h, const int stride_d, const int o_stride_h,
                                         const int o_stride_d) {
  int s_id = blockIdx.x;
#pragma unroll
  for (int d_id = threadIdx.x; d_id < d2; d_id += blockDim.x) {
    float v_cos, v_sin;
    sincosf(freqs[s_id * d2 + d_id], &v_sin, &v_cos);
#pragma unroll
    for (int h_id = threadIdx.y; h_id < h; h_id += blockDim.y) {
      int offset_src = offset_block + h_id * stride_h + d_id * stride_d;
      int offset_dst = offset_block_dst + h_id * o_stride_h + d_id * o_stride_d;
      scalar_t v_src = src[offset_src];
      scalar_t v_src_rotate =
          (d_id + d2 / 2 < d2) ? -src[offset_src + (d2 / 2) * stride_d] : src[offset_src + (d2 / 2 - d2) * stride_d];
      dst[offset_dst] = v_src * (scalar_t)v_cos + v_src_rotate * (scalar_t)v_sin;
    }
  }

  // copy the rest
  if (d > d2) {
#pragma unroll
    for (int h_id = threadIdx.y; h_id < h; h_id += blockDim.y) {
      int offset_head = offset_block + h_id * stride_h;
      int offset_head_dst = offset_block_dst + h_id * o_stride_h;
#pragma unroll
      for (int d_id = d2 + threadIdx.x; d_id < d; d_id += blockDim.x) {
        dst[offset_head_dst + d_id * o_stride_d] = src[offset_head + d_id * stride_d];
      }
    }
  }
}

template <typename scalar_t>
__device__ void fused_rope_block_backward(const scalar_t* src, const float* freqs, scalar_t* dst,
                                          const int offset_block, const int offset_block_dst, const int h, const int d,
                                          const int d2, const int stride_h, const int stride_d, const int o_stride_h,
                                          const int o_stride_d) {
  int s_id = blockIdx.x;
#pragma unroll
  for (int d_id = threadIdx.x; d_id < d2; d_id += blockDim.x) {
    scalar_t v_cos = cosf(freqs[s_id * d2 + d_id]);
    scalar_t v_sin =
        (d_id + d2 / 2 < d2) ? sinf(freqs[s_id * d2 + d_id + d2 / 2]) : -sinf(freqs[s_id * d2 + d_id + d2 / 2 - d2]);
#pragma unroll
    for (int h_id = threadIdx.y; h_id < h; h_id += blockDim.y) {
      int offset_src = offset_block + h_id * stride_h + d_id * stride_d;
      int offset_dst = offset_block_dst + h_id * o_stride_h + d_id * o_stride_d;
      scalar_t v_src = src[offset_src];
      scalar_t v_src_rotate =
          (d_id + d2 / 2 < d2) ? src[offset_src + (d2 / 2) * stride_d] : src[offset_src + (d2 / 2 - d2) * stride_d];
      dst[offset_dst] = v_src * v_cos + v_src_rotate * v_sin;
    }
  }

  // handle the tail
  if (d > d2) {
#pragma unroll
    for (int h_id = threadIdx.y; h_id < h; h_id += blockDim.y) {
      int offset_head = offset_block + h_id * stride_h;
      int offset_head_dst = offset_block_dst + h_id * o_stride_h;
#pragma unroll
      for (int d_id = d2 + threadIdx.x; d_id < d; d_id += blockDim.x) {
        dst[offset_head_dst + d_id * o_stride_d] = src[offset_head + d_id * stride_d];
      }
    }
  }
}

template <typename scalar_t>
__global__ void fused_rope_forward(const int h, const int d, const int d2, const int stride_s, const int stride_b,
                                   const int stride_h, const int stride_d, const int o_stride_s, const int o_stride_b,
                                   const int o_stride_h, const int o_stride_d, const scalar_t* src, const float* freqs,
                                   scalar_t* dst) {
  int s_id = blockIdx.x, b_id = blockIdx.y;
  int offset_block = s_id * stride_s + b_id * stride_b;
  int offset_block_dst = s_id * o_stride_s + b_id * o_stride_b;
  fused_rope_block_forward(src, freqs, dst, offset_block, offset_block_dst, h, d, d2, stride_h, stride_d, o_stride_h,
                           o_stride_d);
}

template <typename scalar_t>
__global__ void fused_rope_backward(const int h, const int d, const int d2, const int stride_s, const int stride_b,
                                    const int stride_h, const int stride_d, const int o_stride_s, const int o_stride_b,
                                    const int o_stride_h, const int o_stride_d, const scalar_t* src, const float* freqs,
                                    scalar_t* dst) {
  int s_id = blockIdx.x, b_id = blockIdx.y;
  int offset_block = s_id * stride_s + b_id * stride_b;
  int offset_block_dst = s_id * o_stride_s + b_id * o_stride_b;
  fused_rope_block_backward(src, freqs, dst, offset_block, offset_block_dst, h, d, d2, stride_h, stride_d, o_stride_h,
                            o_stride_d);
}

template <typename scalar_t_0, typename scalar_t_1>
__device__ void fused_rope_cached_block_forward(const scalar_t_0* src, const scalar_t_1* cos, const scalar_t_1* sin,
                                                scalar_t_0* dst, const int s_id, const int offset_block,
                                                const int offset_block_dst, const int h, const int d, const int d2,
                                                const int stride_h, const int stride_d, const int o_stride_h,
                                                const int o_stride_d) {
#pragma unroll
  for (int d_id = threadIdx.x; d_id < d2; d_id += blockDim.x) {
    scalar_t_0 v_cos = cos[s_id * d2 + d_id];
    scalar_t_0 v_sin = sin[s_id * d2 + d_id];
#pragma unroll
    for (int h_id = threadIdx.y; h_id < h; h_id += blockDim.y) {
      int offset_src = offset_block + h_id * stride_h + d_id * stride_d;
      int offset_dst = offset_block_dst + h_id * o_stride_h + d_id * o_stride_d;
      scalar_t_0 v_src = src[offset_src];
      scalar_t_0 v_src_rotate =
          (d_id + d2 / 2 < d2) ? -src[offset_src + (d2 / 2) * stride_d] : src[offset_src + (d2 / 2 - d2) * stride_d];
      dst[offset_dst] = v_src * v_cos + v_src_rotate * v_sin;
    }
  }

  // copy the rest
  if (d > d2) {
#pragma unroll
    for (int h_id = threadIdx.y; h_id < h; h_id += blockDim.y) {
      int offset_head = offset_block + h_id * stride_h;
      int offset_head_dst = offset_block_dst + h_id * o_stride_h;
#pragma unroll
      for (int d_id = d2 + threadIdx.x; d_id < d; d_id += blockDim.x) {
        dst[offset_head_dst + d_id * o_stride_d] = src[offset_head + d_id * stride_d];
      }
    }
  }
}

template <typename scalar_t_0, typename scalar_t_1>
__device__ void fused_rope_cached_block_backward(const scalar_t_0* src, const scalar_t_1* cos, const scalar_t_1* sin,
                                                 scalar_t_0* dst, const int s_id, const int offset_block,
                                                 const int offset_block_dst, const int h, const int d, const int d2,
                                                 const int stride_h, const int stride_d, const int o_stride_h,
                                                 const int o_stride_d) {
#pragma unroll
  for (int d_id = threadIdx.x; d_id < d2; d_id += blockDim.x) {
    scalar_t_0 v_cos = cos[s_id * d2 + d_id];
    scalar_t_0 v_sin = (d_id + d2 / 2 < d2) ? sin[s_id * d2 + d_id + d2 / 2] : -sin[s_id * d2 + d_id + d2 / 2 - d2];
#pragma unroll
    for (int h_id = threadIdx.y; h_id < h; h_id += blockDim.y) {
      int offset_src = offset_block + h_id * stride_h + d_id * stride_d;
      int offset_dst = offset_block_dst + h_id * o_stride_h + d_id * o_stride_d;
      scalar_t_0 v_src = src[offset_src];
      scalar_t_0 v_src_rotate =
          (d_id + d2 / 2 < d2) ? src[offset_src + (d2 / 2) * stride_d] : src[offset_src + (d2 / 2 - d2) * stride_d];
      dst[offset_dst] = v_src * v_cos + v_src_rotate * v_sin;
    }
  }

  // handle the tail
  if (d > d2) {
#pragma unroll
    for (int h_id = threadIdx.y; h_id < h; h_id += blockDim.y) {
      int offset_head = offset_block + h_id * stride_h;
      int offset_head_dst = offset_block_dst + h_id * o_stride_h;
#pragma unroll
      for (int d_id = d2 + threadIdx.x; d_id < d; d_id += blockDim.x) {
        dst[offset_head_dst + d_id * o_stride_d] = src[offset_head + d_id * stride_d];
      }
    }
  }
}

template <typename scalar_t_0, typename scalar_t_1>
__global__ void fused_rope_cached_forward(const int h, const int d, const int d2, const int stride_s,
                                          const int stride_b, const int stride_h, const int stride_d,
                                          const int o_stride_s, const int o_stride_b, const int o_stride_h,
                                          const int o_stride_d, const scalar_t_0* src, const scalar_t_1* cos,
                                          const scalar_t_1* sin, scalar_t_0* dst) {
  int s_id = blockIdx.x, b_id = blockIdx.y;
  int offset_block = s_id * stride_s + b_id * stride_b;
  int offset_block_dst = s_id * o_stride_s + b_id * o_stride_b;
  fused_rope_cached_block_forward(src, cos, sin, dst, s_id, offset_block, offset_block_dst, h, d, d2, stride_h,
                                  stride_d, o_stride_h, o_stride_d);
}

template <typename scalar_t_0, typename scalar_t_1>
__global__ void fused_rope_cached_backward(const int h, const int d, const int d2, const int stride_s,
                                           const int stride_b, const int stride_h, const int stride_d,
                                           const int o_stride_s, const int o_stride_b, const int o_stride_h,
                                           const int o_stride_d, const scalar_t_0* src, const scalar_t_1* cos,
                                           const scalar_t_1* sin, scalar_t_0* dst) {
  int s_id = blockIdx.x, b_id = blockIdx.y;
  int offset_block = s_id * stride_s + b_id * stride_b;
  int offset_block_dst = s_id * o_stride_s + b_id * o_stride_b;
  fused_rope_cached_block_backward(src, cos, sin, dst, s_id, offset_block, offset_block_dst, h, d, d2, stride_h,
                                   stride_d, o_stride_h, o_stride_d);
}

template <typename scalar_t>
__global__ void fused_rope_thd_forward(const int h, const int d, const int d2, const int stride_t, const int stride_h,
                                       const int stride_d, const int o_stride_t, const int o_stride_h,
                                       const int o_stride_d, const scalar_t* src, const int* cu_seqlens,
                                       const float* freqs, scalar_t* dst) {
  int s_id = blockIdx.x, b_id = blockIdx.y;
  int t_id = s_id + cu_seqlens[b_id];
  if (t_id >= cu_seqlens[b_id + 1]) return;
  int offset_block = t_id * stride_t;
  int offset_block_dst = t_id * o_stride_t;
  fused_rope_block_forward(src, freqs, dst, offset_block, offset_block_dst, h, d, d2, stride_h, stride_d, o_stride_h,
                           o_stride_d);
}

template <typename scalar_t>
__global__ void fused_rope_thd_backward(const int h, const int d, const int d2, const int stride_t, const int stride_h,
                                        const int stride_d, const int o_stride_t, const int o_stride_h,
                                        const int o_stride_d, const scalar_t* src, const int* cu_seqlens,
                                        const float* freqs, scalar_t* dst) {
  int s_id = blockIdx.x, b_id = blockIdx.y;
  int t_id = s_id + cu_seqlens[b_id];
  if (t_id >= cu_seqlens[b_id + 1]) return;
  int offset_block = t_id * stride_t;
  int offset_block_dst = t_id * o_stride_t;
  fused_rope_block_backward(src, freqs, dst, offset_block, offset_block_dst, h, d, d2, stride_h, stride_d, o_stride_h,
                            o_stride_d);
}

template <typename scalar_t_0, typename scalar_t_1>
__global__ void fused_rope_2d_forward(const int ih, const int iw, const int h, const int d, const int stride_b,
                                      const int stride_ih, const int stride_iw, const int stride_h, const int stride_d,
                                      const int o_stride_b, const int o_stride_s, const int o_stride_h,
                                      const int o_stride_d, const scalar_t_0* src, const scalar_t_1* cos_h,
                                      const scalar_t_1* sin_h, const scalar_t_1* cos_w, const scalar_t_1* sin_w,
                                      scalar_t_0* dst) {
  int ih_id = blockIdx.x, iw_id = blockIdx.y, b_id = blockIdx.z;
  // apply to height
  int offset_block = b_id * stride_b + ih_id * stride_ih + iw_id * stride_iw;
  int offset_block_dst = b_id * o_stride_b + (ih_id * iw + iw_id) * o_stride_s;
  int s_id = ih_id;  // for cos_h and sin_h
  fused_rope_cached_block_forward(src, cos_h, sin_h, dst, s_id, offset_block, offset_block_dst, h, d / 2, d / 2,
                                  stride_h, stride_d, o_stride_h, o_stride_d);
  // apply to width
  offset_block += d / 2 * stride_d;
  offset_block_dst += d / 2 * o_stride_d;
  s_id = iw_id;  // for cos_w and sin_w
  fused_rope_cached_block_forward(src, cos_w, sin_w, dst, s_id, offset_block, offset_block_dst, h, d / 2, d / 2,
                                  stride_h, stride_d, o_stride_h, o_stride_d);
}

template <typename scalar_t_0, typename scalar_t_1>
__global__ void fused_rope_2d_backward(const int ih, const int iw, const int h, const int d, const int stride_b,
                                       const int stride_ih, const int stride_iw, const int stride_h, const int stride_d,
                                       const int o_stride_b, const int o_stride_s, const int o_stride_h,
                                       const int o_stride_d, const scalar_t_0* src, const scalar_t_1* cos_h,
                                       const scalar_t_1* sin_h, const scalar_t_1* cos_w, const scalar_t_1* sin_w,
                                       scalar_t_0* dst) {
  int ih_id = blockIdx.x, iw_id = blockIdx.y, b_id = blockIdx.z;
  // apply to height
  int offset_block = b_id * stride_b + ih_id * stride_ih + iw_id * stride_iw;
  int offset_block_dst = b_id * o_stride_b + (ih_id * iw + iw_id) * o_stride_s;
  int s_id = ih_id;  // for cos_h and sin_h
  fused_rope_cached_block_backward(src, cos_h, sin_h, dst, s_id, offset_block, offset_block_dst, h, d / 2, d / 2,
                                   stride_h, stride_d, o_stride_h, o_stride_d);
  // apply to width
  offset_block += d / 2 * stride_d;
  offset_block_dst += d / 2 * o_stride_d;
  s_id = iw_id;  // for cos_w and sin_w
  fused_rope_cached_block_backward(src, cos_w, sin_w, dst, s_id, offset_block, offset_block_dst, h, d / 2, d / 2,
                                   stride_h, stride_d, o_stride_h, o_stride_d);
}

}  // end of anonymous namespace

template <typename scalar_t>
void dispatch_fused_rope_forward(const int s, const int b, const int h, const int d, const int d2, const int stride_s,
                                 const int stride_b, const int stride_h, const int stride_d, const int o_stride_s,
                                 const int o_stride_b, const int o_stride_h, const int o_stride_d,
                                 const scalar_t* input, const float* freqs, scalar_t* output) {
  auto stream = at::cuda::getCurrentCUDAStream();

  int warps_per_block = h < 16 ? 4 : 8;
  dim3 blocks(s, b);
  dim3 threads(C10_WARP_SIZE, warps_per_block);

  fused_rope_forward<<<blocks, threads, 0, stream>>>(h, d, d2, stride_s, stride_b, stride_h, stride_d, o_stride_s,
                                                     o_stride_b, o_stride_h, o_stride_d, input, freqs, output);
  C10_CUDA_KERNEL_LAUNCH_CHECK();
}

template <typename scalar_t>
void dispatch_fused_rope_backward(const int s, const int b, const int h, const int d, const int d2, const int stride_s,
                                  const int stride_b, const int stride_h, const int stride_d, const int o_stride_s,
                                  const int o_stride_b, const int o_stride_h, const int o_stride_d,
                                  const scalar_t* output_grads, const float* freqs, scalar_t* input_grads) {
  auto stream = at::cuda::getCurrentCUDAStream();

  int warps_per_block = h < 16 ? 4 : 8;
  dim3 blocks(s, b);
  dim3 threads(C10_WARP_SIZE, warps_per_block);

  fused_rope_backward<<<blocks, threads, 0, stream>>>(h, d, d2, stride_s, stride_b, stride_h, stride_d, o_stride_s,
                                                      o_stride_b, o_stride_h, o_stride_d, output_grads, freqs,
                                                      input_grads);
  C10_CUDA_KERNEL_LAUNCH_CHECK();
}

template <typename scalar_t_0, typename scalar_t_1>
void dispatch_fused_rope_cached_forward(const int s, const int b, const int h, const int d, const int d2,
                                        const int stride_s, const int stride_b, const int stride_h, const int stride_d,
                                        const int o_stride_s, const int o_stride_b, const int o_stride_h,
                                        const int o_stride_d, const scalar_t_0* input, const scalar_t_1* cos,
                                        const scalar_t_1* sin, scalar_t_0* output) {
  auto stream = at::cuda::getCurrentCUDAStream();

  int warps_per_block = h < 16 ? 4 : 8;
  dim3 blocks(s, b);
  dim3 threads(C10_WARP_SIZE, warps_per_block);

  fused_rope_cached_forward<<<blocks, threads, 0, stream>>>(h, d, d2, stride_s, stride_b, stride_h, stride_d,
                                                            o_stride_s, o_stride_b, o_stride_h, o_stride_d, input, cos,
                                                            sin, output);
  C10_CUDA_KERNEL_LAUNCH_CHECK();
}

template <typename scalar_t_0, typename scalar_t_1>
void dispatch_fused_rope_cached_backward(const int s, const int b, const int h, const int d, const int d2,
                                         const int stride_s, const int stride_b, const int stride_h, const int stride_d,
                                         const int o_stride_s, const int o_stride_b, const int o_stride_h,
                                         const int o_stride_d, const scalar_t_0* output_grads, const scalar_t_1* cos,
                                         const scalar_t_1* sin, scalar_t_0* input_grads) {
  auto stream = at::cuda::getCurrentCUDAStream();

  int warps_per_block = h < 16 ? 4 : 8;
  dim3 blocks(s, b);
  dim3 threads(C10_WARP_SIZE, warps_per_block);

  fused_rope_cached_backward<<<blocks, threads, 0, stream>>>(h, d, d2, stride_s, stride_b, stride_h, stride_d,
                                                             o_stride_s, o_stride_b, o_stride_h, o_stride_d,
                                                             output_grads, cos, sin, input_grads);
  C10_CUDA_KERNEL_LAUNCH_CHECK();
}

template <typename scalar_t>
void dispatch_fused_rope_thd_forward(const int max_s, const int b, const int h, const int d, const int d2,
                                     const int stride_t, const int stride_h, const int stride_d, const int o_stride_t,
                                     const int o_stride_h, const int o_stride_d, const scalar_t* input,
                                     const int* cu_seqlens, const float* freqs, scalar_t* output) {
  auto stream = at::cuda::getCurrentCUDAStream();

  int warps_per_block = h < 16 ? 4 : 8;
  dim3 blocks(max_s, b);
  dim3 threads(C10_WARP_SIZE, warps_per_block);

  fused_rope_thd_forward<<<blocks, threads, 0, stream>>>(h, d, d2, stride_t, stride_h, stride_d, o_stride_t, o_stride_h,
                                                         o_stride_d, input, cu_seqlens, freqs, output);
  C10_CUDA_KERNEL_LAUNCH_CHECK();
}

template <typename scalar_t>
void dispatch_fused_rope_thd_backward(const int max_s, const int b, const int h, const int d, const int d2,
                                      const int stride_t, const int stride_h, const int stride_d, const int o_stride_t,
                                      const int o_stride_h, const int o_stride_d, const scalar_t* output_grads,
                                      const int* cu_seqlens, const float* freqs, scalar_t* input_grads) {
  auto stream = at::cuda::getCurrentCUDAStream();

  int warps_per_block = h < 16 ? 4 : 8;
  dim3 blocks(max_s, b);
  dim3 threads(C10_WARP_SIZE, warps_per_block);

  fused_rope_thd_backward<<<blocks, threads, 0, stream>>>(h, d, d2, stride_t, stride_h, stride_d, o_stride_t,
                                                          o_stride_h, o_stride_d, output_grads, cu_seqlens, freqs,
                                                          input_grads);
  C10_CUDA_KERNEL_LAUNCH_CHECK();
}

template <typename scalar_t_0, typename scalar_t_1>
void dispatch_fused_rope_2d_forward(const int b, const int ih, const int iw, const int h, const int d,
                                    const int stride_b, const int stride_ih, const int stride_iw, const int stride_h,
                                    const int stride_d, const int o_stride_b, const int o_stride_s,
                                    const int o_stride_h, const int o_stride_d, const scalar_t_0* input,
                                    const scalar_t_1* cos_h, const scalar_t_1* sin_h, const scalar_t_1* cos_w,
                                    const scalar_t_1* sin_w, scalar_t_0* output) {
  auto stream = at::cuda::getCurrentCUDAStream();

  int warps_per_block = h < 16 ? 4 : 8;
  dim3 blocks(ih, iw, b);
  dim3 threads(C10_WARP_SIZE, warps_per_block);

  fused_rope_2d_forward<<<blocks, threads, 0, stream>>>(ih, iw, h, d, stride_b, stride_ih, stride_iw, stride_h,
                                                        stride_d, o_stride_b, o_stride_s, o_stride_h, o_stride_d, input,
                                                        cos_h, sin_h, cos_w, sin_w, output);
  C10_CUDA_KERNEL_LAUNCH_CHECK();
}

template <typename scalar_t_0, typename scalar_t_1>
void dispatch_fused_rope_2d_backward(const int b, const int ih, const int iw, const int h, const int d,
                                     const int stride_b, const int stride_ih, const int stride_iw, const int stride_h,
                                     const int stride_d, const int o_stride_b, const int o_stride_s,
                                     const int o_stride_h, const int o_stride_d, const scalar_t_0* output_grads,
                                     const scalar_t_1* cos_h, const scalar_t_1* sin_h, const scalar_t_1* cos_w,
                                     const scalar_t_1* sin_w, scalar_t_0* input_grads) {
  auto stream = at::cuda::getCurrentCUDAStream();

  int warps_per_block = h < 16 ? 4 : 8;
  dim3 blocks(ih, iw, b);
  dim3 threads(C10_WARP_SIZE, warps_per_block);

  fused_rope_2d_backward<<<blocks, threads, 0, stream>>>(ih, iw, h, d, stride_b, stride_ih, stride_iw, stride_h,
                                                         stride_d, o_stride_b, o_stride_s, o_stride_h, o_stride_d,
                                                         output_grads, cos_h, sin_h, cos_w, sin_w, input_grads);
  C10_CUDA_KERNEL_LAUNCH_CHECK();
}


================================================
FILE: csrc/megatron/fused_rotary_positional_embedding_cuda.cu
================================================
/* coding=utf-8
 * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <ATen/ATen.h>

#include "fused_rotary_positional_embedding.h"
#include "type_shim.h"

namespace fused_rope {

torch::Tensor fwd_cuda(const torch::Tensor& input, const torch::Tensor& freqs, const bool transpose_output) {
  // input sizes: (s, b, h, d)
  // s: sequence length
  // b: batch size
  // h: head num
  // d: dim of each head
  const int s = input.size(0);
  const int b = input.size(1);
  const int h = input.size(2);
  const int d = input.size(3);
  // input strides
  const int stride_s = input.stride(0);
  const int stride_b = input.stride(1);
  const int stride_h = input.stride(2);
  const int stride_d = input.stride(3);
  // freqs' shape is always (s, 1, 1, d2), so the strides are same under
  // different memory formats
  const int d2 = freqs.size(3);

  // output
  auto act_options = input.options().requires_grad(false);
  torch::Tensor output;
  if (transpose_output) {
    output = torch::empty({b, s, h, d}, act_options).transpose(0, 1);
  } else {
    output = torch::empty({s, b, h, d}, act_options);
  }
  // output strides
  const int o_stride_s = output.stride(0);
  const int o_stride_b = output.stride(1);
  const int o_stride_h = output.stride(2);
  const int o_stride_d = output.stride(3);

  DISPATCH_FLOAT_HALF_AND_BFLOAT(
      input.scalar_type(), 0, "dispatch_fused_rope_forward",
      dispatch_fused_rope_forward(s, b, h, d, d2, stride_s, stride_b, stride_h, stride_d, o_stride_s, o_stride_b,
                                  o_stride_h, o_stride_d, input.data_ptr<scalar_t_0>(), freqs.data_ptr<float>(),
                                  output.data_ptr<scalar_t_0>()););
  return output;
}

torch::Tensor bwd_cuda(const torch::Tensor& output_grads, const torch::Tensor& freqs, const bool transpose_output) {
  // output_grads sizes: (s, b, h, d)
  // s: sequence length
  // b: batch size
  // h: head num
  // d: dim of each head
  const int s = output_grads.size(0);
  const int b = output_grads.size(1);
  const int h = output_grads.size(2);
  const int d = output_grads.size(3);
  // output_grads strides
  const int stride_s = output_grads.stride(0);
  const int stride_b = output_grads.stride(1);
  const int stride_h = output_grads.stride(2);
  const int stride_d = output_grads.stride(3);
  // freqs' shape is always (s, 1, 1, d2), so the strides are same under
  // different memory formats
  const int d2 = freqs.size(3);

  auto act_options = output_grads.options().requires_grad(false);
  torch::Tensor input_grads;
  if (transpose_output) {
    input_grads = torch::empty({b, s, h, d}, act_options).transpose(0, 1);
  } else {
    input_grads = torch::empty({s, b, h, d}, act_options);
  }
  const int o_stride_s = input_grads.stride(0);
  const int o_stride_b = input_grads.stride(1);
  const int o_stride_h = input_grads.stride(2);
  const int o_stride_d = input_grads.stride(3);

  DISPATCH_FLOAT_HALF_AND_BFLOAT(
      output_grads.scalar_type(), 0, "dispatch_fused_rope_backward",
      dispatch_fused_rope_backward(s, b, h, d, d2, stride_s, stride_b, stride_h, stride_d, o_stride_s, o_stride_b,
                                   o_stride_h, o_stride_d, output_grads.data_ptr<scalar_t_0>(), freqs.data_ptr<float>(),
                                   input_grads.data_ptr<scalar_t_0>()););
  return input_grads;
}

#define DISPATCH_FUSED_ROPE_TYPES(TYPE1, TYPE2, NAME, ...)                                                      \
  switch (TYPE1) {                                                                                              \
    case at::ScalarType::Float: {                                                                               \
      using scalar_t_0 = float;                                                                                 \
      switch (TYPE2) {                                                                                          \
        case at::ScalarType::Float: {                                                                           \
          using scalar_t_1 = float;                                                                             \
          __VA_ARGS__;                                                                                          \
          break;                                                                                                \
        }                                                                                                       \
        default:                                                                                                \
          TORCH_CHECK(false, #NAME, " not supported for '", toString(TYPE1), "' with '", toString(TYPE2), "'"); \
      }                                                                                                         \
      break;                                                                                                    \
    }                                                                                                           \
    case at::ScalarType::Half: {                                                                                \
      using scalar_t_0 = at::Half;                                                                              \
      switch (TYPE2) {                                                                                          \
        case at::ScalarType::Float: {                                                                           \
          using scalar_t_1 = float;                                                                             \
          __VA_ARGS__;                                                                                          \
          break;                                                                                                \
        }                                                                                                       \
        case at::ScalarType::Half: {                                                                            \
          using scalar_t_1 = at::Half;                                                                          \
          __VA_ARGS__;                                                                                          \
          break;                                                                                                \
        }                                                                                                       \
        default:                                                                                                \
          TORCH_CHECK(false, #NAME, " not supported for '", toString(TYPE1), "' with '", toString(TYPE2), "'"); \
      }                                                                                                         \
      break;                                                                                                    \
    }                                                                                                           \
    case at::ScalarType::BFloat16: {                                                                            \
      using scalar_t_0 = at::BFloat16;                                                                          \
      switch (TYPE2) {                                                                                          \
        case at::ScalarType::Float: {                                                                           \
          using scalar_t_1 = float;                                                                             \
          __VA_ARGS__;                                                                                          \
          break;                                                                                                \
        }                                                                                                       \
        case at::ScalarType::BFloat16: {                                                                        \
          using scalar_t_1 = at::BFloat16;                                                                      \
          __VA_ARGS__;                                                                                          \
          break;                                                                                                \
        }                                                                                                       \
        default:                                                                                                \
          TORCH_CHECK(false, #NAME, " not supported for '", toString(TYPE1), "' with '", toString(TYPE2), "'"); \
      }                                                                                                         \
      break;                                                                                                    \
    }                                                                                                           \
    default:                                                                                                    \
      TORCH_CHECK(false, #NAME, " not supported for '", toString(TYPE1), "' with '", toString(TYPE2), "'");     \
  }

torch::Tensor fwd_cached_cuda(const torch::Tensor& input, const torch::Tensor& cos, const torch::Tensor& sin,
                              const bool transpose_output) {
  // input sizes: (s, b, h, d)
  // s: sequence length
  // b: batch size
  // h: head num
  // d: dim of each head
  const int s = input.size(0);
  const int b = input.size(1);
  const int h = input.size(2);
  const int d = input.size(3);
  // input strides
  const int stride_s = input.stride(0);
  const int stride_b = input.stride(1);
  const int stride_h = input.stride(2);
  const int stride_d = input.stride(3);
  // cos/sin's shape is always (s, 1, 1, d2), so the strides are same under
  // different memory formats
  const int d2 = cos.size(3);

  // output
  auto act_options = input.options().requires_grad(false);
  torch::Tensor output;
  if (transpose_output) {
    output = torch::empty({b, s, h, d}, act_options).transpose(0, 1);
  } else {
    output = torch::empty({s, b, h, d}, act_options);
  }
  // output strides
  const int o_stride_s = output.stride(0);
  const int o_stride_b = output.stride(1);
  const int o_stride_h = output.stride(2);
  const int o_stride_d = output.stride(3);

  DISPATCH_FUSED_ROPE_TYPES(input.scalar_type(), cos.scalar_type(), "dispatch_fused_rope_cached_forward",
                            dispatch_fused_rope_cached_forward(
                                s, b, h, d, d2, stride_s, stride_b, stride_h, stride_d, o_stride_s, o_stride_b,
                                o_stride_h, o_stride_d, input.data_ptr<scalar_t_0>(), cos.data_ptr<scalar_t_1>(),
                                sin.data_ptr<scalar_t_1>(), output.data_ptr<scalar_t_0>()););
  return output;
}

torch::Tensor bwd_cached_cuda(const torch::Tensor& output_grads, const torch::Tensor& cos, const torch::Tensor& sin,
                              const bool transpose_output) {
  // output_grads sizes: (s, b, h, d)
  // s: sequence length
  // b: batch size
  // h: head num
  // d: dim of each head
  const int s = output_grads.size(0);
  const int b = output_grads.size(1);
  const int h = output_grads.size(2);
  const int d = output_grads.size(3);
  // output_grads strides
  const int stride_s = output_grads.stride(0);
  const int stride_b = output_grads.stride(1);
  const int stride_h = output_grads.stride(2);
  const int stride_d = output_grads.stride(3);
  // cos/sin's shape is always (s, 1, 1, d2), so the strides are same under
  // different memory formats
  const int d2 = cos.size(3);

  auto act_options = output_grads.options().requires_grad(false);
  torch::Tensor input_grads;
  if (transpose_output) {
    input_grads = torch::empty({b, s, h, d}, act_options).transpose(0, 1);
  } else {
    input_grads = torch::empty({s, b, h, d}, act_options);
  }
  const int o_stride_s = input_grads.stride(0);
  const int o_stride_b = input_grads.stride(1);
  const int o_stride_h = input_grads.stride(2);
  const int o_stride_d = input_grads.stride(3);

  DISPATCH_FUSED_ROPE_TYPES(output_grads.scalar_type(), cos.scalar_type(), "dispatch_fused_rope_cached_backward",
                            dispatch_fused_rope_cached_backward(
                                s, b, h, d, d2, stride_s, stride_b, stride_h, stride_d, o_stride_s, o_stride_b,
                                o_stride_h, o_stride_d, output_grads.data_ptr<scalar_t_0>(), cos.data_ptr<scalar_t_1>(),
                                sin.data_ptr<scalar_t_1>(), input_grads.data_ptr<scalar_t_0>()););
  return input_grads;
}

torch::Tensor fwd_thd_cuda(const torch::Tensor& input, const torch::Tensor& cu_seqlens, const torch::Tensor& freqs) {
  // input sizes: (t, h, d)
  // t: cumulative sum of sequence lengths
  // h: head num
  // d: dim of each head
  const int t = input.size(0);
  const int h = input.size(1);
  const int d = input.size(2);
  // input strides
  const int stride_t = input.stride(0);
  const int stride_h = input.stride(1);
  const int stride_d = input.stride(2);
  // batch size
  const int b = cu_seqlens.size(0) - 1;
  // freqs' shape is (max_s, 1, 1, d2)
  const int max_s = freqs.size(0);
  const int d2 = freqs.size(3);

  // output
  auto act_options = input.options().requires_grad(false);
  auto output = torch::empty({t, h, d}, act_options);
  // output strides
  const int o_stride_t = output.stride(0);
  const int o_stride_h = output.stride(1);
  const int o_stride_d = output.stride(2);

  DISPATCH_FLOAT_HALF_AND_BFLOAT(
      input.scalar_type(), 0, "dispatch_fused_rope_thd_forward",
      dispatch_fused_rope_thd_forward(max_s, b, h, d, d2, stride_t, stride_h, stride_d, o_stride_t, o_stride_h,
                                      o_stride_d, input.data_ptr<scalar_t_0>(), cu_seqlens.data_ptr<int>(),
                                      freqs.data_ptr<float>(), output.data_ptr<scalar_t_0>()););
  return output;
}

torch::Tensor bwd_thd_cuda(const torch::Tensor& output_grads, const torch::Tensor& cu_seqlens,
                           const torch::Tensor& freqs) {
  // output_grads sizes: (t, h, d)
  // t: cumulative sum of sequence lengths
  // h: head num
  // d: dim of each head
  const int t = output_grads.size(0);
  const int h = output_grads.size(1);
  const int d = output_grads.size(2);
  // output_grads strides
  const int stride_t = output_grads.stride(0);
  const int stride_h = output_grads.stride(1);
  const int stride_d = output_grads.stride(2);
  // batch size
  const int b = cu_seqlens.size(0) - 1;
  // freqs' shape is (max_s, 1, 1, d2)
  const int max_s = freqs.size(0);
  const int d2 = freqs.size(3);

  auto act_options = output_grads.options().requires_grad(false);
  auto input_grads = torch::empty({t, h, d}, act_options);
  const int o_stride_t = input_grads.stride(0);
  const int o_stride_h = input_grads.stride(1);
  const int o_stride_d = input_grads.stride(2);

  DISPATCH_FLOAT_HALF_AND_BFLOAT(
      output_grads.scalar_type(), 0, "dispatch_fused_rope_thd_backward",
      dispatch_fused_rope_thd_backward(max_s, b, h, d, d2, stride_t, stride_h, stride_d, o_stride_t, o_stride_h,
                                       o_stride_d, output_grads.data_ptr<scalar_t_0>(), cu_seqlens.data_ptr<int>(),
                                       freqs.data_ptr<float>(), input_grads.data_ptr<scalar_t_0>()););
  return input_grads;
}

torch::Tensor fwd_2d_cuda(const torch::Tensor& input, const torch::Tensor& cos_h, const torch::Tensor& sin_h,
                          const torch::Tensor& cos_w, const torch::Tensor& sin_w) {
  // input sizes: (b, ih, iw, h, d)
  // b: batch size
  // ih: image height
  // iw: image width
  // h: head num
  // d: dim of each head
  const int b = input.size(0);
  const int ih = input.size(1);
  const int iw = input.size(2);
  const int h = input.size(3);
  const int d = input.size(4);
  // input strides
  const int stride_b = input.stride(0);
  const int stride_ih = input.stride(1);
  const int stride_iw = input.stride(2);
  const int stride_h = input.stride(3);
  const int stride_d = input.stride(4);

  // output
  auto act_options = input.options().requires_grad(false);
  auto output = torch::empty({b, ih * iw, h, d}, act_options);
  // output strides
  const int o_stride_b = output.stride(0);
  const int o_stride_s = output.stride(1);
  const int o_stride_h = output.stride(2);
  const int o_stride_d = output.stride(3);

  DISPATCH_FUSED_ROPE_TYPES(
      input.scalar_type(), cos_h.scalar_type(), "dispatch_fused_rope_2d_forward",
      dispatch_fused_rope_2d_forward(
          b, ih, iw, h, d, stride_b, stride_ih, stride_iw, stride_h, stride_d, o_stride_b, o_stride_s, o_stride_h,
          o_stride_d, input.data_ptr<scalar_t_0>(), cos_h.data_ptr<scalar_t_1>(), sin_h.data_ptr<scalar_t_1>(),
          cos_w.data_ptr<scalar_t_1>(), sin_w.data_ptr<scalar_t_1>(), output.data_ptr<scalar_t_0>()););
  return output;
}

torch::Tensor bwd_2d_cuda(const torch::Tensor& output_grads, const torch::Tensor& cos_h, const torch::Tensor& sin_h,
                          const torch::Tensor& cos_w, const torch::Tensor& sin_w) {
  // output_grads sizes: (b, ih, iw, h, d)
  // b: batch size
  // ih: image height
  // iw: image width
  // h: head num
  // d: dim of each head
  const int b = output_grads.size(0);
  const int ih = output_grads.size(1);
  const int iw = output_grads.size(2);
  const int h = output_grads.size(3);
  const int d = output_grads.size(4);
  // output_grads strides
  const int stride_b = output_grads.stride(0);
  const int stride_ih = output_grads.stride(1);
  const int stride_iw = output_grads.stride(2);
  const int stride_h = output_grads.stride(3);
  const int stride_d = output_grads.stride(4);

  auto act_options = output_grads.options().requires_grad(false);
  auto input_grads = torch::empty({b, ih * iw, h, d}, act_options);
  const int o_stride_b = input_grads.stride(0);
  const int o_stride_s = input_grads.stride(1);
  const int o_stride_h = input_grads.stride(2);
  const int o_stride_d = input_grads.stride(3);

  DISPATCH_FUSED_ROPE_TYPES(
      output_grads.scalar_type(), cos_h.scalar_type(), "dispatch_fused_rope_2d_backward",
      dispatch_fused_rope_2d_backward(
          b, ih, iw, h, d, stride_b, stride_ih, stride_iw, stride_h, stride_d, o_stride_b, o_stride_s, o_stride_h,
          o_stride_d, output_grads.data_ptr<scalar_t_0>(), cos_h.data_ptr<scalar_t_1>(), sin_h.data_ptr<scalar_t_1>(),
          cos_w.data_ptr<scalar_t_1>(), sin_w.data_ptr<scalar_t_1>(), input_grads.data_ptr<scalar_t_0>()););
  return input_grads;
}

}  // end namespace fused_rope


================================================
FILE: csrc/megatron/fused_weight_gradient_dense.cpp
================================================
#include <torch/extension.h>

#include <cstdio>
#include <vector>

void wgrad_gemm_accum_fp32_cuda_stub(at::Tensor& input_2d, at::Tensor& d_output_2d, at::Tensor& d_weight);

void wgrad_gemm_accum_fp16_cuda_stub(at::Tensor& input_2d, at::Tensor& d_output_2d, at::Tensor& d_weight);

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("wgrad_gemm_accum_fp32", &wgrad_gemm_accum_fp32_cuda_stub, "wgrad gemm accum in fp32",
        py::call_guard<py::gil_scoped_release>());
  m.def("wgrad_gemm_accum_fp16", &wgrad_gemm_accum_fp16_cuda_stub, "wgrad gemm accum in fp16",
        py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: csrc/megatron/fused_weight_gradient_dense_16bit_prec_cuda.cu
================================================
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <torch/extension.h>

#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <cstring>

/* Includes, cuda */
#include <cublas_v2.h>
#include <cuda_runtime.h>

#include "type_shim.h"

// BF16 inputs and BF16 accumulation
void gemmex_wrapper_fp16(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
                         const float* alpha, at::BFloat16* A, int lda, at::BFloat16* B, int ldb, const float* beta,
                         at::BFloat16* C, int ldc) {
  TORCH_CUDABLAS_CHECK(cublasGemmEx(handle, transa, transb, m, n, k, alpha, A, CUDA_R_16BF, lda, B, CUDA_R_16BF, ldb,
                                    beta, C, CUDA_R_16BF, ldc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
}

// FP16 inputs and FP16 accumulation
void gemmex_wrapper_fp16(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
                         const float* alpha, at::Half* A, int lda, at::Half* B, int ldb, const float* beta, at::Half* C,
                         int ldc) {
  TORCH_CUDABLAS_CHECK(cublasGemmEx(handle, transa, transb, m, n, k, alpha, A, CUDA_R_16F, lda, B, CUDA_R_16F, ldb,
                                    beta, C, CUDA_R_16F, ldc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
}

template <typename T>
void wgrad_gemm_accum_fp16_cuda(T* input, T* d_output, T* d_weight, int in_dim, int hidden_dim, int out_dim) {
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cudaStream_t stream;
  cublasGetStream(handle, &stream);
  const float alpha = 1.0;
  const float beta = 1.0;

  gemmex_wrapper_fp16(handle, CUBLAS_OP_N, CUBLAS_OP_T, in_dim, out_dim, hidden_dim, &alpha, input, in_dim, d_output,
                      out_dim, &beta, d_weight, in_dim);
}

template void wgrad_gemm_accum_fp16_cuda<at::Half>(at::Half* input, at::Half* d_output, at::Half* d_weight, int in_dim,
                                                   int hidden_dim, int out_dim);
template void wgrad_gemm_accum_fp16_cuda<at::BFloat16>(at::BFloat16* input, at::BFloat16* d_output,
                                                       at::BFloat16* d_weight, int in_dim, int hidden_dim, int out_dim);

void wgrad_gemm_accum_fp16_cuda_stub(at::Tensor& input, at::Tensor& d_output, at::Tensor& d_weight) {
  at::Tensor input_2d, d_output_2d;
  // input tensor: collapse to the first dim
  auto in_sizes = input.sizes();
  if (input.dim() > 2) {
    input_2d = input.view({-1, in_sizes[in_sizes.size() - 1]});
  } else {
    input_2d = input;
  }
  // d_output tensor: collapse to the first dim
  auto d_out_sizes = d_output.sizes();
  if (d_output.dim() > 2) {
    d_output_2d = d_output.view({-1, d_out_sizes[d_out_sizes.size() - 1]});
  } else {
    d_output_2d = d_output;
  }

  const int hidden_dim = input_2d.size(0);
  const int in_dim = input_2d.size(1);
  const int out_dim = d_weight.size(0);

  DISPATCH_HALF_AND_BFLOAT(
      input_2d.scalar_type(), "wgrad_gemm_accum_fp16",
      wgrad_gemm_accum_fp16_cuda<scalar_t>(input_2d.data_ptr<scalar_t>(), d_output_2d.data_ptr<scalar_t>(),
                                           d_weight.data_ptr<scalar_t>(), in_dim, hidden_dim, out_dim););
}


================================================
FILE: csrc/megatron/fused_weight_gradient_dense_cuda.cu
================================================
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <torch/extension.h>

#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <cstring>

/* Includes, cuda */
#include <cublas_v2.h>
#include <cuda_runtime.h>

#include "type_shim.h"

// BF16 Tensor core wrapper around cublas GEMMEx
void gemmex_wrapper(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
                    const float* alpha, at::BFloat16* A, int lda, at::BFloat16* B, int ldb, const float* beta, float* C,
                    int ldc) {
  TORCH_CUDABLAS_CHECK(cublasGemmEx(handle, transa, transb, m, n, k, alpha, A, CUDA_R_16BF, lda, B, CUDA_R_16BF, ldb,
                                    beta, C, CUDA_R_32F, ldc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
}

// FP16 Tensor core wrapper around cublas GEMMEx
void gemmex_wrapper(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
                    const float* alpha, at::Half* A, int lda, at::Half* B, int ldb, const float* beta, float* C,
                    int ldc) {
  TORCH_CUDABLAS_CHECK(cublasGemmEx(handle, transa, transb, m, n, k, alpha, A, CUDA_R_16F, lda, B, CUDA_R_16F, ldb,
                                    beta, C, CUDA_R_32F, ldc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
}

// FP32 wrapper around cublas GEMMEx
void gemmex_wrapper(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
                    const float* alpha, float* A, int lda, float* B, int ldb, const float* beta, float* C, int ldc) {
  TORCH_CUDABLAS_CHECK(cublasGemmEx(handle, transa, transb, m, n, k, alpha, A, CUDA_R_32F, lda, B, CUDA_R_32F, ldb,
                                    beta, C, CUDA_R_32F, ldc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
}

template <typename T>
void wgrad_gemm_accum_fp32_cuda(T* input, T* d_output, float* d_weight, int in_dim, int hidden_dim, int out_dim) {
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cudaStream_t stream;
  cublasGetStream(handle, &stream);
  const float alpha = 1.0;
  const float beta = 1.0;

  gemmex_wrapper(handle, CUBLAS_OP_N, CUBLAS_OP_T, in_dim, out_dim, hidden_dim, &alpha, input, in_dim, d_output,
                 out_dim, &beta, d_weight, in_dim);
}

template void wgrad_gemm_accum_fp32_cuda<at::Half>(at::Half* input, at::Half* d_output, float* d_weight, int in_dim,
                                                   int hidden_dim, int out_dim);
template void wgrad_gemm_accum_fp32_cuda<at::BFloat16>(at::BFloat16* input, at::BFloat16* d_output, float* d_weight,
                                                       int in_dim, int hidden_dim, int out_dim);
template void wgrad_gemm_accum_fp32_cuda<float>(float* input, float* d_output, float* d_weight, int in_dim,
                                                int hidden_dim, int out_dim);

void wgrad_gemm_accum_fp32_cuda_stub(at::Tensor& input, at::Tensor& d_output, at::Tensor& d_weight) {
  at::Tensor input_2d, d_output_2d;
  // input tensor: collapse to the first dim
  auto in_sizes = input.sizes();
  if (input.dim() > 2) {
    input_2d = input.view({-1, in_sizes[in_sizes.size() - 1]});
  } else {
    input_2d = input;
  }
  // d_output tensor: collapse to the first dim
  auto d_out_sizes = d_output.sizes();
  if (d_output.dim() > 2) {
    d_output_2d = d_output.view({-1, d_out_sizes[d_out_sizes.size() - 1]});
  } else {
    d_output_2d = d_output;
  }

  const int hidden_dim = input_2d.size(0);
  const int in_dim = input_2d.size(1);
  const int out_dim = d_weight.size(0);

  DISPATCH_FLOAT_HALF_AND_BFLOAT(
      input_2d.scalar_type(), 0, "wgrad_gemm_accum_fp32",
      wgrad_gemm_accum_fp32_cuda<scalar_t_0>(input_2d.data_ptr<scalar_t_0>(), d_output_2d.data_ptr<scalar_t_0>(),
                                             d_weight.data_ptr<float>(), in_dim, hidden_dim, out_dim););
}


================================================
FILE: csrc/megatron/generic_scaled_masked_softmax.cpp
================================================
/* coding=utf-8
 * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <cuda_fp16.h>
#include <torch/extension.h>

#include <vector>

namespace multihead_attn {
namespace fused_softmax {
namespace generic_scaled_masked_softmax {

torch::Tensor fwd_cuda(torch::Tensor const& input, torch::Tensor const& mask, float scale_factor);

torch::Tensor bwd_cuda(torch::Tensor const& output_grads, torch::Tensor const& softmax_results, float scale_factor);

torch::Tensor fwd(torch::Tensor const& input, torch::Tensor const& mask, float scale_factor) {
  TORCH_CHECK(input.dim() == 4, "expected 4D tensor");
  TORCH_CHECK((input.scalar_type() == at::ScalarType::Half) || (input.scalar_type() == at::ScalarType::BFloat16),
              "Only fp16 and bf16 are supported");
  TORCH_CHECK(mask.dim() == 4, "expected 4D tensor");

  return fwd_cuda(input, mask, scale_factor);
}

torch::Tensor bwd(torch::Tensor const& output_grads, torch::Tensor const& softmax_results, float scale_factor) {
  TORCH_CHECK(output_grads.dim() == 4, "expected 3D tensor");
  TORCH_CHECK(softmax_results.dim() == 4, "expected 3D tensor");

  TORCH_CHECK(
      (output_grads.scalar_type() == at::ScalarType::Half) || (output_grads.scalar_type() == at::ScalarType::BFloat16),
      "Only fp16 and bf16 are supported");
  TORCH_CHECK((softmax_results.scalar_type() == at::ScalarType::Half) ||
                  (softmax_results.scalar_type() == at::ScalarType::BFloat16),
              "Only fp16 and bf16 are supported");

  return bwd_cuda(output_grads, softmax_results, scale_factor);
}

}  // end namespace generic_scaled_masked_softmax
}  // end namespace fused_softmax
}  // end namespace multihead_attn

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &multihead_attn::fused_softmax::generic_scaled_masked_softmax::fwd,
        "Self Multihead Attention scaled, time masked softmax -- Forward.", py::call_guard<py::gil_scoped_release>());

  m.def("backward", &multihead_attn::fused_softmax::generic_scaled_masked_softmax::bwd,
        "Self Multihead Attention scaled, time masked softmax -- Backward.", py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: csrc/megatron/generic_scaled_masked_softmax.h
================================================
/* coding=utf-8
 * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <assert.h>
#include <c10/macros/Macros.h>
#include <cuda_fp16.h>
#include <stdint.h>

#include <cfloat>
#include <limits>

namespace {

template <typename T>
struct Add {
  __device__ __forceinline__ T operator()(T a, T b) const { return a + b; }
};

template <typename T>
struct Max {
  __device__ __forceinline__ T operator()(T a, T b) const { return a < b ? b : a; }
};

template <typename T>
__device__ __forceinline__ T WARP_SHFL_DOWN_NATIVE(T value, int laneMask, int width = warpSize,
                                                   unsigned int mask = 0xffffffff) {
#if CUDA_VERSION >= 9000
  return __shfl_down_sync(mask, value, laneMask, width);
#else
  return __shfl_down(value, laneMask, width);
#endif
}

template <typename acc_t, int WARP_SIZE, template <typename> class ReduceOp>
__device__ __forceinline__ acc_t warp_reduce_new(acc_t val) {
  ReduceOp<acc_t> r;
#pragma unroll
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
    val = r(val, WARP_SHFL_DOWN_NATIVE(val, offset, WARP_SIZE));
  }
  return val;
}

template <typename input_t, typename output_t, typename acc_t, int log2_elements>
__global__ void scaled_masked_softmax_warp_backward_new(output_t* gradInput,  //[batches, attn_heads, q_len, k_len]
                                                        input_t* grad,
                                                        const input_t* output,  //[batches, attn_heads, q_len, k_len]
                                                        acc_t scale, int element_count) {
  int threads_per_block = blockDim.x;
  // the first element_count*2 elements are used for cache, the last 128 is used for reduction
  extern __shared__ acc_t shared_data[];
  input_t* local_data = (input_t*)shared_data;
  input_t* output_data = &local_data[element_count];
  // maximum shared cached 128, enough for 4096 elements reduction into 4096/32= 128 elements
  acc_t* shared = (acc_t*)(&(local_data[element_count * 2]));

  int num_reductions = (element_count - 1) / threads_per_block + 1;

  int offset = blockIdx.x * element_count;

  int local_idx = threadIdx.x;
  int lane = threadIdx.x % C10_WARP_SIZE;
  int wid = threadIdx.x / C10_WARP_SIZE;
  int warps_per_thread_block = threads_per_block / C10_WARP_SIZE;

  // load the data to local data
  acc_t val = 0.0;
  for (int i = 0; i < num_reductions; i++) {
    if (i * threads_per_block + local_idx < element_count) {
      val = output[offset + i * threads_per_block + local_idx];
      output_data[i * threads_per_block + local_idx] = val;
      local_data[i * threads_per_block + local_idx] = val * grad[offset + i * threads_per_block + local_idx];
    }
    __syncthreads();
  }

  // find the sum
  for (int i = local_idx; i < (element_count - 1) / C10_WARP_SIZE + 1; i += threads_per_block) {
    shared[i] = 0.0;
  }
  __syncthreads();

#pragma unroll
  for (int i = 0; i < num_reductions; i++) {
    if (i * threads_per_block + local_idx < element_count) {
      val = local_data[i * threads_per_block + local_idx];
    } else {
      val = 0.0;
    }
    __syncthreads();
    val = warp_reduce_new<acc_t, C10_WARP_SIZE, Add>(val);
    if (lane == 0 && wid + warps_per_thread_block * i < (element_count - 1) / C10_WARP_SIZE + 1) {
      shared[wid + warps_per_thread_block * i] = val;
    }
    __syncthreads();
  }

  // final shared reduction

  int shared_mem_len = (element_count - 1) / C10_WARP_SIZE + 1;
  int num_warps = (shared_mem_len - 1) / C10_WARP_SIZE + 1;
  while (shared_mem_len > 1) {
#pragma unroll
    for (int i = 0; i < num_reductions; i++) {
      if (i * threads_per_block + local_idx < shared_mem_len) {
        val = shared[i * threads_per_block + local_idx];
      } else {
        val = 0.0;
      }
      __syncthreads();
      val = warp_reduce_new<acc_t, C10_WARP_SIZE, Add>(val);
      if (lane == 0) {
        shared[wid + warps_per_thread_block * i] = val;
      }
      __syncthreads();
    }
    shared_mem_len = num_warps;
    num_warps = (shared_mem_len - 1) / C10_WARP_SIZE + 1;
  }
  val = shared[0];
#pragma unroll
  for (int i = local_idx; i < element_count; i += threads_per_block) {
    gradInput[offset + i] = (output_t)(scale * (local_data[i] - output_data[i] * val));
  }
}

}  // end of anonymous namespace

template <typename input_t, typename output_t, typename acc_t>
void dispatch_scaled_masked_softmax_backward_new(output_t* grad_input, input_t* grad, const input_t* output,
                                                 const acc_t scale, int query_seq_len, int key_seq_len, int batches,
                                                 int attn_heads) {
  if (key_seq_len == 0) {
    return;
  } else {
    int batch_count = batches * attn_heads * query_seq_len;
    // use 128 threads per block to maximize gpu utilization
    constexpr int threads_per_block = 128;
    int num_warps = (key_seq_len - 1) / C10_WARP_SIZE + 1;
    dim3 blocks(batch_count, 1, 1);
    dim3 threads(threads_per_block, 1, 1);

    scaled_masked_softmax_warp_backward_new<input_t, output_t, acc_t, 12>
        <<<blocks, threads, sizeof(input_t) * key_seq_len * 2 + sizeof(acc_t) * num_warps,
           at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, key_seq_len);
  }
}

/*
 * Extended softmax (from native aten pytorch) with following additional features
 * 1) input scaling
 * 2) Explicit masking
 */
template <typename input_t, typename output_t, typename acc_t>
__global__ void scaled_masked_softmax_warp_forward_new(output_t* dst, const input_t* src, const uint8_t* mask,
                                                       const acc_t scale,
                                                       int query_len,  // query_len
                                                       int attn_heads,
                                                       int element_count,  // key_len
                                                       int pad_batches)    // mask batch size
{
  // min threawds_per_block has to be bigger than 128
  int threads_per_block = blockDim.x;
  //  the first element_count is used for cache, the last 128 is used for reduction
  extern __shared__ acc_t local_data[];
  // maximum shared cached 128, enough for 4096 elements reduction into 4096/32= 128 elements
  acc_t* shared = &(local_data[element_count]);
  // number of 1024 threads reductions
  int num_reductions = (element_count - 1) / threads_per_block + 1;

  int offset = blockIdx.x * element_count;
  int mask_offset;
  int query_id = blockIdx.x % query_len;
  if (pad_batches == 1) {
    // broadcaste the mask tensor
    mask_offset = query_id * element_count;
  } else {
    int mask_batch_id = blockIdx.x / attn_heads / query_len;
    mask_offset = (mask_batch_id * query_len + query_id) * element_count;
  }

  int local_idx = threadIdx.x;
  int lane = threadIdx.x % C10_WARP_SIZE;
  int wid = threadIdx.x / C10_WARP_SIZE;
  int warps_per_thread_block = threads_per_block / C10_WARP_SIZE;

  // load the data to local data
  for (int i = local_idx; i < element_count; i += threads_per_block) {
    // TODO, use the copy vector method
    if (mask[mask_offset + i] == 1) {
      local_data[i] = -10000.0;
    } else {
      local_data[i] = src[offset + i] * scale;
    }
  }

  // first find the max value
  for (int i = local_idx; i < (element_count - 1) / C10_WARP_SIZE + 1; i += threads_per_block) {
    shared[i] = -10000.0;
  }
  __syncthreads();
  acc_t val = -10000.0;
#pragma unroll
  for (int i = 0; i < num_reductions; i++) {
    if (i * threads_per_block + local_idx < element_count) {
      val = local_data[i * threads_per_block + local_idx];
    } else {
      val = -10000.0;
    }
    __syncthreads();
    val = warp_reduce_new<acc_t, C10_WARP_SIZE, Max>(val);

    if (lane == 0 && wid + warps_per_thread_block * i < (element_count - 1) / C10_WARP_SIZE + 1) {
      shared[wid + warps_per_thread_block * i] = val;
    }
    __syncthreads();
  }

  // final shared reduction
  int shared_mem_len = (element_count - 1) / C10_WARP_SIZE + 1;
  int num_warps = (shared_mem_len - 1) / C10_WARP_SIZE + 1;
  while (shared_mem_len > 1) {
#pragma unroll
    for (int i = 0; i < num_reductions; i++) {
      if (i * threads_per_block + local_idx < shared_mem_len) {
        val = shared[i * threads_per_block + local_idx];
      } else {
        val = -10000.0;
      }
      __syncthreads();
      val = warp_reduce_new<acc_t, C10_WARP_SIZE, Max>(val);
      if (lane == 0) {
        shared[wid + warps_per_thread_block * i] = val;
      }
      __syncthreads();
    }
    shared_mem_len = num_warps;
    num_warps = (shared_mem_len - 1) / C10_WARP_SIZE + 1;
  }

  acc_t reduced_val = shared[0];
  if (reduced_val < -10000.0 + 0.1) {
// if everything is masked, pay attention to nothing
#pragma unroll
    for (int i = local_idx; i < element_count; i += threads_per_block) {
      dst[offset + i] = 0.0;
    }
    return;
  }

// update the values
#pragma unroll
  for (int i = local_idx; i < element_count; i += threads_per_block) {
    local_data[i] = std::exp(local_data[i] - reduced_val);
  }

  // find the sum
  for (int i = local_idx; i < (element_count - 1) / C10_WARP_SIZE + 1; i += threads_per_block) {
    shared[i] = 0.0;
  }
  __syncthreads();

#pragma unroll
  for (int i = 0; i < num_reductions; i++) {
    if (i * threads_per_block + local_idx < element_count) {
      val = local_data[i * threads_per_block + local_idx];
    } else {
      val = 0.0;
    }
    __syncthreads();

    val = warp_reduce_new<acc_t, C10_WARP_SIZE, Add>(val);
    if (lane == 0 && wid + warps_per_thread_block * i < (element_count - 1) / C10_WARP_SIZE + 1) {
      shared[wid + warps_per_thread_block * i] = val;
    }
    __syncthreads();
  }

  shared_mem_len = (element_count - 1) / C10_WARP_SIZE + 1;
  num_warps = (shared_mem_len - 1) / C10_WARP_SIZE + 1;
  while (shared_mem_len > 1) {
#pragma unroll
    for (int i = 0; i < num_reductions; i++) {
      if (i * threads_per_block + local_idx < shared_mem_len) {
        val = shared[i * threads_per_block + local_idx];
      } else {
        val = 0.0;
      }
      __syncthreads();
      val = warp_reduce_new<acc_t, C10_WARP_SIZE, Add>(val);
      if (lane == 0) {
        shared[wid + warps_per_thread_block * i] = val;
      }
      __syncthreads();
    }
    shared_mem_len = num_warps;
    num_warps = (shared_mem_len - 1) / C10_WARP_SIZE + 1;
  }

  reduced_val = shared[0];

#pragma unroll
  for (int i = local_idx; i < element_count; i += threads_per_block) {
    dst[offset + i] = local_data[i] / reduced_val;
  }
}

template <typename input_t, typename output_t, typename acc_t>
void dispatch_scaled_masked_softmax_forward_new(output_t* dst, const input_t* src, const uint8_t* mask,
                                                const input_t scale, int query_seq_len, int key_seq_len, int batches,
                                                int attn_heads, int pad_batches) {
  if (key_seq_len == 0) {
    return;
  } else {
    int batch_count = batches * attn_heads * query_seq_len;

    // use 128 threads per block to maximize gpu utilization
    constexpr int threads_per_block = 128;

    // calculate the needed shared memory
    int num_warps = (key_seq_len - 1) / C10_WARP_SIZE + 1;

    dim3 blocks(batch_count, 1, 1);
    dim3 threads(threads_per_block, 1, 1);
    scaled_masked_softmax_warp_forward_new<input_t, output_t, acc_t>
        <<<blocks, threads, sizeof(acc_t) * (key_seq_len + num_warps), at::cuda::getCurrentCUDAStream()>>>(
            dst, src, mask, scale, query_seq_len, attn_heads, key_seq_len, pad_batches);
  }
}


================================================
FILE: csrc/megatron/generic_scaled_masked_softmax_cuda.cu
================================================
/* coding=utf-8
 * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <cuda_runtime.h>
#include <torch/extension.h>

#include "generic_scaled_masked_softmax.h"
#include "type_shim.h"

namespace multihead_attn {
namespace fused_softmax {
namespace generic_scaled_masked_softmax {

torch::Tensor fwd_cuda(torch::Tensor const& input, torch::Tensor const& mask, float scale_factor) {
  // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
  const int batches = input.size(0);
  const int pad_batches = mask.size(0);
  const int attn_heads = input.size(1);
  const int query_seq_len = input.size(2);
  const int key_seq_len = input.size(3);
  TORCH_INTERNAL_ASSERT(pad_batches == 1 || pad_batches == batches);
  TORCH_INTERNAL_ASSERT(mask.size(1) == 1);
  TORCH_INTERNAL_ASSERT(mask.size(2) == query_seq_len);
  TORCH_INTERNAL_ASSERT(mask.size(3) == key_seq_len);

  // Output
  auto act_options = input.options().requires_grad(false);
  torch::Tensor softmax_results = torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options);

  // Softmax Intermediate Result Ptr
  void* input_ptr = static_cast<void*>(input.data_ptr());
  void* mask_ptr = static_cast<void*>(mask.data_ptr());
  void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());

  DISPATCH_HALF_AND_BFLOAT(input.scalar_type(), "dispatch_scaled_masked_softmax_forward",
                           dispatch_scaled_masked_softmax_forward_new<scalar_t, scalar_t, float>(
                               reinterpret_cast<scalar_t*>(softmax_results_ptr),
                               reinterpret_cast<const scalar_t*>(input_ptr), reinterpret_cast<const uint8_t*>(mask_ptr),
                               scale_factor, query_seq_len, key_seq_len, batches, attn_heads, pad_batches););
  return softmax_results;
}

torch::Tensor bwd_cuda(torch::Tensor const& output_grads_, torch::Tensor const& softmax_results_, float scale_factor) {
  auto output_grads = output_grads_.contiguous();
  auto softmax_results = softmax_results_.contiguous();

  // output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
  const int batches = output_grads.size(0);
  const int attn_heads = output_grads.size(1);
  const int query_seq_len = output_grads.size(2);
  const int key_seq_len = output_grads.size(3);

  auto act_options = output_grads.options();
  torch::Tensor input_grad = torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options);

  void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());

  // Softmax Grad
  DISPATCH_HALF_AND_BFLOAT(
      output_grads_.scalar_type(), "dispatch_scaled_masked_softmax_backward",
      dispatch_scaled_masked_softmax_backward_new<scalar_t, scalar_t, float>(
          reinterpret_cast<scalar_t*>(static_cast<void*>(input_grad.data_ptr())),
          reinterpret_cast<scalar_t*>(output_grads_ptr), reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
          scale_factor, query_seq_len, key_seq_len, batches, attn_heads););

  // backward pass is completely in-place
  return input_grad;
}
}  // namespace generic_scaled_masked_softmax
}  // namespace fused_softmax
}  // namespace multihead_attn


================================================
FILE: csrc/megatron/scaled_masked_softmax.cpp
================================================
/* coding=utf-8
 * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <cuda_fp16.h>
#include <torch/extension.h>

#include <vector>

namespace multihead_attn {
namespace fused_softmax {
namespace scaled_masked_softmax {

torch::Tensor fwd_cuda(torch::Tensor const& input, torch::Tensor const& mask, float scale_factor);

torch::Tensor bwd_cuda(torch::Tensor const& output_grads, torch::Tensor const& softmax_results, float scale_factor);

int get_batch_per_block_cuda(int query_seq_len, int key_seq_len, int batches, int attn_heads);

torch::Tensor fwd(torch::Tensor& input, torch::Tensor& mask, float scale_factor) {
  TORCH_CHECK(input.dim() == 4, "expected 4D tensor");
  TORCH_CHECK((input.scalar_type() == at::ScalarType::Half) || (input.scalar_type() == at::ScalarType::BFloat16),
              "Only fp16 and bf16 are supported");
  TORCH_CHECK(mask.dim() == 4, "expected 4D tensor");
  if (!input.is_contiguous()) input = input.contiguous();
  if (!mask.is_contiguous()) mask = mask.contiguous();

  return fwd_cuda(input, mask, scale_factor);
}

torch::Tensor bwd(torch::Tensor& output_grads, torch::Tensor& softmax_results, float scale_factor) {
  TORCH_CHECK(output_grads.dim() == 4, "expected 3D tensor");
  TORCH_CHECK(softmax_results.dim() == 4, "expected 3D tensor");

  TORCH_CHECK(
      (output_grads.scalar_type() == at::ScalarType::Half) || (output_grads.scalar_type() == at::ScalarType::BFloat16),
      "Only fp16 and bf16 are supported");
  TORCH_CHECK((softmax_results.scalar_type() == at::ScalarType::Half) ||
                  (softmax_results.scalar_type() == at::ScalarType::BFloat16),
              "Only fp16 and bf16 are supported");
  if (!output_grads.is_contiguous()) output_grads = output_grads.contiguous();
  if (!softmax_results.is_contiguous()) softmax_results = softmax_results.contiguous();

  return bwd_cuda(output_grads, softmax_results, scale_factor);
}

int get_batch_per_block(int query_seq_len, int key_seq_len, int batches, int attn_heads) {
  return get_batch_per_block_cuda(query_seq_len, key_seq_len, batches, attn_heads);
}

}  // end namespace scaled_masked_softmax
}  // end namespace fused_softmax
}  // end namespace multihead_attn

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &multihead_attn::fused_softmax::scaled_masked_softmax::fwd,
        "Self Multihead Attention scaled, time masked softmax -- Forward.", py::call_guard<py::gil_scoped_release>());

  m.def("backward", &multihead_attn::fused_softmax::scaled_masked_softmax::bwd,
        "Self Multihead Attention scaled, time masked softmax -- Backward.", py::call_guard<py::gil_scoped_release>());

  m.def("get_batch_per_block", &multihead_attn::fused_softmax::scaled_masked_softmax::get_batch_per_block,
        "Return Batch per block size.", py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: csrc/megatron/scaled_masked_softmax.h
================================================
/* coding=utf-8
 * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <assert.h>
#include <c10/macros/Macros.h>
#include <cuda_fp16.h>
#include <stdint.h>

#include <cfloat>
#include <limits>

namespace {

template <typename Datatype, int ELEMENTS_PER_LDG>
__device__ __inline__ void copy_vector(Datatype* dst, const Datatype* src);

template <>
__device__ __inline__ void copy_vector<c10::BFloat16, 1>(c10::BFloat16* dst, const c10::BFloat16* src) {
  *dst = *src;
}

template <>
__device__ __inline__ void copy_vector<c10::BFloat16, 4>(c10::BFloat16* dst, const c10::BFloat16* src) {
  *((float2*)dst) = *((float2*)src);
}

template <>
__device__ __inline__ void copy_vector<c10::Half, 1>(c10::Half* dst, const c10::Half* src) {
  *dst = *src;
}

template <>
__device__ __inline__ void copy_vector<c10::Half, 4>(c10::Half* dst, const c10::Half* src) {
  *((float2*)dst) = *((float2*)src);
}

template <>
__device__ __inline__ void copy_vector<uint8_t, 1>(uint8_t* dst, const uint8_t* src) {
  *dst = *src;
}

template <>
__device__ __inline__ void copy_vector<uint8_t, 4>(uint8_t* dst, const uint8_t* src) {
  *((half2*)dst) = *((half2*)src);
}

int log2_ceil(int value) {
  int log2_value = 0;
  while ((1 << log2_value) < value) ++log2_value;
  return log2_value;
}

template <typename T>
struct Add {
  __device__ __forceinline__ T operator()(T a, T b) const { return a + b; }
};

template <typename T>
struct Max {
  __device__ __forceinline__ T operator()(T a, T b) const { return a < b ? b : a; }
};

template <typename T>
__device__ __forceinline__ T WARP_SHFL_XOR_NATIVE(T value, int laneMask, int width = warpSize,
                                                  unsigned int mask = 0xffffffff) {
#if CUDA_VERSION >= 9000
  return __shfl_xor_sync(mask, value, laneMask, width);
#else
  return __shfl_xor(value, laneMask, width);
#endif
}

template <typename acc_t, int WARP_BATCH, int WARP_SIZE, template <typename> class ReduceOp>
__device__ __forceinline__ void warp_reduce(acc_t* sum) {
  ReduceOp<acc_t> r;
#pragma unroll
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      acc_t b = WARP_SHFL_XOR_NATIVE(sum[i], offset, WARP_SIZE);
      sum[i] = r(sum[i], b);
    }
  }
}

/*
 * Extended softmax (from native aten pytorch) with following additional features
 * 1) input scaling
 */
template <typename input_t, typename output_t, typename acc_t, int log2_elements>
__global__ void scaled_softmax_warp_forward(output_t* dst, const input_t* src, const acc_t scale, int micro_batch_size,
                                            int element_count) {
  // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and
  // warp_size of method warp_softmax_forward_kernel.
  constexpr int next_power_of_two = 1 << log2_elements;
  constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
  constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
  constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
  constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;

  // blockDim/threadIdx = (WARP_SIZE, WARPS_PER_BLOCK, )
  // gridDim/blockIdx = (seq_len, attn_heads, batches)
  long int first_batch =
      (blockDim.y * (blockIdx.x + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z)) + threadIdx.y) * WARP_BATCH;

  // micro_batch_size might not be a multiple of WARP_BATCH. Check how
  // many batches have to computed within this WARP.
  int local_batches = micro_batch_size - first_batch;
  if (local_batches > WARP_BATCH) local_batches = WARP_BATCH;

  // there might be multiple batches per warp. compute the index within the batch
  int local_idx = threadIdx.x;

  long int thread_offset = first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
  src += thread_offset;
  dst += thread_offset;

  // load data from global memory
  acc_t elements[WARP_BATCH][WARP_ITERATIONS];
  input_t temp_data[ELEMENTS_PER_LDG_STG];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    int batch_element_count = (i >= local_batches) ? 0 : element_count;

#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;

      if (element_index < batch_element_count) {
        int itr_idx = i * element_count + it * WARP_SIZE;
        copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_data, src + itr_idx);

#pragma unroll
        for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
          elements[i][it + element] = (acc_t)temp_data[element] * scale;
        }
      } else {
#pragma unroll
        for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
          elements[i][it + element] = -std::numeric_limits<acc_t>::infinity();
        }
      }
    }
  }

  // compute max_value
  acc_t max_value[WARP_BATCH];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    max_value[i] = elements[i][0];
#pragma unroll
    for (int it = 1; it < WARP_ITERATIONS; ++it) {
      max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
    }
  }
  warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Max>(max_value);

  acc_t sum[WARP_BATCH]{0.0f};
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; ++it) {
      elements[i][it] = std::exp((elements[i][it] - max_value[i]));
      sum[i] += elements[i][it];
    }
  }
  warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);

  // store result
  output_t out[ELEMENTS_PER_LDG_STG];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    if (i >= local_batches) break;
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
      if (element_index < element_count) {
#pragma unroll
        for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
          out[element] = elements[i][it + element] / sum[i];
        }
        copy_vector<output_t, ELEMENTS_PER_LDG_STG>(dst + i * element_count + it * WARP_SIZE, out);
      } else {
        break;
      }
    }
  }
}

/*
 * Extended softmax (from native aten pytorch) with following additional features
 * 1) input scaling
 * 2) Explicit masking
 */
template <typename input_t, typename output_t, typename acc_t, int log2_elements>
__global__ void scaled_masked_softmax_warp_forward(output_t* dst, const input_t* src, const uint8_t* mask,
                                                   const acc_t scale, int micro_batch_size, int element_count,
                                                   int pad_batches) {
  // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and
  // warp_size of method warp_softmax_forward_kernel.
  constexpr int next_power_of_two = 1 << log2_elements;
  constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
  constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
  constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
  constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;

  // blockDim/threadIdx = (WARP_SIZE, WARPS_PER_BLOCK, )
  // gridDim/blockIdx = (seq_len, attn_heads, batches)
  long int first_batch =
      (blockDim.y * (blockIdx.x + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z)) + threadIdx.y) * WARP_BATCH;
  long int pad_first_batch = 0;
  if (pad_batches != 1) {  // bert style
    pad_first_batch = (blockDim.y * (blockIdx.x + gridDim.x * blockIdx.z) + threadIdx.y) * WARP_BATCH;
  } else {  // gpt2 style
    pad_first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
  }

  // micro_batch_size might not be a multiple of WARP_BATCH. Check how
  // many batches have to computed within this WARP.
  int local_batches = micro_batch_size - first_batch;
  if (local_batches > WARP_BATCH) local_batches = WARP_BATCH;

  // there might be multiple batches per warp. compute the index within the batch
  int local_idx = threadIdx.x;

  long int thread_offset_src_dst = first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
  long int thread_offset_mask = pad_first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
  src += thread_offset_src_dst;
  dst += thread_offset_src_dst;
  mask += thread_offset_mask;

  // load data from global memory
  acc_t elements[WARP_BATCH][WARP_ITERATIONS];
  input_t temp_data[ELEMENTS_PER_LDG_STG];
  uint8_t temp_mask[ELEMENTS_PER_LDG_STG];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    int batch_element_count = (i >= local_batches) ? 0 : element_count;

#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;

      if (element_index < batch_element_count) {
        int itr_idx = i * element_count + it * WARP_SIZE;
        copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_data, src + itr_idx);
        copy_vector<uint8_t, ELEMENTS_PER_LDG_STG>(temp_mask, mask + itr_idx);

#pragma unroll
        for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
          if (temp_mask[element] != 1) {
            elements[i][it + element] = (acc_t)temp_data[element] * scale;
          } else {
            elements[i][it + element] = -10000.0;
          }
        }
      } else {
#pragma unroll
        for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
          elements[i][it + element] = -std::numeric_limits<acc_t>::infinity();
        }
      }
    }
  }

  // compute max_value
  acc_t max_value[WARP_BATCH];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    max_value[i] = elements[i][0];
#pragma unroll
    for (int it = 1; it < WARP_ITERATIONS; ++it) {
      max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
    }
  }
  warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Max>(max_value);

  // compute scale value to account for full mask
  acc_t scale_value[WARP_BATCH];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    scale_value[i] = (max_value[i] == -10000.0) ? 0.0 : 1.0;
  }

  acc_t sum[WARP_BATCH]{0.0f};
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; ++it) {
      elements[i][it] = std::exp((elements[i][it] - max_value[i]));
      sum[i] += elements[i][it];
    }
  }
  warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);

  // store result
  output_t out[ELEMENTS_PER_LDG_STG];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    if (i >= local_batches) break;
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
      if (element_index < element_count) {
#pragma unroll
        for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
          out[element] = elements[i][it + element] * scale_value[i] / sum[i];
        }
        copy_vector<output_t, ELEMENTS_PER_LDG_STG>(dst + i * element_count + it * WARP_SIZE, out);
      } else {
        break;
      }
    }
  }
}

template <typename input_t, typename output_t, typename acc_t, int log2_elements>
__global__ void scaled_masked_softmax_warp_backward(output_t* gradInput, input_t* grad, const input_t* output,
                                                    acc_t scale, int micro_batch_size, int element_count) {
  // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and
  // warp_size of method warp_softmax_backward_kernel.
  constexpr int next_power_of_two = 1 << log2_elements;
  constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
  constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
  constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
  constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;

  // blockDim/threadIdx = (WARP_SIZE, WARPS_PER_BLOCK, )
  // gridDim/blockIdx = (seq_len, attn_heads, batches)
  long int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;

  // micro_batch_size might not be a multiple of WARP_BATCH. Check how
  // many batches have to computed within this WARP.
  int local_batches = micro_batch_size - first_batch;
  if (local_batches > WARP_BATCH) local_batches = WARP_BATCH;

  // there might be multiple batches per warp. compute the index within the batch
  int local_idx = threadIdx.x;

  // the first element to process by the current thread
  long int thread_offset = first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
  grad += thread_offset;
  output += thread_offset;
  gradInput += thread_offset;

  // load data from global memory
  acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS]{0.0f};
  acc_t output_reg[WARP_BATCH][WARP_ITERATIONS]{0.0f};
  input_t temp_grad[ELEMENTS_PER_LDG_STG];
  input_t temp_output[ELEMENTS_PER_LDG_STG];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    int batch_element_count = (i >= local_batches) ? 0 : element_count;

#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
      if (element_index < batch_element_count) {
        copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_grad, grad + i * element_count + it * WARP_SIZE);
        copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_output, output + i * element_count + it * WARP_SIZE);

#pragma unroll
        for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
          output_reg[i][it + element] = (acc_t)temp_output[element];
        }
#pragma unroll
        for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
          grad_reg[i][it + element] = (acc_t)temp_grad[element] * output_reg[i][it + element];
        }
      }
    }
  }

  acc_t sum[WARP_BATCH];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    sum[i] = grad_reg[i][0];
#pragma unroll
    for (int it = 1; it < WARP_ITERATIONS; ++it) {
      sum[i] += grad_reg[i][it];
    }
  }
  warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);

// store result
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    if (i >= local_batches) break;
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
      if (element_index < element_count) {
        // compute gradients
        output_t out[ELEMENTS_PER_LDG_STG];
#pragma unroll
        for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
          out[element] = (output_t)(scale * (grad_reg[i][it + element] - output_reg[i][it + element] * sum[i]));
        }
        copy_vector<output_t, ELEMENTS_PER_LDG_STG>(gradInput + i * element_count + it * WARP_SIZE, out);
      }
    }
  }
}
}  // end of anonymous namespace

int get_batch_per_block(int query_seq_len, int key_seq_len, int batches, int attn_heads) {
  int log2_elements = log2_ceil(key_seq_len);
  const int next_power_of_two = 1 << log2_elements;

  int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
  int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;

  constexpr int threads_per_block = 128;
  int warps_per_block = (threads_per_block / warp_size);
  int batches_per_block = warps_per_block * batches_per_warp;

  return batches_per_block;
}

template <typename input_t, typename output_t, typename acc_t>
void dispatch_scaled_softmax_forward(output_t* dst, const input_t* src, const input_t scale, int query_seq_len,
                                     int key_seq_len, int batches, int attn_heads) {
  TORCH_INTERNAL_ASSERT(key_seq_len >= 0 && key_seq_len <= 16384);
  if (key_seq_len == 0) {
    return;
  } else {
    int log2_elements = log2_ceil(key_seq_len);
    const int next_power_of_two = 1 << log2_elements;
    int batch_count = batches * attn_heads * query_seq_len;

    // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
    int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;

    // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward.
    int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;

    // use 128 threads per block to maximize gpu utilization
    constexpr int threads_per_block = 128;

    int warps_per_block = (threads_per_block / warp_size);
    int batches_per_block = warps_per_block * batches_per_warp;
    TORCH_INTERNAL_ASSERT(query_seq_len % batches_per_block == 0);
    dim3 blocks(query_seq_len / batches_per_block, attn_heads, batches);
    dim3 threads(warp_size, warps_per_block, 1);
    // Launch code would be more elegant if C++ supported FOR CONSTEXPR
    switch (log2_elements) {
      case 0:  // 1
        scaled_softmax_warp_forward<input_t, output_t, acc_t, 0>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
        break;
      case 1:  // 2
        scaled_softmax_warp_forward<input_t, output_t, acc_t, 1>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
        break;
      case 2:  // 4
        scaled_softmax_warp_forward<input_t, output_t, acc_t, 2>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
        break;
      case 3:  // 8
        scaled_softmax_warp_forward<input_t, output_t, acc_t, 3>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
        break;
      case 4:  // 16
        scaled_softmax_warp_forward<input_t, output_t, acc_t, 4>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
        break;
      case 5:  // 32
        scaled_softmax_warp_forward<input_t, output_t, acc_t, 5>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
        break;
      case 6:  // 64
        scaled_softmax_warp_forward<input_t, output_t, acc_t, 6>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
        break;
      case 7:  // 128
        scaled_softmax_warp_forward<input_t, output_t, acc_t, 7>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
        break;
      case 8:  // 256
        scaled_softmax_warp_forward<input_t, output_t, acc_t, 8>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
        break;
      case 9:  // 512
        scaled_softmax_warp_forward<input_t, output_t, acc_t, 9>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
        break;
      case 10:  // 1024
        scaled_softmax_warp_forward<input_t, output_t, acc_t, 10>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
        break;
      case 11:  // 2048
        scaled_softmax_warp_forward<input_t, output_t, acc_t, 11>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
        break;
      case 12:  // 4096
        scaled_softmax_warp_forward<input_t, output_t, acc_t, 12>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
        break;
      case 13:  // 8192
        scaled_softmax_warp_forward<input_t, output_t, acc_t, 13>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
        break;
      case 14:  // 16384
        scaled_softmax_warp_forward<input_t, output_t, acc_t, 14>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
        break;
      default:
        break;
    }
  }
}

template <typename input_t, typename output_t, typename acc_t>
void dispatch_scaled_masked_softmax_forward(output_t* dst, const input_t* src, const uint8_t* mask, const input_t scale,
                                            int query_seq_len, int key_seq_len, int batches, int attn_heads,
                                            int pad_batches) {
  TORCH_INTERNAL_ASSERT(key_seq_len >= 0 && key_seq_len <= 4096);
  if (key_seq_len == 0) {
    return;
  } else {
    int log2_elements = log2_ceil(key_seq_len);
    const int next_power_of_two = 1 << log2_elements;
    int batch_count = batches * attn_heads * query_seq_len;

    // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
    int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;

    // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward.
    int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;

    // use 128 threads per block to maximize gpu utilization
    constexpr int threads_per_block = 128;

    int warps_per_block = (threads_per_block / warp_size);
    int batches_per_block = warps_per_block * batches_per_warp;
    TORCH_INTERNAL_ASSERT(query_seq_len % batches_per_block == 0);
    dim3 blocks(query_seq_len / batches_per_block, attn_heads, batches);
    dim3 threads(warp_size, warps_per_block, 1);
    // Launch code would be more elegant if C++ supported FOR CONSTEXPR
    switch (log2_elements) {
      case 0:  // 1
        scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 0>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len,
                                                                       pad_batches);
        break;
      case 1:  // 2
        scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 1>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len,
                                                                       pad_batches);
        break;
      case 2:  // 4
        scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 2>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len,
                                                                       pad_batches);
        break;
      case 3:  // 8
        scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 3>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len,
                                                                       pad_batches);
        break;
      case 4:  // 16
        scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 4>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len,
                                                                       pad_batches);
        break;
      case 5:  // 32
        scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 5>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len,
                                                                       pad_batches);
        break;
      case 6:  // 64
        scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 6>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len,
                                                                       pad_batches);
        break;
      case 7:  // 128
        scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 7>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len,
                                                                       pad_batches);
        break;
      case 8:  // 256
        scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 8>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len,
                                                                       pad_batches);
        break;
      case 9:  // 512
        scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 9>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len,
                                                                       pad_batches);
        break;
      case 10:  // 1024
        scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 10>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len,
                                                                       pad_batches);
        break;
      case 11:  // 2048
        scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 11>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len,
                                                                       pad_batches);
        break;
      case 12:  // 4096
        scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 12>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len,
                                                                       pad_batches);
        break;
      default:
        break;
    }
  }
}

template <typename input_t, typename output_t, typename acc_t>
void dispatch_scaled_masked_softmax_backward(output_t* grad_input, input_t* grad, const input_t* output,
                                             const acc_t scale, int query_seq_len, int key_seq_len, int batches,
                                             int attn_heads) {
  TORCH_INTERNAL_ASSERT(key_seq_len >= 0 && key_seq_len <= 4096);
  if (key_seq_len == 0) {
    return;
  } else {
    int log2_elements = log2_ceil(key_seq_len);
    const int next_power_of_two = 1 << log2_elements;
    int batch_count = batches * attn_heads * query_seq_len;

    // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward.
    int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;

    // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward.
    int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;

    // use 128 threads per block to maximize gpu utilization
    constexpr int threads_per_block = 128;

    int warps_per_block = (threads_per_block / warp_size);
    int batches_per_block = warps_per_block * batches_per_warp;
    int blocks = batch_count / batches_per_block;
    dim3 threads(warp_size, warps_per_block, 1);
    // Launch code would be more elegant if C++ supported FOR CONSTEXPR
    switch (log2_elements) {
      case 0:  // 1
        scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 0>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       key_seq_len);
        break;
      case 1:  // 2
        scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 1>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       key_seq_len);
        break;
      case 2:  // 4
        scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 2>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       key_seq_len);
        break;
      case 3:  // 8
        scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 3>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       key_seq_len);
        break;
      case 4:  // 16
        scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 4>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       key_seq_len);
        break;
      case 5:  // 32
        scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 5>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       key_seq_len);
        break;
      case 6:  // 64
        scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 6>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       key_seq_len);
        break;
      case 7:  // 128
        scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 7>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       key_seq_len);
        break;
      case 8:  // 256
        scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 8>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       key_seq_len);
        break;
      case 9:  // 512
        scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 9>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       key_seq_len);
        break;
      case 10:  // 1024
        scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 10>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       key_seq_len);
        break;
      case 11:  // 2048
        scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 11>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       key_seq_len);
        break;
      case 12:  // 4096
        scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 12>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       key_seq_len);
        break;

      default:
        break;
    }
  }
}


================================================
FILE: csrc/megatron/scaled_masked_softmax_cuda.cu
================================================
/* coding=utf-8
 * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <cuda_runtime.h>
#include <torch/extension.h>

#include "scaled_masked_softmax.h"
#include "type_shim.h"

namespace multihead_attn {
namespace fused_softmax {
namespace scaled_masked_softmax {

int get_batch_per_block_cuda(int query_seq_len, int key_seq_len, int batches, int attn_heads) {
  return get_batch_per_block(query_seq_len, key_seq_len, batches, attn_heads);
}

torch::Tensor fwd_cuda(torch::Tensor const& input, torch::Tensor const& mask, float scale_factor) {
  // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
  const int batches = input.size(0);
  const int pad_batches = mask.size(0);
  const int attn_heads = input.size(1);
  const int query_seq_len = input.size(2);
  const int key_seq_len = input.size(3);
  TORCH_INTERNAL_ASSERT(key_seq_len <= 16384);
  TORCH_INTERNAL_ASSERT(query_seq_len > 1);
  TORCH_INTERNAL_ASSERT(pad_batches == 1 || pad_batches == batches);
  TORCH_INTERNAL_ASSERT(mask.size(1) == 1);
  TORCH_INTERNAL_ASSERT(mask.size(2) == query_seq_len);
  TORCH_INTERNAL_ASSERT(mask.size(3) == key_seq_len);

  // Output
  auto act_options = input.options().requires_grad(false);
  torch::Tensor softmax_results = torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options);

  // Softmax Intermediate Result Ptr
  void* input_ptr = static_cast<void*>(input.data_ptr());
  void* mask_ptr = static_cast<void*>(mask.data_ptr());
  void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());

  DISPATCH_HALF_AND_BFLOAT(input.scalar_type(), "dispatch_scaled_masked_softmax_forward",
                           dispatch_scaled_masked_softmax_forward<scalar_t, scalar_t, float>(
                               reinterpret_cast<scalar_t*>(softmax_results_ptr),
                               reinterpret_cast<const scalar_t*>(input_ptr), reinterpret_cast<const uint8_t*>(mask_ptr),
                               scale_factor, query_seq_len, key_seq_len, batches, attn_heads, pad_batches););
  return softmax_results;
}

torch::Tensor bwd_cuda(torch::Tensor const& output_grads_, torch::Tensor const& softmax_results_, float scale_factor) {
  auto output_grads = output_grads_.contiguous();
  auto softmax_results = softmax_results_.contiguous();

  // output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
  const int batches = output_grads.size(0);
  const int attn_heads = output_grads.size(1);
  const int query_seq_len = output_grads.size(2);
  const int key_seq_len = output_grads.size(3);

  auto act_options = output_grads.options().requires_grad(false);
  torch::Tensor input_grads = torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options);
  void* input_grads_ptr = static_cast<void*>(input_grads.data_ptr());
  void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());

  // Softmax Grad
  DISPATCH_HALF_AND_BFLOAT(
      output_grads_.scalar_type(), "dispatch_scaled_masked_softmax_backward",
      dispatch_scaled_masked_softmax_backward<scalar_t, scalar_t, float>(
          reinterpret_cast<scalar_t*>(input_grads_ptr), reinterpret_cast<scalar_t*>(output_grads_ptr),
          reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()), scale_factor, query_seq_len, key_seq_len,
          batches, attn_heads););
  return input_grads;
}
}  // namespace scaled_masked_softmax
}  // namespace fused_softmax
}  // namespace multihead_attn


================================================
FILE: csrc/megatron/scaled_softmax.cpp
================================================
/* coding=utf-8
 * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <cuda_fp16.h>
#include <torch/extension.h>

#include <vector>

namespace multihead_attn {
namespace fused_softmax {
namespace scaled_softmax {

torch::Tensor fwd_cuda(torch::Tensor const& input, float scale_factor);

torch::Tensor bwd_cuda(torch::Tensor const& output_grads, torch::Tensor const& softmax_results, float scale_factor);

torch::Tensor fwd(torch::Tensor const& input, float scale_factor) {
  TORCH_CHECK(input.dim() == 4, "expected 4D tensor");
  TORCH_CHECK((input.scalar_type() == at::ScalarType::Half) || (input.scalar_type() == at::ScalarType::BFloat16),
              "Only fp16 and bf16 are supported");

  return fwd_cuda(input, scale_factor);
}

torch::Tensor bwd(torch::Tensor const& output_grads, torch::Tensor const& softmax_results, float scale_factor) {
  TORCH_CHECK(output_grads.dim() == 4, "expected 3D tensor");
  TORCH_CHECK(softmax_results.dim() == 4, "expected 3D tensor");

  TORCH_CHECK(
      (output_grads.scalar_type() == at::ScalarType::Half) || (output_grads.scalar_type() == at::ScalarType::BFloat16),
      "Only fp16 and bf16 are supported");
  TORCH_CHECK((softmax_results.scalar_type() == at::ScalarType::Half) ||
                  (softmax_results.scalar_type() == at::ScalarType::BFloat16),
              "Only fp16 and bf16 are supported");

  return bwd_cuda(output_grads, softmax_results, scale_factor);
}

}  // end namespace scaled_softmax
}  // end namespace fused_softmax
}  // end namespace multihead_attn

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &multihead_attn::fused_softmax::scaled_softmax::fwd,
        "Self Multihead Attention scaled, softmax -- Forward.", py::call_guard<py::gil_scoped_release>());
  m.def("backward", &multihead_attn::fused_softmax::scaled_softmax::bwd,
        "Self Multihead Attention scaled, softmax -- Backward.", py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: csrc/megatron/scaled_softmax_cuda.cu
================================================
/* coding=utf-8
 * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <cuda_runtime.h>
#include <torch/extension.h>

#include "scaled_masked_softmax.h"
#include "type_shim.h"

namespace multihead_attn {
namespace fused_softmax {
namespace scaled_softmax {

torch::Tensor fwd_cuda(torch::Tensor const& input, float scale_factor) {
  // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
  const int batches = input.size(0);
  const int attn_heads = input.size(1);
  const int query_seq_len = input.size(2);
  const int key_seq_len = input.size(3);
  TORCH_INTERNAL_ASSERT(key_seq_len <= 16384);
  TORCH_INTERNAL_ASSERT(query_seq_len > 1);

  // Output
  auto act_options = input.options().requires_grad(false);
  torch::Tensor softmax_results = torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options);

  // Softmax Intermediate Result Ptr
  void* input_ptr = static_cast<void*>(input.data_ptr());
  void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());

  DISPATCH_HALF_AND_BFLOAT(
      input.scalar_type(), "dispatch_scaled_softmax_forward",
      dispatch_scaled_softmax_forward<scalar_t, scalar_t, float>(
          reinterpret_cast<scalar_t*>(softmax_results_ptr), reinterpret_cast<const scalar_t*>(input_ptr), scale_factor,
          query_seq_len, key_seq_len, batches, attn_heads););
  return softmax_results;
}

torch::Tensor bwd_cuda(torch::Tensor const& output_grads_, torch::Tensor const& softmax_results_, float scale_factor) {
  auto output_grads = output_grads_.contiguous();
  auto softmax_results = softmax_results_.contiguous();

  // output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
  const int batches = output_grads.size(0);
  const int attn_heads = output_grads.size(1);
  const int query_seq_len = output_grads.size(2);
  const int key_seq_len = output_grads.size(3);

  void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());

  // Softmax Grad
  DISPATCH_HALF_AND_BFLOAT(
      output_grads_.scalar_type(), "dispatch_scaled_masked_softmax_backward",
      dispatch_scaled_masked_softmax_backward<scalar_t, scalar_t, float>(
          reinterpret_cast<scalar_t*>(output_grads_ptr), reinterpret_cast<scalar_t*>(output_grads_ptr),
          reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()), scale_factor, query_seq_len, key_seq_len,
          batches, attn_heads););

  // backward pass is completely in-place
  return output_grads;
}
}  // namespace scaled_softmax
}  // namespace fused_softmax
}  // namespace multihead_attn


================================================
FILE: csrc/megatron/scaled_upper_triang_masked_softmax.cpp
================================================
/* coding=utf-8
 * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <cuda_fp16.h>
#include <torch/extension.h>

#include <vector>

namespace multihead_attn {
namespace fused_softmax {
namespace scaled_upper_triang_masked_softmax {

torch::Tensor fwd_cuda(torch::Tensor const& input, float scale_factor);

torch::Tensor bwd_cuda(torch::Tensor const& output_grads, torch::Tensor const& softmax_results, float scale_factor);

torch::Tensor fwd(torch::Tensor const& input, float scale_factor) {
  TORCH_CHECK(input.dim() == 3, "expected 3D tensor");
  TORCH_CHECK((input.scalar_type() == at::ScalarType::Half) || (input.scalar_type() == at::ScalarType::BFloat16),
              "Only fp16 and bf16 are supported");

  return fwd_cuda(input, scale_factor);
}

torch::Tensor bwd(torch::Tensor const& output_grads, torch::Tensor const& softmax_results, float scale_factor) {
  TORCH_CHECK(output_grads.dim() == 3, "expected 3D tensor");
  TORCH_CHECK(softmax_results.dim() == 3, "expected 3D tensor");

  TORCH_CHECK(
      (output_grads.scalar_type() == at::ScalarType::Half) || (output_grads.scalar_type() == at::ScalarType::BFloat16),
      "Only fp16 and bf16 are supported");
  TORCH_CHECK((softmax_results.scalar_type() == at::ScalarType::Half) ||
                  (softmax_results.scalar_type() == at::ScalarType::BFloat16),
              "Only fp16 and bf16 are supported");

  return bwd_cuda(output_grads, softmax_results, scale_factor);
}

}  // end namespace scaled_upper_triang_masked_softmax
}  // end namespace fused_softmax
}  // end namespace multihead_attn

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd,
        "Self Multihead Attention scaled, time masked softmax -- Forward.", py::call_guard<py::gil_scoped_release>());
  m.def("backward", &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd,
        "Self Multihead Attention scaled, time masked softmax -- Backward.", py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: csrc/megatron/scaled_upper_triang_masked_softmax.h
================================================
/* coding=utf-8
 * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <assert.h>
#include <c10/macros/Macros.h>
#include <cuda_fp16.h>
#include <stdint.h>

#include <cfloat>
#include <limits>

namespace {

template <typename Datatype, int ELEMENTS_PER_LDG>
__device__ __inline__ void copy_vector(Datatype* dst, const Datatype* src);

template <>
__device__ __inline__ void copy_vector<c10::BFloat16, 1>(c10::BFloat16* dst, const c10::BFloat16* src) {
  *dst = *src;
}

template <>
__device__ __inline__ void copy_vector<c10::BFloat16, 4>(c10::BFloat16* dst, const c10::BFloat16* src) {
  *((float2*)dst) = *((float2*)src);
}

template <>
__device__ __inline__ void copy_vector<c10::Half, 1>(c10::Half* dst, const c10::Half* src) {
  *dst = *src;
}

template <>
__device__ __inline__ void copy_vector<c10::Half, 4>(c10::Half* dst, const c10::Half* src) {
  *((float2*)dst) = *((float2*)src);
}

template <>
__device__ __inline__ void copy_vector<uint8_t, 1>(uint8_t* dst, const uint8_t* src) {
  *dst = *src;
}

template <>
__device__ __inline__ void copy_vector<uint8_t, 4>(uint8_t* dst, const uint8_t* src) {
  *((half2*)dst) = *((half2*)src);
}

template <typename Datatype, int ELEMENTS_PER_LDG>
__device__ __inline__ void copy_zero_vector(Datatype* dst);

template <>
__device__ __inline__ void copy_zero_vector<c10::BFloat16, 1>(c10::BFloat16* dst) {
  *dst = 0.0;
}

template <>
__device__ __inline__ void copy_zero_vector<c10::BFloat16, 4>(c10::BFloat16* dst) {
  *((float2*)dst) = make_float2(0.0f, 0.0f);
}

template <>
__device__ __inline__ void copy_zero_vector<c10::Half, 1>(c10::Half* dst) {
  *dst = 0.0;
}

template <>
__device__ __inline__ void copy_zero_vector<c10::Half, 4>(c10::Half* dst) {
  *((float2*)dst) = make_float2(0.0f, 0.0f);
}

int log2_ceil(int value) {
  int log2_value = 0;
  while ((1 << log2_value) < value) ++log2_value;
  return log2_value;
}

template <typename T>
struct Add {
  __device__ __forceinline__ T operator()(T a, T b) const { return a + b; }
};

template <typename T>
struct Max {
  __device__ __forceinline__ T operator()(T a, T b) const { return a < b ? b : a; }
};

template <typename T>
__device__ __forceinline__ T WARP_SHFL_XOR_NATIVE(T value, int laneMask, int width = warpSize,
                                                  unsigned int mask = 0xffffffff) {
#if CUDA_VERSION >= 9000
  return __shfl_xor_sync(mask, value, laneMask, width);
#else
  return __shfl_xor(value, laneMask, width);
#endif
}

template <typename acc_t, int WARP_BATCH, int WARP_SIZE, template <typename> class ReduceOp>
__device__ __forceinline__ void warp_reduce(acc_t* sum) {
  ReduceOp<acc_t> r;
#pragma unroll
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
#pragma unroll
    for (int i = 0; i < WARP_BATCH; ++i) {
      acc_t b = WARP_SHFL_XOR_NATIVE(sum[i], offset, WARP_SIZE);
      sum[i] = r(sum[i], b);
    }
  }
}

/*
 * Extended softmax (from native aten pytorch) with following additional features
 * 1) input scaling
 * 2) Implicit time (diagonal masking)
 */
template <typename input_t, typename output_t, typename acc_t, int log2_elements>
__global__ void scaled_upper_triang_masked_softmax_warp_forward(output_t* dst, const input_t* src, const acc_t scale,
                                                                int micro_batch_size, int stride, int element_count) {
  // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and
  // warp_size of method warp_softmax_forward_kernel.
  constexpr int next_power_of_two = 1 << log2_elements;
  constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
  constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
  constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
  constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;

  long int first_batch = (blockDim.y * blockIdx.y + threadIdx.y) * gridDim.x * WARP_BATCH + blockIdx.x;
  int local_seq = blockIdx.x + 1;
  int warp_iteration_limit = (local_seq + ELEMENTS_PER_LDG_STG * WARP_SIZE - 1) / WARP_SIZE;

  // micro_batch_size might not be a multiple of WARP_BATCH. Check how
  // many batches have to computed within this WARP.
  int local_batches = micro_batch_size - first_batch;
  if (local_batches > WARP_BATCH) local_batches = WARP_BATCH;

  // there might be multiple batches per warp. compute the index within the batch
  int local_idx = threadIdx.x;

  long int thread_offset = first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;
  src += thread_offset;
  dst += thread_offset;

  // load data from global memory
  acc_t elements[WARP_BATCH][WARP_ITERATIONS];
  input_t temp_data[ELEMENTS_PER_LDG_STG];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    int batch_element_count = (i >= local_batches) ? 0 : local_seq;

#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;

      if (element_index < batch_element_count) {
        copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_data, src + i * element_count * stride + it * WARP_SIZE);

#pragma unroll
        for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
          if ((element_index + element) < batch_element_count) {
            elements[i][it + element] = (acc_t)temp_data[element] * scale;
          } else {
            elements[i][it + element] = -std::numeric_limits<acc_t>::infinity();
          }
        }
      } else {
#pragma unroll
        for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
          elements[i][it + element] = -std::numeric_limits<acc_t>::infinity();
        }
      }
    }
  }

  // compute max_value
  acc_t max_value[WARP_BATCH];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    max_value[i] = elements[i][0];
#pragma unroll
    for (int it = 1; it < WARP_ITERATIONS; ++it) {
      max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
    }
  }
  warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Max>(max_value);

  acc_t sum[WARP_BATCH]{0.0f};
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; ++it) {
      if (it < warp_iteration_limit) {
        elements[i][it] = std::exp((elements[i][it] - max_value[i]));
        sum[i] += elements[i][it];
      }
    }
  }
  warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);

  // store result
  output_t out[ELEMENTS_PER_LDG_STG];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    if (i >= local_batches) break;
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;

      if (element_index < local_seq) {
#pragma unroll
        for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
          if (element_index + element < local_seq) {
            out[element] = elements[i][it + element] / sum[i];
          } else {
            out[element] = 0;
          }
        }
        copy_vector<output_t, ELEMENTS_PER_LDG_STG>(dst + i * element_count * stride + it * WARP_SIZE, out);
      } else if (element_index < element_count) {
        copy_zero_vector<output_t, ELEMENTS_PER_LDG_STG>(dst + i * element_count * stride + it * WARP_SIZE);
      } else {
        break;
      }
    }
  }
}

template <typename input_t, typename output_t, typename acc_t, int log2_elements>
__global__ void scaled_upper_triang_masked_softmax_warp_backward(output_t* gradInput, input_t* grad,
                                                                 const input_t* output, acc_t scale,
                                                                 int micro_batch_size, int stride, int element_count) {
  // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and
  // warp_size of method warp_softmax_backward_kernel.
  constexpr int next_power_of_two = 1 << log2_elements;
  constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
  constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
  constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
  constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;

  long int first_batch = (blockDim.y * blockIdx.y + threadIdx.y) * gridDim.x * WARP_BATCH + blockIdx.x;
  int local_seq = blockIdx.x + 1;

  // micro_batch_size might not be a multiple of WARP_BATCH. Check how
  // many batches have to computed within this WARP.
  int local_batches = micro_batch_size - first_batch;
  if (local_batches > WARP_BATCH) local_batches = WARP_BATCH;

  // there might be multiple batches per warp. compute the index within the batch
  int local_idx = threadIdx.x;

  // the first element to process by the current thread
  long int thread_offset = first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;
  grad += thread_offset;
  output += thread_offset;
  gradInput += thread_offset;

  // load data from global memory
  acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS]{0.0f};
  acc_t output_reg[WARP_BATCH][WARP_ITERATIONS]{0.0f};
  input_t temp_grad[ELEMENTS_PER_LDG_STG];
  input_t temp_output[ELEMENTS_PER_LDG_STG];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    int batch_element_count = (i >= local_batches) ? 0 : local_seq;

#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
      if (element_index < batch_element_count) {
        copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_grad, grad + i * element_count * stride + it * WARP_SIZE);
        copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_output, output + i * element_count * stride + it * WARP_SIZE);

#pragma unroll
        for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
          if (element_index + element < batch_element_count) {
            output_reg[i][it + element] = (acc_t)temp_output[element];
          }
        }
#pragma unroll
        for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
          if (element_index + element < batch_element_count) {
            grad_reg[i][it + element] = (acc_t)temp_grad[element] * output_reg[i][it + element];
          }
        }
      }
    }
  }

  acc_t sum[WARP_BATCH];
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    sum[i] = grad_reg[i][0];
#pragma unroll
    for (int it = 1; it < WARP_ITERATIONS; ++it) {
      sum[i] += grad_reg[i][it];
    }
  }
  warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);

// store result
#pragma unroll
  for (int i = 0; i < WARP_BATCH; ++i) {
    if (i >= local_batches) break;
#pragma unroll
    for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
      int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
      if (element_index < element_count) {
        // compute gradients
        output_t out[ELEMENTS_PER_LDG_STG];
#pragma unroll
        for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
          out[element] = (output_t)(scale * (grad_reg[i][it + element] - output_reg[i][it + element] * sum[i]));
        }
        copy_vector<output_t, ELEMENTS_PER_LDG_STG>(gradInput + i * element_count * stride + it * WARP_SIZE, out);
      }
    }
  }
}

}  // end of anonymous namespace

template <typename input_t, typename output_t, typename acc_t>
void dispatch_scaled_upper_triang_masked_softmax_forward(output_t* dst, const input_t* src, const input_t scale,
                                                         int softmax_elements, int softmax_elements_stride,
                                                         int attn_batches) {
  TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 16384);
  if (softmax_elements == 0) {
    return;
  } else {
    int log2_elements = log2_ceil(softmax_elements);
    const int next_power_of_two = 1 << log2_elements;
    int seq_len = softmax_elements;
    int batch_count = attn_batches * seq_len;

    // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
    int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;

    // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward.
    int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;

    // use 128 threads per block to maximize gpu utilization
    constexpr int threads_per_block = 128;

    int warps_per_block = (threads_per_block / warp_size);
    int batches_per_block = warps_per_block * batches_per_warp;
    TORCH_INTERNAL_ASSERT(attn_batches % batches_per_block == 0);

    int blocks_per_seq = attn_batches / batches_per_block;
    dim3 blocks(seq_len, blocks_per_seq, 1);
    dim3 threads(warp_size, warps_per_block, 1);
    // Launch code would be more elegant if C++ supported FOR CONSTEXPR
    switch (log2_elements) {
      case 0:  // 1
        scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 0>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 1:  // 2
        scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 1>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 2:  // 4
        scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 2>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 3:  // 8
        scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 3>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 4:  // 16
        scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 4>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 5:  // 32
        scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 5>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 6:  // 64
        scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 6>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 7:  // 128
        scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 7>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 8:  // 256
        scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 8>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 9:  // 512
        scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 9>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 10:  // 1024
        scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 10>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 11:  // 2048
        scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 11>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 12:  // 4096
        scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 12>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 13:  // 8192
        scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 13>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 14:  // 16384
        scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 14>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      default:
        break;
    }
  }
}

template <typename input_t, typename output_t, typename acc_t>
void dispatch_scaled_upper_triang_masked_softmax_backward(output_t* grad_input, input_t* grad, const input_t* output,
                                                          const acc_t scale, int softmax_elements,
                                                          int softmax_elements_stride, int attn_batches) {
  TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 16384);
  if (softmax_elements == 0) {
    return;
  } else {
    int log2_elements = log2_ceil(softmax_elements);
    const int next_power_of_two = 1 << log2_elements;
    int seq_len = softmax_elements;
    int batch_count = attn_batches * seq_len;

    // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward.
    int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;

    // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward.
    int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;

    // use 128 threads per block to maximize gpu utilization
    constexpr int threads_per_block = 128;

    int warps_per_block = (threads_per_block / warp_size);
    int batches_per_block = warps_per_block * batches_per_warp;
    TORCH_INTERNAL_ASSERT(attn_batches % batches_per_block == 0);

    int blocks_per_seq = attn_batches / batches_per_block;
    dim3 blocks(seq_len, blocks_per_seq, 1);
    dim3 threads(warp_size, warps_per_block, 1);
    // Launch code would be more elegant if C++ supported FOR CONSTEXPR
    switch (log2_elements) {
      case 0:  // 1
        scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 0>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 1:  // 2
        scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 1>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 2:  // 4
        scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 2>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 3:  // 8
        scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 3>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 4:  // 16
        scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 4>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 5:  // 32
        scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 5>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 6:  // 64
        scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 6>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 7:  // 128
        scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 7>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 8:  // 256
        scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 8>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 9:  // 512
        scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 9>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 10:  // 1024
        scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 10>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 11:  // 2048
        scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 11>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 12:  // 4096
        scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 12>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 13:  // 8192
        scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 13>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      case 14:  // 16384
        scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 14>
            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count,
                                                                       softmax_elements_stride, softmax_elements);
        break;
      default:
        break;
    }
  }
}


================================================
FILE: csrc/megatron/scaled_upper_triang_masked_softmax_cuda.cu
================================================
/* coding=utf-8
 * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <cuda_runtime.h>
#include <torch/extension.h>

#include "scaled_upper_triang_masked_softmax.h"
#include "type_shim.h"

namespace multihead_attn {
namespace fused_softmax {
namespace scaled_upper_triang_masked_softmax {

torch::Tensor fwd_cuda(torch::Tensor const& input, float scale_factor) {
  // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
  const int attn_batches = input.size(0);
  const int seq_len = input.size(1);
  TORCH_INTERNAL_ASSERT(seq_len <= 16384);

  // Output
  auto act_options = input.options().requires_grad(false);
  torch::Tensor softmax_results = torch::empty({attn_batches, seq_len, seq_len}, act_options);

  // Softmax Intermediate Result Ptr
  void* input_ptr = static_cast<void*>(input.data_ptr());
  void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());

  DISPATCH_HALF_AND_BFLOAT(
      input.scalar_type(), "dispatch_scaled_upper_triang_masked_softmax_forward",
      dispatch_scaled_upper_triang_masked_softmax_forward<scalar_t, scalar_t, float>(
          reinterpret_cast<scalar_t*>(softmax_results_ptr), reinterpret_cast<const scalar_t*>(input_ptr), scale_factor,
          seq_len, seq_len, attn_batches););
  return softmax_results;
}

torch::Tensor bwd_cuda(torch::Tensor const& output_grads_, torch::Tensor const& softmax_results_, float scale_factor) {
  auto output_grads = output_grads_.contiguous();
  auto softmax_results = softmax_results_.contiguous();

  // output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
  const int attn_batches = output_grads.size(0);
  const int seq_len = output_grads.size(1);
  TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2));

  void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());

  // Softmax Grad
  DISPATCH_HALF_AND_BFLOAT(
      output_grads_.scalar_type(), "dispatch_scaled_upper_triang_masked_softmax_backward",
      dispatch_scaled_upper_triang_masked_softmax_backward<scalar_t, scalar_t, float>(
          reinterpret_cast<scalar_t*>(output_grads_ptr), reinterpret_cast<scalar_t*>(output_grads_ptr),
          reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()), scale_factor, seq_len, seq_len,
          attn_batches););

  // backward pass is completely in-place
  return output_grads;
}
}  // namespace scaled_upper_triang_masked_softmax
}  // namespace fused_softmax
}  // namespace multihead_attn


================================================
FILE: csrc/mlp.cpp
================================================
#include <stdio.h>
#include <torch/extension.h>
#include <torch/torch.h>

#include <vector>

size_t get_mlp_reserved_space(int64_t batch_size, int num_layers, const int* output_features);

template <typename T>
size_t get_mlp_bp_workspace_in_bytes(int batch_size, int num_layers, const int* output_features);

template <typename T>
int mlp_fp(T* X, int input_features, int batch_size, T** WPtr, int num_layers, int* output_features, T** BPtr, T* Y,
           T* reserved_space, int use_bias, int activation, void* lt_workspace);

template <typename T>
int mlp_bp(T* X, T* Y, int input_features, int batch_size, T** WPtr, int num_layers, int* output_features, T* dY,
           T* reserved_space, T* work_space, T* dX, T** dwPtr, T** dbPtr, bool requires_grad, int use_bias,
           int activation);

std::vector<at::Tensor> mlp_forward(int use_bias, int activation, std::vector<at::Tensor> inputs) {
  auto num_layers = inputs.size() - 1;
  if (use_bias) {
    // inputs contains (input, weights, biases)
    num_layers /= 2;
  }
  auto batch_size = inputs[0].size(0);
  auto input_features = inputs[0].size(1);

  std::vector<int> output_features;
  for (int i = 0; i < num_layers; i++) {
    output_features.push_back(inputs[i + 1].size(0));
  }

  auto reserved_size = get_mlp_reserved_space(batch_size, num_layers, output_features.data());

  // create output/workspace tensor
  auto out = at::empty({batch_size, output_features.back()}, inputs[0].type());
  auto reserved_space = at::empty({static_cast<long>(reserved_size)}, inputs[0].type());
  // allocate fixed 4MB workspace for cublaslt for now, and this gets at least 4 MB
  auto lt_workspace = at::empty({1 << 22}, inputs[0].type());

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(inputs[0].scalar_type(), "mlp_forward", [&] {
    std::vector<scalar_t*> w_ptr;
    std::vector<scalar_t*> b_ptr;
    for (int i = 0; i < num_layers; i++) {
      w_ptr.push_back(inputs[i + 1].data_ptr<scalar_t>());
      if (use_bias) {
        b_ptr.push_back(inputs[i + 1 + num_layers].data_ptr<scalar_t>());
      }
    }
    [[maybe_unused]] auto result = mlp_fp<scalar_t>(inputs[0].data_ptr<scalar_t>(), input_features, batch_size,
                                                    w_ptr.data(), num_layers, output_features.data(), b_ptr.data(),
                                                    out.data_ptr<scalar_t>(), reserved_space.data_ptr<scalar_t>(),
                                                    use_bias, activation, (void*)(lt_workspace.data_ptr<scalar_t>()));
  });

  return {out, reserved_space};
}

std::vector<at::Tensor> mlp_backward(int use_bias, int activation, at::Tensor grad_o,
                                     std::vector<at::Tensor> fprop_outputs, std::vector<at::Tensor> inputs) {
  auto num_layers = inputs.size() - 1;
  if (use_bias) {
    // inputs contains (input, weights, biases)
    num_layers /= 2;
  }

  auto batch_size = inputs[0].size(0);
  auto input_features = inputs[0].size(1);

  bool requires_grad = inputs[0].requires_grad();

  std::vector<int> output_features;
  for (int i = 0; i < num_layers; i++) {
    output_features.push_back(inputs[i + 1].size(0));
  }
  // create outputs, length of inputs
  std::vector<at::Tensor> outputs;
  for (int i = 0; i < inputs.size(); i++) {
    outputs.push_back(at::empty(inputs[i].sizes(), inputs[i].type()));  // clone for testing now
  }

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(inputs[0].scalar_type(), "mlp_backward", [&] {
    std::vector<scalar_t*> w_ptr;
    for (int i = 0; i < num_layers; i++) {
      w_ptr.push_back(inputs[i + 1].data_ptr<scalar_t>());
    }
    std::vector<scalar_t*> outputs_ptr;
    for (int i = 0; i < inputs.size(); i++) {
      outputs_ptr.push_back(outputs[i].data_ptr<scalar_t>());
    }

    auto work_size = get_mlp_bp_workspace_in_bytes<scalar_t>(batch_size, num_layers, output_features.data());

    // auto work_space = at::empty({work_size*4}, at::kByte);
    auto work_space = at::empty({static_cast<long>(work_size / sizeof(scalar_t))}, inputs[0].type());

    [[maybe_unused]] auto result = mlp_bp<scalar_t>(
        inputs[0].data_ptr<scalar_t>(), fprop_outputs[0].data_ptr<scalar_t>(), input_features, batch_size, w_ptr.data(),
        num_layers, output_features.data(), grad_o.contiguous().data_ptr<scalar_t>(),
        fprop_outputs[1].data_ptr<scalar_t>(), work_space.data_ptr<scalar_t>(), outputs_ptr[0], outputs_ptr.data() + 1,
        outputs_ptr.data() + 1 + num_layers, requires_grad, use_bias, activation);
  });

  return outputs;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &mlp_forward, "MLP forward", py::call_guard<py::gil_scoped_release>());
  m.def("backward", &mlp_backward, "MLP backward", py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: csrc/mlp_cuda.cu
================================================
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <torch/torch.h>

/* Includes, cuda */
#include <cublas_v2.h>
#include <cuda_runtime.h>

#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11000
// includes cublaslt
#include <cublasLt.h>
#endif
// constants for fused bias+relu kernel
#define BIAS_RELU_FW_NTHREADS 128    // forward number of thread per block
#define BIAS_RELU_BW_NTHREADS_X 32   // backward number of thread in feature dim
#define BIAS_RELU_BW_NTHREADS_Y 16   // backward number of thread in batch dim
#define BIAS_RELU_RED_PER_THREAD 16  // backward minimal reduction length per thread

// move to a header later on
#define ILP 4
template <typename T>
__host__ __device__ __forceinline__ bool is_aligned(T* p) {
  return ((uint64_t)p) % (ILP * sizeof(T)) == 0;
}

template <typename T>
__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset) {
  typedef typename std::aligned_storage<ILP * sizeof(T), ILP * alignof(T)>::type LT;
  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
}
template <typename T>
__device__ __forceinline__ void load_store(T* dst, volatile T* src, int dst_offset, int src_offset) {
  typedef typename std::aligned_storage<ILP * sizeof(T), ILP * alignof(T)>::type LT;
  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
}
template <typename T>
__device__ __forceinline__ void load_store(volatile T* dst, T* src, int dst_offset, int src_offset) {
  typedef typename std::aligned_storage<ILP * sizeof(T), ILP * alignof(T)>::type LT;
  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
}

// Keep ReLU in float only. When using half, cast to float before calling.
__device__ __inline__ float relu(float a) {
  float retf = max(a, 0.f);
  return (retf);
}

// Keep Sigmoid in float only. When using half, cast to float before calling.
__device__ __inline__ float sigmoid(float a) {
  float retf = 1.f / (1.f + expf(-a));
  return (retf);
}

// FP64 Wrapper around cublas GEMMEx
cublasStatus_t mlp_gemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
                        float* alpha, const double* A, int lda, const double* B, int ldb, const float* beta, double* C,
                        int ldc) {
  return cublasGemmEx(handle, transa, transb, m, n, k, alpha, A, CUDA_R_64F, lda, B, CUDA_R_64F, ldb, beta, C,
                      CUDA_R_64F, ldc, CUDA_R_64F, CUBLAS_GEMM_DEFAULT);
}

// FP32 Wrapper around cublas GEMMEx
cublasStatus_t mlp_gemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
                        float* alpha, const float* A, int lda, const float* B, int ldb, const float* beta, float* C,
                        int ldc) {
  return cublasGemmEx(handle, transa, transb, m, n, k, alpha, A, CUDA_R_32F, lda, B, CUDA_R_32F, ldb, beta, C,
                      CUDA_R_32F, ldc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT);
}

// FP16 Tensor core wrapper around cublas GEMMEx
cublasStatus_t mlp_gemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
                        float* alpha, const at::Half* A, int lda, const at::Half* B, int ldb, float* beta, at::Half* C,
                        int ldc) {
  return cublasGemmEx(handle, transa, transb, m, n, k, alpha, A, CUDA_R_16F, lda, B, CUDA_R_16F, ldb, beta, C,
                      CUDA_R_16F, ldc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
}
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11000
int mlp_gemm_lt(cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
                float* alpha,                                                        /* host pointer */
                const at::Half* A, int lda, const at::Half* B, int ldb, float* beta, /* host pointer */
                at::Half* C, int ldc, void* workspace, size_t workspaceSize, cudaStream_t stream, bool use_bias,
                bool use_relu, const void* bias) {
  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;

  cublasLtMatmulDescOpaque_t operationDesc = {};
  cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
  cublasLtMatmulPreferenceOpaque_t preference = {};

  int returnedResults = 0;
  cublasLtMatmulHeuristicResult_t heuristicResult = {};
  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT;

  // Create operation descriptor; see cublasLtMatmulDescAttributes_t
  // for details about defaults; here we just set the transforms for
  // A and B.
  status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  if (use_bias) {
    status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias));
    if (status != CUBLAS_STATUS_SUCCESS) {
      goto CLEANUP;
    }
    if (use_relu) {
      epilogue = CUBLASLT_EPILOGUE_RELU_BIAS;
    } else {
      epilogue = CUBLASLT_EPILOGUE_BIAS;
    }
  } else {
    if (use_relu) {
      epilogue = CUBLASLT_EPILOGUE_RELU;
    }
  }

  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue));
  if (status != CUBLAS_STATUS_SUCCESS) {
    goto CLEANUP;
  }

  // Create matrix descriptors. Not setting any extra attributes.
  status =
      cublasLtMatrixLayoutInit(&Adesc, CUDA_R_16F, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status =
      cublasLtMatrixLayoutInit(&Bdesc, CUDA_R_16F, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_16F, m, n, ldc);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // Create preference handle; In general, extra attributes can be
  // used here to disable tensor ops or to make sure algo selected
  // will work with badly aligned A, B, C. However, for simplicity
  // here we assume A,B,C are always well aligned (e.g., directly
  // come from cudaMalloc)
  status = cublasLtMatmulPreferenceInit(&preference);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulPreferenceSetAttribute(&preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize,
                                                sizeof(workspaceSize));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // We just need the best available heuristic to try and run matmul.
  // There is no guarantee that this will work. For example, if A is
  // badly aligned, you can request more (e.g. 32) algos and try to
  // run them one by one until something works.
  status = cublasLtMatmulAlgoGetHeuristic(ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1,
                                          &heuristicResult, &returnedResults);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  if (returnedResults == 0) {
    status = CUBLAS_STATUS_NOT_SUPPORTED;
    goto CLEANUP;
  }
  status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc,
                          &heuristicResult.algo, workspace, workspaceSize, stream);

CLEANUP:
  // Descriptors are no longer needed as all GPU work was already
  // enqueued.
  return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
}

int mlp_gemm_lt(cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
                float* alpha,                                                    /* host pointer */
                const double* A, int lda, const double* B, int ldb, float* beta, /* host pointer */
                double* C, int ldc, void* workspace, size_t workspaceSize, cudaStream_t stream, bool use_bias,
                bool use_relu, const void* bias) {
  return 1;
}

int mlp_gemm_lt(cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
                float* alpha,                                                  /* host pointer */
                const float* A, int lda, const float* B, int ldb, float* beta, /* host pointer */
                float* C, int ldc, void* workspace, size_t workspaceSize, cudaStream_t stream, bool use_bias,
                bool use_relu, const void* bias) {
  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;

  cublasLtMatmulDescOpaque_t operationDesc = {};
  cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
  cublasLtMatmulPreferenceOpaque_t preference = {};

  int returnedResults = 0;
  cublasLtMatmulHeuristicResult_t heuristicResult = {};
  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT;

  // Create operation descriptor; see cublasLtMatmulDescAttributes_t
  // for details about defaults; here we just set the transforms for
  // A and B.
  status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  if (use_bias) {
    status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias));
    if (status != CUBLAS_STATUS_SUCCESS) {
      goto CLEANUP;
    }
    if (use_relu) {
      epilogue = CUBLASLT_EPILOGUE_RELU_BIAS;
    } else {
      epilogue = CUBLASLT_EPILOGUE_BIAS;
    }
  } else {
    if (use_relu) {
      epilogue = CUBLASLT_EPILOGUE_RELU;
    }
  }

  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue));
  if (status != CUBLAS_STATUS_SUCCESS) {
    goto CLEANUP;
  }

  // Create matrix descriptors. Not setting any extra attributes.
  status =
      cublasLtMatrixLayoutInit(&Adesc, CUDA_R_32F, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status =
      cublasLtMatrixLayoutInit(&Bdesc, CUDA_R_32F, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_32F, m, n, ldc);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // Create preference handle; In general, extra attributes can be
  // used here to disable tensor ops or to make sure algo selected
  // will work with badly aligned A, B, C. However, for simplicity
  // here we assume A,B,C are always well aligned (e.g., directly
  // come from cudaMalloc)
  status = cublasLtMatmulPreferenceInit(&preference);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
  status = cublasLtMatmulPreferenceSetAttribute(&preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize,
                                                sizeof(workspaceSize));
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  // We just need the best available heuristic to try and run matmul.
  // There is no guarantee that this will work. For example, if A is
  // badly aligned, you can request more (e.g. 32) algos and try to
  // run them one by one until something works.
  status = cublasLtMatmulAlgoGetHeuristic(ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1,
                                          &heuristicResult, &returnedResults);
  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

  if (returnedResults == 0) {
    status = CUBLAS_STATUS_NOT_SUPPORTED;
    goto CLEANUP;
  }

  status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc,
                          &heuristicResult.algo, workspace, workspaceSize, stream);

CLEANUP:
  // Descriptors are no longer needed as all GPU work was already
  // enqueued.
  return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
}
#endif

// Bias ADD. Assume input X is [features x batch size], column major.
// Bias is one 'features' long vector, with implicit broadcast.
template <typename T>
__global__ void biasAdd_fprop(T* X, T* b, uint batch_size, uint features) {
  T r_x[ILP];
  T r_b[ILP];
  if (is_aligned(X) && is_aligned(b) && features % ILP == 0) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    for (; tid * ILP < features * batch_size; tid += blockDim.x * gridDim.x) {
      int row = tid % (features / ILP);
      load_store(r_x, X, 0, tid);
      load_store(r_b, b, 0, row);
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        float bias_sum = static_cast<float>(r_x[ii]) + static_cast<float>(r_b[ii]);
        r_x[ii] = bias_sum;
      }
      load_store(X, r_x, tid, 0);
    }
  } else {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    for (; tid < features * batch_size; tid += ILP * blockDim.x * gridDim.x) {
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int idx = tid + ii * blockDim.x * gridDim.x;
        if (idx < features * batch_size) {
          int row = tid % features;
          r_x[ii] = X[idx];
          r_b[ii] = b[row];
        }
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        float bias_sum = static_cast<float>(r_x[ii]) + static_cast<float>(r_b[ii]);
        r_x[ii] = bias_sum;
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int idx = tid + ii * blockDim.x * gridDim.x;
        if (idx < features * batch_size) {
          X[idx] = r_x[ii];
        }
      }
    }
  }
}

// Bias ADD + ReLU. Assume input X is [features x batch size], column major.
// Activation support fuesed ReLU. Safe to call in-place.
template <typename T>
__global__ void biasAddRelu_fprop(T* X, T* b, uint batch_size, uint features) {
  T r_x[ILP];
  T r_b[ILP];
  if (is_aligned(X) && is_aligned(b) && features % ILP == 0) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    for (; tid * ILP < features * batch_size; tid += blockDim.x * gridDim.x) {
      int row = tid % (features / ILP);
      load_store(r_x, X, 0, tid);
      load_store(r_b, b, 0, row);
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        float bias_sum = static_cast<float>(r_x[ii]) + static_cast<float>(r_b[ii]);
        r_x[ii] = relu(bias_sum);
      }
      load_store(X, r_x, tid, 0);
    }
  } else {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    for (; tid < features * batch_size; tid += ILP * blockDim.x * gridDim.x) {
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int idx = tid + ii * blockDim.x * gridDim.x;
        if (idx < features * batch_size) {
          int row = tid % features;
          r_x[ii] = X[idx];
          r_b[ii] = b[row];
        }
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        float bias_sum = static_cast<float>(r_x[ii]) + static_cast<float>(r_b[ii]);
        r_x[ii] = relu(bias_sum);
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int idx = tid + ii * blockDim.x * gridDim.x;
        if (idx < features * batch_size) {
          X[idx] = r_x[ii];
        }
      }
    }
  }
}

// ReLU. Assume input X is [features x batch size], column major.
// Safe to call in-place.
template <typename T>
__global__ void Relu_fprop(T* X, uint batch_size, uint features) {
  T r_x[ILP];
  if (is_aligned(X) && features % ILP == 0) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    for (; tid * ILP < features * batch_size; tid += blockDim.x * gridDim.x) {
      load_store(r_x, X, 0, tid);
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        r_x[ii] = relu(static_cast<float>(r_x[ii]));
      }
      load_store(X, r_x, tid, 0);
    }
  } else {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    for (; tid < features * batch_size; tid += ILP * blockDim.x * gridDim.x) {
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int idx = tid + ii * blockDim.x * gridDim.x;
        if (idx < features * batch_size) {
          r_x[ii] = X[idx];
        }
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        r_x[ii] = relu(static_cast<float>(r_x[ii]));
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int idx = tid + ii * blockDim.x * gridDim.x;
        if (idx < features * batch_size) {
          X[idx] = r_x[ii];
        }
      }
    }
  }
}

// Sigmoid. Assume input X is [features x batch size], column major.
// Safe to call in-place.
template <typename T>
__global__ void Sigmoid_fprop(T* X, uint batch_size, uint features) {
  T r_x[ILP];
  if (is_aligned(X) && features % ILP == 0) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    for (; tid * ILP < features * batch_size; tid += blockDim.x * gridDim.x) {
      load_store(r_x, X, 0, tid);
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        r_x[ii] = sigmoid(static_cast<float>(r_x[ii]));
      }
      load_store(X, r_x, tid, 0);
    }
  } else {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    for (; tid < features * batch_size; tid += ILP * blockDim.x * gridDim.x) {
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int idx = tid + ii * blockDim.x * gridDim.x;
        if (idx < features * batch_size) {
          r_x[ii] = X[idx];
        }
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        r_x[ii] = sigmoid(static_cast<float>(r_x[ii]));
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int idx = tid + ii * blockDim.x * gridDim.x;
        if (idx < features * batch_size) {
          X[idx] = r_x[ii];
        }
      }
    }
  }
}

// ReLU. Assume input X is [features x batch size], column major.
// Safe to call in-place.
template <typename T>
__global__ void Relu_bprop(T* dY, T* Y, uint batch_size, uint features, T* dX) {
  T r_dy[ILP];
  T r_y[ILP];
  if (is_aligned(dY) && is_aligned(Y) && is_aligned(dX) && features % ILP == 0) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    for (; tid * ILP < features * batch_size; tid += blockDim.x * gridDim.x) {
      load_store(r_dy, dY, 0, tid);
      load_store(r_y, Y, 0, tid);
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        if ((float)r_y[ii] <= 0.f) r_dy[ii] = 0;
      }
      load_store(dX, r_dy, tid, 0);
    }
  } else {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    for (; tid < features * batch_size; tid += ILP * blockDim.x * gridDim.x) {
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int idx = tid + ii * blockDim.x * gridDim.x;
        if (idx < features * batch_size) {
          r_dy[ii] = dY[idx];
          r_y[ii] = Y[idx];
        }
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        if ((float)r_y[ii] <= 0.f) r_dy[ii] = 0;
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int idx = tid + ii * blockDim.x * gridDim.x;
        if (idx < features * batch_size) {
          dX[idx] = r_dy[ii];
        }
      }
    }
  }
}

// Sigmoid. Assume input X is [features x batch size], column major.
// Safe to call in-place.
template <typename T>
__global__ void Sigmoid_bprop(T* dY, T* Y, uint batch_size, uint features, T* dX) {
  T r_dy[ILP];
  T r_y[ILP];
  if (is_aligned(dY) && is_aligned(Y) && is_aligned(dX) && features % ILP == 0) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    for (; tid * ILP < features * batch_size; tid += blockDim.x * gridDim.x) {
      load_store(r_dy, dY, 0, tid);
      load_store(r_y, Y, 0, tid);
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        float grad_out = r_dy[ii];
        float out = r_y[ii];
        float grad_i = out * (1.f - out) * grad_out;
        r_dy[ii] = grad_i;
      }
      load_store(dX, r_dy, tid, 0);
    }
  } else {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    for (; tid < features * batch_size; tid += ILP * blockDim.x * gridDim.x) {
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int idx = tid + ii * blockDim.x * gridDim.x;
        if (idx < features * batch_size) {
          r_dy[ii] = dY[idx];
          r_y[ii] = Y[idx];
        }
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        float grad_out = r_dy[ii];
        float out = r_y[ii];
        float grad_i = out * (1.f - out) * grad_out;
        r_dy[ii] = grad_i;
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int idx = tid + ii * blockDim.x * gridDim.x;
        if (idx < features * batch_size) {
          dX[idx] = r_dy[ii];
        }
      }
    }
  }
}

// Compute grid size for pointwise backward kernel.
// block_x/y is total elment being handled per block, not number of threads
void get_biasAddRelu_bprop_grid_size(int yfeat, int batch_size, int block_x, int block_y, int* grid_x, int* grid_y) {
  *grid_x = (yfeat + block_x - 1) / block_x;
  // Get number of SMs for efficient reduction.
  int num_SMs = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
  // can switch to occupancy calculation. use 4 below now for sm_70
  int max_blocks_y = (num_SMs * 4 + (*grid_x) - 1) / (*grid_x);
  // block_y should be from minimal work per thread
  int nRedSplits = (batch_size + block_y - 1) / block_y;
  // increase number of elem per thread redcution to not launch more than enough
  // kernel adjust work, so here we just launch max block
  *grid_y = std::min(nRedSplits, max_blocks_y);
  return;
}

// Addition done deterministically via a 2-pass approach. Each CTA writes out partial
// sum, and the last CTA in grid Y dimension accumulates partials serially and writes to result.
template <typename T, int UNROLL_FACTOR>
__global__ void biasAdd_bprop(T* dY, int features, int batch_size, volatile float* intermediate, int* semaphores,
                              T* db) {
  // The feature that this thread is responsible for
  int f = blockIdx.x * blockDim.x + threadIdx.x;

  // Compute the span this thread is responsible for
  // For this block
  int b_chunkSize = (batch_size + gridDim.y - 1) / gridDim.y;
  int b_nStart = blockIdx.y * b_chunkSize;
  int b_nSpan = min(batch_size, b_nStart + b_chunkSize) - b_nStart;
  // For this thread
  int chunkSize = (b_chunkSize + blockDim.y - 1) / blockDim.y;
  int nStart = threadIdx.y * chunkSize + b_nStart;
  int nSpan = min(b_nStart + b_nSpan, nStart + chunkSize) - nStart;

  volatile float* out = intermediate + blockIdx.y * features;

  // Flag to trigger last reduction.
  __shared__ bool isLastBlock;
  // we know block size for now
  __shared__ float smem[BIAS_RELU_BW_NTHREADS_X * BIAS_RELU_BW_NTHREADS_Y];

  // Accumulate db in FP32 always
  float db_local = 0;
  if (f < features) {
    int nidx = 0;
    // Handle non-multiple of UNROLL_FACTOR residue
    for (; nidx < nSpan % UNROLL_FACTOR; nidx++) {
      int64_t row, col, flat_idx;
      row = f;
      col = nStart + nidx;
      flat_idx = col * features + row;
      db_local += (float)dY[flat_idx];
    }

    // Handle meat of work
    for (; (nidx + UNROLL_FACTOR - 1) < nSpan; nidx += UNROLL_FACTOR) {
      int64_t row, col, flat_idx;
      row = f;
      col = nStart + nidx;
      flat_idx = col * features + row;
#pragma unroll 4
      for (int u = 0; u < UNROLL_FACTOR; u++) {
        db_local += (float)dY[flat_idx];
        flat_idx += features;
      }
    }

    // naive block reduction on y-dim
    int linear_idx = threadIdx.y * blockDim.x + threadIdx.x;
    smem[linear_idx] = db_local;
  }
  __syncthreads();
  if (f < features) {
    if (threadIdx.y == 0) {
      for (int yidx = 1; yidx < blockDim.y; yidx++) {
        db_local += smem[yidx * blockDim.x + threadIdx.x];
      }

      // block result is in db_local now for all threadIdx.y == 0
      // Write out partial result
      out[f] = db_local;
    }
  }
  __threadfence();
  __syncthreads();

  // Increment semaphore and check if this is the last CTA in the grid_y dimension.
  // Only thread (0,0) calls this
  if (threadIdx.x == 0 && threadIdx.y == 0 && f < features) {
    unsigned int sum_idx;
    sum_idx = atomicAdd(&(semaphores[blockIdx.x]), 1);
    isLastBlock = (sum_idx == (gridDim.y - 1));
  }
  __syncthreads();

  db_local = 0;
  // No block reduction for now, only thread (*,0) do grid reduction
  if (isLastBlock && f < features) {
    if (threadIdx.y == 0) {
      for (int n = 0; n < gridDim.y; n++) {
        int row, col;
        row = f;
        col = n;
        db_local += (float)(intermediate[col * features + row]);
      }
      db[f] = (T)db_local;
    }
  }
}

// Addition done deterministically via a 2-pass approach. Each CTA writes out partial
// sum, and the last CTA in grid Y dimension accumulates partials serially and writes to result.
template <typename T, int UNROLL_FACTOR>
__global__ void biasAddRelu_bprop(T* Y, T* dY, int features, int batch_size, T* dX, volatile float* intermediate,
                                  int* semaphores, T* db) {
  // The feature that this thread is responsible for
  int f = blockIdx.x * blockDim.x + threadIdx.x;

  // Compute the span this thread is responsible for
  // For this block
  int b_chunkSize = (batch_size + gridDim.y - 1) / gridDim.y;
  int b_nStart = blockIdx.y * b_chunkSize;
  int b_nSpan = min(batch_size, b_nStart + b_chunkSize) - b_nStart;
  // For this thread
  int chunkSize = (b_chunkSize + blockDim.y - 1) / blockDim.y;
  int nStart = threadIdx.y * chunkSize + b_nStart;
  int nSpan = min(b_nStart + b_nSpan, nStart + chunkSize) - nStart;

  volatile float* out = intermediate + blockIdx.y * features;

  // Flag to trigger last reduction.
  __shared__ bool isLastBlock;
  // we know block size for now
  __shared__ float smem[BIAS_RELU_BW_NTHREADS_X * BIAS_RELU_BW_NTHREADS_Y];

  // Accumulate db in FP32 always
  float db_local = 0;
  if (f < features) {
    int nidx = 0;
    // Handle non-multiple of UNROLL_FACTOR residue
    for (; nidx < nSpan % UNROLL_FACTOR; nidx++) {
      int row, col, flat_idx;
      row = f;
      col = nStart + nidx;
      flat_idx = col * features + row;
      T y_val = Y[flat_idx];
      T dy_val = dY[flat_idx];
      T dx_val;
      if ((float)y_val > 0.f)
        dx_val = dy_val;
      else
        dx_val = 0;
      dX[flat_idx] = dx_val;
      db_local += (float)dx_val;
    }

    // Handle meat of work
    for (; (nidx + UNROLL_FACTOR - 1) < nSpan; nidx += UNROLL_FACTOR) {
      int row, col, flat_idx;
      row = f;
      col = nStart + nidx;
      flat_idx = col * features + row;
#pragma unroll 4
      for (int u = 0; u < UNROLL_FACTOR; u++) {
        T y_val = Y[flat_idx];
        T dy_val = dY[flat_idx];
        T dx_val;
        if ((float)y_val > 0.f)
          dx_val = dy_val;
        else
          dx_val = 0;
        dX[flat_idx] = dx_val;
        db_local += (float)dx_val;
        flat_idx += features;
      }
    }

    // naive block reduction on y-dim
    int linear_idx = threadIdx.y * blockDim.x + threadIdx.x;
    smem[linear_idx] = db_local;
  }
  __syncthreads();
  if (f < features) {
    if (threadIdx.y == 0) {
      for (int yidx = 1; yidx < blockDim.y; yidx++) {
        db_local += smem[yidx * blockDim.x + threadIdx.x];
      }

      // block result is in db_local now for all threadIdx.y == 0
      // Write out partial result
      out[f] = db_local;
    }
  }
  __threadfence();
  __syncthreads();

  // Increment semaphore and check if this is the last CTA in the grid_y dimension.
  // Only thread (0,0) calls this
  if (threadIdx.x == 0 && threadIdx.y == 0 && f < features) {
    unsigned int sum_idx;
    sum_idx = atomicAdd(&(semaphores[blockIdx.x]), 1);
    isLastBlock = (sum_idx == (gridDim.y - 1));
  }
  __syncthreads();

  db_local = 0;
  // No block reduction for now, only thread (*,0) do grid reduction
  if (isLastBlock && f < features) {
    if (threadIdx.y == 0) {
      for (int n = 0; n < gridDim.y; n++) {
        int row, col;
        row = f;
        col = n;
        db_local += (float)(intermediate[col * features + row]);
      }
      db[f] = (T)db_local;
    }
  }
}

// Addition done deterministically via a 2-pass approach. Each CTA writes out partial
// sum, and the last CTA in grid Y dimension accumulates partials serially and writes to result.
template <typename T, int UNROLL_FACTOR>
__global__ void biasAddRelu_bprop_aligned(T* Y, T* dY, int features, int batch_size, T* dX,
                                          volatile float* intermediate, int* semaphores, T* db) {
  // The feature that this thread is responsible for
  int f = blockIdx.x * blockDim.x + threadIdx.x;

  // Compute the span this thread is responsible for
  // For this block
  int b_chunkSize = (batch_size + gridDim.y - 1) / gridDim.y;
  int b_nStart = blockIdx.y * b_chunkSize;
  int b_nSpan = min(batch_size, b_nStart + b_chunkSize) - b_nStart;
  // For this thread
  int chunkSize = (b_chunkSize + blockDim.y - 1) / blockDim.y;
  int nStart = threadIdx.y * chunkSize + b_nStart;
  int nSpan = min(b_nStart + b_nSpan, nStart + chunkSize) - nStart;

  volatile float* out = intermediate + blockIdx.y * features;

  // Flag to trigger last reduction.
  __shared__ bool isLastBlock;

  // Accumulate db in FP32 always
  float db_local[ILP];
  T r_y[ILP];
  T r_dy[ILP];
#pragma unroll
  for (int ii = 0; ii < ILP; ii++) {
    db_local[ii] = 0.f;
  }

  // f always <= features in this case
  // if (f < features) {
  int nidx = 0;

  // Handle non-multiple of UNROLL_FACTOR residue
  for (; nidx < nSpan % UNROLL_FACTOR; nidx++) {
    int row, col, flat_idx;
    row = f;
    col = nStart + nidx;
    flat_idx = col * features / ILP + row;

    load_store(r_y, Y, 0, flat_idx);
    load_store(r_dy, dY, 0, flat_idx);
#pragma unroll
    for (int ii = 0; ii < ILP; ii++) {
      if ((float)r_y[ii] <= 0.f) r_dy[ii] = 0;
      db_local[ii] += (float)r_dy[ii];
    }
    load_store(dX, r_dy, flat_idx, 0);
  }

  // Handle meat of work
  for (; (nidx + UNROLL_FACTOR - 1) < nSpan; nidx += UNROLL_FACTOR) {
    int row, col, flat_idx;
    row = f;
    col = nStart + nidx;
    flat_idx = col * features / ILP + row;  // total threads in x == features/ILP
#pragma unroll
    for (int u = 0; u < UNROLL_FACTOR; u++) {
      load_store(r_y, Y, 0, flat_idx);
      load_store(r_dy, dY, 0, flat_idx);
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        if ((float)r_y[ii] <= 0.f) r_dy[ii] = 0;
        db_local[ii] += (float)r_dy[ii];
      }
      load_store(dX, r_dy, flat_idx, 0);
      flat_idx += features / ILP;
    }
  }

  // we know block size for now
  __shared__ float smem[BIAS_RELU_BW_NTHREADS_X * BIAS_RELU_BW_NTHREADS_Y * ILP];
  // naive block reduction on y-dim
  int linear_idx = threadIdx.y * blockDim.x + threadIdx.x;
  float* smem_out = smem + ILP * linear_idx;
#pragma unroll
  for (int ii = 0; ii < ILP; ii++) {
    smem_out[ii] = db_local[ii];  // reuse local dy buffer
  }
  __syncthreads();
  if (threadIdx.y == 0) {
    for (int yidx = 1; yidx < blockDim.y; yidx++) {
      float* smem_in = smem + ILP * (yidx * blockDim.x + threadIdx.x);
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        db_local[ii] += smem_in[ii];  // reuse local dy buffer
      }
    }

    // block result is in db_local now for all threadIdx.y == 0
    if (gridDim.y == 1) {
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        r_dy[ii] = db_local[ii];  // reuse local dy buffer
      }
      load_store(db, r_dy, f, 0);
      return;
    }

    // Write out partial result
    load_store(out, db_local, f, 0);
  }
  __threadfence();
  __syncthreads();

  // Increment semaphore and check if this is the last CTA in the grid_y dimension.
  // Only thread (0,0) calls this
  if (threadIdx.x == 0 && threadIdx.y == 0) {
    unsigned int sum_idx;
    sum_idx = atomicAdd(&(semaphores[blockIdx.x]), 1);
    isLastBlock = (sum_idx == (gridDim.y - 1));
  }
  __syncthreads();

#pragma unroll
  for (int ii = 0; ii < ILP; ii++) {
    db_local[ii] = 0.f;
  }
  float r_db[ILP];

  // No block reduction for now, only thread (*,0) do grid reduction
  if (isLastBlock) {
    if (threadIdx.y == 0) {
      for (int n = 0; n < gridDim.y; n++) {
        int row, col;
        row = f;
        col = n;
        load_store(r_db, intermediate, 0, col * features / ILP + row);
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          db_local[ii] += r_db[ii];
        }
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        r_dy[ii] = db_local[ii];  // reuse local dy buffer
      }
      load_store(db, r_dy, f, 0);
    }
  }
}

// Lists where the num_layers-1 intermediate Y buffers start in reserved space on fprop, starting
// offset 0. The last Y value is, of course, stored in the user provided output buffer.
void get_y_offsets(int batch_size, int num_layers, const int* output_features, int* y_start_offsets) {
  y_start_offsets[0] = 0;
  for (int i = 1; i < num_layers; i++) {
    y_start_offsets[i] = y_start_offsets[i - 1] + batch_size * output_features[i - 1];
  }
}

// Returns the reserved space (in elements) needed for the MLP
size_t get_mlp_reserved_space(int64_t batch_size, int num_layers, const int* output_features) {
  size_t res_space = 0;
  // Need to store output of every intermediate MLP - size equal to output_features[i] * batch_size
  // for all 'i' in [0, num_layers-1)
  for (int l = 0; l < num_layers; l++) {
    res_space += output_features[l] * batch_size;
  }
  return res_space;
}

// Returns the size of all fprop activations combined
size_t get_all_activations_size(int64_t batch_size, int num_layers, const int* output_features) {
  size_t acts_size = 0;
  for (int l = 0; l < num_layers; l++) {
    acts_size += output_features[l] * batch_size;
  }
  return acts_size;
}

#if 0
// Returns the work space (in elements) needed for the MLP bprop.
size_t get_mlp_bp_workspace (int batch_size, int num_layers, const int* output_features) {
    /*
       Workspace is partitioned as
       DY_GEMMs : DX_GEMMs
    */
    size_t work_space = 0;

    // Store each intermediate dY explicitly. Need 2 dYs per MLP layer (one for o/p
    // of biasReLU_bp and one for o/p of dgrad GEMM).
    work_space += 2*get_all_activations_size(batch_size, num_layers, output_features);

    return work_space;
}
#endif

// Scratch space needed for reductions in number of elements
size_t get_reduction_scratch_space(int batch_size, int num_layers, const int* output_features) {
  size_t max_scratch_space = 0;
  // Loop over all layers to see which one needs the max scratch space
  for (int l = 0; l < num_layers; l++) {
    // need to find max(aligned, not_aligned)
    int tmp, res0, res1;

    int block_x = BIAS_RELU_BW_NTHREADS_X;
    int block_y = BIAS_RELU_RED_PER_THREAD * BIAS_RELU_BW_NTHREADS_Y;
    get_biasAddRelu_bprop_grid_size(output_features[l], batch_size, block_x, block_y, &tmp, &res0);

    block_x = ILP * BIAS_RELU_BW_NTHREADS_X;
    get_biasAddRelu_bprop_grid_size(output_features[l], batch_size, block_x, block_y, &tmp, &res1);

    max_scratch_space = std::max(max_scratch_space, (size_t)(output_features[l] * res0));
    max_scratch_space = std::max(max_scratch_space, (size_t)(output_features[l] * res1));
  }

  return max_scratch_space;
}

// Buffer for semaphores
size_t get_semaphores_size(int num_layers, const int* output_features) {
  // Upper bound on semaphores is one per feature for the layer
  // with the most features.
  int max_features = 0;
  for (int l = 0; l < num_layers; l++) {
    max_features = std::max(max_features, output_features[l]);
  }
  return (size_t)max_features;
}

// Returns the work space (in elements) needed for the MLP bprop.
template <typename T>
size_t get_mlp_bp_workspace_in_bytes(int batch_size, int num_layers, const int* output_features) {
  size_t work_space = 0;

  // Store each intermediate dY explicitly. Need 2 dYs per MLP layer (one for o/p
  // of biasReLU_bp and one for o/p of dgrad GEMM).
  work_space += 2 * get_all_activations_size(batch_size, num_layers, output_features) * sizeof(T);
  work_space += get_reduction_scratch_space(batch_size, num_layers, output_features) * sizeof(float);
  work_space += get_semaphores_size(num_layers, output_features) * sizeof(int);

  return work_space;
}

// Returns pointers to each segment of the workspace
template <typename T>
void partition_mlp_bp_workspace(int batch_size, int num_layers, const int* output_features, void* work_space,
                                T** dy_gemms, T** dx_gemms, float** db_scratch, int** semaphores) {
  /*
     Workspace is partitioned as
     DY_GEMMs : DX_GEMMs : DB_SCRATCH : SEMAPHORES
  */
  // Start address where dy_gemm tensors are stored
  *dy_gemms = reinterpret_cast<T*>(work_space);
  // Start address where dx_gemm tensors are stored
  *dx_gemms = *dy_gemms + get_all_activations_size(batch_size, num_layers, output_features);
  // Start address where db intermediate tensors are stored
  *db_scratch = reinterpret_cast<float*>(*dx_gemms + get_all_activations_size(batch_size, num_layers, output_features));
  // Start address of semaphores
  *semaphores =
      reinterpret_cast<int*>(*db_scratch + get_reduction_scratch_space(batch_size, num_layers, output_features));

  return;
}

// Does a simple MLP fprop (GEMM+bias+ReLU).
// Can handle num_layers number of layers, each with its own shape. Output of layer i is assumed
// to be input of layer i+1. output_features, WPtr and BPtr are arrays of length num_layers, and
// must be in the same order i.e. WPtr[i] and BPtr[i] are respectively the weight and bias of layer
// 'i'.
template <typename T>
int mlp_fp(T* X, int input_features, int batch_size, T** WPtr, int num_layers, int* output_features, T** BPtr, T* Y,
           T* reserved_space, int use_bias, int activation, void* lt_workspace) {
  T *weight, *input, *output, *bias;
  T *reserved_space_x, *reserved_space_y;
  reserved_space_x = NULL;
  reserved_space_y = reserved_space;

  // Get cublas handle from Pytorch
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  // Get the stream from cublas handle to reuse for biasReLU kernel.
  cudaStream_t stream;
  cublasGetStream(handle, &stream);

  for (int layer = 0; layer < num_layers; layer++) {
    weight = WPtr[layer];
    input = (layer == 0) ? X : reserved_space_x;
    output = (layer == num_layers - 1) ? Y : reserved_space_y;
    if (use_bias) {
      bias = BPtr[layer];
    }
    int ifeat = (layer == 0) ? input_features : output_features[layer - 1];
    int ofeat = output_features[layer];

    float one = 1.f;
    float zero = 0.f;

    // try with cublaslt first for supported case with valid handle
    int cublaslt_status = 1;
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11000
    if (activation < 1) {
      cublaslt_status = mlp_gemm_lt(
          // ltHandle,
          (cublasLtHandle_t)handle, CUBLAS_OP_T, CUBLAS_OP_N, ofeat, batch_size, ifeat, &one, weight, ifeat, input,
          ifeat, &zero, output, ofeat, lt_workspace, 1 << 22, stream, use_bias == 1, activation == 1, bias);
    }
#endif

    // if cublaslt failed or not executed, fallback to cublas
    if (cublaslt_status != 0) {
      cublasStatus_t cublas_status;
      // Call GEMM: fprop is Y = W'X
      cublas_status = mlp_gemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, ofeat, batch_size, ifeat, &one, weight, ifeat, input,
                               ifeat, &zero, output, ofeat);

      if (cublas_status != CUBLAS_STATUS_SUCCESS) {
        printf("GEMM fprop failed with %d\n", cublas_status);
        return 1;
      }

      const uint& input_size = ofeat;
      int num_blocks = 0;
      int num_SMs = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
      // Call biasReLU
      if (use_bias == 1) {
        if (activation == 0) {  // no activation
          cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, biasAdd_fprop<T>, BIAS_RELU_FW_NTHREADS, 0);
          biasAdd_fprop<<<num_SMs * num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(output, bias, batch_size,
                                                                                    input_size);
        } else if (activation == 1) {  // relu
          cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, biasAddRelu_fprop<T>, BIAS_RELU_FW_NTHREADS, 0);
          biasAddRelu_fprop<<<num_SMs * num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(output, bias, batch_size,
                                                                                        input_size);
        } else if (activation == 2) {  // sigmoid
          cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, biasAdd_fprop<T>, BIAS_RELU_FW_NTHREADS, 0);
          biasAdd_fprop<<<num_SMs * num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(output, bias, batch_size,
                                                                                    input_size);
          cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, Sigmoid_fprop<T>, BIAS_RELU_FW_NTHREADS, 0);
          Sigmoid_fprop<<<num_SMs * num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(output, batch_size, input_size);
        }
      } else {
        // don't need to do anything in case of no activation and no bias
        if (activation == 1) {  // relu
          cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, Relu_fprop<T>, BIAS_RELU_FW_NTHREADS, 0);
          Relu_fprop<<<num_SMs * num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(output, batch_size, input_size);
        } else if (activation == 2) {  // sigmoid
          cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, Sigmoid_fprop<T>, BIAS_RELU_FW_NTHREADS, 0);
          Sigmoid_fprop<<<num_SMs * num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(output, batch_size, input_size);
        }
      }
    }
    // Set current output as next layer input
    reserved_space_x = reserved_space_y;
    // Set next layer output
    reserved_space_y += ofeat * batch_size;
  }

  return 0;
}

// Does a simple MLP bprop (GEMM+bias+ReLU).
// Needs reserved space to come back exactly as it was populated in fprop.
// Does dgrad and wgrad sequentially.
template <typename T>
int mlp_bp(T* X, T* Y, int input_features, int batch_size, T** WPtr, int num_layers, int* output_features, T* dY,
           T* reserved_space, T* work_space, T* dX, T** dwPtr, T** dbPtr, bool requires_grad, int use_bias,
           int activation) {
  T* weight;
  T *dweight, *dx, *dy, *dbias;
  T *x, *y;

  // Where the dx of the biasReLU (== dy of gemm) is stored. Can be thrown away
  // after bp call.
  T* dy_gemm_base;
  // Where the dx after GEMM is stored.
  T* dx_gemm_base;
  // Where partial reduction results are stored.
  float* db_scratch;
  // Semaphores for reduction.
  int* semaphores;

  partition_mlp_bp_workspace<T>(batch_size, num_layers, output_features, work_space, &dy_gemm_base, &dx_gemm_base,
                                &db_scratch, &semaphores);

  size_t semaphore_size = get_semaphores_size(num_layers, output_features) * sizeof(int);

  // Get cublas handle from Pytorch
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  // Get the stream from cublas handle to reuse for biasReLU kernel.
  cudaStream_t stream;
  cublasGetStream(handle, &stream);

  int* y_offsets = (int*)malloc(num_layers * sizeof(int));
  get_y_offsets(batch_size, num_layers, output_features, y_offsets);

  for (int layer = num_layers - 1; layer >= 0; layer--) {
    weight = WPtr[layer];
    dweight = dwPtr[layer];

    // x is read from reserved space
    x = (layer == 0) ? X : reserved_space + y_offsets[layer - 1];
    // dx is written in workspace for all but layer==0
    dx = (layer == 0) ? dX : dx_gemm_base + y_offsets[layer - 1];

    // y is read from reserved space
    y = (layer == num_layers - 1) ? Y : reserved_space + y_offsets[layer];
    // dx from layer+1
    dy = (layer == num_layers - 1) ? dY : dx_gemm_base + y_offsets[layer];
    // dy_gemm is written to and read immediately
    T* dy_gemm = dy_gemm_base + y_offsets[layer];

    dbias = dbPtr[layer];
    int xfeat = (layer == 0) ? input_features : output_features[layer - 1];
    int yfeat = output_features[layer];

    float one = 1.f;
    float zero = 0.f;

    if (use_bias == 1) {
      if (activation == 0) {  // no acitvation
        // bgrad
        dim3 block(BIAS_RELU_BW_NTHREADS_X, BIAS_RELU_BW_NTHREADS_Y);
        int grid_x, grid_y;
        cudaMemsetAsync(semaphores, 0, semaphore_size, stream);

        int block_x = BIAS_RELU_BW_NTHREADS_X;
        int block_y = BIAS_RELU_RED_PER_THREAD * BIAS_RELU_BW_NTHREADS_Y;
        get_biasAddRelu_bprop_grid_size(yfeat, batch_size, block_x, block_y, &grid_x, &grid_y);
        dim3 grid(grid_x, grid_y);
        biasAdd_bprop<T, 4><<<grid, block, 0, stream>>>(dy, yfeat, batch_size, db_scratch, semaphores, dbias);
        // bypass dgrad through reset pointer
        dy_gemm = dy;
      } else if (activation == 1) {  // relu
        dim3 block(BIAS_RELU_BW_NTHREADS_X, BIAS_RELU_BW_NTHREADS_Y);
        int grid_x, grid_y;
        cudaMemsetAsync(semaphores, 0, semaphore_size, stream);

        if (yfeat % (ILP * BIAS_RELU_BW_NTHREADS_X) == 0 && is_aligned(y) && is_aligned(dy) && is_aligned(dy_gemm) &&
            is_aligned(dbias)) {
          int block_x = ILP * BIAS_RELU_BW_NTHREADS_X;
          int block_y = BIAS_RELU_RED_PER_THREAD * BIAS_RELU_BW_NTHREADS_Y;
          get_biasAddRelu_bprop_grid_size(yfeat, batch_size, block_x, block_y, &grid_x, &grid_y);
          dim3 grid(grid_x, grid_y);
          biasAddRelu_bprop_aligned<T, 4>
              <<<grid, block, 0, stream>>>(y, dy, yfeat, batch_size, dy_gemm, db_scratch, semaphores, dbias);
        } else {
          int block_x = BIAS_RELU_BW_NTHREADS_X;
          int block_y = BIAS_RELU_RED_PER_THREAD * BIAS_RELU_BW_NTHREADS_Y;
          get_biasAddRelu_bprop_grid_size(yfeat, batch_size, block_x, block_y, &grid_x, &grid_y);
          dim3 grid(grid_x, grid_y);
          biasAddRelu_bprop<T, 4>
              <<<grid, block, 0, stream>>>(y, dy, yfeat, batch_size, dy_gemm, db_scratch, semaphores, dbias);
        }
      } else if (activation == 2) {  // sigmoid
        // activation backward
        int num_blocks = 0;
        int num_SMs = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, Sigmoid_bprop<T>, BIAS_RELU_FW_NTHREADS, 0);
        Sigmoid_bprop<<<num_SMs * num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(dy, y, batch_size, yfeat, dy_gemm);

        // bgrad, from dy_gemm
        dim3 block(BIAS_RELU_BW_NTHREADS_X, BIAS_RELU_BW_NTHREADS_Y);
        int grid_x, grid_y;
        cudaMemsetAsync(semaphores, 0, semaphore_size, stream);

        int block_x = BIAS_RELU_BW_NTHREADS_X;
        int block_y = BIAS_RELU_RED_PER_THREAD * BIAS_RELU_BW_NTHREADS_Y;
        get_biasAddRelu_bprop_grid_size(yfeat, batch_size, block_x, block_y, &grid_x, &grid_y);
        dim3 grid(grid_x, grid_y);
        biasAdd_bprop<T, 4><<<grid, block, 0, stream>>>(dy_gemm, yfeat, batch_size, db_scratch, semaphores, dbias);
      }
    } else {  // no bias below
      if (activation == 0) {
        // bypass dgrad through reset pointer
        dy_gemm = dy;
      } else if (activation == 1) {  // relu
        int num_blocks = 0;
        int num_SMs = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, Relu_bprop<T>, BIAS_RELU_FW_NTHREADS, 0);
        Relu_bprop<<<num_SMs * num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(dy, y, batch_size, yfeat, dy_gemm);
      } else if (activation == 2) {  // sigmoid
        int num_blocks = 0;
        int num_SMs = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, Sigmoid_bprop<T>, BIAS_RELU_FW_NTHREADS, 0);
        Sigmoid_bprop<<<num_SMs * num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(dy, y, batch_size, yfeat, dy_gemm);
      }
    }

    cublasStatus_t cublas_status;
    // Call GEMM dgrad
    if (layer > 0 || requires_grad == 1) {
      cublas_status = mlp_gemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, xfeat, batch_size, yfeat, &one, weight, xfeat, dy_gemm,
                               yfeat, &zero, dx, xfeat);

      if (cublas_status != CUBLAS_STATUS_SUCCESS) {
        printf("GEMM dgrad failed with %d\n", cublas_status);
        return 1;
      }
    }

    // Call GEMM wgrad
    cublas_status = mlp_gemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, xfeat, yfeat, batch_size, &one, x, xfeat, dy_gemm, yfeat,
                             &zero, dweight, xfeat);

    if (cublas_status != CUBLAS_STATUS_SUCCESS) {
      printf("GEMM wgrad failed with %d\n", cublas_status);
      return 1;
    }
  }

  return 0;
}

// Instantiate for floating point types
template int mlp_fp<float>(float* X, int input_features, int batch_size, float** WPtr, int num_layers,
                           int* output_features, float** BPtr, float* Y, float* reserved_space, int use_bias,
                           int activation, void* lt_workspace);

template int mlp_bp<float>(float* X, float* Y, int input_features, int batch_size, float** WPtr, int num_layers,
                           int* output_features, float* dY, float* reserved_space, float* work_space, float* dX,
                           float** dwPtr, float** dbPtr, bool requires_grad, int use_bias, int activation);

template int mlp_fp<at::Half>(at::Half* X, int input_features, int batch_size, at::Half** WPtr, int num_layers,
                              int* output_features, at::Half** BPtr, at::Half* Y, at::Half* reserved_space,
                              int use_bias, int activation, void* lt_workspace);

template int mlp_bp<at::Half>(at::Half* X, at::Half* Y, int input_features, int batch_size, at::Half** WPtr,
                              int num_layers, int* output_features, at::Half* dY, at::Half* reserved_space,
                              at::Half* work_space, at::Half* dX, at::Half** dwPtr, at::Half** dbPtr,
                              bool requires_grad, int use_bias, int activation);

template int mlp_fp<double>(double* X, int input_features, int batch_size, double** WPtr, int num_layers,
                            int* output_features, double** BPtr, double* Y, double* reserved_space, int use_bias,
                            int activation, void* lt_workspace);

template int mlp_bp<double>(double* X, double* Y, int input_features, int batch_size, double** WPtr, int num_layers,
                            int* output_features, double* dY, double* reserved_space, double* work_space, double* dX,
                            double** dwPtr, double** dbPtr, bool requires_grad, int use_bias, int activation);

template size_t get_mlp_bp_workspace_in_bytes<float>(int batch_size, int num_layers, const int* output_features);
template size_t get_mlp_bp_workspace_in_bytes<at::Half>(int batch_size, int num_layers, const int* output_features);
template size_t get_mlp_bp_workspace_in_bytes<double>(int batch_size, int num_layers, const int* output_features);


================================================
FILE: csrc/multi_tensor_adagrad.cu
================================================
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
// Another possibility:
// #include <torch/all.h>

#include <assert.h>

#include "multi_tensor_apply.cuh"
#include "type_shim.h"

#define BLOCK_SIZE 1024
#define ILP 4

typedef enum {
  ADAGRAD_MODE_0 = 0,  // L2 regularization mode.
  ADAGRAD_MODE_1 = 1,  // AdamW-style weight decay.

} adagradMode_t;

using MATH_T = float;

template <typename T>
struct AdagradFunctor {
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<3>& tl,
                                             const float epsilon, const float lr, adagradMode_t mode,
                                             const float weight_decay) {
    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    T* g = (T*)tl.addresses[0][tensor_loc];
    g += chunk_idx * chunk_size;

    T* p = (T*)tl.addresses[1][tensor_loc];
    p += chunk_idx * chunk_size;

    T* h = (T*)tl.addresses[2][tensor_loc];
    h += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;

    // see note in multi_tensor_scale_kernel.cu
    for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
      MATH_T r_g[ILP];
      MATH_T r_p[ILP];
      MATH_T r_h[ILP];
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int i = i_start + threadIdx.x + ii * blockDim.x;
        if (i < n && i < chunk_size) {
          r_g[ii] = g[i];
          r_p[ii] = p[i];
          r_h[ii] = h[i];
        } else {
          r_g[ii] = MATH_T(0);
          r_p[ii] = MATH_T(0);
          r_h[ii] = MATH_T(0);
        }
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        if (mode == ADAGRAD_MODE_0) {  // L2
          r_g[ii] = r_g[ii] + weight_decay * r_p[ii];
          r_h[ii] = r_h[ii] + r_g[ii] * r_g[ii];
          r_p[ii] = r_p[ii] - lr * (r_g[ii] / (sqrtf(r_h[ii]) + epsilon));
        } else {  // AdamW-style
          r_h[ii] = r_h[ii] + r_g[ii] * r_g[ii];
          r_p[ii] = r_p[ii] - lr * (r_g[ii] / (sqrtf(r_h[ii]) + epsilon) + weight_decay * r_p[ii]);
        }
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int i = i_start + threadIdx.x + ii * blockDim.x;
        if (i < n && i < chunk_size) {
          p[i] = r_p[ii];
          h[i] = r_h[ii];
        }
      }
    }
  }
};

void multi_tensor_adagrad_cuda(int chunk_size, at::Tensor noop_flag, std::vector<std::vector<at::Tensor>> tensor_lists,
                               const float lr, const float epsilon, const int mode, const float weight_decay) {
  using namespace at;

  // Assume single type across p,g,h now
  DISPATCH_DOUBLE_FLOAT_AND_HALF(
      tensor_lists[0][0].scalar_type(), 0, "adagrad",
      multi_tensor_apply<3>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists, AdagradFunctor<scalar_t_0>(), epsilon, lr,
                            (adagradMode_t)mode, weight_decay);)

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: csrc/multi_tensor_adam.cu
================================================
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
// Another possibility:
// #include <torch/all.h>

#include <assert.h>

#include "multi_tensor_apply.cuh"
#include "type_shim.h"

#define BLOCK_SIZE 512
#define ILP 4

typedef enum {
  ADAM_MODE_0 = 0,  // L2 regularization mode
  ADAM_MODE_1 = 1   // Decoupled weight decay mode(AdamW)
} adamMode_t;

using MATH_T = float;

template <typename T, typename FULL_T, typename index_t>
struct AdamFunctor {
  __device__ __forceinline__ void operator()(index_t chunk_size, volatile int* noop_gmem, TensorListMetadata<4>& tl,
                                             const float beta1, const float beta2, const float beta1_correction,
                                             const float beta2_correction, const float epsilon, const float lr,
                                             adamMode_t mode, const float decay) {
    // I'd like this kernel to propagate infs/nans.
    // if(*noop_gmem == 1)
    //   return;

    index_t tensor_loc = tl.block_to_tensor[blockIdx.x];

    // potentially use to pass in list of scalar
    // int tensor_num = tl.start_tensor_this_launch + tensor_loc;

    index_t chunk_idx = tl.block_to_chunk[blockIdx.x];
    index_t n = tl.sizes[tensor_loc];

    T* g = (T*)tl.addresses[0][tensor_loc];
    g += chunk_idx * chunk_size;

    T* p = (T*)tl.addresses[1][tensor_loc];
    p += chunk_idx * chunk_size;

    FULL_T* m = (FULL_T*)tl.addresses[2][tensor_loc];
    m += chunk_idx * chunk_size;

    FULL_T* v = (FULL_T*)tl.addresses[3][tensor_loc];
    v += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;

    // see note in multi_tensor_scale_kernel.cu
    for (index_t i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
      MATH_T r_g[ILP];
      MATH_T r_p[ILP];
      MATH_T r_m[ILP];
      MATH_T r_v[ILP];
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int i = i_start + threadIdx.x + ii * blockDim.x;
        if (i < n && i < chunk_size) {
          r_g[ii] = g[i];
          r_p[ii] = p[i];
          r_m[ii] = m[i];
          r_v[ii] = v[i];
        } else {
          r_g[ii] = MATH_T(0);
          r_p[ii] = MATH_T(0);
          r_m[ii] = MATH_T(0);
          r_v[ii] = MATH_T(0);
        }
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        if (mode == ADAM_MODE_0) {  // L2
          r_g[ii] = r_g[ii] + (decay * r_p[ii]);
          r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
          r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
          MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
          MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
          MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
          MATH_T update = next_m_unbiased / denom;
          r_p[ii] = r_p[ii] - (lr * update);
        } else {  // weight decay
          r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
          r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
          MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
          MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
          MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
          MATH_T update = (next_m_unbiased / denom) + (decay * r_p[ii]);
          r_p[ii] = r_p[ii] - (lr * update);
        }
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int i = i_start + threadIdx.x + ii * blockDim.x;
        if (i < n && i < chunk_size) {
          p[i] = r_p[ii];
          m[i] = r_m[ii];
          v[i] = r_v[ii];
        }
      }
    }
  }
};

template <typename T, typename FULL_T>
struct AdamCapturableFunctor {
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<4>& tl,
                                             const float beta1, const float beta2, const int* step,
                                             const int bias_correction, const float epsilon, const float* lr,
                                             adamMode_t mode, const float decay, const float* inv_scale) {
    if (*noop_gmem == 1) return;

    float beta1_correction = 1.0f, beta2_correction = 1.0f;
    if (bias_correction == 1) {
      beta1_correction = 1 - pow(beta1, *step);
      beta2_correction = 1 - pow(beta2, *step);
    }

    int tensor_loc = tl.block_to_tensor[blockIdx.x];

    // potentially use to pass in list of scalar
    // int tensor_num = tl.start_tensor_this_launch + tensor_loc;

    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    T* g = (T*)tl.addresses[0][tensor_loc];
    g += chunk_idx * chunk_size;

    T* p = (T*)tl.addresses[1][tensor_loc];
    p += chunk_idx * chunk_size;

    FULL_T* m = (FULL_T*)tl.addresses[2][tensor_loc];
    m += chunk_idx * chunk_size;

    FULL_T* v = (FULL_T*)tl.addresses[3][tensor_loc];
    v += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;

    // see note in multi_tensor_scale_kernel.cu
    for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
      MATH_T r_g[ILP];
      MATH_T r_p[ILP];
      MATH_T r_m[ILP];
      MATH_T r_v[ILP];
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int i = i_start + threadIdx.x + ii * blockDim.x;
        if (i < n && i < chunk_size) {
          r_g[ii] = static_cast<MATH_T>(g[i]) * (*inv_scale);
          g[i] = static_cast<T>(r_g[ii]);
          r_p[ii] = static_cast<MATH_T>(p[i]);
          r_m[ii] = static_cast<MATH_T>(m[i]);
          r_v[ii] = static_cast<MATH_T>(v[i]);
        } else {
          r_g[ii] = MATH_T(0);
          r_p[ii] = MATH_T(0);
          r_m[ii] = MATH_T(0);
          r_v[ii] = MATH_T(0);
        }
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        if (mode == ADAM_MODE_0) {  // L2
          r_g[ii] = r_g[ii] + (decay * r_p[ii]);
          r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
          r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
          MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
          MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
          MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
          MATH_T update = next_m_unbiased / denom;
          r_p[ii] = r_p[ii] - (*lr * update);
        } else {  // weight decay
          r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
          r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
          MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
          MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
          MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
          MATH_T update = (next_m_unbiased / denom) + (decay * r_p[ii]);
          r_p[ii] = r_p[ii] - (*lr * update);
        }
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int i = i_start + threadIdx.x + ii * blockDim.x;
        if (i < n && i < chunk_size) {
          p[i] = static_cast<T>(r_p[ii]);
          m[i] = static_cast<T>(r_m[ii]);
          v[i] = static_cast<T>(r_v[ii]);
        }
      }
    }
  }
};

template <typename T, typename FULL_T>
struct AdamCapturableMasterFunctor {
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<5>& tl,
                                             const float beta1, const float beta2, const int* step,
                                             const int bias_correction, const float epsilon, const float* lr,
                                             adamMode_t mode, const float decay, const float* inv_scale) {
    if (*noop_gmem == 1) return;

    float beta1_correction = 1.0f, beta2_correction = 1.0f;
    if (bias_correction == 1) {
      beta1_correction = 1 - pow(beta1, *step);
      beta2_correction = 1 - pow(beta2, *step);
    }

    int tensor_loc = tl.block_to_tensor[blockIdx.x];

    // potentially use to pass in list of scalar
    // int tensor_num = tl.start_tensor_this_launch + tensor_loc;

    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    T* g = (T*)tl.addresses[0][tensor_loc];
    g += chunk_idx * chunk_size;

    T* p = (T*)tl.addresses[1][tensor_loc];
    p += chunk_idx * chunk_size;

    FULL_T* m = (FULL_T*)tl.addresses[2][tensor_loc];
    m += chunk_idx * chunk_size;

    FULL_T* v = (FULL_T*)tl.addresses[3][tensor_loc];
    v += chunk_idx * chunk_size;

    FULL_T* p_master = (FULL_T*)tl.addresses[4][tensor_loc];
    p_master += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;

    // see note in multi_tensor_scale_kernel.cu
    for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
      MATH_T r_g[ILP];
      MATH_T r_p[ILP];
      MATH_T r_m[ILP];
      MATH_T r_v[ILP];
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int i = i_start + threadIdx.x + ii * blockDim.x;
        if (i < n && i < chunk_size) {
          r_g[ii] = static_cast<MATH_T>(g[i]) * (*inv_scale);
          g[i] = static_cast<T>(r_g[ii]);
          r_p[ii] = static_cast<MATH_T>(p_master[i]);
          r_m[ii] = static_cast<MATH_T>(m[i]);
          r_v[ii] = static_cast<MATH_T>(v[i]);
        } else {
          r_g[ii] = MATH_T(0);
          r_p[ii] = MATH_T(0);
          r_m[ii] = MATH_T(0);
          r_v[ii] = MATH_T(0);
        }
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        if (mode == ADAM_MODE_0) {  // L2
          r_g[ii] = r_g[ii] + (decay * r_p[ii]);
          r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
          r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
          MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
          MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
          MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
          MATH_T update = next_m_unbiased / denom;
          r_p[ii] = r_p[ii] - (*lr * update);
        } else {  // weight decay
          r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
          r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
          MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
          MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
          MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
          MATH_T update = (next_m_unbiased / denom) + (decay * r_p[ii]);
          r_p[ii] = r_p[ii] - (*lr * update);
        }
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int i = i_start + threadIdx.x + ii * blockDim.x;
        if (i < n && i < chunk_size) {
          p[i] = static_cast<T>(r_p[ii]);
          p_master[i] = static_cast<FULL_T>(r_p[ii]);
          m[i] = static_cast<FULL_T>(r_m[ii]);
          v[i] = static_cast<FULL_T>(r_v[ii]);
        }
      }
    }
  }
};

void multi_tensor_adam_cuda(int chunk_size, at::Tensor noop_flag, std::vector<std::vector<at::Tensor>> tensor_lists,
                            const float lr, const float beta1, const float beta2, const float epsilon, const int step,
                            const int mode, const int bias_correction, const float weight_decay) {
  using namespace at;

  // Handle bias correction mode
  float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
  if (bias_correction == 1) {
    bias_correction1 = 1 - std::pow(beta1, step);
    bias_correction2 = 1 - std::pow(beta2, step);
  }

  size_t max_size = 0;
  bool requires_64bit_indexing = false;
  for (auto it = tensor_lists.begin(); it != tensor_lists.end(); it++) {
    for (auto it2 = it->begin(); it2 != it->end(); it2++) {
      if (it2->numel() > max_size) {
        max_size = it2->numel();
        if (max_size >= INT_MAX) {
          requires_64bit_indexing = true;
          break;
        }
      }
    }
    if (requires_64bit_indexing) {
      break;
    }
  }

  if (requires_64bit_indexing) {
    // Assume single type across p,g,m1,m2 now
    DISPATCH_DOUBLE_FLOAT_HALF_AND_BFLOAT(
        tensor_lists[0][0].scalar_type(), 0, "adam",
        multi_tensor_apply<4>((int64_t)BLOCK_SIZE, (int64_t)chunk_size, noop_flag, tensor_lists,
                              AdamFunctor<scalar_t_0, float, int64_t>(), beta1, beta2, bias_correction1,
                              bias_correction2, epsilon, lr, (adamMode_t)mode, weight_decay);)
  } else {
    // Assume single type across p,g,m1,m2 now
    DISPATCH_DOUBLE_FLOAT_HALF_AND_BFLOAT(
        tensor_lists[0][0].scalar_type(), 0, "adam",
        multi_tensor_apply<4>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists,
                              AdamFunctor<scalar_t_0, float, int32_t>(), beta1, beta2, bias_correction1,
                              bias_correction2, epsilon, lr, (adamMode_t)mode, weight_decay);)
  }
  AT_CUDA_CHECK(cudaGetLastError());
}

void multi_tensor_adam_capturable_cuda(int chunk_size, at::Tensor noop_flag,
                                       std::vector<std::vector<at::Tensor>> tensor_lists, at::Tensor lr,
                                       const float beta1, const float beta2, const float epsilon, at::Tensor step,
                                       const int mode, const int bias_correction, const float weight_decay,
                                       at::Tensor inv_scale) {
  using namespace at;

  DISPATCH_DOUBLE_FLOAT_HALF_AND_BFLOAT(
      tensor_lists[0][0].scalar_type(), 0, "adam",
      multi_tensor_apply<4>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists, AdamCapturableFunctor<scalar_t_0, float>(),
                            beta1, beta2, step.data_ptr<int>(), bias_correction, epsilon, lr.data_ptr<float>(),
                            (adamMode_t)mode, weight_decay, inv_scale.data_ptr<float>());)

  AT_CUDA_CHECK(cudaGetLastError());
}

void multi_tensor_adam_capturable_master_cuda(int chunk_size, at::Tensor noop_flag,
                                              std::vector<std::vector<at::Tensor>> tensor_lists, at::Tensor lr,
                                              const float beta1, const float beta2, const float epsilon,
                                              at::Tensor step, const int mode, const int bias_correction,
                                              const float weight_decay, at::Tensor inv_scale) {
  using namespace at;

  DISPATCH_DOUBLE_FLOAT_HALF_AND_BFLOAT(
      tensor_lists[0][0].scalar_type(), 0, "adam",
      multi_tensor_apply<5>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists,
                            AdamCapturableMasterFunctor<scalar_t_0, float>(), beta1, beta2, step.data_ptr<int>(),
                            bias_correction, epsilon, lr.data_ptr<float>(), (adamMode_t)mode, weight_decay,
                            inv_scale.data_ptr<float>());)

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: csrc/multi_tensor_apply.cuh
================================================
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
#include <assert.h>
#include <c10/cuda/CUDAGuard.h>

// #include <iostream>

// This header is the one-stop shop for all your multi-tensor apply needs.

// TODO:  Kernel arg size limit may be <4KB for some other cards (ie Jetson)
constexpr int depth_to_max_tensors[6] = {110, 64, 48, 36, 30, 24};
constexpr int depth_to_max_blocks[6] = {320, 320, 320, 320, 320, 320};

template <int n>
struct TensorListMetadata {
  void* addresses[n][depth_to_max_tensors[n - 1]];
  int64_t sizes[depth_to_max_tensors[n - 1]];
  unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
  int block_to_chunk[depth_to_max_blocks[n - 1]];  // I fear this needs to be a full int.
  int start_tensor_this_launch;
};

template <typename T, typename U, typename... ArgTypes>
__global__ void multi_tensor_apply_kernel(int64_t chunk_size, volatile int* noop_flag, T tl, U callable,
                                          ArgTypes... args) {
  // Hand the chunk information to the user-supplied functor to process however it likes.
  callable(chunk_size, noop_flag, tl, args...);
}

template <int depth, typename T, typename... ArgTypes>
void multi_tensor_apply(int64_t block_size, int64_t chunk_size, const at::Tensor& noop_flag,
                        const std::vector<std::vector<at::Tensor>>& tensor_lists, T callable, ArgTypes... args) {
  TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
  int len0 = tensor_lists[0].size();
  TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
  auto ref_device = tensor_lists[0][0].device();
  TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
  for (int l = 0; l < tensor_lists.size(); l++)  // No range-based for because I need indices
  {
    TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
    for (int t = 0; t < tensor_lists[l].size(); t++) {
      // TODO:  Print which tensor fails.
      bool contiguous_memory = tensor_lists[l][t].is_contiguous();
      contiguous_memory = (contiguous_memory || tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast) ||
                           tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast3d));
      TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
      TORCH_CHECK(tensor_lists[l][t].device() == ref_device, "A tensor was not on the same device as the first tensor");
      TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
    }
  }

  int ntensors = tensor_lists[0].size();

  TensorListMetadata<depth> tl;

  const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
  auto stream = at::cuda::getCurrentCUDAStream();

  tl.start_tensor_this_launch = 0;
  int loc_block_info = 0;
  int loc_tensor_info = 0;
  for (int t = 0; t < ntensors; t++) {
    tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
    for (int d = 0; d < depth; d++) tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
    loc_tensor_info++;

    auto chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;

    for (auto chunk = 0; chunk < chunks_this_tensor; chunk++) {
      // std::cout << chunks_this_tensor << std::endl;
      tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
      tl.block_to_chunk[loc_block_info] = chunk;
      loc_block_info++;

      bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] && chunk == chunks_this_tensor - 1);
      bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]);
      bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
      if (tensors_full || blocks_full || last_chunk) {
        // using accscalar_t = acc_type<scalar_t, true>;
        multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>(chunk_size, noop_flag.data_ptr<int>(), tl,
                                                                             callable, args...);

        AT_CUDA_CHECK(cudaGetLastError());

        // Reset.  The control flow possibilities here make my brain hurt.
        loc_block_info = 0;
        if (chunk == chunks_this_tensor - 1) {
          // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 << std::endl;
          loc_tensor_info = 0;
          tl.start_tensor_this_launch = t + 1;
        } else {
          // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 << std::endl;
          tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
          for (int d = 0; d < depth; d++) tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];
          loc_tensor_info = 1;
          tl.start_tensor_this_launch = t;
        }
      }
    }
  }
}


================================================
FILE: csrc/multi_tensor_axpby_kernel.cu
================================================
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
// Another possibility:
// #include <torch/all.h>

#include <assert.h>

#include "multi_tensor_apply.cuh"
#include "type_shim.h"

#define BLOCK_SIZE 512
#define ILP 4

template <typename T>
__device__ __forceinline__ bool is_aligned(T* p) {
  return ((uint64_t)p) % (ILP * sizeof(T)) == 0;
}

template <typename T>
__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset) {
  typedef typename std::aligned_storage<ILP * sizeof(T), ILP * alignof(T)>::type LT;
  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
}

template <typename x_t, typename y_t, typename out_t>
struct AxpbyFunctor {
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<3>& tl,
                                             float a, float b, int arg_to_check) {
    // I'd like this kernel to propagate infs/nans.
    // if(*noop_gmem == 1)
    //   return;

    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    x_t* x = (x_t*)tl.addresses[0][tensor_loc];
    x += chunk_idx * chunk_size;

    y_t* y = (y_t*)tl.addresses[1][tensor_loc];
    y += chunk_idx * chunk_size;

    out_t* out = (out_t*)tl.addresses[2][tensor_loc];
    out += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;

    bool finite = true;
    x_t r_x[ILP];
    y_t r_y[ILP];
    out_t r_out[ILP];

    // to make things simple, we put aligned case in a different code path
    if (n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(x) && is_aligned(y) && is_aligned(out)) {
      for (int i_start = threadIdx.x; i_start * ILP < n && i_start * ILP < chunk_size; i_start += blockDim.x) {
        // load
        load_store(r_x, x, 0, i_start);
        load_store(r_y, y, 0, i_start);
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          r_out[ii] = a * static_cast<float>(r_x[ii]) + b * static_cast<float>(r_y[ii]);
          if (arg_to_check == -1) finite = finite && (isfinite(r_x[ii]) && isfinite(r_y[ii]));
          if (arg_to_check == 0) finite = finite && isfinite(r_x[ii]);
          if (arg_to_check == 1) finite = finite && isfinite(r_y[ii]);
        }
        // store
        load_store(out, r_out, i_start, 0);
      }
    } else {
      // Non-divergent exit condition for __syncthreads, not necessary here
      for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          r_x[ii] = 0;
          r_y[ii] = 0;
          int i = i_start + threadIdx.x + ii * blockDim.x;
          if (i < n && i < chunk_size) {
            r_x[ii] = x[i];
            r_y[ii] = y[i];
          }
        }
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          r_out[ii] = a * static_cast<float>(r_x[ii]) + b * static_cast<float>(r_y[ii]);
          if (arg_to_check == -1) finite = finite && (isfinite(r_x[ii]) && isfinite(r_y[ii]));
          if (arg_to_check == 0) finite = finite && isfinite(r_x[ii]);
          if (arg_to_check == 1) finite = finite && isfinite(r_y[ii]);
        }
        // see note in multi_tensor_scale_kernel.cu
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          int i = i_start + threadIdx.x + ii * blockDim.x;
          if (i < n && i < chunk_size) out[i] = r_out[ii];
        }
      }
    }
    if (!finite) *noop_gmem = 1;  // Blindly fire off a write.  These will race but that's ok.
  }
};

void multi_tensor_axpby_cuda(int chunk_size, at::Tensor noop_flag, std::vector<std::vector<at::Tensor>> tensor_lists,
                             float a, float b, int arg_to_check) {
  using namespace at;
  // The output (downscaled) type is always float.
  // If build times suffer, think about where to put this dispatch,
  // and what logic should be moved out of multi_tensor_apply.

  DISPATCH_FLOAT_AND_HALF(
      tensor_lists[0][0].scalar_type(), 0, "multi_tensor_axpby_cuda",
      DISPATCH_FLOAT_AND_HALF(
          tensor_lists[1][0].scalar_type(), 1, "multi_tensor_axpby_cuda",
          DISPATCH_FLOAT_AND_HALF(
              tensor_lists[2][0].scalar_type(), 2, "multi_tensor_axpby_cuda",
              multi_tensor_apply<3>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists,
                                    AxpbyFunctor<scalar_t_0, scalar_t_1, scalar_t_2>(), a, b, arg_to_check);)))

  AT_CUDA_CHECK(cudaGetLastError());

  // AT_CUDA_CHECK(cudaDeviceSynchronize());
}


================================================
FILE: csrc/multi_tensor_l2norm_kernel.cu
================================================
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
#include <c10/cuda/CUDAGuard.h>
// Another possibility:
// #include <torch/all.h>

#include <assert.h>

#include "multi_tensor_apply.cuh"
#include "type_shim.h"

#define BLOCK_SIZE 512
#define ILP 4

template <typename T>
__device__ __forceinline__ bool is_aligned(T* p) {
  return ((uint64_t)p) % (ILP * sizeof(T)) == 0;
}

template <typename T>
__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset) {
  typedef typename std::aligned_storage<ILP * sizeof(T), ILP * alignof(T)>::type LT;
  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
}

template <typename x_t>
struct L2NormFunctor {
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<1>& tl,
                                             float* output, float* output_per_tensor, bool per_tensor,
                                             int max_chunks_per_tensor) {
    // I'd like this kernel to propagate infs/nans.
    // if(*noop_gmem == 1)
    //   return;

    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    x_t* x = (x_t*)tl.addresses[0][tensor_loc];
    x += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;

    __shared__ float s_vals[512];

    float vals[ILP];  // = {0}; // this probably works too but I want to be sure...
    x_t r_x[ILP];
    for (int i = 0; i < ILP; i++) {
      vals[i] = 0.f;
      r_x[i] = 0;
    }

    // to make things simple, we put aligned case in a different code path
    if (n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(x)) {
      for (int i_start = threadIdx.x; i_start * ILP < n && i_start * ILP < chunk_size; i_start += blockDim.x) {
        // load
        load_store(r_x, x, 0, i_start);
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          float next = static_cast<float>(r_x[ii]);
          vals[ii] += next * next;
        }
      }
    } else {
      for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          int i = i_start + threadIdx.x + ii * blockDim.x;
          if (i < n && i < chunk_size) {
            float next = static_cast<float>(x[i]);
            vals[ii] += next * next;
          }
        }
      }
    }

    float val = 0.f;
    for (int i = 0; i < ILP; i++) val += vals[i];

    float final = reduce_block_into_lanes(s_vals, val);

    if (threadIdx.x == 0) {
      if (!isfinite(final)) *noop_gmem = 1;  // Blindly fire off a write.  These will race but that's ok.
      output[blockIdx.x] += final;
      if (per_tensor)
        output_per_tensor[(tl.start_tensor_this_launch + tensor_loc) * max_chunks_per_tensor + chunk_idx] = final;
    }
  }
};

template <typename x_t>
struct UnscaleL2NormFunctor {
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<1>& tl,
                                             const float* inv_scale, float* output, float* output_per_tensor,
                                             bool per_tensor, int max_chunks_per_tensor) {
    // I'd like this kernel to propagate infs/nans.
    // if(*noop_gmem == 1)
    //   return;

    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    x_t* x = (x_t*)tl.addresses[0][tensor_loc];
    x += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;

    __shared__ float s_vals[512];

    float vals[ILP];  // = {0}; // this probably works too but I want to be sure...
    x_t r_x[ILP];
    for (int i = 0; i < ILP; i++) {
      vals[i] = 0.f;
      r_x[i] = 0;
    }

    // to make things simple, we put aligned case in a different code path
    if (n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(x)) {
      for (int i_start = threadIdx.x; i_start * ILP < n && i_start * ILP < chunk_size; i_start += blockDim.x) {
        // load
        load_store(r_x, x, 0, i_start);
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          float next = static_cast<float>(r_x[ii]) * (*inv_scale);
          vals[ii] += next * next;
        }
      }
    } else {
      for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          int i = i_start + threadIdx.x + ii * blockDim.x;
          if (i < n && i < chunk_size) {
            float next = static_cast<float>(x[i]) * (*inv_scale);
            vals[ii] += next * next;
          }
        }
      }
    }

    float val = 0.f;
    for (int i = 0; i < ILP; i++) val += vals[i];

    float final = reduce_block_into_lanes(s_vals, val);

    if (threadIdx.x == 0) {
      if (!isfinite(final)) *noop_gmem = 1;  // Blindly fire off a write.  These will race but that's ok.
      output[blockIdx.x] += final;
      if (per_tensor)
        output_per_tensor[(tl.start_tensor_this_launch + tensor_loc) * max_chunks_per_tensor + chunk_idx] = final;
    }
  }
};

// Probably better to template, but since we are not likely to support other norm
template <typename x_t>
struct MaxNormFunctor {
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<1>& tl,
                                             float* output, float* output_per_tensor, bool per_tensor,
                                             int max_chunks_per_tensor) {
    // I'd like this kernel to propagate infs/nans.
    // if(*noop_gmem == 1)
    //   return;

    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    x_t* x = (x_t*)tl.addresses[0][tensor_loc];
    x += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;

    __shared__ float s_vals[512];

    float vals[ILP];  // = {0}; // this probably works too but I want to be sure...
    x_t r_x[ILP];
    for (int i = 0; i < ILP; i++) {
      vals[i] = 0.f;
      r_x[i] = 0;
    }

    // to make things simple, we put aligned case in a different code path
    if (n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(x)) {
      for (int i_start = threadIdx.x; i_start * ILP < n && i_start * ILP < chunk_size; i_start += blockDim.x) {
        // load
        load_store(r_x, x, 0, i_start);
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          float next = static_cast<float>(r_x[ii]);
          vals[ii] = fmaxf(fabsf(vals[ii]), fabsf(next));
        }
      }
    } else {
      for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          int i = i_start + threadIdx.x + ii * blockDim.x;
          if (i < n && i < chunk_size) {
            float next = static_cast<float>(x[i]);
            vals[ii] = fmaxf(fabsf(vals[ii]), fabsf(next));
          }
        }
      }
    }

    float val = 0.f;
    for (int i = 0; i < ILP; i++) val = fmaxf(fabsf(val), fabsf(vals[i]));

    float final = reduce_block_into_lanes_max_op(s_vals, val);

    if (threadIdx.x == 0) {
      if (!isfinite(final)) *noop_gmem = 1;  // Blindly fire off a write.  These will race but that's ok.
      output[blockIdx.x] = fmaxf(fabsf(output[blockIdx.x]), fabsf(final));
      if (per_tensor)
        output_per_tensor[(tl.start_tensor_this_launch + tensor_loc) * max_chunks_per_tensor + chunk_idx] = final;
    }
  }
};

__global__ void cleanup(float* output, float* output_per_tensor, float* ret, float* ret_per_tensor, bool per_tensor,
                        int max_chunks_per_tensor) {
  __shared__ float vals[512];

  if (blockIdx.x == 0) {
    float val = 0;
    if (threadIdx.x < 320) val = output[threadIdx.x];

    float final = reduce_block_into_lanes(vals, val);

    if (threadIdx.x == 0) *ret = sqrt(final);
  }

  if (per_tensor) {
    float* output_this_tensor = output_per_tensor + blockIdx.x * max_chunks_per_tensor;

    float val = 0;
    for (int i = threadIdx.x; i < max_chunks_per_tensor; i += blockDim.x) val += output_this_tensor[i];

    float final = reduce_block_into_lanes(vals, val);

    if (threadIdx.x == 0) ret_per_tensor[blockIdx.x] = sqrt(final);
  }
}

__global__ void cleanup_v2(float* output, float* output_per_tensor, float* ret, float* ret_per_tensor, bool per_tensor,
                           int max_chunks_per_tensor, int norm_type, float alpha, float beta) {
  __shared__ float vals[512];

  if (blockIdx.x == 0) {
    float val = 0;
    if (threadIdx.x < 320) val = output[threadIdx.x];

    if (norm_type == 0) {
      float final = reduce_block_into_lanes_max_op(vals, val);
      if (threadIdx.x == 0) *ret = alpha * (*ret) + beta * final;
    } else {
      float final = reduce_block_into_lanes(vals, val);
      if (threadIdx.x == 0) *ret = sqrt(alpha * (*ret) * (*ret) + beta * final);
    }
  }

  if (per_tensor) {
    float* output_this_tensor = output_per_tensor + blockIdx.x * max_chunks_per_tensor;

    if (norm_type == 0) {
      float val = 0;
      for (int i = threadIdx.x; i < max_chunks_per_tensor; i += blockDim.x)
        val = fmaxf(fabsf(val), fabsf(output_this_tensor[i]));

      float final = reduce_block_into_lanes_max_op(vals, val);

      if (threadIdx.x == 0) ret_per_tensor[blockIdx.x] = alpha * ret_per_tensor[blockIdx.x] + beta * final;
    } else {
      float val = 0;
      for (int i = threadIdx.x; i < max_chunks_per_tensor; i += blockDim.x) val += output_this_tensor[i];

      float final = reduce_block_into_lanes(vals, val);

      if (threadIdx.x == 0)
        ret_per_tensor[blockIdx.x] =
            sqrt(alpha * ret_per_tensor[blockIdx.x] * ret_per_tensor[blockIdx.x] + beta * final);
    }
  }
}

std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(int chunk_size, at::Tensor noop_flag,
                                                            std::vector<std::vector<at::Tensor>> tensor_lists,
                                                            at::optional<bool> per_tensor_python) {
  bool per_tensor = per_tensor_python.has_value() ? per_tensor_python.value() : false;

  auto float_options = tensor_lists[0][0].options().dtype(at::kFloat);
  auto output = at::zeros({320}, float_options);

  at::Tensor output_per_tensor;
  at::Tensor ret_per_tensor;

  int ntensors = tensor_lists[0].size();
  int max_chunks_per_tensor = -1;

  if (per_tensor) {
    for (int t = 0; t < ntensors; t++) {
      int max_chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
      if (max_chunks_this_tensor > max_chunks_per_tensor) max_chunks_per_tensor = max_chunks_this_tensor;
    }
    output_per_tensor = at::zeros({ntensors * max_chunks_per_tensor}, float_options);
    ret_per_tensor = at::empty({ntensors}, float_options);
  } else {
    ret_per_tensor = at::empty({0}, float_options);
  }

  DISPATCH_FLOAT_HALF_AND_BFLOAT(
      tensor_lists[0][0].scalar_type(), 0, "multi_tensor_l2norm_cuda",
      multi_tensor_apply<1>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists, L2NormFunctor<scalar_t_0>(),
                            output.data_ptr<float>(), per_tensor ? output_per_tensor.data_ptr<float>() : nullptr,
                            per_tensor, max_chunks_per_tensor);)

  AT_CUDA_CHECK(cudaGetLastError());
  // AT_CUDA_CHECK(cudaDeviceSynchronize());

  // This involves one more small kernel launches, but will be negligible end to end.
  // I could get rid of these by hacking the functor + multi tensor harness with persistence
  // logic, but keeping it simple for now
  auto ret = at::empty({1}, output.options());
  const at::cuda::OptionalCUDAGuard device_guard(device_of(output));
  auto stream = at::cuda::getCurrentCUDAStream();
  cleanup<<<per_tensor ? ntensors : 1, 512, 0, stream>>>(
      output.data_ptr<float>(), per_tensor ? output_per_tensor.data_ptr<float>() : nullptr, ret.data_ptr<float>(),
      per_tensor ? ret_per_tensor.data_ptr<float>() : nullptr, per_tensor, max_chunks_per_tensor);

  return std::tuple<at::Tensor, at::Tensor>(ret, ret_per_tensor);
}

std::tuple<at::Tensor, at::Tensor> multi_tensor_unscale_l2norm_cuda(int chunk_size, at::Tensor noop_flag,
                                                                    std::vector<std::vector<at::Tensor>> tensor_lists,
                                                                    at::Tensor inv_scale,
                                                                    at::optional<bool> per_tensor_python) {
  bool per_tensor = per_tensor_python.has_value() ? per_tensor_python.value() : false;

  auto float_options = tensor_lists[0][0].options().dtype(at::kFloat);
  auto output = at::zeros({320}, float_options);

  at::Tensor output_per_tensor;
  at::Tensor ret_per_tensor;

  int ntensors = tensor_lists[0].size();
  int max_chunks_per_tensor = -1;

  if (per_tensor) {
    for (int t = 0; t < ntensors; t++) {
      int max_chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
      if (max_chunks_this_tensor > max_chunks_per_tensor) max_chunks_per_tensor = max_chunks_this_tensor;
    }
    output_per_tensor = at::zeros({ntensors * max_chunks_per_tensor}, float_options);
    ret_per_tensor = at::empty({ntensors}, float_options);
  } else {
    ret_per_tensor = at::empty({0}, float_options);
  }

  DISPATCH_FLOAT_HALF_AND_BFLOAT(
      tensor_lists[0][0].scalar_type(), 0, "multi_tensor_unscale_l2norm_cuda",
      multi_tensor_apply<1>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists, UnscaleL2NormFunctor<scalar_t_0>(),
                            inv_scale.data_ptr<float>(), output.data_ptr<float>(),
                            per_tensor ? output_per_tensor.data_ptr<float>() : nullptr, per_tensor,
                            max_chunks_per_tensor);)

  AT_CUDA_CHECK(cudaGetLastError());
  // AT_CUDA_CHECK(cudaDeviceSynchronize());

  // This involves one more small kernel launches, but will be negligible end to end.
  // I could get rid of these by hacking the functor + multi tensor harness with persistence
  // logic, but keeping it simple for now
  auto ret = at::empty({1}, output.options());
  const at::cuda::OptionalCUDAGuard device_guard(device_of(output));
  auto stream = at::cuda::getCurrentCUDAStream();
  cleanup<<<per_tensor ? ntensors : 1, 512, 0, stream>>>(
      output.data_ptr<float>(), per_tensor ? output_per_tensor.data_ptr<float>() : nullptr, ret.data_ptr<float>(),
      per_tensor ? ret_per_tensor.data_ptr<float>() : nullptr, per_tensor, max_chunks_per_tensor);

  return std::tuple<at::Tensor, at::Tensor>(ret, ret_per_tensor);
}

// Compute and update grad norm
// Here use a per tensor norm, and blend new norm(n) and old norm(gn) by
// L-2: gn = sqrt(a * gn^2 + b * n^2)
// L-inf: gn = a * gn + b * n
void multi_tensor_norm_out_cuda(int chunk_size, at::Tensor noop_flag, std::vector<std::vector<at::Tensor>> tensor_lists,
                                at::Tensor out, const float alpha, const float beta, const int norm_type) {
  auto float_options = tensor_lists[0][0].options().dtype(at::kFloat);
  TORCH_CHECK(tensor_lists[0][0].device() == noop_flag.device(), "noop flag should be on the same device as tensors");
  // we don't need global thus uses empty here
  auto output = at::empty({320}, float_options);

  at::Tensor output_per_tensor;
  at::Tensor ret_per_tensor;

  int ntensors = tensor_lists[0].size();
  int max_chunks_per_tensor = -1;

  for (int t = 0; t < ntensors; t++) {
    int max_chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
    if (max_chunks_this_tensor > max_chunks_per_tensor) max_chunks_per_tensor = max_chunks_this_tensor;
  }

  // Although it is single write then read, still need to be zero
  // Since tailing element also participate cleanup
  output_per_tensor = at::zeros({ntensors * max_chunks_per_tensor}, float_options);

  if (norm_type == 0) {
    DISPATCH_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 0, "multi_tensor_maxnorm_cuda",
                            multi_tensor_apply<1>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists,
                                                  MaxNormFunctor<scalar_t_0>(), output.data_ptr<float>(),
                                                  output_per_tensor.data_ptr<float>(), true, max_chunks_per_tensor);)
  } else {
    DISPATCH_FLOAT_HALF_AND_BFLOAT(
        tensor_lists[0][0].scalar_type(), 0, "multi_tensor_l2norm_cuda",
        multi_tensor_apply<1>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists, L2NormFunctor<scalar_t_0>(),
                              output.data_ptr<float>(), output_per_tensor.data_ptr<float>(), true,
                              max_chunks_per_tensor);)
  }
  AT_CUDA_CHECK(cudaGetLastError());

  // AT_CUDA_CHECK(cudaDeviceSynchronize());

  // This involves one more small kernel launches, but will be negligible end to end.
  // I could get rid of these by hacking the functor + multi tensor harness with persistence
  // logic, but keeping it simple for now
  auto ret = at::empty({1}, output.options());

  // Adding the following device guard since it happens sometimes that the
  // tensors are on one device and the cuda stream is on another device which
  // results in ILLEGAL MEM ACCESS error.
  const at::cuda::OptionalCUDAGuard device_guard(device_of(output));
  auto stream = at::cuda::getCurrentCUDAStream();
  cleanup_v2<<<ntensors, 512, 0, stream>>>(output.data_ptr<float>(), output_per_tensor.data_ptr<float>(),
                                           ret.data_ptr<float>(), out.data_ptr<float>(), true, max_chunks_per_tensor,
                                           norm_type, alpha, beta);

  return;
}


================================================
FILE: csrc/multi_tensor_l2norm_kernel_mp.cu
================================================
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
#include <c10/cuda/CUDAGuard.h>
// Another possibility:
// #include <torch/all.h>

#include <assert.h>

#include "multi_tensor_apply.cuh"
#include "type_shim.h"

#define BLOCK_SIZE 512
#define ILP 4

template <typename T>
__device__ __forceinline__ bool is_aligned(T* p) {
  return ((uint64_t)p) % (ILP * sizeof(T)) == 0;
}

template <typename T>
__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset) {
  typedef typename std::aligned_storage<ILP * sizeof(T), ILP * alignof(T)>::type LT;
  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
}

template <typename x_t>
struct L2NormFunctor {
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<1>& tl,
                                             float* output, float* output_per_tensor, bool per_tensor,
                                             int max_chunks_per_tensor) {
    if (*noop_gmem) {
      return;
    }

    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    x_t* x = (x_t*)tl.addresses[0][tensor_loc];
    x += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;

    __shared__ float s_vals[512];

    float vals[ILP];  // = {0}; // this probably works too but I want to be sure...
    x_t r_x[ILP];
    for (int i = 0; i < ILP; i++) {
      vals[i] = 0.f;
      r_x[i] = 0;
    }

    // to make things simple, we put aligned case in a different code path
    if (n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(x)) {
      for (int i_start = threadIdx.x; i_start * ILP < n && i_start * ILP < chunk_size; i_start += blockDim.x) {
        // load
        load_store(r_x, x, 0, i_start);
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          float next = static_cast<float>(r_x[ii]);
          vals[ii] += next * next;
        }
      }
    } else {
      for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          int i = i_start + threadIdx.x + ii * blockDim.x;
          if (i < n && i < chunk_size) {
            float next = static_cast<float>(x[i]);
            vals[ii] += next * next;
          }
        }
      }
    }

    float val = 0.f;
    for (int i = 0; i < ILP; i++) val += vals[i];

    float final = reduce_block_into_lanes(s_vals, val);

    if (threadIdx.x == 0) {
      if (!isfinite(final)) *noop_gmem = 1;  // Blindly fire off a write.  These will race but that's ok.
      output[blockIdx.x] += final;
      if (per_tensor)
        output_per_tensor[(tl.start_tensor_this_launch + tensor_loc) * max_chunks_per_tensor + chunk_idx] = final;
    }
  }
};

__global__ void cleanup(float* output, float* output_per_tensor, float* ret, float* ret_per_tensor, bool per_tensor,
                        int max_chunks_per_tensor, volatile int* noop_gmem) {
  if (*noop_gmem) {
    return;
  }
  __shared__ float vals[512];

  if (blockIdx.x == 0) {
    float val = 0;
    if (threadIdx.x < 320) val = output[threadIdx.x];

    float final = reduce_block_into_lanes(vals, val);

    if (threadIdx.x == 0) *ret = sqrt(final);
  }

  if (per_tensor) {
    float* output_this_tensor = output_per_tensor + blockIdx.x * max_chunks_per_tensor;

    float val = 0;
    for (int i = threadIdx.x; i < max_chunks_per_tensor; i += blockDim.x) val += output_this_tensor[i];

    float final = reduce_block_into_lanes(vals, val);

    if (threadIdx.x == 0) ret_per_tensor[blockIdx.x] = sqrt(final);
  }
}

std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_mp_cuda(int chunk_size, at::Tensor noop_flag,
                                                               std::vector<std::vector<at::Tensor>> tensor_lists,
                                                               at::optional<bool> per_tensor_python) {
  bool per_tensor = per_tensor_python.has_value() ? per_tensor_python.value() : false;

  auto float_options = tensor_lists[0][0].options().dtype(at::kFloat);
  auto output = at::zeros({320}, float_options);

  at::Tensor output_per_tensor;
  at::Tensor ret_per_tensor;

  int ntensors = tensor_lists[0].size();
  int max_chunks_per_tensor = -1;

  if (per_tensor) {
    for (int t = 0; t < ntensors; t++) {
      int max_chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
      if (max_chunks_this_tensor > max_chunks_per_tensor) max_chunks_per_tensor = max_chunks_this_tensor;
    }
    output_per_tensor = at::zeros({ntensors * max_chunks_per_tensor}, float_options);
    ret_per_tensor = at::empty({ntensors}, float_options);
  } else {
    ret_per_tensor = at::empty({0}, float_options);
  }

  DISPATCH_FLOAT_HALF_AND_BFLOAT(
      tensor_lists[0][0].scalar_type(), 0, "multi_tensor_l2norm_mp_cuda",
      multi_tensor_apply<1>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists, L2NormFunctor<scalar_t_0>(),
                            output.data_ptr<float>(), per_tensor ? output_per_tensor.data_ptr<float>() : nullptr,
                            per_tensor, max_chunks_per_tensor);)

  AT_CUDA_CHECK(cudaGetLastError());
  // AT_CUDA_CHECK(cudaDeviceSynchronize());

  // This involves one more small kernel launches, but will be negligible end to end.
  // I could get rid of these by hacking the functor + multi tensor harness with persistence
  // logic, but keeping it simple for now
  auto ret = at::empty({1}, output.options());
  const at::cuda::OptionalCUDAGuard device_guard(device_of(output));
  auto stream = at::cuda::getCurrentCUDAStream();
  cleanup<<<per_tensor ? ntensors : 1, 512, 0, stream>>>(
      output.data_ptr<float>(), per_tensor ? output_per_tensor.data_ptr<float>() : nullptr, ret.data_ptr<float>(),
      per_tensor ? ret_per_tensor.data_ptr<float>() : nullptr, per_tensor, max_chunks_per_tensor,
      noop_flag.data_ptr<int>());

  return std::tuple<at::Tensor, at::Tensor>(ret, ret_per_tensor);
}


================================================
FILE: csrc/multi_tensor_l2norm_scale_kernel.cu
================================================
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
#include <c10/cuda/CUDAGuard.h>
// Another possibility:
// #include <torch/all.h>

#include <assert.h>

#include "multi_tensor_apply.cuh"
#include "type_shim.h"

#define BLOCK_SIZE 512
#define ILP 4

template <typename T>
__device__ __forceinline__ bool is_aligned(T* p) {
  return ((uint64_t)p) % (ILP * sizeof(T)) == 0;
}

template <typename T>
__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset) {
  typedef typename std::aligned_storage<ILP * sizeof(T), ILP * alignof(T)>::type LT;
  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
}

template <typename in_t, typename out_t>
struct L2NormScaleFunctor {
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<2>& tl,
                                             float* output, float* output_per_tensor, float scale, bool per_tensor,
                                             int max_chunks_per_tensor) {
    // I'd like this kernel to propagate infs/nans.
    // if(*noop_gmem == 1)
    //   return;

    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    in_t* in = (in_t*)tl.addresses[0][tensor_loc];
    in += chunk_idx * chunk_size;

    out_t* out = (out_t*)tl.addresses[1][tensor_loc];
    out += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;

    __shared__ float s_vals[512];

    float vals[ILP];  // = {0}; // this probably works too but I want to be sure...
    in_t r_in[ILP];
    for (int i = 0; i < ILP; i++) {
      vals[i] = 0.f;
      r_in[i] = 0;
    }
    // bool finite = true;
    out_t r_out[ILP];

    // to make things simple, we put aligned case in a different code path
    if (n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(in) && is_aligned(out)) {
      for (int i_start = threadIdx.x; i_start * ILP < n && i_start * ILP < chunk_size; i_start += blockDim.x) {
        // load
        load_store(r_in, in, 0, i_start);
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          float next = static_cast<float>(r_in[ii]);
          r_out[ii] = next * scale;
          vals[ii] += next * next;
          // finite = finite && isfinite(r_in[ii]);
        }
        load_store(out, r_out, i_start, 0);
      }
    } else {
      for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          r_in[ii] = 0;
          int i = i_start + threadIdx.x + ii * blockDim.x;
          if (i < n && i < chunk_size) {
            r_in[ii] = in[i];
            float next = static_cast<float>(in[i]);
            vals[ii] += next * next;
          }
        }
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          r_out[ii] = static_cast<float>(r_in[ii]) * scale;
          // finite = finite && isfinite(r_in[ii]);
        }
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          int i = i_start + threadIdx.x + ii * blockDim.x;
          if (i < n && i < chunk_size) out[i] = r_out[ii];
        }
      }
    }

    float val = 0.f;
    for (int i = 0; i < ILP; i++) val += vals[i];

    float final = reduce_block_into_lanes(s_vals, val);

    if (threadIdx.x == 0) {
      if (!isfinite(final)) *noop_gmem = 1;  // Blindly fire off a write.  These will race but that's ok.
      output[blockIdx.x] += final;
      if (per_tensor)
        output_per_tensor[(tl.start_tensor_this_launch + tensor_loc) * max_chunks_per_tensor + chunk_idx] = final;
    }
  }
};
// Probably better to template, but since we are not likely to support other norm
template <typename x_t>
struct MaxNormFunctor {
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<1>& tl,
                                             float* output, float* output_per_tensor, bool per_tensor,
                                             int max_chunks_per_tensor) {
    // I'd like this kernel to propagate infs/nans.
    // if(*noop_gmem == 1)
    //   return;

    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    x_t* x = (x_t*)tl.addresses[0][tensor_loc];
    x += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;

    __shared__ float s_vals[512];

    float vals[ILP];  // = {0}; // this probably works too but I want to be sure...
    x_t r_x[ILP];
    for (int i = 0; i < ILP; i++) {
      vals[i] = 0.f;
      r_x[i] = 0;
    }

    // to make things simple, we put aligned case in a different code path
    if (n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(x)) {
      for (int i_start = threadIdx.x; i_start * ILP < n && i_start * ILP < chunk_size; i_start += blockDim.x) {
        // load
        load_store(r_x, x, 0, i_start);
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          float next = static_cast<float>(r_x[ii]);
          vals[ii] = fmaxf(fabsf(vals[ii]), fabsf(next));
        }
      }
    } else {
      for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          int i = i_start + threadIdx.x + ii * blockDim.x;
          if (i < n && i < chunk_size) {
            float next = static_cast<float>(x[i]);
            vals[ii] = fmaxf(fabsf(vals[ii]), fabsf(next));
          }
        }
      }
    }

    float val = 0.f;
    for (int i = 0; i < ILP; i++) val = fmaxf(fabsf(val), fabsf(vals[i]));

    float final = reduce_block_into_lanes_max_op(s_vals, val);

    if (threadIdx.x == 0) {
      if (!isfinite(final)) *noop_gmem = 1;  // Blindly fire off a write.  These will race but that's ok.
      output[blockIdx.x] = fmaxf(fabsf(output[blockIdx.x]), fabsf(final));
      if (per_tensor)
        output_per_tensor[(tl.start_tensor_this_launch + tensor_loc) * max_chunks_per_tensor + chunk_idx] = final;
    }
  }
};

__global__ void cleanup_v3(float* output, float* output_per_tensor, float* ret, float* ret_per_tensor, bool per_tensor,
                           int max_chunks_per_tensor) {
  __shared__ float vals[512];

  if (blockIdx.x == 0) {
    float val = 0;
    if (threadIdx.x < 320) val = output[threadIdx.x];

    float final = reduce_block_into_lanes(vals, val);

    if (threadIdx.x == 0) *ret = sqrt(final);
  }

  if (per_tensor) {
    float* output_this_tensor = output_per_tensor + blockIdx.x * max_chunks_per_tensor;

    float val = 0;
    for (int i = threadIdx.x; i < max_chunks_per_tensor; i += blockDim.x) val += output_this_tensor[i];

    float final = reduce_block_into_lanes(vals, val);

    if (threadIdx.x == 0) ret_per_tensor[blockIdx.x] = sqrt(final);
  }
}

std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_scale_cuda(int chunk_size, at::Tensor noop_flag,
                                                                  std::vector<std::vector<at::Tensor>> tensor_lists,
                                                                  float scale, at::optional<bool> per_tensor_python) {
  bool per_tensor = per_tensor_python.has_value() ? per_tensor_python.value() : false;

  auto float_options = tensor_lists[0][0].options().dtype(at::kFloat);
  auto output = at::zeros({320}, float_options);

  at::Tensor output_per_tensor;
  at::Tensor ret_per_tensor;

  int ntensors = tensor_lists[0].size();
  int max_chunks_per_tensor = -1;

  if (per_tensor) {
    for (int t = 0; t < ntensors; t++) {
      int max_chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
      if (max_chunks_this_tensor > max_chunks_per_tensor) max_chunks_per_tensor = max_chunks_this_tensor;
    }
    output_per_tensor = at::zeros({ntensors * max_chunks_per_tensor}, float_options);
    ret_per_tensor = at::empty({ntensors}, float_options);
  } else {
    ret_per_tensor = at::empty({0}, float_options);
  }

  DISPATCH_FLOAT_AND_HALF(
      tensor_lists[0][0].scalar_type(), 0, "multi_tensor_l2norm_scale_cuda",
      DISPATCH_FLOAT_AND_HALF(
          tensor_lists[1][0].scalar_type(), 1, "multi_tensor_l2norm_scale_cuda",
          multi_tensor_apply<2>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists,
                                L2NormScaleFunctor<scalar_t_0, scalar_t_1>(), output.data_ptr<float>(),
                                per_tensor ? output_per_tensor.data_ptr<float>() : nullptr, scale, per_tensor,
                                max_chunks_per_tensor);))

  AT_CUDA_CHECK(cudaGetLastError());
  // AT_CUDA_CHECK(cudaDeviceSynchronize());

  // This involves one more small kernel launches, but will be negligible end to end.
  // I could get rid of these by hacking the functor + multi tensor harness with persistence
  // logic, but keeping it simple for now
  auto ret = at::empty({1}, output.options());
  const at::cuda::OptionalCUDAGuard device_guard(device_of(output));
  auto stream = at::cuda::getCurrentCUDAStream();
  cleanup_v3<<<per_tensor ? ntensors : 1, 512, 0, stream>>>(
      output.data_ptr<float>(), per_tensor ? output_per_tensor.data_ptr<float>() : nullptr, ret.data_ptr<float>(),
      per_tensor ? ret_per_tensor.data_ptr<float>() : nullptr, per_tensor, max_chunks_per_tensor);

  return std::tuple<at::Tensor, at::Tensor>(ret, ret_per_tensor);
}


================================================
FILE: csrc/multi_tensor_lamb.cu
================================================
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
// Another possibility:
// #include <torch/all.h>

#include <assert.h>

#include "multi_tensor_apply.cuh"
#include "type_shim.h"

#define BLOCK_SIZE 512
#define ILP 4

template <typename T>
__device__ __forceinline__ bool is_aligned(T* p) {
  return ((uint64_t)p) % (ILP * sizeof(T)) == 0;
}

template <typename T>
__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset) {
  typedef typename std::aligned_storage<ILP * sizeof(T), ILP * alignof(T)>::type LT;
  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
}

typedef enum {
  MOMENT_MODE_0 = 0,  // L2 regularization mode
  MOMENT_MODE_1 = 1   // Decoupled weight decay mode
} adamMode_t;

std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(int chunk_size, at::Tensor noop_flag,
                                                            std::vector<std::vector<at::Tensor>> tensor_lists,
                                                            at::optional<bool> per_tensor_python);

using MATH_T = float;

template <typename T>
struct LAMBStage1Functor {
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<4>& tl,
                                             const float beta1, const float beta2, const float beta3,
                                             const float beta1_correction, const float beta2_correction,
                                             const float epsilon, adamMode_t mode, const float decay,
                                             const float* global_grad_norm, const float max_global_grad_norm) {
    // I'd like this kernel to propagate infs/nans.
    // if(*noop_gmem == 1)
    //   return;

    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    float clipped_global_grad_norm =
        (*global_grad_norm) > max_global_grad_norm ? (*global_grad_norm) / max_global_grad_norm : 1.0f;

    T* g = (T*)tl.addresses[0][tensor_loc];
    g += chunk_idx * chunk_size;

    T* p = (T*)tl.addresses[1][tensor_loc];
    p += chunk_idx * chunk_size;

    T* m = (T*)tl.addresses[2][tensor_loc];
    m += chunk_idx * chunk_size;

    T* v = (T*)tl.addresses[3][tensor_loc];
    v += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;

    MATH_T r_g[ILP];
    MATH_T r_p[ILP];
    MATH_T r_m[ILP];
    MATH_T r_v[ILP];
    // to make things simple, we put aligned case in a different code path
    if (n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(g) && is_aligned(p) && is_aligned(m) && is_aligned(v)) {
      T l_g[ILP];
      T l_p[ILP];
      T l_m[ILP];
      T l_v[ILP];
      for (int i_start = threadIdx.x; i_start * ILP < n && i_start * ILP < chunk_size; i_start += blockDim.x) {
        // load
        load_store(l_g, g, 0, i_start);
        if (decay != 0) load_store(l_p, p, 0, i_start);
        load_store(l_m, m, 0, i_start);
        load_store(l_v, v, 0, i_start);
        // unpack
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          r_g[ii] = l_g[ii];
          if (decay == 0) {
            r_p[ii] = MATH_T(0);
          } else {
            r_p[ii] = l_p[ii];
          }
          r_m[ii] = l_m[ii];
          r_v[ii] = l_v[ii];
        }
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          if (mode == MOMENT_MODE_0) {
            MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
            // L2 on scaled grad
            scaled_grad = scaled_grad + decay * r_p[ii];
            r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
            r_v[ii] = r_v[ii] * beta2 + (1 - beta2) * scaled_grad * scaled_grad;
            MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
            MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
            MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
            r_p[ii] = next_m_unbiased / denom;
          } else {
            MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
            r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
            r_v[ii] = r_v[ii] * beta2 + (1 - beta2) * scaled_grad * scaled_grad;
            MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
            MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
            MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
            r_p[ii] = (next_m_unbiased / denom) + (decay * r_p[ii]);
          }
        }
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          l_p[ii] = r_p[ii];
          l_m[ii] = r_m[ii];
          l_v[ii] = r_v[ii];
        }
        // store
        load_store(g, l_p, i_start, 0);
        load_store(m, l_m, i_start, 0);
        load_store(v, l_v, i_start, 0);
      }
    } else {
      // see note in multi_tensor_scale_kernel.cu
      for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
        MATH_T r_g[ILP];
        MATH_T r_p[ILP];
        MATH_T r_m[ILP];
        MATH_T r_v[ILP];
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          int i = i_start + threadIdx.x + ii * blockDim.x;
          if (i < n && i < chunk_size) {
            r_g[ii] = g[i];
            // special ?optimization? for lamb stage 1
            if (decay == 0) {
              r_p[ii] = MATH_T(0);
            } else {
              r_p[ii] = p[i];
            }
            r_m[ii] = m[i];
            r_v[ii] = v[i];
          } else {
            r_g[ii] = MATH_T(0);
            r_p[ii] = MATH_T(0);
            r_m[ii] = MATH_T(0);
            r_v[ii] = MATH_T(0);
          }
        }
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          if (mode == MOMENT_MODE_0) {
            MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
            // L2 on scaled grad
            scaled_grad = scaled_grad + decay * r_p[ii];
            r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
            r_v[ii] = r_v[ii] * beta2 + (1 - beta2) * scaled_grad * scaled_grad;
            MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
            MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
            MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
            r_p[ii] = next_m_unbiased / denom;
          } else {
            MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
            r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
            r_v[ii] = r_v[ii] * beta2 + (1 - beta2) * scaled_grad * scaled_grad;
            MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
            MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
            MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
            r_p[ii] = (next_m_unbiased / denom) + (decay * r_p[ii]);
          }
        }
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          int i = i_start + threadIdx.x + ii * blockDim.x;
          if (i < n && i < chunk_size) {
            g[i] = r_p[ii];
            m[i] = r_m[ii];
            v[i] = r_v[ii];
          }
        }
      }
    }
  }
};

// Step 2 reads in 'update' value and per-tensor param_norm and update_norm.
// It computes new parameter value.
template <typename T>
struct LAMBStage2Functor {
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<2>& tl,
                                             const float* per_tensor_param_norm, const float* per_tensor_update_norm,
                                             const float learning_rate, const float decay, bool use_nvlamb) {
    // I'd like this kernel to propagate infs/nans.
    // if(*noop_gmem == 1)
    //   return;

    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int tensor_num = tl.start_tensor_this_launch + tensor_loc;
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    MATH_T ratio = learning_rate;
    // nvlamb: apply adaptive learning rate to all parameters
    // otherwise, only apply to those with non-zero weight decay
    if (use_nvlamb || (decay != 0.0)) {
      float param_norm = per_tensor_param_norm[tensor_num];
      float update_norm = per_tensor_update_norm[tensor_num];
      ratio = (update_norm != 0.0f && param_norm != 0.0f) ? learning_rate * (param_norm / update_norm) : learning_rate;
    }

    T* update = (T*)tl.addresses[0][tensor_loc];
    update += chunk_idx * chunk_size;

    T* p = (T*)tl.addresses[1][tensor_loc];
    p += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;

    // to make things simple, we put aligned case in a different code path
    if (n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(p) && is_aligned(update)) {
      T r_p[ILP];
      T r_update[ILP];
      for (int i_start = threadIdx.x; i_start * ILP < n && i_start * ILP < chunk_size; i_start += blockDim.x) {
        // load
        load_store(r_p, p, 0, i_start);
        load_store(r_update, update, 0, i_start);
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          r_p[ii] = static_cast<MATH_T>(r_p[ii]) - (ratio * static_cast<MATH_T>(r_update[ii]));
        }
        load_store(p, r_p, i_start, 0);
      }
    } else {
      for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
        MATH_T r_p[ILP];
        MATH_T r_update[ILP];
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          int i = i_start + threadIdx.x + ii * blockDim.x;
          if (i < n && i < chunk_size) {
            r_p[ii] = p[i];
            r_update[ii] = update[i];
          }
        }
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          r_p[ii] = r_p[ii] - (ratio * r_update[ii]);
        }
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          int i = i_start + threadIdx.x + ii * blockDim.x;
          if (i < n && i < chunk_size) {
            p[i] = r_p[ii];
          }
        }
      }
    }
  }
};

void multi_tensor_lamb_cuda(int chunk_size, at::Tensor noop_flag, std::vector<std::vector<at::Tensor>> tensor_lists,
                            const float lr, const float beta1, const float beta2, const float epsilon, const int step,
                            const int bias_correction, const float weight_decay, const int grad_averaging,
                            const int mode, at::Tensor global_grad_norm, const float max_grad_norm,
                            at::optional<bool> use_nvlamb_python) {
  using namespace at;
  // Master weight and 32bit momentum(potentially changing) is not handled by this
  // So we assume every tensor are all in the same type

  bool use_nvlamb = use_nvlamb_python.has_value() ? use_nvlamb_python.value() : false;

  // Handle bias correction mode
  float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
  if (bias_correction == 1) {
    bias_correction1 = 1 - std::pow(beta1, step);
    bias_correction2 = 1 - std::pow(beta2, step);
  }

  // Handle grad averaging mode
  float beta3 = 1.0f;
  if (grad_averaging == 1) beta3 = 1 - beta1;

  std::vector<std::vector<at::Tensor>> grad_list(tensor_lists.begin(), tensor_lists.begin() + 1);
  std::vector<std::vector<at::Tensor>> param_list(tensor_lists.begin() + 1, tensor_lists.begin() + 2);

  // Compute per tensor param norm
  auto param_norm_tuple = multi_tensor_l2norm_cuda(chunk_size, noop_flag, param_list, true);

  // We now in-place modify grad to store update before compute its norm
  // Generally this is not a issue since people modify grad in step() method all the time
  // We can also grab list of empty tensor to avoid this, but I'd like to save space/cpu code
  DISPATCH_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 0, "lamb_stage_1",
                          multi_tensor_apply<4>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists,
                                                LAMBStage1Functor<scalar_t_0>(), beta1, beta2,
                                                beta3,  // 1-beta1 or 1 depends on averaging mode
                                                bias_correction1, bias_correction2, epsilon, (adamMode_t)mode,
                                                weight_decay, global_grad_norm.data_ptr<float>(), max_grad_norm);)

  // Compute update norms
  auto update_norm_tuple = multi_tensor_l2norm_cuda(chunk_size, noop_flag, grad_list, true);

  std::vector<std::vector<at::Tensor>> grad_param_list(tensor_lists.begin(), tensor_lists.begin() + 2);

  DISPATCH_FLOAT_AND_HALF(
      tensor_lists[0][0].scalar_type(), 0, "lamb_stage_2",
      multi_tensor_apply<2>(BLOCK_SIZE, chunk_size, noop_flag, grad_param_list, LAMBStage2Functor<scalar_t_0>(),
                            std::get<1>(param_norm_tuple).data_ptr<float>(),
                            std::get<1>(update_norm_tuple).data_ptr<float>(), lr, weight_decay, use_nvlamb);)

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: csrc/multi_tensor_lamb_mp.cu
================================================
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
// Another possibility:
// #include <torch/all.h>

#include <assert.h>

#include "multi_tensor_apply.cuh"
#include "type_shim.h"

#define BLOCK_SIZE 512
#define ILP 4

template <typename T>
__device__ __forceinline__ bool is_aligned(T* p) {
  return ((uint64_t)p) % (ILP * sizeof(T)) == 0;
}

template <typename T>
__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset) {
  typedef typename std::aligned_storage<ILP * sizeof(T), ILP * alignof(T)>::type LT;
  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
}

typedef enum {
  MOMENT_MODE_0 = 0,  // L2 regularization mode
  MOMENT_MODE_1 = 1   // Decoupled weight decay mode
} adamMode_t;

std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_mp_cuda(int chunk_size, at::Tensor noop_flag,
                                                               std::vector<std::vector<at::Tensor>> tensor_lists,
                                                               at::optional<bool> per_tensor_python);

using MATH_T = float;

template <typename T, typename param_t>
struct LAMBStage1Functor {
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<4>& tl,
                                             const float beta1, const float beta2, const float beta3,
                                             const int* step_ptr, const int bias_correction, const float epsilon,
                                             adamMode_t mode, const float decay, const float* global_grad_norm,
                                             const float* max_global_grad_norm, const float* found_inf,
                                             const float* inv_scale) {
    if (*noop_gmem) {
      return;
    }

    float beta1_correction = 1.0f;
    float beta2_correction = 1.0f;
    if (bias_correction == 1) {
      int step = *step_ptr;
      beta1_correction = 1 - std::pow(beta1, step);
      beta2_correction = 1 - std::pow(beta2, step);
    }

    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    float clipped_global_grad_norm =
        (*global_grad_norm) > (*max_global_grad_norm) ? (*global_grad_norm) / (*max_global_grad_norm) : 1.0f;

    T* g = (T*)tl.addresses[0][tensor_loc];
    g += chunk_idx * chunk_size;

    param_t* p = (param_t*)tl.addresses[1][tensor_loc];
    p += chunk_idx * chunk_size;

    param_t* m = (param_t*)tl.addresses[2][tensor_loc];
    m += chunk_idx * chunk_size;

    param_t* v = (param_t*)tl.addresses[3][tensor_loc];
    v += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;

    MATH_T r_g[ILP];
    MATH_T r_p[ILP];
    MATH_T r_m[ILP];
    MATH_T r_v[ILP];
    // to make things simple, we put aligned case in a different code path
    if (n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(g) && is_aligned(p) && is_aligned(m) && is_aligned(v)) {
      T l_g[ILP];
      param_t l_p[ILP];
      param_t l_m[ILP];
      param_t l_v[ILP];
      for (int i_start = threadIdx.x; i_start * ILP < n && i_start * ILP < chunk_size; i_start += blockDim.x) {
        // load
        load_store(l_g, g, 0, i_start);
        if (decay != 0) load_store(l_p, p, 0, i_start);
        load_store(l_m, m, 0, i_start);
        load_store(l_v, v, 0, i_start);
        // unpack
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          r_g[ii] = l_g[ii] * (*inv_scale);
          if (decay == 0) {
            r_p[ii] = MATH_T(0);
          } else {
            r_p[ii] = l_p[ii];
          }
          r_m[ii] = l_m[ii];
          r_v[ii] = l_v[ii];
        }
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          if (mode == MOMENT_MODE_0) {
            MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
            // L2 on scaled grad
            scaled_grad = scaled_grad + decay * r_p[ii];
            r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
            r_v[ii] = r_v[ii] * beta2 + (1 - beta2) * scaled_grad * scaled_grad;
            MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
            MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
            MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
            r_p[ii] = next_m_unbiased / denom;
          } else {
            MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
            r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
            r_v[ii] = r_v[ii] * beta2 + (1 - beta2) * scaled_grad * scaled_grad;
            MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
            MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
            MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
            r_p[ii] = (next_m_unbiased / denom) + (decay * r_p[ii]);
          }
        }
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          l_p[ii] = r_p[ii];
          // Difference from APEX's LAMB kernel. `g` and `p` can be different dtypes.
          l_g[ii] = r_p[ii];
          l_m[ii] = r_m[ii];
          l_v[ii] = r_v[ii];
        }
        // store
        load_store(g, l_g, i_start, 0);
        load_store(m, l_m, i_start, 0);
        load_store(v, l_v, i_start, 0);
      }
    } else {
      // see note in multi_tensor_scale_kernel.cu
      for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
        MATH_T r_g[ILP];
        MATH_T r_p[ILP];
        MATH_T r_m[ILP];
        MATH_T r_v[ILP];
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          int i = i_start + threadIdx.x + ii * blockDim.x;
          if (i < n && i < chunk_size) {
            r_g[ii] = g[i] * (*inv_scale);
            // special ?optimization? for lamb stage 1
            if (decay == 0) {
              r_p[ii] = MATH_T(0);
            } else {
              r_p[ii] = p[i];
            }
            r_m[ii] = m[i];
            r_v[ii] = v[i];
          } else {
            r_g[ii] = MATH_T(0);
            r_p[ii] = MATH_T(0);
            r_m[ii] = MATH_T(0);
            r_v[ii] = MATH_T(0);
          }
        }
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          if (mode == MOMENT_MODE_0) {
            MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
            // L2 on scaled grad
            scaled_grad = scaled_grad + decay * r_p[ii];
            r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
            r_v[ii] = r_v[ii] * beta2 + (1 - beta2) * scaled_grad * scaled_grad;
            MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
            MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
            MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
            r_p[ii] = next_m_unbiased / denom;
          } else {
            MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
            r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
            r_v[ii] = r_v[ii] * beta2 + (1 - beta2) * scaled_grad * scaled_grad;
            MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
            MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
            MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
            r_p[ii] = (next_m_unbiased / denom) + (decay * r_p[ii]);
          }
        }
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          int i = i_start + threadIdx.x + ii * blockDim.x;
          if (i < n && i < chunk_size) {
            g[i] = r_p[ii];
            m[i] = r_m[ii];
            v[i] = r_v[ii];
          }
        }
      }
    }
  }
};

// Step 2 reads in 'update' value and per-tensor param_norm and update_norm.
// It computes new parameter value.
// N == 2: FP32 params, no master params
// N == 3: FP16 params, FP32 master params.
template <typename T, int N, typename param_t>
struct LAMBStage2Functor {
  static_assert((N == 2 && std::is_same<T, param_t>::value) || (N == 3 && std::is_same<param_t, float>::value), "");
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<N>& tl,
                                             const float* per_tensor_param_norm, const float* per_tensor_update_norm,
                                             const float* learning_rate, const float decay, bool use_nvlamb) {
    if (*noop_gmem) {
      return;
    }

    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int tensor_num = tl.start_tensor_this_launch + tensor_loc;
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    MATH_T ratio = *learning_rate;
    // nvlamb: apply adaptive learning rate to all parameters
    // otherwise, only apply to those with non-zero weight decay
    if (use_nvlamb || (decay != 0.0)) {
      float param_norm = per_tensor_param_norm[tensor_num];
      float update_norm = per_tensor_update_norm[tensor_num];
      ratio =
          (update_norm != 0.0f && param_norm != 0.0f) ? *learning_rate * (param_norm / update_norm) : *learning_rate;
    }

    T* update = (T*)tl.addresses[0][tensor_loc];
    update += chunk_idx * chunk_size;

    param_t* p = (param_t*)tl.addresses[1][tensor_loc];
    p += chunk_idx * chunk_size;

    T* out_p;
    if (N == 3) {
      out_p = (T*)tl.addresses[2][tensor_loc];
      out_p += chunk_idx * chunk_size;
    }

    n -= chunk_idx * chunk_size;

    // to make things simple, we put aligned case in a different code path
    bool can_use_aligned_path = n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(p) && is_aligned(update);
    if (N == 3) {
      can_use_aligned_path = can_use_aligned_path && is_aligned(out_p);
    }
    if (can_use_aligned_path) {
      param_t r_p[ILP];
      T r_update[ILP];
      T r_out_p[ILP];
      for (int i_start = threadIdx.x; i_start * ILP < n && i_start * ILP < chunk_size; i_start += blockDim.x) {
        // load
        load_store(r_p, p, 0, i_start);
        load_store(r_update, update, 0, i_start);
        if (N == 3) {
          load_store(r_out_p, out_p, 0, i_start);
        }
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          r_p[ii] = static_cast<MATH_T>(r_p[ii]) - (ratio * static_cast<MATH_T>(r_update[ii]));
          if (N == 3) {
            r_out_p[ii] = r_p[ii];
          }
        }
        load_store(p, r_p, i_start, 0);
        if (N == 3) {
          load_store(out_p, r_out_p, i_start, 0);
        }
      }
    } else {
      for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
        MATH_T r_p[ILP];
        MATH_T r_update[ILP];
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          int i = i_start + threadIdx.x + ii * blockDim.x;
          if (i < n && i < chunk_size) {
            r_p[ii] = p[i];
            r_update[ii] = update[i];
          }
        }
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          r_p[ii] = r_p[ii] - (ratio * r_update[ii]);
        }
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          int i = i_start + threadIdx.x + ii * blockDim.x;
          if (i < n && i < chunk_size) {
            p[i] = r_p[ii];
            if (N == 3) {
              out_p[i] = r_p[ii];
            }
          }
        }
      }
    }
  }
};

void multi_tensor_lamb_mp_cuda(int chunk_size, at::Tensor noop_flag, std::vector<std::vector<at::Tensor>> tensor_lists,
                               at::Tensor lr, const float beta1, const float beta2, const float epsilon,
                               at::Tensor step, const int bias_correction, const float weight_decay,
                               const int grad_averaging, const int mode, at::Tensor global_grad_norm,
                               at::Tensor max_grad_norm, at::optional<bool> use_nvlamb_python, at::Tensor found_inf,
                               at::Tensor inv_scale) {
  // n_tensors == 5: FP16 model params & FP32 master params
  // n_tensors == 4: FP32 model params & NO FP32 master params
  const auto n_tensors = tensor_lists.size();
  assert(n_tensors == 4 || n_tensors == 5);
  using namespace at;

  bool use_nvlamb = use_nvlamb_python.has_value() ? use_nvlamb_python.value() : false;

  // note(mkozuki): move bias handling below to functor
  // Handle bias correction mode
  // float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
  // if (bias_correction == 1) {
  //   bias_correction1 = 1 - std::pow(beta1, step);
  //   bias_correction2 = 1 - std::pow(beta2, step);
  // }

  // Handle grad averaging mode
  float beta3 = 1.0f;
  if (grad_averaging == 1) beta3 = 1 - beta1;

  std::vector<std::vector<at::Tensor>> stage1_tensor_lists(tensor_lists.begin(), tensor_lists.begin() + 4);
  std::vector<std::vector<at::Tensor>> grad_list(tensor_lists.begin(), tensor_lists.begin() + 1);
  std::vector<std::vector<at::Tensor>> param_list(tensor_lists.begin() + 1, tensor_lists.begin() + 2);

  // Compute per tensor param norm
  auto param_norm_tuple = multi_tensor_l2norm_mp_cuda(chunk_size, noop_flag, param_list, true);

  // We now in-place modify grad to store update before compute its norm
  // Generally this is not a issue since people modify grad in step() method all the time
  // We can also grab list of empty tensor to avoid this, but I'd like to save space/cpu code
  if (n_tensors == 4) {
    DISPATCH_FLOAT_AND_HALF(
        tensor_lists[0][0].scalar_type(), 0, "lamb_stage_1",
        multi_tensor_apply<4>(BLOCK_SIZE, chunk_size, noop_flag, stage1_tensor_lists,
                              LAMBStage1Functor<scalar_t_0, scalar_t_0>(), beta1, beta2,
                              beta3,  // 1-beta1 or 1 depends on averaging mode
                              // bias_correction1,
                              // bias_correction2,
                              step.data_ptr<int>(), bias_correction, epsilon, (adamMode_t)mode, weight_decay,
                              global_grad_norm.data_ptr<float>(), max_grad_norm.data_ptr<float>(),
                              found_inf.data_ptr<float>(), inv_scale.data_ptr<float>());)
  } else {
    DISPATCH_FLOAT_HALF_AND_BFLOAT(
        tensor_lists[0][0].scalar_type(), 0, "lamb_stage_1",
        multi_tensor_apply<4>(BLOCK_SIZE, chunk_size, noop_flag, stage1_tensor_lists,
                              LAMBStage1Functor<scalar_t_0, float>(), beta1, beta2,
                              beta3,  // 1-beta1 or 1 depends on averaging mode
                              // bias_correction1,
                              // bias_correction2,
                              step.data_ptr<int>(), bias_correction, epsilon, (adamMode_t)mode, weight_decay,
                              global_grad_norm.data_ptr<float>(), max_grad_norm.data_ptr<float>(),
                              found_inf.data_ptr<float>(), inv_scale.data_ptr<float>());)
  }

  // Compute update norms
  auto update_norm_tuple = multi_tensor_l2norm_mp_cuda(chunk_size, noop_flag, grad_list, true);

  std::vector<std::vector<at::Tensor>> grad_param_list(tensor_lists.begin(), tensor_lists.begin() + 2);
  if (n_tensors == 4) {
    DISPATCH_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 0, "lamb_stage_2",
                            multi_tensor_apply<2>(BLOCK_SIZE, chunk_size, noop_flag, grad_param_list,
                                                  LAMBStage2Functor<scalar_t_0, 2, scalar_t_0>(),
                                                  std::get<1>(param_norm_tuple).data_ptr<float>(),
                                                  std::get<1>(update_norm_tuple).data_ptr<float>(),
                                                  lr.data_ptr<float>(), weight_decay, use_nvlamb);)
  } else {
    grad_param_list.push_back(tensor_lists[4]);
    DISPATCH_FLOAT_HALF_AND_BFLOAT(tensor_lists[0][0].scalar_type(), 0, "lamb_stage_2",
                                   multi_tensor_apply<3>(BLOCK_SIZE, chunk_size, noop_flag, grad_param_list,
                                                         LAMBStage2Functor<scalar_t_0, 3, float>(),
                                                         std::get<1>(param_norm_tuple).data_ptr<float>(),
                                                         std::get<1>(update_norm_tuple).data_ptr<float>(),
                                                         lr.data_ptr<float>(), weight_decay, use_nvlamb);)
  }
  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: csrc/multi_tensor_lamb_stage_1.cu
================================================
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
// Another possibility:
// #include <torch/all.h>

#include <assert.h>

#include "multi_tensor_apply.cuh"
#include "type_shim.h"

#define BLOCK_SIZE 512
#define ILP 4

// Step 1 computes the 'update' value of regular Adam optimizer.
template <typename GRAD_T, typename T, typename UPD_T>
struct LAMBStage1Functor {
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<5>& tl,
                                             const float* per_tensor_decay, const float beta1, const float beta2,
                                             const float beta1_correction, const float beta2_correction,
                                             const float epsilon, const float clipped_global_grad_norm) {
    // I'd like this kernel to propagate infs/nans.
    // if(*noop_gmem == 1)
    //   return;

    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int tensor_num = tl.start_tensor_this_launch + tensor_loc;
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    float decay = per_tensor_decay[tensor_num];

    GRAD_T* g = (GRAD_T*)tl.addresses[0][tensor_loc];
    g += chunk_idx * chunk_size;

    T* p = (T*)tl.addresses[1][tensor_loc];
    p += chunk_idx * chunk_size;

    T* m = (T*)tl.addresses[2][tensor_loc];
    m += chunk_idx * chunk_size;

    T* v = (T*)tl.addresses[3][tensor_loc];
    v += chunk_idx * chunk_size;

    UPD_T* update = (UPD_T*)tl.addresses[4][tensor_loc];
    update += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;

    // see note in multi_tensor_scale_kernel.cu
    for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
      GRAD_T r_g[ILP];
      T r_p[ILP];
      T r_m[ILP];
      T r_v[ILP];
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int i = i_start + threadIdx.x + ii * blockDim.x;
        if (i < n && i < chunk_size) {
          r_g[ii] = g[i];
          r_p[ii] = p[i];
          r_m[ii] = m[i];
          r_v[ii] = v[i];
        } else {
          r_g[ii] = GRAD_T(0);
          r_p[ii] = T(0);
          r_m[ii] = T(0);
          r_v[ii] = T(0);
        }
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        T scaled_grad = r_g[ii] / clipped_global_grad_norm;
        r_m[ii] = r_m[ii] * beta1 + (1 - beta1) * scaled_grad;
        r_v[ii] = r_v[ii] * beta2 + (1 - beta2) * scaled_grad * scaled_grad;
        T next_m_unbiased = r_m[ii] / beta1_correction;
        T next_v_unbiased = r_v[ii] / beta2_correction;
        T denom = std::sqrt(next_v_unbiased) + epsilon;
        r_p[ii] = (next_m_unbiased / denom) + (decay * r_p[ii]);
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int i = i_start + threadIdx.x + ii * blockDim.x;
        if (i < n && i < chunk_size) {
          update[i] = (UPD_T)r_p[ii];
          m[i] = r_m[ii];
          v[i] = r_v[ii];
        }
      }
    }
  }
};

void multi_tensor_lamb_stage1_cuda(int chunk_size, at::Tensor noop_flag,
                                   std::vector<std::vector<at::Tensor>> tensor_lists, at::Tensor per_tensor_decay,
                                   const int step, const float beta1, const float beta2, const float epsilon,
                                   at::Tensor global_grad_norm, const float max_global_grad_norm) {
  using namespace at;

  const float* g_grad_norm = global_grad_norm.data_ptr<float>();
  float clipped_global_grad_norm = *(g_grad_norm) > max_global_grad_norm ? *(g_grad_norm) / max_global_grad_norm : 1.0f;
  float next_step = float(step + 1);
  float beta1_correction = 1.0f - std::pow(beta1, next_step);
  float beta2_correction = 1.0f - std::pow(beta2, next_step);
  DISPATCH_FLOAT_AND_HALF(
      tensor_lists[0][0].scalar_type(), 0, "lamb_stage_1",
      DISPATCH_FLOAT_AND_HALF(
          tensor_lists[1][0].scalar_type(), 1, "lamb_stage_1",
          DISPATCH_FLOAT_AND_HALF(
              tensor_lists[4][0].scalar_type(), 2, "lamb_stage_1",
              multi_tensor_apply<5>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists,
                                    LAMBStage1Functor<scalar_t_0, scalar_t_1, scalar_t_2>(),
                                    per_tensor_decay.data_ptr<float>(), beta1, beta2, beta1_correction,
                                    beta2_correction, epsilon, clipped_global_grad_norm);)))

  AT_CUDA_CHECK(cudaGetLastError());

  // AT_CUDA_CHECK(cudaDeviceSynchronize());
}


================================================
FILE: csrc/multi_tensor_lamb_stage_2.cu
================================================
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
// Another possibility:
// #include <torch/all.h>

#include <assert.h>

#include "multi_tensor_apply.cuh"
#include "type_shim.h"

#define BLOCK_SIZE 512
#define ILP 4

using MATH_T = float;

// Step 2 reads in 'update' value and per-tensor param_norm and update_norm.
// It computes new parameter value.
template <typename T, typename UPD_T>
struct LAMBStage2Functor {
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<2>& tl,
                                             const float* per_tensor_param_norm, const float* per_tensor_update_norm,
                                             const float learning_rate, const float decay, bool use_nvlamb) {
    // I'd like this kernel to propagate infs/nans.
    // if(*noop_gmem == 1)
    //   return;

    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int tensor_num = tl.start_tensor_this_launch + tensor_loc;
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    MATH_T ratio = learning_rate;
    // nvlamb: apply adaptive learning rate to all parameters
    // otherwise, only apply to those with non-zero weight decay
    if (use_nvlamb || (decay != 0.0)) {
      float param_norm = per_tensor_param_norm[tensor_num];
      float update_norm = per_tensor_update_norm[tensor_num];
      ratio = (update_norm != 0.0f && param_norm != 0.0f) ? learning_rate * (param_norm / update_norm) : learning_rate;
    }

    T* p = (T*)tl.addresses[0][tensor_loc];
    p += chunk_idx * chunk_size;

    UPD_T* update = (UPD_T*)tl.addresses[1][tensor_loc];
    update += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;

    for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
      T r_p[ILP];
      UPD_T r_update[ILP];
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int i = i_start + threadIdx.x + ii * blockDim.x;
        if (i < n && i < chunk_size) {
          r_p[ii] = p[i];
          r_update[ii] = update[i];
        }
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        r_p[ii] = r_p[ii] - (ratio * (T)r_update[ii]);
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int i = i_start + threadIdx.x + ii * blockDim.x;
        if (i < n && i < chunk_size) {
          p[i] = r_p[ii];
        }
      }
    }
  }
};

void multi_tensor_lamb_stage2_cuda(int chunk_size, at::Tensor noop_flag,
                                   std::vector<std::vector<at::Tensor>> tensor_lists, at::Tensor per_tensor_param_norm,
                                   at::Tensor per_tensor_update_norm, const float lr, const float weight_decay,
                                   at::optional<bool> use_nvlamb_python) {
  bool use_nvlamb = use_nvlamb_python.has_value() ? use_nvlamb_python.value() : false;

  using namespace at;

  DISPATCH_FLOAT_AND_HALF(
      tensor_lists[0][0].scalar_type(), 0, "lamb_stage_2",
      DISPATCH_FLOAT_AND_HALF(
          tensor_lists[1][0].scalar_type(), 1, "lamb_stage_2",
          multi_tensor_apply<2>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists,
                                LAMBStage2Functor<scalar_t_0, scalar_t_1>(), per_tensor_param_norm.data_ptr<float>(),
                                per_tensor_update_norm.data_ptr<float>(), lr, weight_decay, use_nvlamb);))

  AT_CUDA_CHECK(cudaGetLastError());

  // AT_CUDA_CHECK(cudaDeviceSynchronize());
}


================================================
FILE: csrc/multi_tensor_novograd.cu
================================================
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
// Another possibility:
// #include <torch/all.h>

#include <assert.h>

#include "multi_tensor_apply.cuh"
#include "type_shim.h"

#define BLOCK_SIZE 512
#define ILP 4

typedef enum {
  MOMENT_MODE_0 = 0,  // Novograd paper mode, momentum caculation with denom then decay inside
  MOMENT_MODE_1 = 1   // Decoupled weight decay mode
} momentMode_t;

void multi_tensor_norm_out_cuda(int chunk_size, at::Tensor noop_flag, std::vector<std::vector<at::Tensor>> tensor_lists,
                                at::Tensor out, const float alpha, const float beta, const int norm_type);

using MATH_T = float;

template <typename T>
struct NovoGradFunctor {
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<3>& tl,
                                             const float beta1, const float beta2, const float beta3,
                                             const float beta1_correction, const float beta2_correction,
                                             const float epsilon, const float lr, momentMode_t m_mode,
                                             const float decay, const float* per_tensor_grad_norm) {
    // I'd like this kernel to propagate infs/nans.
    // if(*noop_gmem == 1)
    //   return;

    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int tensor_num = tl.start_tensor_this_launch + tensor_loc;
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    float grad_norm = per_tensor_grad_norm[tensor_num];

    T* g = (T*)tl.addresses[0][tensor_loc];
    g += chunk_idx * chunk_size;

    T* p = (T*)tl.addresses[1][tensor_loc];
    p += chunk_idx * chunk_size;

    T* m = (T*)tl.addresses[2][tensor_loc];
    m += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;

    // see note in multi_tensor_scale_kernel.cu
    for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
      MATH_T r_g[ILP];
      MATH_T r_p[ILP];
      MATH_T r_m[ILP];
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int i = i_start + threadIdx.x + ii * blockDim.x;
        if (i < n && i < chunk_size) {
          r_g[ii] = g[i];
          r_p[ii] = p[i];
          r_m[ii] = m[i];
        } else {
          r_g[ii] = MATH_T(0);
          r_p[ii] = MATH_T(0);
          r_m[ii] = MATH_T(0);
        }
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        if (m_mode == MOMENT_MODE_0) {
          MATH_T next_v_unbiased = grad_norm / beta2_correction;
          MATH_T denom = next_v_unbiased + epsilon;
          r_g[ii] = (r_g[ii] / denom) + (decay * r_p[ii]);
          r_m[ii] = beta1 * r_m[ii] + beta3 * r_g[ii];
          MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
          r_p[ii] = r_p[ii] - (lr * next_m_unbiased);
        } else {
          r_m[ii] = beta1 * r_m[ii] + beta3 * r_g[ii];
          MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
          MATH_T next_v_unbiased = grad_norm / beta2_correction;
          MATH_T denom = next_v_unbiased + epsilon;
          MATH_T update = (next_m_unbiased / denom) + (decay * r_p[ii]);
          r_p[ii] = r_p[ii] - (lr * update);
        }
      }
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int i = i_start + threadIdx.x + ii * blockDim.x;
        if (i < n && i < chunk_size) {
          p[i] = r_p[ii];
          m[i] = r_m[ii];
        }
      }
    }
  }
};

void multi_tensor_novograd_cuda(int chunk_size, at::Tensor noop_flag, std::vector<std::vector<at::Tensor>> tensor_lists,
                                at::Tensor grad_norms, const float lr, const float beta1, const float beta2,
                                const float epsilon, const int step, const int bias_correction,
                                const float weight_decay, const int grad_averaging, const int moment_mode,
                                const int norm_type) {
  using namespace at;

  // Handle bias correction mode
  float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
  if (bias_correction == 1) {
    bias_correction1 = 1 - std::pow(beta1, step);
    bias_correction2 = std::sqrt(1 - std::pow(beta2, step));
  }

  // Handle grad averaging mode
  float beta3 = 1;
  if (grad_averaging == 1) beta3 = 1 - beta1;

  std::vector<std::vector<at::Tensor>> grad_list(tensor_lists.begin(), tensor_lists.begin() + 1);

  // Compute and update grad norm
  // Here use a per tensor norm, and blend new norm(n) and old norm(gn) by
  // L-2: gn = sqrt(a * gn^2 + b * n^2)
  // L-inf: gn = a * gn + b * n
  multi_tensor_norm_out_cuda(chunk_size, noop_flag, grad_list, grad_norms, beta2, (1.0f - beta2), norm_type);

  // Assume single type across p,g,m1,m2 now
  DISPATCH_DOUBLE_FLOAT_AND_HALF(
      tensor_lists[0][0].scalar_type(), 0, "novograd",
      multi_tensor_apply<3>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists, NovoGradFunctor<scalar_t_0>(), beta1,
                            beta2,
                            beta3,  // 1-beta1 or 1 depends on averaging mode
                            bias_correction1, bias_correction2, epsilon, lr, (momentMode_t)moment_mode, weight_decay,
                            grad_norms.data_ptr<float>());)

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: csrc/multi_tensor_scale_kernel.cu
================================================
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
// Another possibility:
// #include <torch/all.h>

#include <assert.h>
// Stringstream is a big hammer, but I want to rely on operator<< for dtype.
#include <sstream>

#include "multi_tensor_apply.cuh"
#include "type_shim.h"

#define BLOCK_SIZE 512
#define ILP 4

template <typename T>
__device__ __forceinline__ bool is_aligned(T* p) {
  return ((uint64_t)p) % (ILP * sizeof(T)) == 0;
}

template <typename T>
__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset) {
  typedef typename std::aligned_storage<ILP * sizeof(T), ILP * alignof(T)>::type LT;
  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
}

template <typename in_t, typename out_t>
struct ScaleFunctor {
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<2>& tl,
                                             float scale) {
    // I'd like this kernel to propagate infs/nans.
    // if(*noop_gmem == 1)
    //   return;

    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    in_t* in = (in_t*)tl.addresses[0][tensor_loc];
    in += chunk_idx * chunk_size;

    out_t* out = (out_t*)tl.addresses[1][tensor_loc];
    out += chunk_idx * chunk_size;

    n -= chunk_idx * chunk_size;

    bool finite = true;
    in_t r_in[ILP];
    out_t r_out[ILP];

    // to make things simple, we put aligned case in a different code path
    if (n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(in) && is_aligned(out)) {
      for (int i_start = threadIdx.x; i_start * ILP < n && i_start * ILP < chunk_size; i_start += blockDim.x) {
        // load
        load_store(r_in, in, 0, i_start);
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          r_out[ii] = static_cast<float>(r_in[ii]) * scale;
          finite = finite && isfinite(r_in[ii]);
        }
        // store
        load_store(out, r_out, i_start, 0);
      }
    } else {
      // Non-divergent exit condition for __syncthreads, not necessary here
      for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          r_in[ii] = 0;
          int i = i_start + threadIdx.x + ii * blockDim.x;
          if (i < n && i < chunk_size) r_in[ii] = in[i];
        }
        // note for clarification to future michael:
        // From a pure memory dependency perspective, there's likely no point unrolling
        // the write loop, since writes just fire off once their LDGs arrive.
        // Put another way, the STGs are dependent on the LDGs, but not on each other.
        // There is still compute ILP benefit from unrolling the loop though.
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          r_out[ii] = static_cast<float>(r_in[ii]) * scale;
          finite = finite && isfinite(r_in[ii]);
        }
#pragma unroll
        for (int ii = 0; ii < ILP; ii++) {
          int i = i_start + threadIdx.x + ii * blockDim.x;
          if (i < n && i < chunk_size) out[i] = r_out[ii];
        }
      }
    }
    if (!finite) *noop_gmem = 1;  // Blindly fire off a write.  These will race but that's ok.
  }
};

void multi_tensor_scale_cuda(int chunk_size, at::Tensor noop_flag, std::vector<std::vector<at::Tensor>> tensor_lists,
                             float scale) {
  using namespace at;
  // The output (downscaled) type is always float.
  // If build times suffer, think about where to put this dispatch,
  // and what logic should be moved out of multi_tensor_apply.

  DISPATCH_FLOAT_HALF_AND_BFLOAT(
      tensor_lists[0][0].scalar_type(), 0, "multi_tensor_scale_cuda",
      DISPATCH_FLOAT_HALF_AND_BFLOAT(tensor_lists[1][0].scalar_type(), 1, "multi_tensor_scale_cuda",
                                     multi_tensor_apply<2>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists,
                                                           ScaleFunctor<scalar_t_0, scalar_t_1>(), scale);))
  AT_CUDA_CHECK(cudaGetLastError());

  // AT_CUDA_CHECK(cudaDeviceSynchronize());
}


================================================
FILE: csrc/multi_tensor_sgd_kernel.cu
================================================
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
#include <assert.h>
#include <cuda_runtime.h>

#include "multi_tensor_apply.cuh"

#define BLOCK_SIZE 512
#define ILP 4

/**
 * Perform fused SGD on multiple buffers
 * N: number of tensors
 * tl[0] : gradients
 * tl[1] : weights
 * tl[2] : momentum buffers
 * tl[3] : fp16 weights (if appropriate)
 * wd : weight_decay (scalar)
 * momentum : momentum (scalar)
 * dampening : momentum dampening (scalar)
 * lr : learning rate (scalar)
 * nesterov : enable nesterov (bool)
 * first run : necessary for proper momentum handling & init
 * wd_after_momentum : apply weight decay _after_ momentum instead of before
 **/
template <int N, typename T_grad, typename T_weight>
struct SGDFunctor {
  __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, TensorListMetadata<N>& tl,
                                             float wd, float momentum, float dampening, float lr, bool nesterov,
                                             bool first_run, bool wd_after_momentum, float scale) {
    // Early exit if we don't need to do anything
    if (*noop_gmem) return;

    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
    int n = tl.sizes[tensor_loc];

    T_grad* grad_in = (T_grad*)tl.addresses[0][tensor_loc];
    grad_in += chunk_idx * chunk_size;

    T_weight* weight_in = (T_weight*)tl.addresses[1][tensor_loc];
    weight_in += chunk_idx * chunk_size;

    T_weight* mom_in = (T_weight*)tl.addresses[2][tensor_loc];
    mom_in += chunk_idx * chunk_size;

    at::Half* model_weights_out = nullptr;
    if (N == 4) {
      model_weights_out = (at::Half*)tl.addresses[3][tensor_loc];
      model_weights_out += chunk_idx * chunk_size;
    }

    n -= chunk_idx * chunk_size;

    // Non-divergent exit condition for the __syncthreads
    float incoming_grads[ILP];
    float incoming_weights[ILP];
    float incoming_moms[ILP];
    for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        incoming_grads[ii] = 0;
        incoming_weights[ii] = 0;
        incoming_moms[ii] = 0;
        int i = i_start + threadIdx.x + ii * blockDim.x;
        if (i < n && i < chunk_size) {
          incoming_grads[ii] = static_cast<float>(grad_in[i]) * scale;
          incoming_weights[ii] = static_cast<float>(weight_in[i]);
          incoming_moms[ii] = static_cast<float>(mom_in[i]);
        }
      }

// note for clarification to future michael:
// From a pure memory dependency perspective, there's likely no point unrolling
// the write loop, since writes just fire off once their LDGs arrive.
// Put another way, the STGs are dependent on the LDGs, but not on each other.
// There is still compute ILP benefit from unrolling the loop though.
#pragma unroll
      for (int ii = 0; ii < ILP; ii++) {
        int i = i_start + threadIdx.x + ii * blockDim.x;
        if (i < n && i < chunk_size) {
          // apply weight decay before momentum if necessary
          if (wd != 0.f && !wd_after_momentum) incoming_grads[ii] += wd * incoming_weights[ii];

          if (momentum != 0.f) {
            if (!first_run)
              incoming_moms[ii] = incoming_moms[ii] * momentum + (1.f - dampening) * incoming_grads[ii];
            else  // initialize momentums to current incoming grads
              incoming_moms[ii] = incoming_grads[ii];

            if (nesterov)
              incoming_grads[ii] += momentum * incoming_moms[ii];
            else
              incoming_grads[ii] = incoming_moms[ii];
          }

          // Apply WD after momentum if desired
          if (wd != 0.f && wd_after_momentum) incoming_grads[ii] += wd * incoming_weights[ii];

          // adjust the weight and write out
          weight_in[i] += (-lr * incoming_grads[ii]);

          // if necessary, write out an fp16 copy of the weights
          if (N == 4) model_weights_out[i] = static_cast<at::Half>(weight_in[i]);

          // also write out the new momentum
          if (momentum != 0.f) mom_in[i] = incoming_moms[ii];
        }
      }
    }
  }
};

void multi_tensor_sgd_cuda(int chunk_size, at::Tensor noop_flag, std::vector<std::vector<at::Tensor>> tensor_lists,
                           float wd, float momentum, float dampening, float lr, bool nesterov, bool first_run,
                           bool wd_after_momentum, float scale) {
  auto num_tensors = tensor_lists.size();
  auto grad_type = tensor_lists[0][0].scalar_type();
  auto weight_type = tensor_lists[1][0].scalar_type();

  if (num_tensors == 4)
    for (int i = 0; i < tensor_lists[3].size(); i++)
      TORCH_CHECK(tensor_lists[3][i].scalar_type() == at::ScalarType::Half,
                  "Additional output tensors should always be fp16.");

  TORCH_CHECK(noop_flag.device() == tensor_lists[0][0].device(),
              "expected noop flag to be on the same device as tensors");

  // We have 3 possibilities to handle here, in terms of
  // grad_type, param_type, momentum_type, requires_fp16_copy
  // 1. fp16, fp16, fp16, No
  // 2. fp32, fp32, fp32, No
  // 3. fp16, fp32, fp32, Yes
  // 4. fp32, fp32, fp32, Yes // this is the materialize_master_grads=True case
  // It's easier to hardcode these possibilities than to use
  // switches etc. to handle the cross-product of cases where
  // we don't want the majority of them.

  // Case 1. fp16, fp16, fp16, No
  if (grad_type == at::ScalarType::Half && weight_type == at::ScalarType::Half && num_tensors == 3) {
    multi_tensor_apply<3>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists, SGDFunctor<3, at::Half, at::Half>(), wd,
                          momentum, dampening, lr, nesterov, first_run, wd_after_momentum, scale);
  }
  // Case 2. fp16, fp32, fp32, No
  // else if (grad_type == at::ScalarType::Half &&
  //          weight_type == at::ScalarType::Float &&
  //          num_tensors == 3) {
  //   multi_tensor_apply<3>(
  //       BLOCK_SIZE,
  //       chunk_size,
  //       noop_flag,
  //       tensor_lists,
  //       SGDFunctor<3, at::Half, float>(),
  //       wd,
  //       momentum,
  //       dampening,
  //       lr,
  //       nesterov,
  //       first_run,
  //       wd_after_momentum);
  // }
  // Case 2. fp32, fp32, fp32, No
  else if (grad_type == at::ScalarType::Float && weight_type == at::ScalarType::Float && num_tensors == 3) {
    multi_tensor_apply<3>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists, SGDFunctor<3, float, float>(), wd, momentum,
                          dampening, lr, nesterov, first_run, wd_after_momentum, scale);
  }
  // Case 3. fp16, fp32, fp32, Yes
  else if (grad_type == at::ScalarType::Half && weight_type == at::ScalarType::Float && num_tensors == 4) {
    multi_tensor_apply<4>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists, SGDFunctor<4, at::Half, float>(), wd,
                          momentum, dampening, lr, nesterov, first_run, wd_after_momentum, scale);
  }
  // Case 4. fp32, fp32, fp32, Yes
  else if (grad_type == at::ScalarType::Float && weight_type == at::ScalarType::Float && num_tensors == 4) {
    multi_tensor_apply<4>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists, SGDFunctor<4, float, float>(), wd, momentum,
                          dampening, lr, nesterov, first_run, wd_after_momentum, scale);
  } else {
    AT_ERROR("multi_tensor_sgd only supports some combinations of gradient & weight types. Given: ", "gradient: ",
             grad_type, ", weight: ", weight_type, ", num_lists: ", num_tensors);
  }

  AT_CUDA_CHECK(cudaGetLastError());
}


================================================
FILE: csrc/static_switch.h
================================================
// From
// https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h

#pragma once

/// @param COND       - a boolean expression to switch by
/// @param CONST_NAME - a name given for the constexpr bool variable.
/// @param ...       - code to execute for true and false
///
/// Usage:
/// ```
/// BOOL_SWITCH(flag, BoolConst, [&] {
///     some_function<BoolConst>(...);
/// });
/// ```
#define BOOL_SWITCH(COND, CONST_NAME, ...)      \
  [&] {                                         \
    if (COND) {                                 \
      constexpr static bool CONST_NAME = true;  \
      return __VA_ARGS__();                     \
    } else {                                    \
      constexpr static bool CONST_NAME = false; \
      return __VA_ARGS__();                     \
    }                                           \
  }()


================================================
FILE: csrc/syncbn.cpp
================================================
#include <ATen/ATen.h>
#include <torch/extension.h>

#include <vector>

// returns {mean,biased_var}
// implemented using welford
std::vector<at::Tensor> welford_mean_var_CUDA(const at::Tensor input);

// reduces array of mean/var across processes
// returns global {mean,inv_std,biased_var}
// implemented using welford
std::vector<at::Tensor> welford_parallel_CUDA(const at::Tensor mean_feature_nodes,
                                              const at::Tensor var_biased_feature_nodes, const at::Tensor numel,
                                              const float eps);

// elementwise BN operation, returns output
// input/weight/shift should have identical data type;
// mean/inv_std have promoted data type (dtype==fp16?fp32:dtype)
at::Tensor batchnorm_forward_CUDA(const at::Tensor input, const at::Tensor mean, const at::Tensor inv_std,
                                  const at::optional<at::Tensor> weight, const at::optional<at::Tensor> shift);

// backward BN operation, returns {sum_dy, sum_dy_xmu, grad_weight, grad_bias}
// grad_output/input should have identical data type;
// mean/inv_std have promoted data type (dtype==fp16?fp32:dtype)
// implemented using kahan summation
std::vector<at::Tensor> reduce_bn_CUDA(const at::Tensor grad_output, const at::Tensor input, const at::Tensor mean,
                                       const at::Tensor inv_std, const at::optional<at::Tensor> weight);

// elementwise backward BN operation, returns grad_input
// grad_output/input/weight precision could be fp16/fp32;
// mean/inv_std/sum_dy/sum_dy_xmu precision is fp32
at::Tensor batchnorm_backward_CUDA(const at::Tensor grad_output, const at::Tensor input, const at::Tensor mean,
                                   const at::Tensor inv_std, const at::optional<at::Tensor> weight,
                                   const at::Tensor sum_dy, const at::Tensor sum_dy_xmu, const at::Tensor count);

// returns {mean, biased_var}
// implemented using welford
// expect data to be in n+c format (channel last) and applies CUDNN_BATCHNORM_SPATIAL
std::vector<at::Tensor> welford_mean_var_c_last_CUDA(const at::Tensor input);

// elementwise BN operation, returns output
// input/weight/shift should have identical data type;
// mean/inv_std have promoted data type (dtype==fp16?fp32:dtype)
// expect data to be in n+c format (channel last) and applies CUDNN_BATCHNORM_SPATIAL
at::Tensor batchnorm_forward_c_last_CUDA(const at::Tensor input, const at::optional<at::Tensor> z,
                                         const at::Tensor mean, const at::Tensor inv_std,
                                         const at::optional<at::Tensor> weight, const at::optional<at::Tensor> shift,
                                         const bool fuse_relu);

// backward BN operation, returns {sum_dy, sum_dy_xmu, grad_weight, grad_bias}
// grad_output/input should have identical data type;
// mean/inv_std have promoted data type (dtype==fp16?fp32:dtype)
// expect data to be in n+c format (channel last) and applies CUDNN_BATCHNORM_SPATIAL
std::vector<at::Tensor> reduce_bn_c_last_CUDA(const at::Tensor grad_output, const at::Tensor input,
                                              const at::Tensor mean, const at::Tensor inv_std,
                                              const at::optional<at::Tensor> weight);

// elementwise backward BN operation, returns grad_input
// grad_output/input/weight precision could be fp16/fp32;
// mean/inv_std/sum_dy/sum_dy_xmu precision is fp32
// expect data to be in n+c format (channel last) and applies CUDNN_BATCHNORM_SPATIAL
at::Tensor batchnorm_backward_c_last_CUDA(const at::Tensor grad_output, const at::Tensor input, const at::Tensor mean,
                                          const at::Tensor inv_std, const at::optional<at::Tensor> weight,
                                          const at::Tensor sum_dy, const at::Tensor sum_dy_xmu, const at::Tensor count);

at::Tensor relu_backward_c_last_CUDA(const at::Tensor grad_output, const at::Tensor input,
                                     const at::optional<at::Tensor> z, const at::Tensor mean, const at::Tensor inv_std,
                                     const at::optional<at::Tensor> weight, const at::optional<at::Tensor> shift);

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("welford_mean_var", &welford_mean_var_CUDA, "welford mean variance", py::call_guard<py::gil_scoped_release>());
  m.def("welford_parallel", &welford_parallel_CUDA, "welford parallel reduce mean variance",
        py::call_guard<py::gil_scoped_release>());
  m.def("batchnorm_forward", &batchnorm_forward_CUDA, "batchnorm forward", py::call_guard<py::gil_scoped_release>());
  m.def("reduce_bn", &reduce_bn_CUDA, "batchnorm backward reduce grad sum and bias/weight grad",
        py::call_guard<py::gil_scoped_release>());
  m.def("batchnorm_backward", &batchnorm_backward_CUDA, "batchnorm backward dgrad",
        py::call_guard<py::gil_scoped_release>());
  m.def("welford_mean_var_c_last", &welford_mean_var_c_last_CUDA, "welford mean variance nhwc",
        py::call_guard<py::gil_scoped_release>());
  m.def("batchnorm_forward_c_last", &batchnorm_forward_c_last_CUDA, "batchnorm forward nhwc",
        py::call_guard<py::gil_scoped_release>());
  m.def("reduce_bn_c_last", &reduce_bn_c_last_CUDA, "batchnorm backwards reduce grad sum and bias/weight grad nhwc",
        py::call_guard<py::gil_scoped_release>());
  m.def("batchnorm_backward_c_last", &batchnorm_backward_c_last_CUDA, "batchnorm backward dgrad nhwc",
        py::call_guard<py::gil_scoped_release>());
  m.def("relu_bw_c_last", &relu_backward_c_last_CUDA, "relu_bw_c_last", py::call_guard<py::gil_scoped_release>());
}


================================================
FILE: csrc/type_shim.h
================================================
#include <ATen/ATen.h>

// Forward/backward compatiblity hack around
// https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288
// pending more future-proof guidance from upstream.
// struct TypeShim
// {
//   const at::Type& payload;
//   TypeShim(const at::Type& type) : payload(type) {}
//   // Enable trivial conversion to a const at::Type& for pre-3aeb78
//   operator const at::Type&(){ return payload; };
//   // Enable dispatch switch statements to take *this directly for  post-3aeb78
//   //operator at::ScalarType(){ return payload.; };
// };

#define DISPATCH_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...)               \
  switch (TYPE) {                                                     \
    case at::ScalarType::Float: {                                     \
      using scalar_t_##LEVEL = float;                                 \
      __VA_ARGS__;                                                    \
      break;                                                          \
    }                                                                 \
    case at::ScalarType::Half: {                                      \
      using scalar_t_##LEVEL = at::Half;                              \
      __VA_ARGS__;                                                    \
      break;                                                          \
    }                                                                 \
    default:                                                          \
      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
  }

#define DISPATCH_FLOAT_HALF_AND_BFLOAT(TYPE, LEVEL, NAME, ...)        \
  switch (TYPE) {                                                     \
    case at::ScalarType::Float: {                                     \
      using scalar_t_##LEVEL = float;                                 \
      __VA_ARGS__;                                                    \
      break;                                                          \
    }                                                                 \
    case at::ScalarType::Half: {                                      \
      using scalar_t_##LEVEL = at::Half;                              \
      __VA_ARGS__;                                                    \
      break;                                                          \
    }                                                                 \
    case at::ScalarType::BFloat16: {                                  \
      using scalar_t_##LEVEL = at::BFloat16;                          \
      __VA_ARGS__;                                                    \
      break;                                                          \
    }                                                                 \
    default:                                                          \
      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
  }

#define DISPATCH_FLOAT_HALF_AND_BYTE(TYPE, LEVEL, NAME, ...)          \
  switch (TYPE) {                                                     \
    case at::ScalarType::Float: {                                     \
      using scalar_t_##LEVEL = float;                                 \
      __VA_ARGS__;                                                    \
      break;                                                          \
    }                                                                 \
    case at::ScalarType::Half: {                                      \
      using scalar_t_##LEVEL = at::Half;                              \
      __VA_ARGS__;                                                    \
      break;                                                          \
    }                                                                 \
    case at::ScalarType::Byte: {                                      \
      using scalar_t_##LEVEL = uint8_t;                               \
      __VA_ARGS__;                                                    \
      break;                                                          \
    }                                                                 \
    default:                                                          \
      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
  }

#define DISPATCH_DOUBLE_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...)        \
  switch (TYPE) {                                                     \
    case at::ScalarType::Double: {                                    \
      using scalar_t_##LEVEL = double;                                \
      __VA_ARGS__;                                                    \
      break;                                                          \
    }                                                                 \
    case at::ScalarType::Float: {                                     \
      using scalar_t_##LEVEL = float;                                 \
      __VA_ARGS__;                                                    \
      break;                                                          \
    }                                                                 \
    case at::ScalarType::Half: {                                      \
      using scalar_t_##LEVEL = at::Half;                              \
      __VA_ARGS__;                                                    \
      break;                                                          \
    }                                                                 \
    default:                                                          \
      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
  }

#define DISPATCH_DOUBLE_FLOAT_HALF_AND_BFLOAT(TYPE, LEVEL, NAME, ...) \
  switch (TYPE) {                                                     \
    case at::ScalarType::Double: {                                    \
      using scalar_t_##LEVEL = double;                                \
      __VA_ARGS__;                                                    \
      break;                                                          \
    }                                                                 \
    case at::ScalarType::Float: {                                     \
      using scalar_t_##LEVEL = float;                                 \
      __VA_ARGS__;                                                    \
      break;                                                          \
    }                                                                 \
    case at::ScalarType::Half: {                                      \
      using scalar_t_##LEVEL = at::Half;                              \
      __VA_ARGS__;                                                    \
      break;                                                          \
    }                                                                 \
    case at::ScalarType::BFloat16: {                                  \
      using scalar_t_##LEVEL = at::BFloat16;                          \
      __VA_ARGS__;                                                    \
      break;                                                          \
    }                                                                 \
    default:                                                          \
      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
  }

#define DISPATCH_DOUBLE_AND_FLOAT(TYPE, LEVEL, NAME, ...)             \
  switch (TYPE) {                                                     \
    case at::ScalarType::Double: {                                    \
      using scalar_t_##LEVEL = double;                                \
      __VA_ARGS__;                                                    \
      break;                                                          \
    }                                                                 \
    case at::ScalarType::Float: {                                     \
      using scalar_t_##LEVEL = float;                                 \
      __VA_ARGS__;                                                    \
      break;                                                          \
    }                                                                 \
    default:                                                          \
      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
  }

#define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...)                     \
  switch (TYPE) {                                                     \
    case at::ScalarType::Half: {                                      \
      using scalar_t = at::Half;                                      \
      __VA_ARGS__;                                                    \
      break;                                                          \
    }                                                                 \
    case at::ScalarType::BFloat16: {                                  \
      using scalar_t = at::BFloat16;                                  \
      __VA_ARGS__;                                                    \
      break;                                                          \
    }                                                                 \
    default:                                                          \
      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
  }

#define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
  switch (TYPEIN) {                                                            \
    case at::ScalarType::Float: {                                              \
      using scalar_t_in = float;                                               \
      switch (TYPEOUT) {                                                       \
        case at::ScalarType::Float: {                                          \
          using scalar_t_out = float;                                          \
          __VA_ARGS__;                                                         \
          break;                                                               \
        }                                                                      \
        case at::ScalarType::Half: {                                           \
          using scalar_t_out = at::Half;                                       \
          __VA_ARGS__;                                                         \
          break;                                                               \
        }                                                                      \
        case at::ScalarType::BFloat16: {                                       \
          using scalar_t_out = at::BFloat16;                                   \
          __VA_ARGS__;                                                         \
          break;                                                               \
        }                                                                      \
        default:                                                               \
          AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'");   \
      }                                                                        \
      break;                                                                   \
    }                                                                          \
    case at::ScalarType::Half: {                                               \
      using scalar_t_in = at::Half;                                            \
      using scalar_t_out = at::Half;                                           \
      __VA_ARGS__;                                                             \
      break;                                                                   \
    }                                                                          \
    case at::ScalarType::BFloat16: {                                           \
      using scalar_t_in = at::BFloat16;                                        \
      using scalar_t_out = at::BFloat16;                                       \
      __VA_ARGS__;                                                             \
      break;                                                                   \
    }                                                                          \
    default:                                                                   \
      AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'");        \
  }

#define DISPATCH_DOUBLE_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
  switch (TYPEIN) {                                                                   \
    case at::ScalarType::Double: {                                                    \
      using scalar_t_in = double;                                                     \
      switch (TYPEOUT) {                                                              \
        case at::ScalarType::Double: {                                                \
          using scalar_t_out = double;                                                \
          __VA_ARGS__;                                                                \
          break;                                                                      \
        }                                                                             \
        case at::ScalarType::Float: {                                                 \
          using scalar_t_out = float;                                                 \
          __VA_ARGS__;                                                                \
          break;                                                                      \
        }                                                                             \
        case at::ScalarType::Half: {                                                  \
          using scalar_t_out = at::Half;                                              \
          __VA_ARGS__;                                                                \
          break;                                                                      \
        }                                                                             \
        case at::ScalarType::BFloat16: {                                              \
          using scalar_t_out = at::BFloat16;                                          \
          __VA_ARGS__;                                                                \
          break;                                                                      \
        }                                                                             \
        default:                                                                      \
          AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'");          \
      }                                                                               \
      break;                                                                          \
    }                                                                                 \
    case at::ScalarType::Float: {                                                     \
      using scalar_t_in = float;                                                      \
      switch (TYPEOUT) {                                                              \
        case at::ScalarType::Float: {                                                 \
          using scalar_t_out = float;                                                 \
          __VA_ARGS__;                                                                \
          break;                                                                      \
        }                                                                             \
        case at::ScalarType::Half: {                                                  \
          using scalar_t_out = at::Half;                                              \
          __VA_ARGS__;                                                                \
          break;                                                                      \
        }                                                                             \
        case at::ScalarType::BFloat16: {                                              \
          using scalar_t_out = at::BFloat16;                                          \
          __VA_ARGS__;                                                                \
          break;                                                                      \
        }                                                                             \
        default:                                                                      \
          AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'");          \
      }                                                                               \
      break;                                                                          \
    }                                                                                 \
    case at::ScalarType::Half: {                                                      \
      using scalar_t_in = at::Half;                                                   \
      using scalar_t_out = at::Half;                                                  \
      __VA_ARGS__;                                                                    \
      break;                                                                          \
    }                                                                                 \
    case at::ScalarType::BFloat16: {                                                  \
      using scalar_t_in = at::BFloat16;                                               \
      using scalar_t_out = at::BFloat16;                                              \
      __VA_ARGS__;                                                                    \
      break;                                                                          \
    }                                                                                 \
    default:                                                                          \
      AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'");               \
  }

template <typename T>
__device__ __forceinline__ T reduce_block_into_lanes(T* x, T val, int lanes = 1,
                                                     bool share_result = false)  // lanes is intended to be <= 32.
{
  int tid = threadIdx.x + threadIdx.y * blockDim.x;
  int blockSize = blockDim.x * blockDim.y;  // blockSize is intended to be a multiple of 32.

  if (blockSize >= 64) {
    x[tid] = val;
    __syncthreads();
  }

#pragma unroll
  for (int i = (blockSize >> 1); i >= 64; i >>= 1) {
    if (tid < i) x[tid] = x[tid] + x[tid + i];
    __syncthreads();
  }

  T final;

  if (tid < 32) {
    if (blockSize >= 64)
      final = x[tid] + x[tid + 32];
    else
      final = val;
    // __SYNCWARP();

#pragma unroll
    for (int i = 16; i >= lanes; i >>= 1) final = final + __shfl_down_sync(0xffffffff, final, i);
  }

  if (share_result) {
    if (tid < lanes) x[tid] = final;  // EpilogueOp
    // Make sure the smem result is visible to all warps.
  }
  __syncthreads();
  // Avoid potential write before read race when reduce_block_into_lanes is called back to back

  return final;
}

template <typename T>
__device__ __forceinline__ T
reduce_block_into_lanes_max_op(T* x, T val, int lanes = 1,
                               bool share_result = false)  // lanes is intended to be <= 32.
{
  int tid = threadIdx.x + threadIdx.y * blockDim.x;
  int blockSize = blockDim.x * blockDim.y;  // blockSize is intended to be a multiple of 32.

  if (blockSize >= 64) {
    x[tid] = val;
    __syncthreads();
  }

#pragma unroll
  for (int i = (blockSize >> 1); i >= 64; i >>= 1) {
    if (tid < i) x[tid] = fmaxf(fabsf(x[tid]), fabsf(x[tid + i]));
    __syncthreads();
  }

  T final;

  if (tid < 32) {
    if (blockSize >= 64)
      final = fmaxf(fabsf(x[tid]), fabsf(x[tid + 32]));
    else
      final = val;
    // __SYNCWARP();

#pragma unroll
    for (int i = 16; i >= lanes; i >>= 1) final = fmaxf(fabsf(final), fabsf(__shfl_down_sync(0xffffffff, final, i)));
  }

  if (share_result) {
    if (tid < lanes) x[tid] = final;  // EpilogueOp
    // Make sure the smem result is visible to all warps.
    __syncthreads();
  }

  return final;
}


================================================
FILE: csrc/update_scale_hysteresis.cu
================================================
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>

__global__ void update_scale_hysteresis_cuda_kernel(float* current_scale, int* growth_tracker, int* hysteresis_tracker,
                                                    const float* found_inf, double growth_factor, double backoff_factor,
                                                    int growth_interval, int hysteresis) {
  if (*found_inf > 0) {
    *hysteresis_tracker -= 1;

    // Only reset the growth tracker when hysteresis is larger than zero
    if (*hysteresis_tracker > 0) {
      *growth_tracker = 0;
      return;
    }
  }

  if (*found_inf) {
    *current_scale = (*current_scale) * backoff_factor;
    *growth_tracker = 0;
  } else {
    // Entering this branch means we just carried out a successful step,
    // so growth_tracker is incremented before comparing to growth_interval.
    auto successful = (*growth_tracker) + 1;
    if (successful == growth_interval) {
      auto new_scale = static_cast<float>((*current_scale) * growth_factor);
      // Do not grow the scale past fp32 bounds to inf.
      if (isfinite(new_scale)) {
        *current_scale = new_scale;
      }
      *growth_tracker = 0;
    } else {
      *growth_tracker = successful;
    }
  }

  // Reset the hysteresis tracker if no infs are found
  if (*found_inf <= 0) {
    *hysteresis_tracker = hysteresis;
  }
}

at::Tensor update_scale_hysteresis_cuda(at::Tensor current_scale, at::Tensor growth_tracker,
                                        at::Tensor hysteresis_tracker, at::Tensor found_inf, const double growth_factor,
                                        const double backoff_factor, const int64_t growth_interval,
                                        const int hysteresis) {
  update_scale_hysteresis_cuda_kernel<<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>(
      current_scale.mutable_data_ptr<float>(), growth_tracker.mutable_data_ptr<int>(),
      hysteresis_tracker.mutable_data_ptr<int>(), found_inf.const_data_ptr<float>(), growth_factor, backoff_factor,
      growth_interval, hysteresis);

  AT_CUDA_CHECK(cudaGetLastError());

  return current_scale;
}


================================================
FILE: csrc/welford.cu
================================================
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_runtime.h>

#include <iostream>
#include <vector>

#include "type_shim.h"

__device__ __forceinline__ int lastpow2(int n) {
  int out = 1 << (31 - __clz(n));
  if (n == out) out >>= 1;
  return out;
}

__host__ __forceinline__ int h_next_pow2(unsigned int n) {
  n--;
  n |= (n >> 1);
  n |= (n >> 2);
  n |= (n >> 4);
  n |= (n >> 8);
  n |= (n >> 16);
  return ++n;
}

__host__ __forceinline__ int h_last_pow2(unsigned int n) {
  n |= (n >> 1);
  n |= (n >> 2);
  n |= (n >> 4);
  n |= (n >> 8);
  n |= (n >> 16);
  return n - (n >> 1);
}

#define WARP_SIZE 32

template <typename T>
__device__ __forceinline__ T warp_reduce_sum(T val) {
#pragma unroll
  for (int i = WARP_SIZE / 2; i > 0; i >>= 1) val = val + __shfl_down_sync(0xffffffff, val, i);
  return val;
}

template <typename T>
__device__ __forceinline__ T reduce_block(T* x, T val) {
  int tid = threadIdx.y * blockDim.x + threadIdx.x;
  int blockSize = blockDim.x * blockDim.y;

  if (blockSize > 32) {
    val = warp_reduce_sum(val);
    if (tid % WARP_SIZE == 0) x[tid / WARP_SIZE] = val;

    __syncthreads();

    val = (tid < blockSize / WARP_SIZE ? x[tid % WARP_SIZE] : T(0));
  }

  if (tid / WARP_SIZE == 0) val = warp_reduce_sum(val);

  return val;
}

#define ELEMENTS_PER_ITER 4  // enables concurrency within each thread to hide latency
#define ELEMENTS_PER_THREAD 16
#define OPTIMAL_TILE_W 32
#define MAX_H_BLOCK 128
#define MAX_BLOCK_SIZE 512

__host__ int div_ru(int x, int y) { return h_last_pow2(1 + (x - 1) / y); }

__host__ void flexible_launch_configs(const int reduction, const int stride, dim3& block, dim3& grid,
                                      const bool coop_flag = false) {
  int block_x = std::min(h_last_pow2(stride), OPTIMAL_TILE_W);
  int block_y = std::min(h_last_pow2(div_ru(reduction, ELEMENTS_PER_THREAD)), MAX_BLOCK_SIZE / block_x);
  if (block_x * block_y != MAX_BLOCK_SIZE) {
    block_x = std::min(h_last_pow2(stride), MAX_BLOCK_SIZE / block_y);
  }

  int grid_x = div_ru(stride, block_x);
  int grid_y = std::min(div_ru(reduction, block_y * ELEMENTS_PER_THREAD), MAX_H_BLOCK);
  if (coop_flag) {
    // it's not worth having a grid reduction if the reduction dimension is not big enough
    grid_y = grid_y < 8 ? 1 : grid_y;
  }

  block.x = block_x;
  block.y = block_y;
  block.z = 1;
  grid.x = grid_x;
  grid.y = grid_y;
  grid.z = 1;
}

template <typename T, typename C>
__device__ __forceinline__ void welford_merge_element(C& count, T& mean, T& m2n, const C& num_new, const T& mean_new,
                                                      const T& m2n_new) {
  T factor = T(1.0) / max(1, (count + num_new));
  T delta0 = mean - mean_new;
  mean = (mean_new * num_new + mean * count) * factor;
  m2n += m2n_new + delta0 * delta0 * num_new * count * factor;
  count += num_new;
}

template <typename T>
__device__ __forceinline__ void warp_reduce_mean_m2n(T& mean, T& m2n, int& num) {
#pragma unroll
  for (int i = WARP_SIZE / 2; i > 0; i >>= 1) {
    auto num_new = __shfl_down_sync(0xffffffff, num, i);
    auto mean_new = __shfl_down_sync(0xffffffff, mean, i);
    auto m2n_new = __shfl_down_sync(0xffffffff, m2n, i);
    welford_merge_element(num, mean, m2n, num_new, mean_new, m2n_new);
  }
}

template <typename T>
__device__ void welford_reduce_mean_m2n(T* __restrict__ x, int* __restrict__ count, T& mean, T& m2n, int& num,
                                        int block_size, int thread_id) {
  int lane = thread_id % WARP_SIZE;
  int wid = thread_id / WARP_SIZE;

  if (block_size > 32) {
    warp_reduce_mean_m2n(mean, m2n, num);
    if (lane == 0) {
      x[wid * 2] = mean;
      x[wid * 2 + 1] = m2n;
      count[wid] = num;
    }
    __syncthreads();

    if (wid == 0) {
      mean = (thread_id < block_size / WARP_SIZE) ? x[lane * 2] : T(0);
      m2n = (thread_id < block_size / WARP_SIZE) ? x[lane * 2 + 1] : T(0);
      num = (thread_id < block_size / WARP_SIZE) ? count[lane] : int(0);
    }
  }

  if (wid == 0) warp_reduce_mean_m2n(mean, m2n, num);

  return;
}

// return spatial size for NC+ Tensors
__host__ int get_tensor_spatial_size(const at::Tensor& input) {
  auto space_size = input.size(2);
  for (int i = 3; i < input.ndimension(); i++) {
    space_size *= input.size(i);
  }
  return space_size;
}

// promote accumulation scalar type. promote half to float.
__host__ at::ScalarType promote_scalartype(const at::Tensor& input) {
  return input.scalar_type() == at::ScalarType::Half ? at::ScalarType::Float : input.scalar_type();
}

// return single element size, optional accumulation type promotion.
__host__ size_t get_element_data_size(const at::Tensor& input, bool accumulation = false) {
  auto scalar_type = accumulation ? promote_scalartype(input) : input.scalar_type();
  return at::elementSize(scalar_type);
}

template <typename T, typename C>
__device__ __forceinline__ void welford_merge_block_vertical(C& count, T& mean, T& m2n, C* shmem_count, T* shmem_mean,
                                                             T* shmem_m2n) {
  // write to shared memory
  auto address_base = threadIdx.x + threadIdx.y * blockDim.x;
  shmem_mean[address_base] = mean;
  shmem_m2n[address_base] = m2n;
  shmem_count[address_base] = count;

#pragma unroll
  for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
    __syncthreads();
    if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) {
      auto address = address_base + offset * blockDim.x;
      // read shared memory back to register for reduction
      auto num_new = shmem_count[address];
      auto mean_new = shmem_mean[address];
      auto m2n_new = shmem_m2n[address];

      welford_merge_element(count, mean, m2n, num_new, mean_new, m2n_new);

      // last write is not necessary
      shmem_mean[address_base] = mean;
      shmem_m2n[address_base] = m2n;
      shmem_count[address_base] = count;
    }
  }
}

template <typename T>
__device__ __forceinline__ void merge_block_vertical(T& sum_dy, T& sum_dy_xmu, T* shmem_sum_dy, T* shmem_sum_dy_xmu) {
  // write to shared memory
  auto address_base = threadIdx.x + threadIdx.y * blockDim.x;
  shmem_sum_dy[address_base] = sum_dy;
  shmem_sum_dy_xmu[address_base] = sum_dy_xmu;

#pragma unroll
  for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
    __syncthreads();
    if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) {
      auto address = address_base + offset * blockDim.x;

      sum_dy += shmem_sum_dy[address];
      sum_dy_xmu += shmem_sum_dy_xmu[address];

      // last write is not necessary
      shmem_sum_dy[address_base] = sum_dy;
      shmem_sum_dy_xmu[address_base] = sum_dy_xmu;
    }
  }
}

// welford kernel calculating mean/biased_variance/unbiased_variance
template <typename scalar_t, typename accscalar_t, typename outscalar_t>
__global__ void welford_kernel(const scalar_t* __restrict__ input, outscalar_t* __restrict__ out_mean,
                               outscalar_t* __restrict__ out_var_biased, const int bs, const int fs, const int ss) {
  int block_size = blockDim.x * blockDim.y;
  int count = 0;
  accscalar_t x_mean = accscalar_t(0);
  accscalar_t m_2_n = accscalar_t(0);

  int thread_id = threadIdx.y * blockDim.x + threadIdx.x;

  for (int batch_id = threadIdx.y; batch_id < bs; batch_id += blockDim.y) {
    int input_base = blockIdx.x * ss + batch_id * ss * fs;
    // sequential welford
    for (int offset = threadIdx.x; offset < ss; offset += blockDim.x) {
      count++;
      auto x_n = static_cast<accscalar_t>(input[offset + input_base]);
      auto d = x_n - x_mean;
      x_mean += d / count;
      m_2_n += d * (x_n - x_mean);
    }
  }

  static __shared__ int s_mem[160];
  accscalar_t* s_mem_ac = (accscalar_t*)&s_mem[32];

  welford_reduce_mean_m2n<accscalar_t>(s_mem_ac, s_mem, x_mean, m_2_n, count, block_size, thread_id);

  if (thread_id == 0) {
    out_mean[blockIdx.x] = static_cast<outscalar_t>(x_mean);
    out_var_biased[blockIdx.x] = static_cast<outscalar_t>(m_2_n / count);
  }
}

// elementwise BN kernel
template <typename scalar_t, typename accscalar_t, typename layerscalar_t>
__global__ void batchnorm_forward_kernel(const scalar_t* __restrict__ input, const accscalar_t* __restrict__ mean,
                                         const accscalar_t* __restrict__ inv_std,
                                         const layerscalar_t* __restrict__ weight,
                                         const layerscalar_t* __restrict__ shift, scalar_t* __restrict__ out,
                                         const int ss, const int bs) {
  auto m_c = mean[blockIdx.x];
  auto inv_std_c = inv_std[blockIdx.x];
  auto w_c = weight == NULL ? accscalar_t(1.0) : static_cast<accscalar_t>(weight[blockIdx.x]);
  auto s_c = shift == NULL ? accscalar_t(0.0) : static_cast<accscalar_t>(shift[blockIdx.x]);

  for (int batch_offset = blockIdx.y * blockDim.y + threadIdx.y; batch_offset < bs;
       batch_offset += gridDim.y * blockDim.y) {
    int address_base = blockIdx.x * ss + batch_offset * gridDim.x * ss;
    for (int offset = threadIdx.x + blockIdx.z * blockDim.x; offset < ss; offset += gridDim.z * blockDim.x) {
      out[address_base + offset] =
          static_cast<scalar_t>(w_c * (static_cast<accscalar_t>(input[address_base + offset]) - m_c) * inv_std_c + s_c);
    }
  }
}

// Backward BN kernel, calculates grad_bias, grad_weight as well as intermediate
// results to calculating grad_input.
// Breaking the grad_input to two step to support sync BN, which requires all
// reduce of the intermediate results across processes.
template <typename scalar_t, typename accscalar_t, typename layerscalar_t>
__global__ void reduce_bn_kernel(const scalar_t* __restrict__ input, const scalar_t* __restrict__ grad_output,
                                 const accscalar_t* __restrict__ mean, const accscalar_t* __restrict__ inv_std,
                                 accscalar_t* __restrict__ sum_dy_o, accscalar_t* __restrict__ sum_dy_xmu_o,
                                 layerscalar_t* __restrict__ grad_weight, layerscalar_t* __restrict__ grad_bias,
                                 const int bs, const int fs, const int ss) {
  static __shared__ int s_mem[64];
  // int total_item_num = bs * ss;

  int thread_id = threadIdx.y * blockDim.x + threadIdx.x;

  auto r_mean = mean[blockIdx.x];
  auto factor = inv_std[blockIdx.x];

  // Kahan sum
  accscalar_t sum_dy = 0.0;
  accscalar_t sum_dy_xmu = 0.0;
  accscalar_t sum_dy_c = 0.0;
  accscalar_t sum_dy_xmu_c = 0.0;
  for (int batch_id = threadIdx.y; batch_id < bs; batch_id += blockDim.y) {
    int input_base = blockIdx.x * ss + batch_id * ss * fs;
    for (int offset = threadIdx.x; offset < ss; offset += blockDim.x) {
      auto e_grad = static_cast<accscalar_t>(grad_output[offset + input_base]);
      auto e_input = static_cast<accscalar_t>(input[offset + input_base]);
      // calculating sum_dy
      auto sum_dy_y = e_grad - sum_dy_c;
      auto sum_dy_t = sum_dy + sum_dy_y;
      sum_dy_c = (sum_dy_t - sum_dy) - sum_dy_y;
      sum_dy = sum_dy_t;

      // calculating sum_dy_xmu
      auto sum_dy_xmu_y = e_grad * (e_input - r_mean) - sum_dy_xmu_c;
      auto sum_dy_xmu_t = sum_dy_xmu + sum_dy_xmu_y;
      sum_dy_xmu_c = (sum_dy_xmu_t - sum_dy_xmu) - sum_dy_xmu_y;
      sum_dy_xmu = sum_dy_xmu_t;
    }
  }

  sum_dy = reduce_block((accscalar_t*)s_mem, sum_dy);
  __syncthreads();
  sum_dy_xmu = reduce_block((accscalar_t*)s_mem, sum_dy_xmu);

  if (thread_id == 0) {
    if (grad_bias != NULL) {
      grad_bias[blockIdx.x] = static_cast<layerscalar_t>(sum_dy);
    }
    if (grad_weight != NULL) {
      grad_weight[blockIdx.x] = static_cast<layerscalar_t>(sum_dy_xmu * factor);
    }
    // mean_dy[blockIdx.x] = sum_dy / total_item_num;
    // mean_dy_xmu[blockIdx.x] = sum_dy_xmu / total_item_num;
    sum_dy_o[blockIdx.x] = sum_dy;
    sum_dy_xmu_o[blockIdx.x] = sum_dy_xmu;
  }
}

// elementwise backward BN kernel
template <typename scalar_t, typename accscalar_t, typename layerscalar_t>
__global__ void batchnorm_backward_kernel(const scalar_t* __restrict__ grad_output, const scalar_t* __restrict__ input,
                                          const accscalar_t* __restrict__ mean, const accscalar_t* __restrict__ inv_std,
                                          const layerscalar_t* __restrict__ weight,
                                          const accscalar_t* __restrict__ sum_dy,
                                          const accscalar_t* __restrict__ sum_dy_xmu, const int* __restrict__ numel,
                                          scalar_t* __restrict__ grad_input, const int64_t world_size, const int ss,
                                          const int bs) {
  int64_t div = 0;
  for (int i = 0; i < world_size; i++) {
    div += numel[i];
  }
  auto m_c = static_cast<accscalar_t>(mean[blockIdx.x]);
  // auto m_dy_c = static_cast<accscalar_t>(mean_dy[blockIdx.x]);
  auto m_dy_c = static_cast<accscalar_t>(sum_dy[blockIdx.x]) / div;
  auto factor_1_c = inv_std[blockIdx.x];
  auto factor_2_c = (weight == NULL ? accscalar_t(1.0) : static_cast<accscalar_t>(weight[blockIdx.x])) * factor_1_c;
  // factor_1_c = factor_1_c * factor_1_c * mean_dy_xmu[blockIdx.x];
  factor_1_c = factor_1_c * factor_1_c * sum_dy_xmu[blockIdx.x] / div;

  for (int batch_offset = blockIdx.y * blockDim.y + threadIdx.y; batch_offset < bs;
       batch_offset += gridDim.y * blockDim.y) {
    int address_base = blockIdx.x * ss + batch_offset * gridDim.x * ss;
    for (int offset = threadIdx.x + blockIdx.z * blockDim.x; offset < ss; offset += gridDim.z * blockDim.x) {
      grad_input[address_base + offset] =
          (static_cast<accscalar_t>(grad_output[address_base + offset]) - m_dy_c -
           (static_cast<accscalar_t>(input[address_base + offset]) - m_c) * factor_1_c) *
          factor_2_c;
    }
  }
}

// welford kernel for c last tensor calculating mean/biased_variance/unbiased_variance
template <typename scalar_t, typename accscalar_t, typename outscalar_t, int PARALLEL_LOADS>
__global__ void welford_kernel_c_last(const scalar_t* __restrict__ input, outscalar_t* __restrict__ out_mean,
                                      outscalar_t* __restrict__ out_var_biased, volatile accscalar_t* staging_data,
                                      int* semaphores, const int reduction_size, const int stride) {
  // hide latency with concurrency
  accscalar_t x_mean[PARALLEL_LOADS];
  accscalar_t m_2_n[PARALLEL_LOADS];
  int count[PARALLEL_LOADS];

#pragma unroll
  for (int i = 0; i < PARALLEL_LOADS; i++) {
    x_mean[i] = accscalar_t(0);
    m_2_n[i] = accscalar_t(0);
    count[i] = accscalar_t(0);
  }
  // tensor dimension (m,c)

  // loop along m dimension
  int inner_loop_stride = blockDim.y * gridDim.y;

  // offset along m dimension
  int m_offset = blockIdx.y * blockDim.y + threadIdx.y;
  int c_offset = blockIdx.x * blockDim.x + threadIdx.x;

  int loop_count = 1 + (reduction_size - 1) / (inner_loop_stride * PARALLEL_LOADS);
  int address_base = m_offset * stride + c_offset;
  int address_increment = inner_loop_stride * stride;

  for (int i = 0; i < loop_count; i++) {
    accscalar_t x_math[PARALLEL_LOADS];
    accscalar_t x_count_inv[PARALLEL_LOADS];
    accscalar_t is_valid[PARALLEL_LOADS];

    // load multiple data in
#pragma unroll
    for (int j = 0; j < PARALLEL_LOADS; j++) {
      if (c_offset < stride && m_offset < reduction_size) {
        x_math[j] = input[address_base];
        count[j]++;
        x_count_inv[j] = accscalar_t(1) / count[j];
        is_valid[j] = accscalar_t(1);
      } else {
        x_math[j] = accscalar_t(0);
        x_count_inv[j] = accscalar_t(0);
        is_valid[j] = accscalar_t(0);
      }
      m_offset += inner_loop_stride;
      address_base += address_increment;
    }

    // calculate mean/m2n with welford
#pragma unroll
    for (int j = 0; j < PARALLEL_LOADS; j++) {
      accscalar_t delta0 = x_math[j] - x_mean[j];
      x_mean[j] += delta0 * x_count_inv[j];
      accscalar_t delta1 = x_math[j] - x_mean[j];
      m_2_n[j] += delta0 * delta1 * is_valid[j];
    }
  }

  // thread reduction to accumulate mean/m_2_n/count between PARALLEL_LOADS
#pragma unroll
  for (int j = 1; j < PARALLEL_LOADS; j++) {
    welford_merge_element(count[0], x_mean[0], m_2_n[0], count[j], x_mean[j], m_2_n[j]);
  }

  // release x_mean / m_2_n
  auto mean_th = x_mean[0];
  auto m2_th = m_2_n[0];
  auto count_th = count[0];

  // block-wise reduction with shared memory (since reduction cannot be done within a warp)
  static __shared__ accscalar_t shmem_mean[MAX_BLOCK_SIZE];
  static __shared__ accscalar_t shmem_m2n[MAX_BLOCK_SIZE];
  static __shared__ int shmem_count[MAX_BLOCK_SIZE];

  welford_merge_block_vertical(count_th, mean_th, m2_th, shmem_count, shmem_mean, shmem_m2n);

  // grid reduction if needed (coop launch used at the first place)
  if (gridDim.y > 1) {
    volatile accscalar_t* staging_mean = staging_data;
    volatile accscalar_t* staging_m2n = &staging_data[stride * gridDim.y];
    volatile int* staging_count = reinterpret_cast<volatile int*>(&staging_m2n[stride * gridDim.y]);

    address_base = c_offset + blockIdx.y * stride;
    // write data to staging_data;
    if (threadIdx.y == 0 && c_offset < stride) {
      staging_mean[address_base] = mean_th;
      staging_m2n[address_base] = m2_th;
      staging_count[address_base] = count_th;
    }

    __threadfence();
    __syncthreads();  // ensuring writes to staging_ is visible to all blocks

    __shared__ bool is_last_block_done;
    // mark block done
    if (threadIdx.x == 0 && threadIdx.y == 0) {
      int old = atomicAdd(&semaphores[blockIdx.x], 1);
      is_last_block_done = (old == (gridDim.y - 1));
    }

    __syncthreads();

    // check that all data is now available in global memory
    if (is_last_block_done) {
      count_th = 0;
      mean_th = accscalar_t(0.0);
      m2_th = accscalar_t(0.0);

      for (int y = threadIdx.y; y < gridDim.y; y += blockDim.y) {
        address_base = c_offset + y * stride;
        int num_new = c_offset < stride ? staging_count[address_base] : 0;
        accscalar_t mean_new = c_offset < stride ? staging_mean[address_base] : accscalar_t(0.0);
        accscalar_t m2n_new = c_offset < stride ? staging_m2n[address_base] : accscalar_t(0.0);

        welford_merge_element(count_th, mean_th, m2_th, num_new, mean_new, m2n_new);
      }

      welford_merge_block_vertical(count_th, mean_th, m2_th, shmem_count, shmem_mean, shmem_m2n);
      if (threadIdx.y == 0 && c_offset < stride) {
        out_mean[c_offset] = static_cast<outscalar_t>(mean_th);
        out_var_biased[c_offset] = static_cast<outscalar_t>(m2_th / count_th);
      }
    }
  } else {
    if (blockIdx.y == 0 && threadIdx.y == 0 && c_offset < stride) {
      out_mean[c_offset] = static_cast<outscalar_t>(mean_th);
      out_var_biased[c_offset] = static_cast<outscalar_t>(m2_th / count_th);
    }
  }
}

// parallel welford kernel to further reduce mean / biased_var
// into mean / unbiased_var / inv_std across multiple processes.
template <typename scalar_t>
__global__ void welford_kernel_parallel(const scalar_t* __restrict__ mean, const scalar_t* __restrict__ var_biased,
                                        const int* __restrict__ numel, scalar_t* __restrict__ out_mean,
                                        scalar_t* __restrict__ out_var, scalar_t* __restrict__ inv_std,
                                        const int world_size, const int feature_size, const float eps) {
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < feature_size; i += gridDim.x * blockDim.x) {
    // load data;
    int address = i;
    scalar_t x_mean = 0;
    scalar_t m_2_n = 0;
    int count = 0;
    for (int j = 0; j < world_size; j++) {
      welford_merge_element(count, x_mean, m_2_n, numel[j], mean[address], var_biased[address] * numel[j]);
      address += feature_size;
    }
    out_mean[i] = x_mean;
    out_var[i] = m_2_n / (count - 1);
    inv_std[i] = scalar_t(1) / sqrt(m_2_n / count + eps);
  }
}

// elementwise BN kernel
template <typename scalar_t, typename accscalar_t, typename layerscalar_t, int PARALLEL_LOADS>
__global__ void batchnorm_forward_c_last_kernel(const scalar_t* __restrict__ input, const scalar_t* __restrict__ z,
                                                const accscalar_t* __restrict__ mean,
                                                const accscalar_t* __restrict__ inv_std,
                                                const layerscalar_t* __restrict__ weight,
                                                const layerscalar_t* __restrict__ shift, scalar_t* __restrict__ out,
                                                const int reduction_size, const int stride, const bool fuse_relu) {
  // tensor dimension (m,c)
  // loop along m dimension
  int inner_loop_stride = blockDim.y * gridDim.y;

  // offset along m dimension
  int m_offset = blockIdx.y * blockDim.y + threadIdx.y;
  int c_offset = blockIdx.x * blockDim.x + threadIdx.x;

  auto m_c = mean[c_offset];
  auto inv_std_c = static_cast<accscalar_t>(inv_std[c_offset]);
  auto w_c = weight == NULL ? accscalar_t(1.0) : static_cast<accscalar_t>(weight[c_offset]);
  auto s_c = shift == NULL ? accscalar_t(0.0) : static_cast<accscalar_t>(shift[c_offset]);

  int loop_count = 1 + (reduction_size - 1) / (inner_loop_stride * PARALLEL_LOADS);
  int address_base = m_offset * stride + c_offset;
  int address_increment = inner_loop_stride * stride;

  for (int i = 0; i < loop_count; i++) {
#pragma unroll
    for (int j = 0; j < PARALLEL_LOADS; j++) {
      if (c_offset < stride && m_offset < reduction_size) {
        auto tmp = w_c * (static_cast<accscalar_t>(input[address_base]) - m_c) * inv_std_c + s_c;
        if (z != NULL) {
          tmp += z[address_base];
        }
        out[address_base] = (fuse_relu && tmp <= accscalar_t(0.0) ? scalar_t(0.0) : static_cast<scalar_t>(tmp));
      }
      m_offset += inner_loop_stride;
      address_base += address_increment;
    }
  }
}

// elementwise BN kernel
template <typename scalar_t, typename accscalar_t, typename layerscalar_t, int PARALLEL_LOADS>
__global__ void relu_backward_c_last_kernel(const scalar_t* __restrict__ grad_output,
                                            const scalar_t* __restrict__ input, const scalar_t* __restrict__ z,
                                            const accscalar_t* __restrict__ mean,
                                            const accscalar_t* __restrict__ inv_std,
                                            const layerscalar_t* __restrict__ weight,
                                            const layerscalar_t* __restrict__ shift, scalar_t* __restrict__ out,
                                            const int reduction_size, const int stride) {
  // tensor dimension (m,c)
  // loop along m dimension
  int inner_loop_stride = blockDim.y * gridDim.y;

  // offset along m dimension
  int m_offset = blockIdx.y * blockDim.y + threadIdx.y;
  int c_offset = blockIdx.x * blockDim.x + threadIdx.x;

  auto m_c = mean[c_offset];
  auto inv_std_c = static_cast<accscalar_t>(inv_std[c_offset]);
  auto w_c = weight == NULL ? accscalar_t(1.0) : static_cast<accscalar_t>(weight[c_offset]);
  auto s_c = shift == NULL ? accscalar_t(0.0) : static_cast<accscalar_t>(shift[c_offset]);

  int loop_count = 1 + (reduction_size - 1) / (inner_loop_stride * PARALLEL_LOADS);
  int address_base = m_offset * stride + c_offset;
  int address_increment = inner_loop_stride * stride;

  for (int i = 0; i < loop_count; i++) {
#pragma unroll
    for (int j = 0; j < PARALLEL_LOADS; j++) {
      if (c_offset < stride && m_offset < reduction_size) {
        auto tmp = w_c * (static_cast<accscalar_t>(input[address_base]) - m_c) * inv_std_c + s_c;
        if (z != NULL) {
          tmp += z[address_base];
        }
        out[address_base] = (tmp <= accscalar_t(0.0) ? scalar_t(0.0) : grad_output[address_base]);
      }
      m_offset += inner_loop_stride;
      address_base += address_increment;
    }
  }
}

// batchnorm backward kernel for c last tensor
template <typename scalar_t, typename accscalar_t, typename layerscalar_t, int PARALLEL_LOADS>
__global__ void reduce_bn_c_last_kernel(const scalar_t* __restrict__ input, const scalar_t* __restrict__ grad_output,
                                        const accscalar_t* __restrict__ mean, const accscalar_t* __restrict__ inv_std,
                                        accscalar_t* __restrict__ sum_dy_o, accscalar_t* __restrict__ sum_dy_xmu_o,
                                        layerscalar_t* __restrict__ grad_weight, layerscalar_t* __restrict__ grad_bias,
                                        volatile accscalar_t* staging_data, int* semaphores, const int reduction_size,
                                        const int stride) {
  // hide latency with concurrency
  accscalar_t sum_dy[PARALLEL_LOADS];
  accscalar_t sum_dy_xmu[PARALLEL_LOADS];

#pragma unroll
  for (int i = 0; i < PARALLEL_LOADS; i++) {
    sum_dy[i] = accscalar_t(0);
    sum_dy_xmu[i] = accscalar_t(0);
  }
  // tensor dimension (m,c)

  // loop along m dimension
  int inner_loop_stride = blockDim.y * gridDim.y;

  // offset along m dimension
  int m_offset = blockIdx.y * blockDim.y + threadIdx.y;
  int c_offset = blockIdx.x * blockDim.x + threadIdx.x;

  int loop_count = 1 + (reduction_size - 1) / (inner_loop_stride * PARALLEL_LOADS);
  int address_base = m_offset * stride + c_offset;
  int address_increment = inner_loop_stride * stride;

  auto r_mean = mean[c_offset];
  auto factor = inv_std[c_offset];

  for (int i = 0; i < loop_count; i++) {
    accscalar_t x_input[PARALLEL_LOADS];
    accscalar_t x_grad_output[PARALLEL_LOADS];

    // load multiple data in
#pragma unroll
    for (int j = 0; j < PARALLEL_LOADS; j++) {
      if (c_offset < stride && m_offset < reduction_size) {
        x_input[j] = input[address_base];
        x_grad_output[j] = grad_output[address_base];
      } else {
        x_input[j] = accscalar_t(0);
        x_grad_output[j] = accscalar_t(0);
      }
      m_offset += inner_loop_stride;
      address_base += address_increment;
    }

    // calculate sum_dy / sum_dy_xmu
#pragma unroll
    for (int j = 0; j < PARALLEL_LOADS; j++) {
      sum_dy[j] += x_grad_output[j];
      sum_dy_xmu[j] += x_grad_output[j] * (x_input[j] - r_mean);
    }
  }

  // thread reduction to accumulate sum_dy / sum_dy_xmu between PARALLEL_LOADS
#pragma unroll
  for (int j = 1; j < PARALLEL_LOADS; j++) {
    sum_dy[0] += sum_dy[j];
    sum_dy_xmu[0] += sum_dy_xmu[j];
  }

  // release array of registers
  auto sum_dy_th = sum_dy[0];
  auto sum_dy_xmu_th = sum_dy_xmu[0];

  // block-wise reduction with shared memory (since reduction cannot be done within a warp)
  static __shared__ accscalar_t shmem_sum_dy[MAX_BLOCK_SIZE];
  static __shared__ accscalar_t shmem_sum_dy_xmu[MAX_BLOCK_SIZE];

  merge_block_vertical(sum_dy_th, sum_dy_xmu_th, shmem_sum_dy, shmem_sum_dy_xmu);

  // grid reduction if needed (coop launch used at the first place)
  if (gridDim.y > 1) {
    volatile accscalar_t* staging_sum_dy = staging_data;
    volatile accscalar_t* staging_sum_dy_xmu = &staging_data[stride * gridDim.y];

    address_base = c_offset + blockIdx.y * stride;
    // write data to staging_data;
    if (threadIdx.y == 0 && c_offset < stride) {
      staging_sum_dy[address_base] = sum_dy_th;
      staging_sum_dy_xmu[address_base] = sum_dy_xmu_th;
    }

    __threadfence();
    __syncthreads();  // ensuring writes to staging_ is visible to all blocks

    __shared__ bool is_last_block_done;
    // mark block done
    if (threadIdx.x == 0 && threadIdx.y == 0) {
      int old = atomicAdd(&semaphores[blockIdx.x], 1);
      is_last_block_done = (old == (gridDim.y - 1));
    }

    __syncthreads();

    // check that all data is now available in global memory
    if (is_last_block_done) {
      sum_dy_th = accscalar_t(0.0);
      sum_dy_xmu_th = accscalar_t(0.0);

      for (int y = threadIdx.y; y < gridDim.y; y += blockDim.y) {
        address_base = c_offset + y * stride;
        sum_dy_th += (c_offset < stride ? staging_sum_dy[address_base] : accscalar_t(0.0));
        sum_dy_xmu_th += (c_offset < stride ? staging_sum_dy_xmu[address_base] : accscalar_t(0.0));
      }

      merge_block_vertical(sum_dy_th, sum_dy_xmu_th, shmem_sum_dy, shmem_sum_dy_xmu);
      if (threadIdx.y == 0 && c_offset < stride) {
        if (grad_bias != NULL) {
          grad_bias[c_offset] = static_cast<layerscalar_t>(sum_dy_th);
        }
        if (grad_weight != NULL) {
          grad_weight[c_offset] = static_cast<layerscalar_t>(sum_dy_xmu_th * factor);
        }
        // mean_dy[c_offset] = sum_dy_th / reduction_size;
        // mean_dy_xmu[c_offset] = sum_dy_xmu_th / reduction_size;
        sum_dy_o[c_offset] = sum_dy_th;
        sum_dy_xmu_o[c_offset] = sum_dy_xmu_th;
      }
    }
  } else {
    if (blockIdx.y == 0 && threadIdx.y == 0 && c_offset < stride) {
      if (grad_bias != NULL) {
        grad_bias[c_offset] = static_cast<layerscalar_t>(sum_dy_th);
      }
      if (grad_weight != NULL) {
        grad_weight[c_offset] = static_cast<layerscalar_t>(sum_dy_xmu_th * factor);
      }
      // mean_dy[c_offset] = sum_dy_th / reduction_size;
      // mean_dy_xmu[c_offset] = sum_dy_xmu_th / reduction_size;
      sum_dy_o[c_offset] = sum_dy_th;
      sum_dy_xmu_o[c_offset] = sum_dy_xmu_th;
    }
  }
}

// elementwise BN kernel
template <typename scalar_t, typename accscalar_t, typename layerscalar_t, int PARALLEL_LOADS>
__global__ void batchnorm_backward_c_last_kernel(
    const scalar_t* __restrict__ grad_output, const scalar_t* __restrict__ input, const accscalar_t* __restrict__ mean,
    const accscalar_t* __restrict__ inv_std, const layerscalar_t* __restrict__ weight,
    const accscalar_t* __restrict__ sum_dy, const accscalar_t* __restrict__ sum_dy_xmu, const int* __restrict__ numel,
    scalar_t* __restrict__ grad_input, const int64_t world_size, const int reduction_size, const int stride) {
  int64_t div = 0;
  for (int i = 0; i < world_size; i++) {
    div += numel[i];
  }
  // tensor dimension (m,c)
  // loop along m dimension
  int inner_loop_stride = blockDim.y * gridDim.y;

  // offset along m dimension
  int m_offset = blockIdx.y * blockDim.y + threadIdx.y;
  int c_offset = blockIdx.x * blockDim.x + threadIdx.x;

  auto m_c = mean[c_offset];
  auto m_dy_c = sum_dy[c_offset] / div;
  auto factor_1_c = inv_std[c_offset];
  auto factor_2_c = (weight == NULL ? accscalar_t(1.0) : static_cast<accscalar_t>(weight[c_offset])) * factor_1_c;
  factor_1_c = factor_1_c * factor_1_c * sum_dy_xmu[c_offset] / div;

  int loop_count = 1 + (reduction_size - 1) / (inner_loop_stride * PARALLEL_LOADS);
  int address_base = m_offset * stride + c_offset;
  int address_increment = inner_loop_stride * stride;

  for (int i = 0; i < loop_count; i++) {
#pragma unroll
    for (int j = 0; j < PARALLEL_LOADS; j++) {
      if (c_offset < stride && m_offset < reduction_size) {
        grad_input[address_base] =
            static_cast<scalar_t>((static_cast<accscalar_t>(grad_output[address_base]) - m_dy_c -
                                   (static_cast<accscalar_t>(input[address_base]) - m_c) * factor_1_c) *
                                  factor_2_c);
      }
      m_offset += inner_loop_stride;
      address_base += address_increment;
    }
  }
}

std::vector<at::Tensor> welford_mean_var_CUDA(const at::Tensor input) {
  const auto batch_size = input.size(0);
  const auto feature_size = input.size(1);

  auto space_size = get_tensor_spatial_size(input);
  auto scalar_type = promote_scalartype(input);

  at::Tensor out_var_biased = at::empty({feature_size}, input.options().dtype(scalar_type));
  at::Tensor out_mean = at::empty({feature_size}, input.options().dtype(scalar_type));

  int block_y = min(h_last_pow2(batch_size), int(MAX_BLOCK_SIZE / 32));
  int block_x = max(1, min(MAX_BLOCK_SIZE / block_y, h_last_pow2(space_size)));
  const dim3 block(block_x, block_y);
  const dim3 grid(feature_size);

  auto stream = at::cuda::getCurrentCUDAStream();

  {
    using namespace at;
    DISPATCH_FLOAT_AND_HALF(
        input.scalar_type(), 0, "welford_mean_var_kernel", using accscalar_t = at::acc_type<scalar_t_0, true>;
        welford_kernel<scalar_t_0, accscalar_t, accscalar_t>
        <<<grid, block, 0, stream>>>(input.data_ptr<scalar_t_0>(), out_mean.data_ptr<accscalar_t>(),
                                     out_var_biased.data_ptr<accscalar_t>(), batch_size, feature_size, space_size););
  }

  return {out_mean, out_var_biased};
}

at::Tensor batchnorm_forward_CUDA(const at::Tensor input, const at::Tensor mean, const at::Tensor inv_std,
                                  const at::optional<at::Tensor> weight, const at::optional<at::Tensor> shift) {
  const auto batch_size = input.size(0);
  const auto feature_size = input.size(1);
  at::Tensor out = at::empty_like(input);

  auto space_size = get_tensor_spatial_size(input);

  int block_x = max(32, min(MAX_BLOCK_SIZE, h_last_pow2(space_size) / 4));
  int block_y = max(1, min(MAX_BLOCK_SIZE / block_x, h_last_pow2(batch_size) / 4));
  const dim3 block(block_x, block_y);
  int grid_z = max(1, min(65535, h_last_pow2(space_size) / 4 / block_x));
  int batch_group_size = max(1, min(65535, h_last_pow2(batch_size) / block_y));
  const dim3 grid(feature_size, batch_group_size, grid_z);
  auto stream = at::cuda::getCurrentCUDAStream();

  if (input.scalar_type() == at::ScalarType::Half && weight.has_value() &&
      weight.value().scalar_type() == at::ScalarType::Float) {
    using namespace at;
    DISPATCH_FLOAT_AND_HALF(
        input.scalar_type(), 0, "batchnorm_forward", using accscalar_t = at::acc_type<scalar_t_0, true>;
        batchnorm_forward_kernel<scalar_t_0, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
            input.data_ptr<scalar_t_0>(), mean.data_ptr<accscalar_t>(), inv_std.data_ptr<accscalar_t>(),
            weight.has_value() ? weight.value().data_ptr<accscalar_t>() : NULL,
            shift.has_value() ? shift.value().data_ptr<accscalar_t>() : NULL, out.data_ptr<scalar_t_0>(), space_size,
            batch_size););
  } else {
    if (weight.has_value()) {
      TORCH_CHECK(input.scalar_type() == weight.value().scalar_type(),
                  "input.scalar_type() is not supported with weight.scalar_type()");
    }
    using namespace at;
    DISPATCH_FLOAT_AND_HALF(
        input.scalar_type(), 0, "batchnorm_forward", using accscalar_t = at::acc_type<scalar_t_0, true>;
        batchnorm_forward_kernel<scalar_t_0, accscalar_t, scalar_t_0><<<grid, block, 0, stream>>>(
            input.data_ptr<scalar_t_0>(), mean.data_ptr<accscalar_t>(), inv_std.data_ptr<accscalar_t>(),
            weight.has_value() ? weight.value().data_ptr<scalar_t_0>() : NULL,
            shift.has_value() ? shift.value().data_ptr<scalar_t_0>() : NULL, out.data_ptr<scalar_t_0>(), space_size,
            batch_size););
  }
  return out;
}

std::vector<at::Tensor> reduce_bn_CUDA(const at::Tensor grad_output, const at::Tensor input, const at::Tensor mean,
                                       const at::Tensor inv_std, const at::optional<at::Tensor> weight) {
  const auto batch_size = input.size(0);
  const auto feature_size = input.size(1);

  auto scalar_type = promote_scalartype(input);

  at::Tensor sum_dy = at::empty({feature_size}, mean.options());
  at::Tensor sum_dy_xmu = at::empty({feature_size}, mean.options());

  at::Tensor grad_weight;
  at::Tensor grad_bias;
  if (weight.has_value()) {
    grad_weight = at::empty({feature_size}, weight.value().options());
    grad_bias = at::empty({feature_size}, weight.value().options());
  } else {
    grad_weight = at::empty({0}, mean.options());
    grad_bias = at::empty({0}, mean.options());
  }

  auto space_size = get_tensor_spatial_size(input);

  int block_y = min(h_last_pow2(batch_size), int(MAX_BLOCK_SIZE / 32));
  int block_x = max(1, min(MAX_BLOCK_SIZE / block_y, h_last_pow2(space_size)));
  const dim3 block(block_x, block_y);
  const dim3 grid(feature_size);
  auto stream = at::cuda::getCurrentCUDAStream();

  if (input.scalar_type() == at::ScalarType::Half && weight.has_value() &&
      weight.value().scalar_type() == at::ScalarType::Float) {
    using namespace at;
    DISPATCH_FLOAT_AND_HALF(
        input.scalar_type(), 0, "batchnorm_backward_reduce", using accscalar_t = at::acc_type<scalar_t_0, true>;
        reduce_bn_kernel<scalar_t_0, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
            input.data_ptr<scalar_t_0>(), grad_output.data_ptr<scalar_t_0>(), mean.data_ptr<accscalar_t>(),
            inv_std.data_ptr<accscalar_t>(), sum_dy.data_ptr<accscalar_t>(), sum_dy_xmu.data_ptr<accscalar_t>(),
            weight.has_value() ? grad_weight.data_ptr<accscalar_t>() : NULL,
            weight.has_value() ? grad_bias.data_ptr<accscalar_t>() : NULL, batch_size, feature_size, space_size););
  } else {
    if (weight.has_value()) {
      TORCH_CHECK(input.scalar_type() == weight.value().scalar_type(),
                  "input.scalar_type() is not supported with weight.scalar_type()");
    }
    using namespace at;
    DISPATCH_FLOAT_AND_HALF(
        input.scalar_type(), 0, "batchnorm_backward_reduce", using accscalar_t = at::acc_type<scalar_t_0, true>;
        reduce_bn_kernel<scalar_t_0, accscalar_t, scalar_t_0><<<grid, block, 0, stream>>>(
            input.data_ptr<scalar_t_0>(), grad_output.data_ptr<scalar_t_0>(), mean.data_ptr<accscalar_t>(),
            inv_std.data_ptr<accscalar_t>(), sum_dy.data_ptr<accscalar_t>(), sum_dy_xmu.data_ptr<accscalar_t>(),
            weight.has_value() ? grad_weight.data_ptr<scalar_t_0>() : NULL,
            weight.has_value() ? grad_bias.data_ptr<scalar_t_0>() : NULL, batch_size, feature_size, space_size););
  }

  return {sum_dy, sum_dy_xmu, grad_weight, grad_bias};
}

at::Tensor batchnorm_backward_CUDA(const at::Tensor grad_output, const at::Tensor input, const at::Tensor mean,
                                   const at::Tensor inv_std, const at::optional<at::Tensor> weight,
                                   const at::Tensor sum_dy, const at::Tensor sum_dy_xmu, const at::Tensor count) {
  const auto batch_size = input.size(0);
  const auto feature_size = input.size(1);

  at::Tensor grad_input = at::empty_like(input);

  auto space_size = get_tensor_spatial_size(input);

  int block_x = max(32, min(MAX_BLOCK_SIZE, h_last_pow2(space_size) / 4));
  int block_y = max(1, min(MAX_BLOCK_SIZE / block_x, h_last_pow2(batch_size) / 4));
  const dim3 block(block_x, block_y);
  int grid_z = max(1, min(65535, h_last_pow2(space_size) / 4 / block_x));
  int batch_group_size = max(1, min(65535, h_last_pow2(batch_size) / block_y));
  const dim3 grid(feature_size, batch_group_size, grid_z);

  auto stream = at::cuda::getCurrentCUDAStream();

  if (input.scalar_type() == at::ScalarType::Half && weight.has_value() &&
      weight.value().scalar_type() == at::ScalarType::Float) {
    using namespace at;
    DISPATCH_FLOAT_AND_HALF(
        input.scalar_type(), 0, "batchnorm_backward", using accscalar_t = at::acc_type<scalar_t_0, true>;
        batchnorm_backward_kernel<scalar_t_0, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
            grad_output.data_ptr<scalar_t_0>(), input.data_ptr<scalar_t_0>(), mean.data_ptr<accscalar_t>(),
            inv_std.data_ptr<accscalar_t>(), weight.has_value() ? weight.value().data_ptr<accscalar_t>() : NULL,
            sum_dy.data_ptr<accscalar_t>(), sum_dy_xmu.data_ptr<accscalar_t>(), count.data_ptr<int>(),
            grad_input.data_ptr<scalar_t_0>(), count.numel(), space_size, batch_size););
  } else {
    if (weight.has_value()) {
      TORCH_CHECK(input.scalar_type() == weight.value().scalar_type(),
                  "input.scalar_type() is not supported with weight.scalar_type()");
    }
    using namespace at;
    DISPATCH_FLOAT_AND_HALF(
        input.scalar_type(), 0, "batchnorm_backward", using accscalar_t = at::acc_type<scalar_t_0, true>;
        batchnorm_backward_kernel<scalar_t_0, accscalar_t, scalar_t_0><<<grid, block, 0, stream>>>(
            grad_output.data_ptr<scalar_t_0>(), input.data_ptr<scalar_t_0>(), mean.data_ptr<accscalar_t>(),
            inv_std.data_ptr<accscalar_t>(), weight.has_value() ? weight.value().data_ptr<scalar_t_0>() : NULL,
            sum_dy.data_ptr<accscalar_t>(), sum_dy_xmu.data_ptr<accscalar_t>(), count.data_ptr<int>(),
            grad_input.data_ptr<scalar_t_0>(), count.numel(), space_size, batch_size););
  }

  return grad_input;
}

std::vector<at::Tensor> welford_parallel_CUDA(const at::Tensor mean_feature_nodes, const at::Tensor var_biased,
                                              const at::Tensor numel, const float eps) {
  const auto world_size = mean_feature_nodes.size(0);
  const auto feature_size = mean_feature_nodes.size(1);

  at::Tensor out_var = at::empty({feature_size}, var_biased.options());
  at::Tensor inv_std = at::empty_like(out_var);
  at::Tensor out_mean = at::empty_like(out_var);

  at::Tensor mean_feature_nodes_ = mean_feature_nodes.contiguous();
  at::Tensor var_biased_ = var_biased.contiguous();
  at::Tensor numel_ = numel.contiguous();

  // TODO(jie): tile this for memory coalescing!
  const int block = std::min(h_last_pow2(feature_size), MAX_BLOCK_SIZE);
  const int grid = std::max<int>(1, feature_size / block);

  auto stream = at::cuda::getCurrentCUDAStream();

  {
    using namespace at;
    DISPATCH_FLOAT_AND_HALF(mean_feature_nodes.scalar_type(), 0, "welford_parallel_kernel",
                            welford_kernel_parallel<scalar_t_0><<<grid, block, 0, stream>>>(
                                mean_feature_nodes_.data_ptr<scalar_t_0>(), var_biased_.data_ptr<scalar_t_0>(),
                                numel_.data_ptr<int>(), out_mean.data_ptr<scalar_t_0>(), out_var.data_ptr<scalar_t_0>(),
                                inv_std.data_ptr<scalar_t_0>(), world_size, feature_size, eps););
  }

  return {out_mean, out_var, inv_std};
}

std::vector<at::Tensor> welford_mean_var_c_last_CUDA(const at::Tensor input) {
  const auto stride = input.size(input.ndimension() - 1);
  const auto reduction_size = input.numel() / stride;

  auto scalar_type = promote_scalartype(input);
  auto option = input.options().dtype(scalar_type);

  at::Tensor out_var_biased = at::empty({stride}, option);
  at::Tensor out_mean = at::empty({stride}, option);

  dim3 block;
  dim3 grid;
  flexible_launch_configs(reduction_size, stride, block, grid, true);

  at::Tensor staging_data;
  at::Tensor semaphores;
  if (grid.y > 1) {
    staging_data = at::empty({4 * stride * grid.y}, option);
    semaphores = at::zeros({grid.x}, input.options().dtype(at::kInt));
  }

  auto stream = at::cuda::getCurrentCUDAStream();

  {
    using namespace at;
    DISPATCH_FLOAT_AND_HALF(
        input.scalar_type(), 0, "welford_mean_var_c_last", using accscalar_t = at::acc_type<scalar_t_0, true>;
        accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.data_ptr<accscalar_t>() : nullptr;
        int* semaphores_ptr = grid.y > 1 ? semaphores.data_ptr<int>() : nullptr;
        welford_kernel_c_last<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER><<<grid, block, 0, stream>>>(
            input.data_ptr<scalar_t_0>(), out_mean.data_ptr<accscalar_t>(), out_var_biased.data_ptr<accscalar_t>(),
            staging_data_ptr, semaphores_ptr, reduction_size, stride););
  }

  return {out_mean, out_var_biased};
}

at::Tensor batchnorm_forward_c_last_CUDA(const at::Tensor input, const at::optional<at::Tensor> z,
                                         const at::Tensor mean, const at::Tensor inv_std,
                                         const at::optional<at::Tensor> weight, const at::optional<at::Tensor> shift,
                                         const bool fuse_relu) {
  const auto stride = input.size(input.ndimension() - 1);
  const auto reduction_size = input.numel() / stride;

  at::Tensor out = at::empty_like(input);

  dim3 block;
  dim3 grid;
  flexible_launch_configs(reduction_size, stride, block, grid);

  auto stream = at::cuda::getCurrentCUDAStream();

  if (input.scalar_type() == at::ScalarType::Half && weight.has_value() &&
      weight.value().scalar_type() == at::ScalarType::Float) {
    using namespace at;
    DISPATCH_FLOAT_AND_HALF(
        input.scalar_type(), 0, "batchnorm_forward", using accscalar_t = at::acc_type<scalar_t_0, true>;
        batchnorm_forward_c_last_kernel<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
        <<<grid, block, 0, stream>>>(input.data_ptr<scalar_t_0>(),
                                     z.has_value() ? z.value().data_ptr<scalar_t_0>() : NULL,
                                     mean.data_ptr<accscalar_t>(), inv_std.data_ptr<accscalar_t>(),
                                     weight.has_value() ? weight.value().data_ptr<accscalar_t>() : NULL,
                                     shift.has_value() ? shift.value().data_ptr<accscalar_t>() : NULL,
                                     out.data_ptr<scalar_t_0>(), reduction_size, stride, fuse_relu););
  } else {
    if (weight.has_value()) {
      TORCH_CHECK(input.scalar_type() == weight.value().scalar_type(),
                  "input.scalar_type() is not supported with weight.scalar_type()");
    }
    using namespace at;
    DISPATCH_FLOAT_AND_HALF(
        input.scalar_type(), 0, "batchnorm_forward", using accscalar_t = at::acc_type<scalar_t_0, true>;
        batchnorm_forward_c_last_kernel<scalar_t_0, accscalar_t, scalar_t_0, ELEMENTS_PER_ITER>
        <<<grid, block, 0, stream>>>(input.data_ptr<scalar_t_0>(),
                                     z.has_value() ? z.value().data_ptr<scalar_t_0>() : NULL,
                                     mean.data_ptr<accscalar_t>(), inv_std.data_ptr<accscalar_t>(),
                                     weight.has_value() ? weight.value().data_ptr<scalar_t_0>() : NULL,
                                     shift.has_value() ? shift.value().data_ptr<scalar_t_0>() : NULL,
                                     out.data_ptr<scalar_t_0>(), reduction_size, stride, fuse_relu););
  }
  return out;
}

std::vector<at::Tensor> reduce_bn_c_last_CUDA(const at::Tensor grad_output, const at::Tensor input,
                                              const at::Tensor mean, const at::Tensor inv_std,
                                              const at::optional<at::Tensor> weight) {
  const auto stride = input.size(input.ndimension() - 1);
  const auto reduction_size = input.numel() / stride;

  at::Tensor sumn_dy = at::empty({stride}, mean.options());
  at::Tensor sum_dy_xmu = at::empty({stride}, mean.options());

  at::Tensor grad_weight;
  at::Tensor grad_bias;
  if (weight.has_value()) {
    grad_weight = at::empty({stride}, weight.value().options());
    grad_bias = at::empty({stride}, weight.value().options());
  } else {
    // because I cannot return an uninitialized at::Tensor
    grad_weight = at::empty({0}, mean.options());
    grad_bias = at::empty({0}, mean.options());
  }

  dim3 block;
  dim3 grid;
  flexible_launch_configs(reduction_size, stride, block, grid, true);

  at::Tensor staging_data;
  at::Tensor semaphores;
  if (grid.y > 1) {
    staging_data = at::empty({2 * stride * grid.y}, mean.options());
    semaphores = at::zeros({grid.x}, input.options().dtype(at::kInt));
  }
  auto stream = at::cuda::getCurrentCUDAStream();

  if (input.scalar_type() == at::ScalarType::Half && weight.has_value() &&
      weight.value().scalar_type() == at::ScalarType::Float) {
    using namespace at;
    DISPATCH_FLOAT_AND_HALF(
        input.scalar_type(), 0, "batchnorm_backward_reduce", using accscalar_t = at::acc_type<scalar_t_0, true>;
        accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.data_ptr<accscalar_t>() : nullptr;
        int* semaphores_ptr = grid.y > 1 ? semaphores.data_ptr<int>() : nullptr;
        reduce_bn_c_last_kernel<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER><<<grid, block, 0, stream>>>(
            input.data_ptr<scalar_t_0>(), grad_output.data_ptr<scalar_t_0>(), mean.data_ptr<accscalar_t>(),
            inv_std.data_ptr<accscalar_t>(), sumn_dy.data_ptr<accscalar_t>(), sum_dy_xmu.data_ptr<accscalar_t>(),
            weight.has_value() ? grad_weight.data_ptr<accscalar_t>() : NULL,
            weight.has_value() ? grad_bias.data_ptr<accscalar_t>() : NULL, staging_data_ptr, semaphores_ptr,
            reduction_size, stride););
  } else {
    if (weight.has_value()) {
      TORCH_CHECK(input.scalar_type() == weight.value().scalar_type(),
                  "input.scalar_type() is not supported with weight.scalar_type()");
    }
    using namespace at;
    DISPATCH_FLOAT_AND_HALF(
        input.scalar_type(), 0, "batchnorm_backward_reduce", using accscalar_t = at::acc_type<scalar_t_0, true>;
        accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.data_ptr<accscalar_t>() : nullptr;
        int* semaphores_ptr = grid.y > 1 ? semaphores.data_ptr<int>() : nullptr;
        reduce_bn_c_last_kernel<scalar_t_0, accscalar_t, scalar_t_0, ELEMENTS_PER_ITER><<<grid, block, 0, stream>>>(
            input.data_ptr<scalar_t_0>(), grad_output.data_ptr<scalar_t_0>(), mean.data_ptr<accscalar_t>(),
            inv_std.data_ptr<accscalar_t>(), sumn_dy.data_ptr<accscalar_t>(), sum_dy_xmu.data_ptr<accscalar_t>(),
            weight.has_value() ? grad_weight.data_ptr<scalar_t_0>() : NULL,
            weight.has_value() ? grad_bias.data_ptr<scalar_t_0>() : NULL, staging_data_ptr, semaphores_ptr,
            reduction_size, stride););
  }

  return {sumn_dy, sum_dy_xmu, grad_weight, grad_bias};
}

at::Tensor batchnorm_backward_c_last_CUDA(const at::Tensor grad_output, const at::Tensor input, const at::Tensor mean,
                                          const at::Tensor inv_std, const at::optional<at::Tensor> weight,
                                          const at::Tensor sum_dy, const at::Tensor sum_dy_xmu,
                                          const at::Tensor count) {
  const auto stride = input.size(input.ndimension() - 1);
  const auto reduction_size = input.numel() / stride;

  at::Tensor grad_input = at::empty_like(input);

  dim3 block;
  dim3 grid;
  flexible_launch_configs(reduction_size, stride, block, grid);

  auto stream = at::cuda::getCurrentCUDAStream();

  if (input.scalar_type() == at::ScalarType::Half && weight.has_value() &&
      weight.value().scalar_type() == at::ScalarType::Float) {
    using namespace at;
    DISPATCH_FLOAT_AND_HALF(
        input.scalar_type(), 0, "batchnorm_forward", using accscalar_t = at::acc_type<scalar_t_0, true>;
        batchnorm_backward_c_last_kernel<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
        <<<grid, block, 0, stream>>>(
            grad_output.data_ptr<scalar_t_0>(), input.data_ptr<scalar_t_0>(), mean.data_ptr<accscalar_t>(),
            inv_std.data_ptr<accscalar_t>(), weight.has_value() ? weight.value().data_ptr<accscalar_t>() : NULL,
            sum_dy.data_ptr<accscalar_t>(), sum_dy_xmu.data_ptr<accscalar_t>(), count.data_ptr<int>(),
            grad_input.data_ptr<scalar_t_0>(), count.numel(), reduction_size, stride););
  } else {
    if (weight.has_value()) {
      TORCH_CHECK(input.scalar_type() == weight.value().scalar_type(),
                  "input.scalar_type() is not supported with weight.scalar_type()");
    }
    using namespace at;
    DISPATCH_FLOAT_AND_HALF(
        input.scalar_type(), 0, "batchnorm_forward", using accscalar_t = at::acc_type<scalar_t_0, true>;
        batchnorm_backward_c_last_kernel<scalar_t_0, accscalar_t, scalar_t_0, ELEMENTS_PER_ITER>
        <<<grid, block, 0, stream>>>(
            grad_output.data_ptr<scalar_t_0>(), input.data_ptr<scalar_t_0>(), mean.data_ptr<accscalar_t>(),
            inv_std.data_ptr<accscalar_t>(), weight.has_value() ? weight.value().data_ptr<scalar_t_0>() : NULL,
            sum_dy.data_ptr<accscalar_t>(), sum_dy_xmu.data_ptr<accscalar_t>(), count.data_ptr<int>(),
            grad_input.data_ptr<scalar_t_0>(), count.numel(), reduction_size, stride););
  }

  return grad_input;
}

at::Tensor relu_backward_c_last_CUDA(const at::Tensor grad_output, const at::Tensor input,
                                     const at::optional<at::Tensor> z, const at::Tensor mean, const at::Tensor inv_std,
                                     const at::optional<at::Tensor> weight, const at::optional<at::Tensor> shift) {
  const auto stride = input.size(input.ndimension() - 1);
  const auto reduction_size = input.numel() / stride;

  at::Tensor out = at::empty_like(input);

  dim3 block;
  dim3 grid;
  flexible_launch_configs(reduction_size, stride, block, grid);

  auto stream = at::cuda::getCurrentCUDAStream();

  if (input.scalar_type() == at::ScalarType::Half && weight.has_value() &&
      weight.value().scalar_type() == at::ScalarType::Float) {
    using namespace at;
    DISPATCH_FLOAT_AND_HALF(
        input.scalar_type(), 0, "batchnorm_forward", using accscalar_t = at::acc_type<scalar_t_0, true>;
        relu_backward_c_last_kernel<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
        <<<grid, block, 0, stream>>>(grad_output.data_ptr<scalar_t_0>(), input.data_ptr<scalar_t_0>(),
                                     z.has_value() ? z.value().data_ptr<scalar_t_0>() : NULL,
                                     mean.data_ptr<accscalar_t>(), inv_std.data_ptr<accscalar_t>(),
                                     weight.has_value() ? weight.value().data_ptr<accscalar_t>() : NULL,
                                     shift.has_value() ? shift.value().data_ptr<accscalar_t>() : NULL,
                                     out.data_ptr<scalar_t_0>(), reduction_size, stride););
  } else {
    if (weight.has_value()) {
      TORCH_CHECK(input.scalar_type() == weight.value().scalar_type(),
                  "input.scalar_type() is not supported with weight.scalar_type()");
    }
    using namespace at;
    DISPATCH_FLOAT_AND_HALF(
        input.scalar_type(), 0, "batchnorm_forward", using accscalar_t = at::acc_type<scalar_t_0, true>;
        relu_backward_c_last_kernel<scalar_t_0, accscalar_t, scalar_t_0, ELEMENTS_PER_ITER><<<grid, block, 0, stream>>>(
            grad_output.data_ptr<scalar_t_0>(), input.data_ptr<scalar_t_0>(),
            z.has_value() ? z.value().data_ptr<scalar_t_0>() : NULL, mean.data_ptr<accscalar_t>(),
            inv_std.data_ptr<accscalar_t>(), weight.has_value() ? weight.value().data_ptr<scalar_t_0>() : NULL,
            shift.has_value() ? shift.value().data_ptr<scalar_t_0>() : NULL, out.data_ptr<scalar_t_0>(), reduction_size,
            stride););
  }
  return out;
}


================================================
FILE: docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line.
SPHINXOPTS    =
SPHINXBUILD   = sphinx-build
SPHINXPROJ    = NVIDIAAPEX
SOURCEDIR     = source
BUILDDIR      = build

# Put it first so that "make" without argument is like "make help".
help:
	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

gh-pages:
	git checkout gh-pages
	rm -rf build
	rm -rf source
	git checkout master -- .
	make html
	rm -rf ../_modules ../_sources ../_static
	mv -fv build/html/* ../
	rm -rf build
	git add -A
	git commit -m "Generated gh-pages for `git log master -1 --pretty=short --abbrev-commit`" && git push origin gh-pages ; git checkout master

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


================================================
FILE: docs/source/_static/css/pytorch_theme.css
================================================
body {
    font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
}

/* Default header fonts are ugly */
h1, h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend, p.caption {
    font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
}

/* Use white for docs background */
.wy-side-nav-search {
    background-color: #fff;
}

.wy-nav-content-wrap, .wy-menu li.current > a  {
    background-color: #fff;
}

@media screen and (min-width: 1400px) {
    .wy-nav-content-wrap {
        background-color: rgba(0, 0, 0, 0.0470588);
    }

    .wy-nav-content {
        background-color: #fff;
    }
}

/* Fixes for mobile */
.wy-nav-top {
    background-color: #fff;
    background-image: url('../img/apex.jpg');
    background-repeat: no-repeat;
    background-position: center;
    padding: 0;
    margin: 0.4045em 0.809em;
    color: #333;
}

.wy-nav-top > a {
    display: none;
}

@media screen and (max-width: 768px) {
    .wy-side-nav-search>a img.logo {
        height: 60px;
    }
}

/* This is needed to ensure that logo above search scales properly */
.wy-side-nav-search a {
    display: block;
}

/* This ensures that multiple constructors will remain in separate lines. */
.rst-content dl:not(.docutils) dt {
    display: table;
}

/* Use our red for literals (it's very similar to the original color) */
.rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
    color: #F05732;
}

.rst-content tt.xref, a .rst-content tt, .rst-content tt.xref,
.rst-content code.xref, a .rst-content tt, a .rst-content code {
    color: #404040;
}

/* Change link colors (except for the menu) */

a {
    color: #F05732;
}

a:hover {
    color: #F05732;
}


a:visited {
    color: #D44D2C;
}

.wy-menu a {
    color: #b3b3b3;
}

.wy-menu a:hover {
    color: #b3b3b3;
}

/* Default footer text is quite big */
footer {
    font-size: 80%;
}

footer .rst-footer-buttons {
    font-size: 125%; /* revert footer settings - 1/80% = 125% */
}

footer p {
    font-size: 100%;
}

/* For hidden headers that appear in TOC tree */
/* see http://stackoverflow.com/a/32363545/3343043 */
.rst-content .hidden-section {
    display: none;
}

nav .hidden-section {
    display: inherit;
}

.wy-side-nav-search>div.version {
    color: #000;
}


================================================
FILE: docs/source/_templates/layout.html
================================================
{% extends "!layout.html" %}
  {% block sidebartitle %} {{ super() }}

  <style>
    /* Sidebar header (and topbar for mobile) */
    .wy-side-nav-search, .wy-nav-top {
      background: #76b900;
    }

    .wy-side-nav-search a:link, .wy-nav-top a:link {
      color: #fff;
    }
    .wy-side-nav-search a:visited, .wy-nav-top a:visited {
      color: #fff;
    }
    .wy-side-nav-search a:hover, .wy-nav-top a:hover {
      color: #fff;
    }

    .wy-menu-vertical a:link, .wy-menu-vertical a:visited {
      color: #d9d9d9
    }

    .wy-menu-vertical a:active {
      background-color: #76b900
    }

    .wy-side-nav-search>div.version {
      color: rgba(0, 0, 0, 0.3)
    }
  </style>
  {% endblock %}

  {% block footer %} {{ super() }}

  <style>
  a:link, a:visited {
    color: #76b900;
  }

  a:hover {
    color: #8c0;
  }

  .rst-content dl:not(.docutils) dt {
    background: rgba(118, 185, 0, 0.1);
    color: rgba(59,93,0,1);
    border-top: solid 3px rgba(59,93,0,1);
  }
  </style>
  {% endblock %}


================================================
FILE: docs/source/conf.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# PyTorch documentation build configuration file, created by
# sphinx-quickstart on Fri Dec 23 13:31:47 2016.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys

sys.path.insert(0, os.path.abspath("."))
# sys.path.insert(0, os.path.abspath('../../apex/parallel/'))
# import multiproc
import sphinx_rtd_theme


# -- General configuration ------------------------------------------------

# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
    "sphinx.ext.autodoc",
    "sphinx.ext.autosummary",
    "sphinx.ext.doctest",
    "sphinx.ext.intersphinx",
    "sphinx.ext.todo",
    "sphinx.ext.coverage",
    "sphinx.ext.mathjax",
    "sphinx.ext.napoleon",
    "sphinx.ext.viewcode",
    "sphinx.ext.extlinks",
]

napoleon_use_ivar = True

# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]

# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
source_suffix = ".rst"

# The master toctree document.
master_doc = "index"

# General information about the project.
project = "Apex"
copyright = "2018"
author = "Christian Sarofeen, Natalia Gimelshein, Michael Carilli, Raul Puri"

# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
# TODO: change to [:2] at v1.0
# version = 'master (' + torch.__version__ + ' )'
version = "0.1"
# The full version, including alpha/beta/rc tags.
# TODO: verify this works as expected
release = "0.1.0"

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path
exclude_patterns = []

# The name of the Pygments (syntax highlighting) style to use.
pygments_style = "sphinx"

# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = True


# -- Options for HTML output -------------------------------------------------

# The theme to use for HTML and HTML Help pages.  See the documentation for
# a list of builtin themes.
#
html_theme = "sphinx_rtd_theme"
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]

# Theme options are theme-specific and customize the look and feel of a theme
# further.  For a list of options available for each theme, see the
# documentation.
#
html_theme_options = {
    "collapse_navigation": False,
    "display_version": True,
    "logo_only": True,
}

# html_logo = '_static/img/nv-pytorch2.png'

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ["_static"]

# html_style_path = 'css/pytorch_theme.css'
html_context = {
    "css_files": [
        "https://fonts.googleapis.com/css?family=Lato",
        "_static/css/pytorch_theme.css",
    ],
}


# -- Options for HTMLHelp output ---------------------------------------------

# Output file base name for HTML help builder.
htmlhelp_basename = "PyTorchdoc"


# -- Options for LaTeX output ------------------------------------------------

latex_elements = {
    # The paper size ('letterpaper' or 'a4paper').
    #
    # 'papersize': 'letterpaper',
    # The font size ('10pt', '11pt' or '12pt').
    #
    # 'pointsize': '10pt',
    # Additional stuff for the LaTeX preamble.
    #
    # 'preamble': '',
    # Latex figure (float) alignment
    #
    # 'figure_align': 'htbp',
}

# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
#  author, documentclass [howto, manual, or own class]).
latex_documents = [
    (master_doc, "apex.tex", "Apex Documentation", "Torch Contributors", "manual"),
]


# -- Options for manual page output ------------------------------------------

# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [(master_doc, "Apex", "Apex Documentation", [author], 1)]


# -- Options for Texinfo output ----------------------------------------------

# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
#  dir menu entry, description, category)
texinfo_documents = [
    (
        master_doc,
        "Apex",
        "Apex Documentation",
        author,
        "Apex",
        "One line description of project.",
        "Miscellaneous",
    ),
]


# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {
    "python": ("https://docs.python.org/", None),
    "numpy": ("http://docs.scipy.org/doc/numpy/", None),
}

# -- A patch that prevents Sphinx from cross-referencing ivar tags -------
# See http://stackoverflow.com/a/41184353/3343043

from docutils import nodes
from sphinx.util.docfields import TypedField
from sphinx import addnodes


def patched_make_field(self, types, domain, items, **kw):
    # `kw` catches `env=None` needed for newer sphinx while maintaining
    #  backwards compatibility when passed along further down!

    # type: (List, unicode, Tuple) -> nodes.field
    def handle_item(fieldarg, content):
        par = nodes.paragraph()
        par += addnodes.literal_strong("", fieldarg)  # Patch: this line added
        # par.extend(self.make_xrefs(self.rolename, domain, fieldarg,
        #                           addnodes.literal_strong))
        if fieldarg in types:
            par += nodes.Text(" (")
            # NOTE: using .pop() here to prevent a single type node to be
            # inserted twice into the doctree, which leads to
            # inconsistencies later when references are resolved
            fieldtype = types.pop(fieldarg)
            if len(fieldtype) == 1 and isinstance(fieldtype[0], nodes.Text):
                typename = "".join(n.astext() for n in fieldtype)
                typename = typename.replace("int", "python:int")
                typename = typename.replace("long", "python:long")
                typename = typename.replace("float", "python:float")
                typename = typename.replace("type", "python:type")
                par.extend(
                    self.make_xrefs(
                        self.typerolename,
                        domain,
                        typename,
                        addnodes.literal_emphasis,
                        **kw,
                    )
                )
            else:
                par += fieldtype
            par += nodes.Text(")")
        par += nodes.Text(" -- ")
        par += content
        return par

    fieldname = nodes.field_name("", self.label)
    if len(items) == 1 and self.can_collapse:
        fieldarg, content = items[0]
        bodynode = handle_item(fieldarg, content)
    else:
        bodynode = self.list_type()
        for fieldarg, content in items:
            bodynode += nodes.list_item("", handle_item(fieldarg, content))
    fieldbody = nodes.field_body("", bodynode)
    return nodes.field("", fieldname, fieldbody)


TypedField.make_field = patched_make_field


================================================
FILE: docs/source/index.rst
================================================
.. PyTorch documentation master file, created by
   sphinx-quickstart on Fri Dec 23 13:31:47 2016.
   You can adapt this file completely to your liking, but it should at least
   contain the root `toctree` directive.

:github_url: https://github.com/nvidia/apex

Apex (A PyTorch Extension)
===================================

This site contains the API documentation for Apex (https://github.com/nvidia/apex),
a Pytorch extension with NVIDIA-maintained utilities to streamline mixed precision and distributed training.  Some of the code here will be included in upstream Pytorch eventually. The intention of Apex is to make up-to-date utilities available to users as quickly as possible.

Installation instructions can be found here:  https://github.com/NVIDIA/apex#quick-start.

Some other useful material, including GTC 2019 and Pytorch DevCon 2019 Slides, can be found here:  https://github.com/mcarilli/mixed_precision_references.

.. toctree::
   :maxdepth: 1
   :caption: Fused Optimizers

   optimizers

.. toctree::
   :maxdepth: 1
   :caption: Fused Layer Norm

   layernorm

..   .. toctree::
     :maxdepth: 1
     :caption: Deprecated mixed precision API
     fp16_util

..   RNN
   
Indices and tables
==================

* :ref:`genindex`
* :ref:`modindex`


================================================
FILE: docs/source/layernorm.rst
================================================
.. role:: hidden
    :class: hidden-section

apex.normalization.fused_layer_norm
===================================

.. automodule:: apex.normalization
.. currentmodule:: apex.normalization

.. FusedAdam
   ----------

.. autoclass:: FusedLayerNorm
    :members:

.. autoclass:: FusedRMSNorm
    :members:


================================================
FILE: docs/source/optimizers.rst
================================================
.. role:: hidden
    :class: hidden-section

apex.optimizers
===================================

.. automodule:: apex.optimizers
.. currentmodule:: apex.optimizers

.. FusedAdam
   ----------

.. autoclass:: FusedAdam
    :members:

.. autoclass:: FusedLAMB
    :members:

.. autoclass:: FusedNovoGrad
    :members:

.. autoclass:: FusedSGD
    :members:


================================================
FILE: examples/README.md
================================================
This directory contains examples illustrating Apex mixed precision and distributed tools.

**Note for users of the pre-unification API**:
`deprecated_api` contains examples illustrating the old (pre-unified) APIs.  These APIs will be removed soon, and users are strongly encouraged to switch.  The separate mixed precision tools called `Amp` and `FP16_Optimizer` in the old API are exposed via different flags/optimization levels in the new API.


================================================
FILE: examples/dcgan/README.md
================================================
# Mixed Precision DCGAN Training in PyTorch

`main_amp.py` is based on [https://github.com/pytorch/examples/tree/master/dcgan](https://github.com/pytorch/examples/tree/master/dcgan).
It implements Automatic Mixed Precision (Amp) training of the DCGAN example for different datasets. Command-line flags forwarded to `amp.initialize` are used to easily manipulate and switch between various pure and mixed precision "optimization levels" or `opt_level`s.  For a detailed explanation of `opt_level`s, see the [updated API guide](https://nvidia.github.io/apex/amp.html).

We introduce these changes to the PyTorch DCGAN example as described in the [Multiple models/optimizers/losses](https://nvidia.github.io/apex/advanced.html#multiple-models-optimizers-losses) section of the documentation::
```
# Added after models and optimizers construction
[netD, netG], [optimizerD, optimizerG] = amp.initialize(
    [netD, netG], [optimizerD, optimizerG], opt_level=opt.opt_level, num_losses=3)
...
# loss.backward() changed to:
with amp.scale_loss(errD_real, optimizerD, loss_id=0) as errD_real_scaled:
    errD_real_scaled.backward()
...
with amp.scale_loss(errD_fake, optimizerD, loss_id=1) as errD_fake_scaled:
    errD_fake_scaled.backward()
...
with amp.scale_loss(errG, optimizerG, loss_id=2) as errG_scaled:
    errG_scaled.backward()
```

Note that we use different `loss_scalers` for each computed loss.
Using a separate loss scaler per loss is [optional, not required](https://nvidia.github.io/apex/advanced.html#optionally-have-amp-use-a-different-loss-scaler-per-loss).

To improve the numerical stability, we swapped `nn.Sigmoid() + nn.BCELoss()` to `nn.BCEWithLogitsLoss()`.

With the new Amp API **you never need to explicitly convert your model, or the input data, to half().**

"Pure FP32" training:
```
$ python main_amp.py --opt_level O0
```
Recommended mixed precision training:
```
$ python main_amp.py --opt_level O1
```

Have a look at the original [DCGAN example](https://github.com/pytorch/examples/tree/master/dcgan) for more information about the used arguments.

To enable mixed precision training, we introduce the `--opt_level` argument.


================================================
FILE: examples/dcgan/main_amp.py
================================================
from __future__ import print_function
import argparse
import os
import random
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.utils as vutils

try:    
    from apex import amp
except ImportError:
    raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.")


parser = argparse.ArgumentParser()
parser.add_argument('--dataset', default='cifar10', help='cifar10 | lsun | mnist |imagenet | folder | lfw | fake')
parser.add_argument('--dataroot', default='./', help='path to dataset')
parser.add_argument('--workers', type=int, help='number of data loading workers', default=2)
parser.add_argument('--batchSize', type=int, default=64, help='input batch size')
parser.add_argument('--imageSize', type=int, default=64, help='the height / width of the input image to network')
parser.add_argument('--nz', type=int, default=100, help='size of the latent z vector')
parser.add_argument('--ngf', type=int, default=64)
parser.add_argument('--ndf', type=int, default=64)
parser.add_argument('--niter', type=int, default=25, help='number of epochs to train for')
parser.add_argument('--lr', type=float, default=0.0002, help='learning rate, default=0.0002')
parser.add_argument('--beta1', type=float, default=0.5, help='beta1 for adam. default=0.5')
parser.add_argument('--ngpu', type=int, default=1, help='number of GPUs to use')
parser.add_argument('--netG', default='', help="path to netG (to continue training)")
parser.add_argument('--netD', default='', help="path to netD (to continue training)")
parser.add_argument('--outf', default='.', help='folder to output images and model checkpoints')
parser.add_argument('--manualSeed', type=int, help='manual seed')
parser.add_argument('--classes', default='bedroom', help='comma separated list of classes for the lsun data set')
parser.add_argument('--opt_level', default='O1', help='amp opt_level, default="O1"')

opt = parser.parse_args()
print(opt)


try:
    os.makedirs(opt.outf)
except OSError:
    pass

if opt.manualSeed is None:
    opt.manualSeed = 2809
print("Random Seed: ", opt.manualSeed)
random.seed(opt.manualSeed)
torch.manual_seed(opt.manualSeed)

cudnn.benchmark = True


if opt.dataset in ['imagenet', 'folder', 'lfw']:
    # folder dataset
    dataset = dset.ImageFolder(root=opt.dataroot,
                               transform=transforms.Compose([
                                   transforms.Resize(opt.imageSize),
                                   transforms.CenterCrop(opt.imageSize),
                                   transforms.ToTensor(),
                                   transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
                               ]))
    nc=3
elif opt.dataset == 'lsun':
    classes = [ c + '_train' for c in opt.classes.split(',')]
    dataset = dset.LSUN(root=opt.dataroot, classes=classes,
                        transform=transforms.Compose([
                            transforms.Resize(opt.imageSize),
                            transforms.CenterCrop(opt.imageSize),
                            transforms.ToTensor(),
                            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
                        ]))
    nc=3
elif opt.dataset == 'cifar10':
    dataset = dset.CIFAR10(root=opt.dataroot, download=True,
                           transform=transforms.Compose([
                               transforms.Resize(opt.imageSize),
                               transforms.ToTensor(),
                               transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
                           ]))
    nc=3

elif opt.dataset == 'mnist':
        dataset = dset.MNIST(root=opt.dataroot, download=True,
                           transform=transforms.Compose([
                               transforms.Resize(opt.imageSize),
                               transforms.ToTensor(),
                               transforms.Normalize((0.5,), (0.5,)),
                           ]))
        nc=1

elif opt.dataset == 'fake':
    dataset = dset.FakeData(image_size=(3, opt.imageSize, opt.imageSize),
                            transform=transforms.ToTensor())
    nc=3

assert dataset
dataloader = torch.utils.data.DataLoader(dataset, batch_size=opt.batchSize,
                                         shuffle=True, num_workers=int(opt.workers))

device = torch.device("cuda:0")
ngpu = int(opt.ngpu)
nz = int(opt.nz)
ngf = int(opt.ngf)
ndf = int(opt.ndf)


# custom weights initialization called on netG and netD
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        m.weight.data.normal_(0.0, 0.02)
    elif classname.find('BatchNorm') != -1:
        m.weight.data.normal_(1.0, 0.02)
        m.bias.data.fill_(0)


class Generator(nn.Module):
    def __init__(self, ngpu):
        super(Generator, self).__init__()
        self.ngpu = ngpu
        self.main = nn.Sequential(
            # input is Z, going into a convolution
            nn.ConvTranspose2d(     nz, ngf * 8, 4, 1, 0, bias=False),
            nn.BatchNorm2d(ngf * 8),
            nn.ReLU(True),
            # state size. (ngf*8) x 4 x 4
            nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 4),
            nn.ReLU(True),
            # state size. (ngf*4) x 8 x 8
            nn.ConvTranspose2d(ngf * 4, ngf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 2),
            nn.ReLU(True),
            # state size. (ngf*2) x 16 x 16
            nn.ConvTranspose2d(ngf * 2,     ngf, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf),
            nn.ReLU(True),
            # state size. (ngf) x 32 x 32
            nn.ConvTranspose2d(    ngf,      nc, 4, 2, 1, bias=False),
            nn.Tanh()
            # state size. (nc) x 64 x 64
        )

    def forward(self, input):
        if input.is_cuda and self.ngpu > 1:
            output = nn.parallel.data_parallel(self.main, input, range(self.ngpu))
        else:
            output = self.main(input)
        return output


netG = Generator(ngpu).to(device)
netG.apply(weights_init)
if opt.netG != '':
    netG.load_state_dict(torch.load(opt.netG))
print(netG)


class Discriminator(nn.Module):
    def __init__(self, ngpu):
        super(Discriminator, self).__init__()
        self.ngpu = ngpu
        self.main = nn.Sequential(
            # input is (nc) x 64 x 64
            nn.Conv2d(nc, ndf, 4, 2, 1, bias=False),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf) x 32 x 32
            nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 2),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*2) x 16 x 16
            nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 4),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*4) x 8 x 8
            nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 8),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*8) x 4 x 4
            nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
        )

    def forward(self, input):
        if input.is_cuda and self.ngpu > 1:
            output = nn.parallel.data_parallel(self.main, input, range(self.ngpu))
        else:
            output = self.main(input)

        return output.view(-1, 1).squeeze(1)


netD = Discriminator(ngpu).to(device)
netD.apply(weights_init)
if opt.netD != '':
    netD.load_state_dict(torch.load(opt.netD))
print(netD)

criterion = nn.BCEWithLogitsLoss()

fixed_noise = torch.randn(opt.batchSize, nz, 1, 1, device=device)
real_label = 1
fake_label = 0

# setup optimizer
optimizerD = optim.Adam(netD.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))
optimizerG = optim.Adam(netG.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))

[netD, netG], [optimizerD, optimizerG] = amp.initialize(
    [netD, netG], [optimizerD, optimizerG], opt_level=opt.opt_level, num_losses=3)

for epoch in range(opt.niter):
    for i, data in enumerate(dataloader, 0):
        ############################
        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
        ###########################
        # train with real
        netD.zero_grad()
        real_cpu = data[0].to(device)
        batch_size = real_cpu.size(0)
        label = torch.full((batch_size,), real_label, device=device)

        output = netD(real_cpu)
        errD_real = criterion(output, label)
        with amp.scale_loss(errD_real, optimizerD, loss_id=0) as errD_real_scaled:
            errD_real_scaled.backward()
        D_x = output.mean().item()

        # train with fake
        noise = torch.randn(batch_size, nz, 1, 1, device=device)
        fake = netG(noise)
        label.fill_(fake_label)
        output = netD(fake.detach())
        errD_fake = criterion(output, label)
        with amp.scale_loss(errD_fake, optimizerD, loss_id=1) as errD_fake_scaled:
            errD_fake_scaled.backward()
        D_G_z1 = output.mean().item()
        errD = errD_real + errD_fake
        optimizerD.step()

        ############################
        # (2) Update G network: maximize log(D(G(z)))
        ###########################
        netG.zero_grad()
        label.fill_(real_label)  # fake labels are real for generator cost
        output = netD(fake)
        errG = criterion(output, label)
        with amp.scale_loss(errG, optimizerG, loss_id=2) as errG_scaled:
            errG_scaled.backward()
        D_G_z2 = output.mean().item()
        optimizerG.step()

        print('[%d/%d][%d/%d] Loss_D: %.4f Loss_G: %.4f D(x): %.4f D(G(z)): %.4f / %.4f'
              % (epoch, opt.niter, i, len(dataloader),
                 errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))
        if i % 100 == 0:
            vutils.save_image(real_cpu,
                    '%s/real_samples.png' % opt.outf,
                    normalize=True)
            fake = netG(fixed_noise)
            vutils.save_image(fake.detach(),
                    '%s/amp_fake_samples_epoch_%03d.png' % (opt.outf, epoch),
                    normalize=True)

    # do checkpointing
    torch.save(netG.state_dict(), '%s/netG_epoch_%d.pth' % (opt.outf, epoch))
    torch.save(netD.state_dict(), '%s/netD_epoch_%d.pth' % (opt.outf, epoch))


================================================
FILE: examples/docker/Dockerfile
================================================
# Base image must at least have pytorch and CUDA installed.
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.03-py3
FROM $BASE_IMAGE
ARG BASE_IMAGE
RUN echo "Installing Apex on top of ${BASE_IMAGE}"
# make sure we don't overwrite some existing directory called "apex"
WORKDIR /tmp/unique_for_apex
# uninstall Apex if present, twice to make absolutely sure :)
RUN pip uninstall -y apex || :
RUN pip uninstall -y apex || :
# SHA is something the user can touch to force recreation of this Docker layer,
# and therefore force cloning of the latest version of Apex
RUN SHA=ToUcHMe git clone https://github.com/NVIDIA/apex.git
WORKDIR /tmp/unique_for_apex/apex
RUN pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
WORKDIR /workspace


================================================
FILE: examples/docker/README.md
================================================
## Option 1:  Create a new container with Apex

**Dockerfile** installs the latest Apex on top of an existing image.  Run
```
docker build -t new_image_with_apex .
```
By default, **Dockerfile** uses NVIDIA's Pytorch container as the base image,
which requires an NVIDIA GPU Cloud (NGC) account.  If you don't have an NGC account, you can sign up for free by following the instructions [here](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html#generating-api-key).

Alternatively, you can supply your own base image via the `BASE_IMAGE` build-arg.
`BASE_IMAGE` must have Pytorch and Cuda installed.  For example, any
`-devel` image for Pytorch 1.0 and later from the
[official Pytorch Dockerhub](https://hub.docker.com/r/pytorch/pytorch) may be used:
```
docker build --build-arg BASE_IMAGE=1.3-cuda10.1-cudnn7-devel -t new_image_with_apex .
```

If you want to rebuild your image, and force the latest Apex to be cloned and installed, make any small change to the `SHA` variable in **Dockerfile**.

**Warning:**
Currently, the non-`-devel` images on Pytorch Dockerhub do not contain the Cuda compiler `nvcc`.  Therefore,
images whose name does not contain `-devel` are not eligible candidates for `BASE_IMAGE`.

### Running your Apex container

Like any Cuda-enabled Pytorch container, a container with Apex should be run via [nvidia-docker](https://github.com/NVIDIA/nvidia-docker), for example:
```
docker run --runtime=nvidia -it --rm --ipc=host new_image_with_apex
```

## Option 2:  Install Apex in a running container

Instead of building a new container, it is also a viable option to `git clone https://github.com/NVIDIA/apex.git` on bare metal, mount the Apex repo into your container at launch by running, for example,
```
docker run --runtime=nvidia -it --rm --ipc=host -v /bare/metal/apex:/apex/in/container <base image>
```
then go to /apex/in/container within the running container and
```
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
```


================================================
FILE: examples/imagenet/README.md
================================================
# Mixed Precision ImageNet Training in PyTorch

`main_amp.py` is based on [https://github.com/pytorch/examples/tree/master/imagenet](https://github.com/pytorch/examples/tree/master/imagenet).
It implements Automatic Mixed Precision (Amp) training of popular model architectures, such as ResNet, AlexNet, and VGG, on the ImageNet dataset.  Command-line flags forwarded to `amp.initialize` are used to easily manipulate and switch between various pure and mixed precision "optimization levels" or `opt_level`s.  For a detailed explanation of `opt_level`s, see the [updated API guide](https://nvidia.github.io/apex/amp.html).

Three lines enable Amp:
```
# Added after model and optimizer construction
model, optimizer = amp.initialize(model, optimizer, flags...)
...
# loss.backward() changed to:
with amp.scale_loss(loss, optimizer) as scaled_loss:
    scaled_loss.backward()
```

With the new Amp API **you never need to explicitly convert your model, or the input data, to half().**

## Requirements

- Download the ImageNet dataset and move validation images to labeled subfolders
    - The following script may be helpful: https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh

## Training

To train a model, create softlinks to the Imagenet dataset, then run `main.py` with the desired model architecture, as shown in `Example commands` below.

The default learning rate schedule is set for ResNet50.  `main_amp.py` script rescales the learning rate according to the global batch size (number of distributed processes \* per-process minibatch size).

## Example commands

**Note:**  batch size `--b 224` assumes your GPUs have >=16GB of onboard memory.  You may be able to increase this to 256, but that's cutting it close, so it may out-of-memory for different Pytorch versions.

**Note:**  All of the following use 4 dataloader subprocesses (`--workers 4`) to reduce potential
CPU data loading bottlenecks.

**Note:**  `--opt-level` `O1` and `O2` both use dynamic loss scaling by default unless manually overridden.
`--opt-level` `O0` and `O3` (the "pure" training modes) do not use loss scaling by default.
`O0` and `O3` can be told to use loss scaling via manual overrides, but using loss scaling with `O0`
(pure FP32 training) does not really make sense, and will trigger a warning.

Softlink training and validation datasets into the current directory:
```
$ ln -sf /data/imagenet/train-jpeg/ train
$ ln -sf /data/imagenet/val-jpeg/ val
```

### Summary

Amp allows easy experimentation with various pure and mixed precision options.
```
$ python main_amp.py -a resnet50 --b 128 --workers 4 --opt-level O0 ./
$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O3 ./
$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O3 --keep-batchnorm-fp32 True ./
$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O1 ./
$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O1 --loss-scale 128.0 ./
$ python -m torch.distributed.launch --nproc_per_node=2 main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O1 ./
$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O2 ./
$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O2 --loss-scale 128.0 ./
$ python -m torch.distributed.launch --nproc_per_node=2 main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O2 ./
```
Options are explained below.  Again, the [updated API guide](https://nvidia.github.io/apex/amp.html) provides more detail.

#### `--opt-level O0` (FP32 training) and `O3` (FP16 training)

"Pure FP32" training:
```
$ python main_amp.py -a resnet50 --b 128 --workers 4 --opt-level O0 ./
```
"Pure FP16" training:
```
$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O3 ./
```
FP16 training with FP32 batchnorm:
```
$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O3 --keep-batchnorm-fp32 True ./
```
Keeping the batchnorms in FP32 improves stability and allows Pytorch
to use cudnn batchnorms, which significantly increases speed in Resnet50.

The `O3` options might not converge, because they are not true mixed precision.
However, they can be useful to establish "speed of light" performance for
your model, which provides a baseline for comparison with `O1` and `O2`.
For Resnet50 in particular, `--opt-level O3 --keep-batchnorm-fp32 True` establishes
the "speed of light."  (Without `--keep-batchnorm-fp32`, it's slower, because it does
not use cudnn batchnorm.)

#### `--opt-level O1` (Official Mixed Precision recipe, recommended for typical use)

`O1` patches Torch functions to cast inputs according to a whitelist-blacklist model.
FP16-friendly (Tensor Core) ops like gemms and convolutions run in FP16, while ops
that benefit from FP32, like batchnorm and softmax, run in FP32.
Also, dynamic loss scaling is used by default.
```
$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O1 ./
```
`O1` overridden to use static loss scaling:
```
$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O1 --loss-scale 128.0
```
Distributed training with 2 processes (1 GPU per process, see **Distributed training** below
for more detail)
```
$ python -m torch.distributed.launch --nproc_per_node=2 main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O1 ./
```
For best performance, set `--nproc_per_node` equal to the total number of GPUs on the node
to use all available resources.

#### `--opt-level O2` ("Almost FP16" mixed precision.  More dangerous than O1.)

`O2` exists mainly to support some internal use cases.  Please prefer `O1`.

`O2` casts the model to FP16, keeps batchnorms in FP32,
maintains master weights in FP32, and implements
dynamic loss scaling by default. (Unlike --opt-level O1, --opt-level O2
does not patch Torch functions.)
```
$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O2 ./
```
"Fast mixed precision" overridden to use static loss scaling:
```
$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O2 --loss-scale 128.0 ./
```
Distributed training with 2 processes (1 GPU per process)
```
$ python -m torch.distributed.launch --nproc_per_node=2 main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O2 ./
```

## Distributed training

`main_amp.py` optionally uses `apex.parallel.DistributedDataParallel` (DDP) for multiprocess training with one GPU per process.
```
model = apex.parallel.DistributedDataParallel(model)
```
is a drop-in replacement for
```
model = torch.nn.parallel.DistributedDataParallel(model,
                                                  device_ids=[arg.local_rank],
                                                  output_device=arg.local_rank)
```
(because Torch DDP permits multiple GPUs per process, with Torch DDP you are required to
manually specify the device to run on and the output device.
With Apex DDP, it uses only the current device by default).

The choice of DDP wrapper (Torch or Apex) is orthogonal to the use of Amp and other Apex tools.  It is safe to use `apex.amp` with either `torch.nn.parallel.DistributedDataParallel` or `apex.parallel.DistributedDataParallel`.  In the future, I may add some features that permit optional tighter integration between `Amp` and `apex.parallel.DistributedDataParallel` for marginal performance benefits, but currently, there's no compelling reason to use Apex DDP versus Torch DDP for most models.

To use DDP with `apex.amp`, the only gotcha is that
```
model, optimizer = amp.initialize(model, optimizer, flags...)
```
must precede
```
model = DDP(model)
```
If DDP wrapping occurs before `amp.initialize`, `amp.initialize` will raise an error.

With both Apex DDP and Torch DDP, you must also call `torch.cuda.set_device(args.local_rank)` within
each process prior to initializing your model or any other tensors.
More information can be found in the docs for the
Pytorch multiprocess launcher module [torch.distributed.launch](https://pytorch.org/docs/stable/distributed.html#launch-utility).

`main_amp.py` is written to interact with 
[torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility),
which spawns multiprocess jobs using the following syntax:
```
python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main_amp.py args...
```
`NUM_GPUS` should be less than or equal to the number of visible GPU devices on the node.  The use of `torch.distributed.launch` is unrelated to the choice of DDP wrapper.  It is safe to use either apex DDP or torch DDP with `torch.distributed.launch`.

Optionally, one can run imagenet with synchronized batch normalization across processes by adding
`--sync_bn` to the `args...`

## Deterministic training (for debugging purposes)

Running with the `--deterministic` flag should produce bitwise identical outputs run-to-run,
regardless of what other options are used (see [Pytorch docs on reproducibility](https://pytorch.org/docs/stable/notes/randomness.html)).
Since `--deterministic` disables `torch.backends.cudnn.benchmark`, `--deterministic` may
cause a modest performance decrease.

## Profiling

If you're curious how the network actually looks on the CPU and GPU timelines (for example, how good is the overall utilization?
Is the prefetcher really overlapping data transfers?) try profiling `main_amp.py`.
[Detailed instructions can be found here](https://gist.github.com/mcarilli/213a4e698e4a0ae2234ddee56f4f3f95).


================================================
FILE: examples/imagenet/main_amp.py
================================================
import argparse
import os
import shutil
import time

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models

import numpy as np

from torch.nn.parallel import DistributedDataParallel as DDP

def to_python_float(scalar_tensor: torch.Tensor):
    return scalar_tensor.float().item()

def fast_collate(batch, memory_format):

    imgs = [img[0] for img in batch]
    targets = torch.tensor([target[1] for target in batch], dtype=torch.int64)
    w = imgs[0].size[0]
    h = imgs[0].size[1]
    tensor = torch.zeros( (len(imgs), 3, h, w), dtype=torch.uint8).contiguous(memory_format=memory_format)
    for i, img in enumerate(imgs):
        nump_array = np.asarray(img, dtype=np.uint8)
        if(nump_array.ndim < 3):
            nump_array = np.expand_dims(nump_array, axis=-1)
        nump_array = np.rollaxis(nump_array, 2)
        tensor[i] += torch.from_numpy(nump_array)
    return tensor, targets


def parse():
    model_names = sorted(name for name in models.__dict__
                     if name.islower() and not name.startswith("__")
                     and callable(models.__dict__[name]))

    parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
    parser.add_argument('data', metavar='DIR',
                        help='path to dataset')
    parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet18',
                        choices=model_names,
                        help='model architecture: ' +
                        ' | '.join(model_names) +
                        ' (default: resnet18)')
    parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
                        help='number of data loading workers (default: 4)')
    parser.add_argument('--epochs', default=90, type=int, metavar='N',
                        help='number of total epochs to run')
    parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
                        help='manual epoch number (useful on restarts)')
    parser.add_argument('-b', '--batch-size', default=256, type=int,
                        metavar='N', help='mini-batch size per process (default: 256)')
    parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
                        metavar='LR', help='Initial learning rate.  Will be scaled by <global batch size>/256: args.lr = args.lr*float(args.batch_size*args.world_size)/256.  A warmup schedule will also be applied over the first 5 epochs.')
    parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
                        help='momentum')
    parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float,
                        metavar='W', help='weight decay (default: 1e-4)')
    parser.add_argument('--print-freq', '-p', default=10, type=int,
                        metavar='N', help='print frequency (default: 10)')
    parser.add_argument('--resume', default='', type=str, metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
                        help='evaluate model on validation set')
    parser.add_argument('--pretrained', dest='pretrained', action='store_true',
                        help='use pre-trained model')

    parser.add_argument('--prof', default=-1, type=int,
                        help='Only run 10 iterations for profiling.')
    parser.add_argument('--deterministic', action='store_true')

    parser.add_argument("--local_rank", default=os.getenv('LOCAL_RANK', 0), type=int)
    parser.add_argument('--sync_bn', action='store_true',
                        help='enabling apex sync BN.')

    parser.add_argument('--opt-level', type=str)
    parser.add_argument('--keep-batchnorm-fp32', type=str, default=None)
    parser.add_argument('--loss-scale', type=str, default=None)
    parser.add_argument('--channels-last', type=bool, default=False)
    args = parser.parse_args()
    return args

def main():
    global best_prec1, args

    args = parse()
    print("opt_level = {}".format(args.opt_level))
    print("keep_batchnorm_fp32 = {}".format(args.keep_batchnorm_fp32), type(args.keep_batchnorm_fp32))
    print("loss_scale = {}".format(args.loss_scale), type(args.loss_scale))

    print("\nCUDNN VERSION: {}\n".format(torch.backends.cudnn.version()))

    cudnn.benchmark = True
    best_prec1 = 0
    if args.deterministic:
        cudnn.benchmark = False
        cudnn.deterministic = True
        torch.manual_seed(args.local_rank)
        torch.set_printoptions(precision=10)

    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1

    args.gpu = 0
    args.world_size = 1

    if args.distributed:
        args.gpu = args.local_rank
        torch.cuda.set_device(args.gpu)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        args.world_size = torch.distributed.get_world_size()

    assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled."

    if args.channels_last:
        memory_format = torch.channels_last
    else:
        memory_format = torch.contiguous_format

    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    if args.sync_bn:
        import apex
        print("using apex synced BN")
        model = apex.parallel.convert_syncbn_model(model)

    model = model.cuda().to(memory_format=memory_format)

    # Scale learning rate based on global batch size
    args.lr = args.lr*float(args.batch_size*args.world_size)/256.
    optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    if args.distributed:
        model = DDP(model)
    scaler = torch.amp.GradScaler("cuda")

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    # Optionally resume from a checkpoint
    if args.resume:
        # Use a local scope to avoid dangling references
        def resume():
            if os.path.isfile(args.resume):
                print("=> loading checkpoint '{}'".format(args.resume))
                checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.gpu))
                args.start_epoch = checkpoint['epoch']
                global best_prec1
                best_prec1 = checkpoint['best_prec1']
                model.load_state_dict(checkpoint['state_dict'])
                optimizer.load_state_dict(checkpoint['optimizer'])
                print("=> loaded checkpoint '{}' (epoch {})"
                      .format(args.resume, checkpoint['epoch']))
            else:
                print("=> no checkpoint found at '{}'".format(args.resume))
        resume()

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')

    if(args.arch == "inception_v3"):
        raise RuntimeError("Currently, inception_v3 is not supported by this example.")
        # crop_size = 299
        # val_size = 320 # I chose this value arbitrarily, we can adjust.
    else:
        crop_size = 224
        val_size = 256

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(crop_size),
            transforms.RandomHorizontalFlip(),
            # transforms.ToTensor(), Too slow
            # normalize,
        ]))
    val_dataset = datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(val_size),
            transforms.CenterCrop(crop_size),
        ]))

    train_sampler = None
    val_sampler = None
    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
        val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)

    collate_fn = lambda b: fast_collate(b, memory_format)

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True, sampler=train_sampler, collate_fn=collate_fn)

    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True,
        sampler=val_sampler,
        collate_fn=collate_fn)

    if args.evaluate:
        validate(val_loader, model, criterion)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, scaler, epoch)

        # evaluate on validation set
        prec1 = validate(val_loader, model, criterion)

        # remember best prec@1 and save checkpoint
        if args.local_rank == 0:
            is_best = prec1 > best_prec1
            best_prec1 = max(prec1, best_prec1)
            save_checkpoint({
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_prec1': best_prec1,
                'optimizer' : optimizer.state_dict(),
            }, is_best)

class data_prefetcher():
    def __init__(self, loader):
        self.loader = iter(loader)
        self.stream = torch.cuda.Stream()
        self.mean = torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]).cuda().view(1,3,1,1)
        self.std = torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]).cuda().view(1,3,1,1)
        # With Amp, it isn't necessary to manually convert data to half.
        # if args.fp16:
        #     self.mean = self.mean.half()
        #     self.std = self.std.half()
        self.preload()

    def preload(self):
        try:
            self.next_input, self.next_target = next(self.loader)
        except StopIteration:
            self.next_input = None
            self.next_target = None
            return
        # if record_stream() doesn't work, another option is to make sure device inputs are created
        # on the main stream.
        # self.next_input_gpu = torch.empty_like(self.next_input, device='cuda')
        # self.next_target_gpu = torch.empty_like(self.next_target, device='cuda')
        # Need to make sure the memory allocated for next_* is not still in use by the main stream
        # at the time we start copying to next_*:
        # self.stream.wait_stream(torch.cuda.current_stream())
        with torch.cuda.stream(self.stream):
            self.next_input = self.next_input.cuda(non_blocking=True)
            self.next_target = self.next_target.cuda(non_blocking=True)
            # more code for the alternative if record_stream() doesn't work:
            # copy_ will record the use of the pinned source tensor in this side stream.
            # self.next_input_gpu.copy_(self.next_input, non_blocking=True)
            # self.next_target_gpu.copy_(self.next_target, non_blocking=True)
            # self.next_input = self.next_input_gpu
            # self.next_target = self.next_target_gpu

            # With Amp, it isn't necessary to manually convert data to half.
            # if args.fp16:
            #     self.next_input = self.next_input.half()
            # else:
            self.next_input = self.next_input.float()
            self.next_input = self.next_input.sub_(self.mean).div_(self.std)

    def next(self):
        torch.cuda.current_stream().wait_stream(self.stream)
        input = self.next_input
        target = self.next_target
        if input is not None:
            input.record_stream(torch.cuda.current_stream())
        if target is not None:
            target.record_stream(torch.cuda.current_stream())
        self.preload()
        return input, target


def train(train_loader, model, criterion, optimizer, scaler, epoch):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to train mode
    model.train()
    end = time.time()

    prefetcher = data_prefetcher(train_loader)
    input, target = prefetcher.next()
    i = 0
    while input is not None:
        i += 1
        if args.prof >= 0 and i == args.prof:
            print("Profiling begun at iteration {}".format(i))
            torch.cuda.cudart().cudaProfilerStart()

        if args.prof >= 0: torch.cuda.nvtx.range_push("Body of iteration {}".format(i))

        adjust_learning_rate(optimizer, epoch, i, len(train_loader))

        # compute output
        with torch.autocast(device_type="cuda"):
            if args.prof >= 0: torch.cuda.nvtx.range_push("forward")
            output = model(input)
            if args.prof >= 0: torch.cuda.nvtx.range_pop()
            loss = criterion(output, target)

        # compute gradient and do SGD step
        optimizer.zero_grad()

        if args.prof >= 0: torch.cuda.nvtx.range_push("backward")
        scaler.scale(loss).backward()
        if args.prof >= 0: torch.cuda.nvtx.range_pop()

        # for param in model.parameters():
        #     print(param.data.double().sum().item(), param.grad.data.double().sum().item())

        if args.prof >= 0: torch.cuda.nvtx.range_push("optimizer.step()")
        scaler.step(optimizer)
        scaler.update()
        if args.prof >= 0: torch.cuda.nvtx.range_pop()

        if i%args.print_freq == 0:
            # Every print_freq iterations, check the loss, accuracy, and speed.
            # For best performance, it doesn't make sense to print these metrics every
            # iteration, since they incur an allreduce and some host<->device syncs.

            # Measure accuracy
            prec1, prec5 = accuracy(output.data, target, topk=(1, 5))

            # Average loss and accuracy across processes for logging
            if args.distributed:
                reduced_loss = reduce_tensor(loss.data)
                prec1 = reduce_tensor(prec1)
                prec5 = reduce_tensor(prec5)
            else:
                reduced_loss = loss.data

            # to_python_float incurs a host<->device sync
            losses.update(to_python_float(reduced_loss), input.size(0))
            top1.update(to_python_float(prec1), input.size(0))
            top5.update(to_python_float(prec5), input.size(0))

            torch.cuda.synchronize()
            batch_time.update((time.time() - end)/args.print_freq)
            end = time.time()

            if args.local_rank == 0:
                print('Epoch: [{0}][{1}/{2}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Speed {3:.3f} ({4:.3f})\t'
                      'Loss {loss.val:.10f} ({loss.avg:.4f})\t'
                      'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                      'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                       epoch, i, len(train_loader),
                       args.world_size*args.batch_size/batch_time.val,
                       args.world_size*args.batch_size/batch_time.avg,
                       batch_time=batch_time,
                       loss=losses, top1=top1, top5=top5))
        if args.prof >= 0: torch.cuda.nvtx.range_push("prefetcher.next()")
        input, target = prefetcher.next()
        if args.prof >= 0: torch.cuda.nvtx.range_pop()

        # Pop range "Body of iteration {}".format(i)
        if args.prof >= 0: torch.cuda.nvtx.range_pop()

        if args.prof >= 0 and i == args.prof + 10:
            print("Profiling ended at iteration {}".format(i))
            torch.cuda.cudart().cudaProfilerStop()
            quit()


def validate(val_loader, model, criterion):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()

    prefetcher = data_prefetcher(val_loader)
    input, target = prefetcher.next()
    i = 0
    while input is not None:
        i += 1

        # compute output
        with torch.no_grad():
            output = model(input)
            loss = criterion(output, target)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))

        if args.distributed:
            reduced_loss = reduce_tensor(loss.data)
            prec1 = reduce_tensor(prec1)
            prec5 = reduce_tensor(prec5)
        else:
            reduced_loss = loss.data

        losses.update(to_python_float(reduced_loss), input.size(0))
        top1.update(to_python_float(prec1), input.size(0))
        top5.update(to_python_float(prec5), input.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        # TODO:  Change timings to mirror train().
        if args.local_rank == 0 and i % args.print_freq == 0:
            print('Test: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Speed {2:.3f} ({3:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                   i, len(val_loader),
                   args.world_size * args.batch_size / batch_time.val,
                   args.world_size * args.batch_size / batch_time.avg,
                   batch_time=batch_time, loss=losses,
                   top1=top1, top5=top5))

        input, target = prefetcher.next()

    print(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}'
          .format(top1=top1, top5=top5))

    return top1.avg


def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def adjust_learning_rate(optimizer, epoch, step, len_epoch):
    """LR schedule that should yield 76% converged accuracy with batch size 256"""
    factor = epoch // 30

    if epoch >= 80:
        factor = factor + 1

    lr = args.lr*(0.1**factor)

    """Warmup"""
    if epoch < 5:
        lr = lr*float(1 + step + epoch*len_epoch)/(5.*len_epoch)

    # if(args.local_rank == 0):
    #     print("epoch = {}, step = {}, lr = {}".format(epoch, step, lr))

    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res


def reduce_tensor(tensor):
    rt = tensor.clone()
    dist.all_reduce(rt, op=dist.reduce_op.SUM)
    rt /= args.world_size
    return rt

if __name__ == '__main__':
    main()


================================================
FILE: examples/simple/distributed/README.md
================================================
**distributed_data_parallel.py** and **run.sh** show an example using Amp with
[apex.parallel.DistributedDataParallel](https://nvidia.github.io/apex/parallel.html) or
[torch.nn.parallel.DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#distributeddataparallel)
and the Pytorch multiprocess launcher script,
[torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility).
The use of `Amp` with DistributedDataParallel does not need to change from ordinary 
single-process use.  The only gotcha is that wrapping your model with `DistributedDataParallel` must
come after the call to `amp.initialize`.  Test via
```bash
bash run.sh
```

**This is intended purely as an instructional example, not a performance showcase.**


================================================
FILE: examples/simple/distributed/distributed_data_parallel.py
================================================
import torch
import argparse
import os
from apex import amp
# FOR DISTRIBUTED: (can also use torch.nn.parallel.DistributedDataParallel instead)
from apex.parallel import DistributedDataParallel

parser = argparse.ArgumentParser()
# FOR DISTRIBUTED:  Parse for the local_rank argument, which will be supplied
# automatically by torch.distributed.launch.
parser.add_argument("--local_rank", default=0, type=int)
args = parser.parse_args()

# FOR DISTRIBUTED:  If we are running under torch.distributed.launch,
# the 'WORLD_SIZE' environment variable will also be set automatically.
args.distributed = False
if 'WORLD_SIZE' in os.environ:
    args.distributed = int(os.environ['WORLD_SIZE']) > 1

if args.distributed:
    # FOR DISTRIBUTED:  Set the device according to local_rank.
    torch.cuda.set_device(args.local_rank)

    # FOR DISTRIBUTED:  Initialize the backend.  torch.distributed.launch will provide
    # environment variables, and requires that you use init_method=`env://`.
    torch.distributed.init_process_group(backend='nccl',
                                         init_method='env://')

torch.backends.cudnn.benchmark = True

N, D_in, D_out = 64, 1024, 16

# Each process receives its own batch of "fake input data" and "fake target data."
# The "training loop" in each process just uses this fake batch over and over.
# https://github.com/NVIDIA/apex/tree/master/examples/imagenet provides a more realistic
# example of distributed data sampling for both training and validation.
x = torch.randn(N, D_in, device='cuda')
y = torch.randn(N, D_out, device='cuda')

model = torch.nn.Linear(D_in, D_out).cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

model, optimizer = amp.initialize(model, optimizer, opt_level="O1")

if args.distributed:
    # FOR DISTRIBUTED:  After amp.initialize, wrap the model with
    # apex.parallel.DistributedDataParallel.
    model = DistributedDataParallel(model)
    # torch.nn.parallel.DistributedDataParallel is also fine, with some added args:
    # model = torch.nn.parallel.DistributedDataParallel(model,
    #                                                   device_ids=[args.local_rank],
    #                                                   output_device=args.local_rank)

loss_fn = torch.nn.MSELoss()

for t in range(500):
    optimizer.zero_grad()
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    with amp.scale_loss(loss, optimizer) as scaled_loss:
        scaled_loss.backward()
    optimizer.step()

if args.local_rank == 0:
    print("final loss = ", loss)


================================================
FILE: examples/simple/distributed/run.sh
================================================
#!/bin/bash
python -m torch.distributed.launch --nproc_per_node=2 distributed_data_parallel.py


================================================
FILE: pyproject.toml
================================================
[build-system]
requires = [
    "setuptools",
    "wheel",
]
build-backend = "setuptools.build_meta"

[tool.ruff]
line-length = 100
ignore = [
    # Sorted by occurrence count (ascending) - easier to fix first
    "E731",  # lambda assignment (6 occurrences)
    "E721",  # type comparison should use isinstance (8 occurrences)
    "E741",  # ambiguous variable name (8 occurrences)
    "E712",  # comparison to True/False (9 occurrences)
    "F403",  # star imports used (9 occurrences)
    "E701",  # multiple statements on one line (10 occurrences)
    "E711",  # comparison to None should be `cond is None` (11 occurrences)
    "F821",  # undefined name (14 occurrences)
    "E722",  # bare except (15 occurrences)
    "E402",  # module level import not at top of file (41 occurrences)
    "F401",  # imported but unused (45 occurrences)
    "F841",  # local variable assigned but never used (52 occurrences)
    "F405",  # star imports (80 occurrences)
]


================================================
FILE: requirements.txt
================================================
cxxfilt>=0.2.0
tqdm>=4.28.1
numpy>=1.15.3
PyYAML>=5.1
pytest>=3.5.1
packaging>=14.0
torch>=2.6.0


================================================
FILE: requirements_dev.txt
================================================
-r requirements.txt
flake8>=3.7.9
Sphinx>=3.0.3

================================================
FILE: setup.py
================================================
import sys
import warnings
import os
import threading
import glob
from packaging.version import parse, Version

from setuptools import setup, find_packages
import subprocess

import torch
from torch.utils.cpp_extension import (
    BuildExtension,
    CppExtension,
    CUDAExtension,
    CUDA_HOME,
    load,
)

# ninja build does not work unless include_dirs are abs path
this_dir = os.path.dirname(os.path.abspath(__file__))

# Allow environment variables to specify build flags for PEP 517 compatibility
ENV_TO_FLAG = {
    "APEX_CPP_EXT": "--cpp_ext",
    "APEX_CUDA_EXT": "--cuda_ext",
    "APEX_XENTROPY": "--xentropy",
    "APEX_FAST_LAYER_NORM": "--fast_layer_norm",
    "APEX_DISTRIBUTED_ADAM": "--distributed_adam",
    "APEX_DISTRIBUTED_LAMB": "--distributed_lamb",
    "APEX_BNP": "--bnp",
    "APEX_GROUP_NORM": "--group_norm",
    "APEX_INDEX_MUL_2D": "--index_mul_2d",
    "APEX_DEPRECATED_FUSED_ADAM": "--deprecated_fused_adam",
    "APEX_DEPRECATED_FUSED_LAMB": "--deprecated_fused_lamb",
    "APEX_FAST_MULTIHEAD_ATTN": "--fast_multihead_attn",
    "APEX_FMHA": "--fmha",
    "APEX_PERMUTATION_SEARCH": "--permutation_search",
    "APEX_FOCAL_LOSS": "--focal_loss",
    "APEX_TRANSDUCER": "--transducer",
    "APEX_CUDNN_GBN": "--cudnn_gbn",
    "APEX_PEER_MEMORY": "--peer_memory",
    "APEX_NCCL_P2P": "--nccl_p2p",
    "APEX_FAST_BOTTLENECK": "--fast_bottleneck",
    "APEX_FUSED_CONV_BIAS_RELU": "--fused_conv_bias_relu",
    "APEX_NCCL_ALLOCATOR": "--nccl_allocator",
    "APEX_GPU_DIRECT_STORAGE": "--gpu_direct_storage",
}
for env_var, flag in ENV_TO_FLAG.items():
    if os.environ.get(env_var, "0") == "1" and flag not in sys.argv:
        print(f"[apex] Detected {env_var}=1, adding {flag} to build flags.")
        sys.argv.append(flag)


FLAG_TO_ENV = {v: k for k, v in ENV_TO_FLAG.items()}
CORE_FLAGS = {"--cpp_ext", "--cuda_ext"}
CONTRIB_FLAGS = set(FLAG_TO_ENV.keys()) - CORE_FLAGS


def has_flag(flag, env_var):
    if flag in sys.argv or os.environ.get(env_var, "0") == "1":
        return True
    if flag in CONTRIB_FLAGS and os.environ.get("APEX_ALL_CONTRIB_EXT", "0") == "1":
        return True
    return False


def get_cuda_bare_metal_version(cuda_dir):
    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
    output = raw_output.split()
    release_idx = output.index("release") + 1
    bare_metal_version = parse(output[release_idx].split(",")[0])

    return raw_output, bare_metal_version


def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
    raw_output, bare_metal_version = get_cuda_bare_metal_version(cuda_dir)
    torch_binary_version = parse(torch.version.cuda)

    print("\nCompiling cuda extensions with")
    print(raw_output + "from " + cuda_dir + "/bin\n")

    if bare_metal_version != torch_binary_version:
        raise RuntimeError(
            "Cuda extensions are being compiled with a version of Cuda that does "
            "not match the version used to compile Pytorch binaries.  "
            "Pytorch binaries were compiled with Cuda {}.\n".format(torch.version.cuda)
            + "In some cases, a minor-version mismatch will not cause later errors:  "
            "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798.  "
            "You can try commenting out this check (at your own risk)."
        )


def raise_if_cuda_home_none(global_option: str) -> None:
    if CUDA_HOME is not None:
        return
    raise RuntimeError(
        f"{global_option} was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  "
        "If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, "
        "only images whose names contain 'devel' will provide nvcc."
    )


def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int) -> bool:
    cudnn_available = torch.backends.cudnn.is_available()
    cudnn_version = torch.backends.cudnn.version() if cudnn_available else None
    if not (cudnn_available and (cudnn_version >= required_cudnn_version)):
        warnings.warn(
            f"Skip `{global_option}` as it requires cuDNN {required_cudnn_version} or later, "
            f"but {'cuDNN is not available' if not cudnn_available else cudnn_version}"
        )
        return False
    return True


if not torch.cuda.is_available():
    # https://github.com/NVIDIA/apex/issues/486
    # Extension builds after https://github.com/pytorch/pytorch/pull/23408 attempt to query torch.cuda.get_device_capability(),
    # which will fail if you are compiling in an environment without visible GPUs (e.g. during an nvidia-docker build command).
    print(
        "\nWarning: Torch did not find available GPUs on this system.\n",
        "If your intention is to cross-compile, this is not an error.\n"
        "By default, Apex will cross-compile for Pascal (compute capabilities 6.0, 6.1, 6.2) (until CUDA 12.8),\n"
        "Volta (compute capability 7.0), Turing (compute capability 7.5),\n"
        "and, if the CUDA version is >= 11.0, Ampere (compute capability 8.0, 8.6), and,\n"
        "if the CUDA version is >= 12.8, Blackwell (compute capability 10.0, 12.0).\n"
        "If you wish to cross-compile for a single specific architecture,\n"
        'export TORCH_CUDA_ARCH_LIST="compute capability" before running setup.py.\n',
    )
    if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None and CUDA_HOME is not None:
        _, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)
        if bare_metal_version >= Version("13.0"):
            os.environ["TORCH_CUDA_ARCH_LIST"] = "7.5;8.0;8.6;9.0;10.0;11.0;12.0"
        elif bare_metal_version >= Version("12.8"):
            os.environ["TORCH_CUDA_ARCH_LIST"] = "7.0;7.5;8.0;8.6;9.0;10.0;12.0"
        elif bare_metal_version >= Version("11.8"):
            os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5;8.0;8.6;9.0"
        elif bare_metal_version >= Version("11.1"):
            os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5;8.0;8.6"
        elif bare_metal_version == Version("11.0"):
            os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5;8.0"
        else:
            os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5"

print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
TORCH_MAJOR = int(torch.__version__.split(".")[0])
TORCH_MINOR = int(torch.__version__.split(".")[1])

if TORCH_MAJOR == 0 and TORCH_MINOR < 4:
    raise RuntimeError(
        "Apex requires Pytorch 0.4 or newer.\nThe latest stable release can be obtained from https://pytorch.org/"
    )

cmdclass = {}
ext_modules = []

extras = {}

if "--cpp_ext" in sys.argv or "--cuda_ext" in sys.argv:
    if TORCH_MAJOR == 0:
        raise RuntimeError(
            "--cpp_ext requires Pytorch 1.0 or later, found torch.__version__ = {}".format(
                torch.__version__
            )
        )

if has_flag("--cpp_ext", "APEX_CPP_EXT"):
    if "--cpp_ext" in sys.argv:
        sys.argv.remove("--cpp_ext")
    ext_modules.append(CppExtension("apex_C", ["csrc/flatten_unflatten.cpp"]))


_, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)

if has_flag("--distributed_adam", "APEX_DISTRIBUTED_ADAM"):
    if "--distributed_adam" in sys.argv:
        sys.argv.remove("--distributed_adam")
    raise_if_cuda_home_none("--distributed_adam")
    ext_modules.append(
        CUDAExtension(
            name="distributed_adam_cuda",
            sources=[
                "apex/contrib/csrc/optimizers/multi_tensor_distopt_adam.cpp",
                "apex/contrib/csrc/optimizers/multi_tensor_distopt_adam_kernel.cu",
            ],
            include_dirs=[os.path.join(this_dir, "csrc")],
            extra_compile_args={
                "cxx": ["-O3"],
                "nvcc": ["-O3", "--use_fast_math"],
            },
        )
    )

if has_flag("--distributed_lamb", "APEX_DISTRIBUTED_LAMB"):
    if "--distributed_lamb" in sys.argv:
        sys.argv.remove("--distributed_lamb")
    raise_if_cuda_home_none("--distributed_lamb")
    ext_modules.append(
        CUDAExtension(
            name="distributed_lamb_cuda",
            sources=[
                "apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb.cpp",
                "apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb_kernel.cu",
            ],
            include_dirs=[os.path.join(this_dir, "csrc")],
            extra_compile_args={
                "cxx": ["-O3"],
                "nvcc": ["-O3", "--use_fast_math"],
            },
        )
    )

if has_flag("--cuda_ext", "APEX_CUDA_EXT"):
    if "--cuda_ext" in sys.argv:
        sys.argv.remove("--cuda_ext")
    raise_if_cuda_home_none("--cuda_ext")
    check_cuda_torch_binary_vs_bare_metal(CUDA_HOME)

    ext_modules.append(
        CUDAExtension(
            name="amp_C",
            sources=[
                "csrc/amp_C_frontend.cpp",
                "csrc/multi_tensor_sgd_kernel.cu",
                "csrc/multi_tensor_scale_kernel.cu",
                "csrc/multi_tensor_axpby_kernel.cu",
                "csrc/multi_tensor_l2norm_kernel.cu",
                "csrc/multi_tensor_l2norm_kernel_mp.cu",
                "csrc/multi_tensor_l2norm_scale_kernel.cu",
                "csrc/multi_tensor_lamb_stage_1.cu",
                "csrc/multi_tensor_lamb_stage_2.cu",
                "csrc/multi_tensor_adam.cu",
                "csrc/multi_tensor_adagrad.cu",
                "csrc/multi_tensor_novograd.cu",
                "csrc/multi_tensor_lamb.cu",
                "csrc/multi_tensor_lamb_mp.cu",
                "csrc/update_scale_hysteresis.cu",
            ],
            extra_compile_args={
                "cxx": ["-O3"],
                "nvcc": [
                    "-lineinfo",
                    "-O3",
                    # '--resource-usage',
                    "--use_fast_math",
                ],
            },
        )
    )
    ext_modules.append(
        CUDAExtension(
            name="syncbn",
            sources=["csrc/syncbn.cpp", "csrc/welford.cu"],
            extra_compile_args={
                "cxx": ["-O3"],
                "nvcc": ["-O3"],
            },
        )
    )

    ext_modules.append(
        CUDAExtension(
            name="fused_layer_norm_cuda",
            sources=["csrc/layer_norm_cuda.cpp", "csrc/layer_norm_cuda_kernel.cu"],
            extra_compile_args={
                "cxx": ["-O3"],
                "nvcc": ["-maxrregcount=50", "-O3", "--use_fast_math"],
            },
        )
    )

    ext_modules.append(
        CUDAExtension(
            name="mlp_cuda",
            sources=["csrc/mlp.cpp", "csrc/mlp_cuda.cu"],
            extra_compile_args={
                "cxx": ["-O3"],
                "nvcc": ["-O3"],
            },
        )
    )
    ext_modules.append(
        CUDAExtension(
            name="fused_dense_cuda",
            sources=["csrc/fused_dense.cpp", "csrc/fused_dense_cuda.cu"],
            extra_compile_args={
                "cxx": ["-O3"],
                "nvcc": ["-O3"],
            },
        )
    )

    ext_modules.append(
        CUDAExtension(
            name="scaled_upper_triang_masked_softmax_cuda",
            sources=[
                "csrc/megatron/scaled_upper_triang_masked_softmax.cpp",
                "csrc/megatron/scaled_upper_triang_masked_softmax_cuda.cu",
            ],
            include_dirs=[os.path.join(this_dir, "csrc")],
            extra_compile_args={
                "cxx": ["-O3"],
                "nvcc": [
                    "-O3",
                    "-U__CUDA_NO_HALF_OPERATORS__",
                    "-U__CUDA_NO_HALF_CONVERSIONS__",
                    "--expt-relaxed-constexpr",
                    "--expt-extended-lambda",
                ],
            },
        )
    )

    ext_modules.append(
        CUDAExtension(
            name="generic_scaled_masked_softmax_cuda",
            sources=[
                "csrc/megatron/generic_scaled_masked_softmax.cpp",
                "csrc/megatron/generic_scaled_masked_softmax_cuda.cu",
            ],
            include_dirs=[os.path.join(this_dir, "csrc")],
            extra_compile_args={
                "cxx": ["-O3"],
                "nvcc": [
                    "-O3",
                    "-U__CUDA_NO_HALF_OPERATORS__",
                    "-U__CUDA_NO_HALF_CONVERSIONS__",
                    "--expt-relaxed-constexpr",
                    "--expt-extended-lambda",
                ],
            },
        )
    )

    ext_modules.append(
        CUDAExtension(
            name="scaled_masked_softmax_cuda",
            sources=[
                "csrc/megatron/scaled_masked_softmax.cpp",
                "csrc/megatron/scaled_masked_softmax_cuda.cu",
            ],
            include_dirs=[os.path.join(this_dir, "csrc")],
            extra_compile_args={
                "cxx": ["-O3"],
                "nvcc": [
                    "-O3",
                    "-U__CUDA_NO_HALF_OPERATORS__",
                    "-U__CUDA_NO_HALF_CONVERSIONS__",
                    "--expt-relaxed-constexpr",
                    "--expt-extended-lambda",
                ],
            },
        )
    )

    ext_modules.append(
        CUDAExtension(
            name="scaled_softmax_cuda",
            sources=[
                "csrc/megatron/scaled_softmax.cpp",
                "csrc/megatron/scaled_softmax_cuda.cu",
            ],
            include_dirs=[os.path.join(this_dir, "csrc")],
            extra_compile_args={
                "cxx": ["-O3"],
                "nvcc": [
                    "-O3",
                    "-U__CUDA_NO_HALF_OPERATORS__",
                    "-U__CUDA_NO_HALF_CONVERSIONS__",
                    "--expt-relaxed-constexpr",
                    "--expt-extended-lambda",
                ],
            },
        )
    )

    ext_modules.append(
        CUDAExtension(
            name="fused_rotary_positional_embedding",
            sources=[
                "csrc/megatron/fused_rotary_positional_embedding.cpp",
                "csrc/megatron/fused_rotary_positional_embedding_cuda.cu",
            ],
            include_dirs=[os.path.join(this_dir, "csrc")],
            extra_compile_args={
                "cxx": ["-O3"],
                "nvcc": [
                    "-O3",
                    "-U__CUDA_NO_HALF_OPERATORS__",
                    "-U__CUDA_NO_HALF_CONVERSIONS__",
                    "--expt-relaxed-constexpr",
                    "--expt-extended-lambda",
                ],
            },
        )
    )

    ext_modules.append(
        CUDAExtension(
            name="fused_weight_gradient_mlp_cuda",
            include_dirs=[os.path.join(this_dir, "csrc")],
            sources=[
                "csrc/megatron/fused_weight_gradient_dense.cpp",
                "csrc/megatron/fused_weight_gradient_dense_cuda.cu",
                "csrc/megatron/fused_weight_gradient_dense_16bit_prec_cuda.cu",
            ],
            extra_compile_args={
                "cxx": ["-O3"],
                "nvcc": [
                    "-O3",
                    "-U__CUDA_NO_HALF_OPERATORS__",
                    "-U__CUDA_NO_HALF_CONVERSIONS__",
                    "--expt-relaxed-constexpr",
                    "--expt-extended-lambda",
                    "--use_fast_math",
                ],
            },
        )
    )

if has_flag("--permutation_search", "APEX_PERMUTATION_SEARCH"):
    if "--permutation_search" in sys.argv:
        sys.argv.remove("--permutation_search")

    if CUDA_HOME is None:
        raise RuntimeError(
            "--permutation_search was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc."
        )
    else:
        cc_flag = ["-Xcompiler", "-fPIC", "-shared"]
        ext_modules.append(
            CUDAExtension(
                name="permutation_search_cuda",
                sources=[
                    "apex/contrib/sparsity/permutation_search_kernels/CUDA_kernels/permutation_search_kernels.cu"
                ],
                include_dirs=[
                    os.path.join(
                        this_dir,
                        "apex",
                        "contrib",
                        "sparsity",
                        "permutation_search_kernels",
                        "CUDA_kernels",
                    )
                ],
                extra_compile_args={"cxx": ["-O3"], "nvcc": ["-O3"] + cc_flag},
            )
        )

if has_flag("--bnp", "APEX_BNP"):
    if "--bnp" in sys.argv:
        sys.argv.remove("--bnp")
    raise_if_cuda_home_none("--bnp")
    ext_modules.append(
        CUDAExtension(
            name="bnp",
            sources=[
                "apex/contrib/csrc/groupbn/batch_norm.cu",
                "apex/contrib/csrc/groupbn/ipc.cu",
                "apex/contrib/csrc/groupbn/interface.cpp",
                "apex/contrib/csrc/groupbn/batch_norm_add_relu.cu",
            ],
            include_dirs=[os.path.join(this_dir, "csrc")],
            extra_compile_args={
                "cxx": [],
                "nvcc": [
                    "-DCUDA_HAS_FP16=1",
                    "-D__CUDA_NO_HALF_OPERATORS__",
                    "-D__CUDA_NO_HALF_CONVERSIONS__",
                    "-D__CUDA_NO_HALF2_OPERATORS__",
                ],
            },
        )
    )

if has_flag("--xentropy", "APEX_XENTROPY"):
    from datetime import datetime

    if "--xentropy" in sys.argv:
        sys.argv.remove("--xentropy")
    raise_if_cuda_home_none("--xentropy")
    xentropy_ver = datetime.today().strftime("%y.%m.%d")
    print(f"`--xentropy` setting version of {xentropy_ver}")
    ext_modules.append(
        CUDAExtension(
            name="xentropy_cuda",
            sources=[
                "apex/contrib/csrc/xentropy/interface.cpp",
                "apex/contrib/csrc/xentropy/xentropy_kernel.cu",
            ],
            include_dirs=[os.path.join(this_dir, "csrc")],
            extra_compile_args={
                "cxx": ["-O3"] + [f'-DXENTROPY_VER="{xentropy_ver}"'],
                "nvcc": ["-O3"],
            },
        )
    )

if has_flag("--focal_loss", "APEX_FOCAL_LOSS"):
    if "--focal_loss" in sys.argv:
        sys.argv.remove("--focal_loss")
    raise_if_cuda_home_none("--focal_loss")
    ext_modules.append(
        CUDAExtension(
            name="focal_loss_cuda",
            sources=[
                "apex/contrib/csrc/focal_loss/focal_loss_cuda.cpp",
                "apex/contrib/csrc/focal_loss/focal_loss_cuda_kernel.cu",
            ],
            include_dirs=[os.path.join(this_dir, "csrc")],
            extra_compile_args={
                "cxx": ["-O3"],
                "nvcc": ["-O3", "--use_fast_math", "--ftz=false"],
            },
        )
    )

if has_flag("--group_norm", "APEX_GROUP_NORM"):
    if "--group_norm" in sys.argv:
        sys.argv.remove("--group_norm")
    raise_if_cuda_home_none("--group_norm")

    ext_modules.append(
        CUDAExtension(
            name="group_norm_cuda",
            sources=[
                "apex/contrib/csrc/group_norm/group_norm_nhwc_op.cpp",
            ]
            + glob.glob("apex/contrib/csrc/group_norm/*.cu"),
            include_dirs=[os.path.join(this_dir, "csrc")],
            extra_compile_args={
                "cxx": ["-O3", "-std=c++17"],
                "nvcc": [
                    "-O3",
                    "-std=c++17",
                    "--use_fast_math",
                    "--ftz=false",
                ],
            },
        )
    )

    # CUDA group norm V2 is tested on SM100
    if bare_metal_version >= Version("12.4"):
        if bare_metal_version >= Version("12.8"):
            arch_flags = [
                "-gencode=arch=compute_90,code=sm_90",
                "-gencode=arch=compute_100,code=sm_100",
                "-gencode=arch=compute_120,code=compute_120",
            ]
        else:
            arch_flags = ["-gencode=arch=compute_90,code=compute_90"]

        ext_modules.append(
            CUDAExtension(
                name="group_norm_v2_cuda",
                sources=[
                    "apex/contrib/csrc/group_norm_v2/gn.cpp",
                    "apex/contrib/csrc/group_norm_v2/gn_cuda.cu",
                    "apex/contrib/csrc/group_norm_v2/gn_utils.cpp",
                ]
                + glob.glob("apex/contrib/csrc/group_norm_v2/gn_cuda_inst_*.cu"),
                extra_compile_args={
                    "cxx": ["-O2"],
                    "nvcc": [
                        "-O2",
                        "--use_fast_math",
                        "--ftz=false",
                        "-U__CUDA_NO_HALF_CONVERSIONS__",
                        "-U__CUDA_NO_HALF_OPERATORS__",
                        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
                        "-U__CUDA_NO_BFLOAT16_OPERATORS__",
                    ]
                    + arch_flags,
                },
            )
        )

if has_flag("--index_mul_2d", "APEX_INDEX_MUL_2D"):
    if "--index_mul_2d" in sys.argv:
        sys.argv.remove("--index_mul_2d")
    raise_if_cuda_home_none("--index_mul_2d")
    ext_modules.append(
        CUDAExtension(
            name="fused_index_mul_2d",
            sources=[
                "apex/contrib/csrc/index_mul_2d/index_mul_2d_cuda.cpp",
                "apex/contrib/csrc/index_mul_2d/index_mul_2d_cuda_kernel.cu",
            ],
            include_dirs=[os.path.join(this_dir, "csrc")],
            extra_compile_args={
                "cxx": ["-O3"],
                "nvcc": ["-O3", "--use_fast_math", "--ftz=false"],
            },
        )
    )

if has_flag("--deprecated_fused_adam", "APEX_DEPRECATED_FUSED_ADAM"):
    if "--deprecated_fused_adam" in sys.argv:
        sys.argv.remove("--deprecated_fused_adam")
    raise_if_cuda_home_none("--deprecated_fused_adam")
    ext_modules.append(
        CUDAExtension(
            name="fused_adam_cuda",
            sources=[
                "apex/contrib/csrc/optimizers/fused_adam_cuda.cpp",
                "apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu",
            ],
            include_dirs=[os.path.join(this_dir, "csrc")],
            extra_compile_args={
                "cxx": ["-O3"],
                "nvcc": ["-O3", "--use_fast_math"],
            },
        )
    )

if has_flag("--deprecated_fused_lamb", "APEX_DEPRECATED_FUSED_LAMB"):
    if "--deprecated_fused_lamb" in sys.argv:
        sys.argv.remove("--deprecated_fused_lamb")
    raise_if_cuda_home_none("--deprecated_fused_lamb")
    ext_modules.append(
        CUDAExtension(
            name="fused_lamb_cuda",
            sources=[
                "apex/contrib/csrc/optimizers/fused_lamb_cuda.cpp",
                "apex/contrib/csrc/optimizers/fused_lamb_cuda_kernel.cu",
                "csrc/multi_tensor_l2norm_kernel.cu",
            ],
            include_dirs=[os.path.join(this_dir, "csrc")],
            extra_compile_args={
                "cxx": ["-O3"],
                "nvcc": ["-O3", "--use_fast_math"],
            },
        )
    )

# Check, if ATen/CUDAGeneratorImpl.h is found, otherwise use ATen/cuda/CUDAGeneratorImpl.h
# See https://github.com/pytorch/pytorch/pull/70650
generator_flag = []
torch_dir = torch.__path__[0]
if os.path.exists(os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")):
    generator_flag = ["-DOLD_GENERATOR_PATH"]

if has_flag("--fast_layer_norm", "APEX_FAST_LAYER_NORM"):
    if "--fast_layer_norm" in sys.argv:
        sys.argv.remove("--fast_layer_norm")
    raise_if_cuda_home_none("--fast_layer_norm")

    ext_modules.append(
        CUDAExtension(
            name="fast_layer_norm",
            sources=[
                "apex/contrib/csrc/layer_norm/ln_api.cpp",
                "apex/contrib/csrc/layer_norm/ln_fwd_cuda_kernel.cu",
                "apex/contrib/csrc/layer_norm/ln_bwd_semi_cuda_kernel.cu",
            ],
            extra_compile_args={
                "cxx": ["-O3"] + generator_flag,
                "nvcc": [
                    "-O3",
                    "-U__CUDA_NO_HALF_OPERATORS__",
                    "-U__CUDA_NO_HALF_CONVERSIONS__",
                    "-U__CUDA_NO_BFLOAT16_OPERATORS__",
                    "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
                    "-U__CUDA_NO_BFLOAT162_OPERATORS__",
                    "-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
                    "-I./apex/contrib/csrc/layer_norm/",
                    "--expt-relaxed-constexpr",
                    "--expt-extended-lambda",
                    "--use_fast_math",
                ]
                + generator_flag,
            },
            include_dirs=[os.path.join(this_dir, "apex/contrib/csrc/layer_norm")],
        )
    )

if has_flag("--fmha", "APEX_FMHA"):
    if "--fmha" in sys.argv:
        sys.argv.remove("--fmha")
    raise_if_cuda_home_none("--fmha")

    if bare_metal_version < Version("11.0"):
        raise RuntimeError("--fmha only supported on sm_80 and sm_90 GPUs")

    cc_flag = []
    cc_flag.append("-gencode")
    cc_flag.append("arch=compute_80,code=sm_80")
    if bare_metal_version >= Version("11.8"):
        cc_flag.append("-gencode")
        cc_flag.append("arch=compute_90,code=sm_90")
    if bare_metal_version >= Version("12.8"):
        cc_flag.append("-gencode")
        cc_flag.append("arch=compute_100,code=sm_100")
        cc_flag.append("-gencode")
        cc_flag.append("arch=compute_120,code=sm_120")
    if bare_metal_version >= Version("13.0"):
        cc_flag.append("-gencode")
        cc_flag.append("arch=compute_110,code=sm_110")

    ext_modules.append(
        CUDAExtension(
            name="fmhalib",
            sources=[
                "apex/contrib/csrc/fmha/fmha_api.cpp",
                "apex/contrib/csrc/fmha/src/fmha_fill.cu",
                "apex/contrib/csrc/fmha/src/fmha_noloop_reduce.cu",
                "apex/contrib/csrc/fmha/src/fmha_fprop_fp16_128_64_kernel.sm80.cu",
                "apex/contrib/csrc/fmha/src/fmha_fprop_fp16_256_64_kernel.sm80.cu",
                "apex/contrib/csrc/fmha/src/fmha_fprop_fp16_384_64_kernel.sm80.cu",
                "apex/contrib/csrc/fmha/src/fmha_fprop_fp16_512_64_kernel.sm80.cu",
                "apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_128_64_kernel.sm80.cu",
                "apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_256_64_kernel.sm80.cu",
                "apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_384_64_kernel.sm80.cu",
                "apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_512_64_kernel.sm80.cu",
            ],
            extra_compile_args={
                "cxx": ["-O3"] + generator_flag,
                "nvcc": [
                    "-O3",
                    "-U__CUDA_NO_HALF_OPERATORS__",
                    "-U__CUDA_NO_HALF_CONVERSIONS__",
                    "--expt-relaxed-constexpr",
                    "--expt-extended-lambda",
                    "--use_fast_math",
                ]
                + generator_flag
                + cc_flag,
            },
            include_dirs=[
                os.path.join(this_dir, "apex/contrib/csrc"),
                os.path.join(this_dir, "apex/contrib/csrc/fmha/src"),
            ],
        )
    )


if has_flag("--fast_multihead_attn", "APEX_FAST_MULTIHEAD_ATTN"):
    if "--fast_multihead_attn" in sys.argv:
        sys.argv.remove("--fast_multihead_attn")
    raise_if_cuda_home_none("--fast_multihead_attn")

    subprocess.run(
        [
            "git",
            "submodule",
            "update",
            "--init",
            "apex/contrib/csrc/multihead_attn/cutlass",
        ]
    )
    ext_modules.append(
        CUDAExtension(
            name="fast_multihead_attn",
            sources=[
                "apex/contrib/csrc/multihead_attn/multihead_attn_frontend.cpp",
                "apex/contrib/csrc/multihead_attn/additive_masked_softmax_dropout_cuda.cu",
                "apex/contrib/csrc/multihead_attn/masked_softmax_dropout_cuda.cu",
                "apex/contrib/csrc/multihead_attn/encdec_multihead_attn_cuda.cu",
                "apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add_cuda.cu",
                "apex/contrib/csrc/multihead_attn/self_multihead_attn_cuda.cu",
                "apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask_cuda.cu",
                "apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_cuda.cu",
                "apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add_cuda.cu",
            ],
            extra_compile_args={
                "cxx": ["-O3"] + generator_flag,
                "nvcc": [
                    "-O3",
                    "-U__CUDA_NO_HALF_OPERATORS__",
                    "-U__CUDA_NO_HALF_CONVERSIONS__",
                    "--expt-relaxed-constexpr",
                    "--expt-extended-lambda",
                    "--use_fast_math",
                ]
                + generator_flag,
            },
            include_dirs=[
                os.path.join(this_dir, "apex/contrib/csrc/multihead_attn/cutlass/include/"),
                os.path.join(
                    this_dir,
                    "apex/contrib/csrc/multihead_attn/cutlass/tools/util/include",
                ),
            ],
        )
    )

if has_flag("--transducer", "APEX_TRANSDUCER"):
    if "--transducer" in sys.argv:
        sys.argv.remove("--transducer")
    raise_if_cuda_home_none("--transducer")
    ext_modules.append(
        CUDAExtension(
            name="transducer_joint_cuda",
            sources=[
                "apex/contrib/csrc/transducer/transducer_joint.cpp",
                "apex/contrib/csrc/transducer/transducer_joint_kernel.cu",
            ],
            extra_compile_args={
                "cxx": ["-O3"] + generator_flag,
                "nvcc": ["-O3"] + generator_flag,
            },
            include_dirs=[
                os.path.join(this_dir, "csrc"),
                os.path.join(this_dir, "apex/contrib/csrc/multihead_attn"),
            ],
        )
    )
    ext_modules.append(
        CUDAExtension(
            name="transducer_loss_cuda",
            sources=[
                "apex/contrib/csrc/transducer/transducer_loss.cpp",
                "apex/contrib/csrc/transducer/transducer_loss_kernel.cu",
            ],
            include_dirs=[os.path.join(this_dir, "csrc")],
            extra_compile_args={
                "cxx": ["-O3"],
                "nvcc": ["-O3"],
            },
        )
    )

if has_flag("--cudnn_gbn", "APEX_CUDNN_GBN"):
    if "--cudnn_gbn" in sys.argv:
        sys.argv.remove("--cudnn_gbn")
    raise_if_cuda_home_none("--cudnn_gbn")
    if check_cudnn_version_and_warn("--cudnn_gbn", 8500):
        subprocess.run(
            [
                "git",
                "submodule",
                "update",
                "--init",
                "apex/contrib/csrc/cudnn-frontend/",
            ]
        )
        ext_modules.append(
            CUDAExtension(
                name="cudnn_gbn_lib",
                sources=[
                    "apex/contrib/csrc/cudnn_gbn/norm_sample.cpp",
                    "apex/contrib/csrc/cudnn_gbn/cudnn_gbn.cpp",
                ],
                include_dirs=[os.path.join(this_dir, "apex/contrib/csrc/cudnn-frontend/include")],
                extra_compile_args={"cxx": ["-O3", "-g"] + generator_flag},
            )
        )

if has_flag("--peer_memory", "APEX_PEER_MEMORY"):
    if "--peer_memory" in sys.argv:
        sys.argv.remove("--peer_memory")
    raise_if_cuda_home_none("--peer_memory")
    ext_modules.append(
        CUDAExtension(
            name="peer_memory_cuda",
            sources=[
                "apex/contrib/csrc/peer_memory/peer_memory_cuda.cu",
                "apex/contrib/csrc/peer_memory/peer_memory.cpp",
            ],
            extra_compile_args={"cxx": ["-O3"] + generator_flag},
        )
    )

# NOTE: Requires NCCL >= 2.10.3
if has_flag("--nccl_p2p", "APEX_NCCL_P2P"):
    if "--nccl_p2p" in sys.argv:
        sys.argv.remove("--nccl_p2p")
    raise_if_cuda_home_none("--nccl_p2p")
    # Check NCCL version.
    _nccl_version_getter = load(
        name="_nccl_version_getter",
        sources=[
            "apex/contrib/csrc/nccl_p2p/nccl_version.cpp",
            "apex/contrib/csrc/nccl_p2p/nccl_version_check.cu",
        ],
    )
    _available_nccl_version = _nccl_version_getter.get_nccl_version()
    if _available_nccl_version >= (2, 10):
        ext_modules.append(
            CUDAExtension(
                name="nccl_p2p_cuda",
                sources=[
                    "apex/contrib/csrc/nccl_p2p/nccl_p2p_cuda.cu",
                    "apex/contrib/csrc/nccl_p2p/nccl_p2p.cpp",
                ],
                extra_compile_args={"cxx": ["-O3"] + generator_flag},
            )
        )
    else:
        warnings.warn(
            f"Skip `--nccl_p2p` as it requires NCCL 2.10.3 or later, but {_available_nccl_version[0]}.{_available_nccl_version[1]}"
        )

# note (mkozuki): Now `--fast_bottleneck` option (i.e. apex/contrib/bottleneck) depends on `--peer_memory` and `--nccl_p2p`.
if has_flag("--fast_bottleneck", "APEX_FAST_BOTTLENECK"):
    if "--fast_bottleneck" in sys.argv:
        sys.argv.remove("--fast_bottleneck")
    raise_if_cuda_home_none("--fast_bottleneck")
    if check_cudnn_version_and_warn("--fast_bottleneck", 8400):
        subprocess.run(
            [
                "git",
                "submodule",
                "update",
                "--init",
                "apex/contrib/csrc/cudnn-frontend/",
            ]
        )
        ext_modules.append(
            CUDAExtension(
                name="fast_bottleneck",
                sources=["apex/contrib/csrc/bottleneck/bottleneck.cpp"],
                include_dirs=[os.path.join(this_dir, "apex/contrib/csrc/cudnn-frontend/include")],
                extra_compile_args={"cxx": ["-O3"] + generator_flag},
            )
        )


if has_flag("--fused_conv_bias_relu", "APEX_FUSED_CONV_BIAS_RELU"):
    if "--fused_conv_bias_relu" in sys.argv:
        sys.argv.remove("--fused_conv_bias_relu")
    raise_if_cuda_home_none("--fused_conv_bias_relu")
    if check_cudnn_version_and_warn("--fused_conv_bias_relu", 8400):
        subprocess.run(
            [
                "git",
                "submodule",
                "update",
                "--init",
                "apex/contrib/csrc/cudnn-frontend/",
            ]
        )
        ext_modules.append(
            CUDAExtension(
                name="fused_conv_bias_relu",
                sources=["apex/contrib/csrc/conv_bias_relu/conv_bias_relu.cpp"],
                include_dirs=[os.path.join(this_dir, "apex/contrib/csrc/cudnn-frontend/include")],
                extra_compile_args={"cxx": ["-O3"] + generator_flag},
            )
        )


if has_flag("--nccl_allocator", "APEX_NCCL_ALLOCATOR"):
    if "--nccl_allocator" in sys.argv:
        sys.argv.remove("--nccl_allocator")
    raise_if_cuda_home_none("--nccl_allocator")
    _nccl_version_getter = load(
        name="_nccl_version_getter",
        sources=[
            "apex/contrib/csrc/nccl_p2p/nccl_version.cpp",
            "apex/contrib/csrc/nccl_p2p/nccl_version_check.cu",
        ],
    )
    _available_nccl_version = _nccl_version_getter.get_nccl_version()
    if _available_nccl_version >= (2, 19):
        ext_modules.append(
            CUDAExtension(
                name="_apex_nccl_allocator",
                sources=[
                    "apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp",
                ],
                include_dirs=[os.path.join(this_dir, "apex/apex/contrib/csrc/nccl_allocator")],
                libraries=["nccl"],
                extra_compile_args={"cxx": ["-O3"] + generator_flag},
            )
        )
    else:
        warnings.warn(
            f"Skip `--nccl_allocator` as it requires NCCL 2.19 or later, but {_available_nccl_version[0]}.{_available_nccl_version[1]}"
        )


if has_flag("--gpu_direct_storage", "APEX_GPU_DIRECT_STORAGE"):
    if "--gpu_direct_storage" in sys.argv:
        sys.argv.remove("--gpu_direct_storage")
    raise_if_cuda_home_none("--gpu_direct_storage")
    ext_modules.append(
        CUDAExtension(
            name="_apex_gpu_direct_storage",
            sources=[
                "apex/contrib/csrc/gpu_direct_storage/gds.cpp",
                "apex/contrib/csrc/gpu_direct_storage/gds_pybind.cpp",
            ],
            include_dirs=[os.path.join(this_dir, "apex/contrib/csrc/gpu_direct_storage")],
            libraries=["cufile"],
            extra_compile_args={"cxx": ["-O3"] + generator_flag},
        )
    )


# Patch because `setup.py bdist_wheel` and `setup.py develop` do not support the `parallel` option
parallel: int | None = None
if "--parallel" in sys.argv:
    idx = sys.argv.index("--parallel")
    parallel = int(sys.argv[idx + 1])
    sys.argv.pop(idx + 1)
    sys.argv.pop(idx)
else:
    # Check if APEX_PARALLEL_BUILD environment variable is set
    apex_parallel_build = os.environ.get("APEX_PARALLEL_BUILD", None)
    if apex_parallel_build is not None:
        try:
            parallel = int(apex_parallel_build)
            print(
                f"[apex] Using parallel build with {parallel} jobs from APEX_PARALLEL_BUILD environment variable"
            )
        except ValueError:
            print(
                f"[apex] Warning: APEX_PARALLEL_BUILD environment variable '{apex_parallel_build}' is not a valid integer, ignoring"
            )


# Prevent file conflicts when multiple extensions are compiled simultaneously
class BuildExtensionSeparateDir(BuildExtension):
    build_extension_patch_lock = threading.Lock()
    thread_ext_name_map = {}

    def finalize_options(self):
        if parallel is not None:
            self.parallel = parallel
        super().finalize_options()

    def build_extension(self, ext):
        with self.build_extension_patch_lock:
            if not getattr(self.compiler, "_compile_separate_output_dir", False):
                compile_orig = self.compiler.compile

                def compile_new(*args, **kwargs):
                    return compile_orig(
                        *args,
                        **{
                            **kwargs,
                            "output_dir": os.path.join(
                                kwargs["output_dir"],
                                self.thread_ext_name_map[threading.current_thread().ident],
                            ),
                        },
                    )

                self.compiler.compile = compile_new
                self.compiler._compile_separate_output_dir = True
        self.thread_ext_name_map[threading.current_thread().ident] = ext.name
        objects = super().build_extension(ext)
        return objects


setup(
    name="apex",
    version="0.1",
    packages=find_packages(
        exclude=(
            "build",
            "csrc",
            "include",
            "tests",
            "dist",
            "docs",
            "tests",
            "examples",
            "apex.egg-info",
        )
    ),
    install_requires=["packaging>20.6"],
    description="PyTorch Extensions written by NVIDIA",
    ext_modules=ext_modules,
    cmdclass={"build_ext": BuildExtensionSeparateDir} if ext_modules else {},
    extras_require=extras,
)


================================================
FILE: tests/L0/run_fused_layer_norm/test_fused_layer_norm.py
================================================
import importlib.util

import torch
from apex.normalization import FusedLayerNorm
from apex.normalization import FusedRMSNorm
from apex.normalization import MixedFusedLayerNorm
from apex.normalization import MixedFusedRMSNorm

from torch.testing._internal import common_utils
from torch.testing._internal.common_device_type import instantiate_device_type_tests

from itertools import product


def _prep_inputs(batch_size, normalized_shape, dtype):
    shape = (batch_size, *normalized_shape)
    fused = torch.randn(shape).cuda().requires_grad_(True)
    with torch.no_grad():
        native = fused.clone().to(dtype).requires_grad_(True)
    return native, fused


autocast_dtypes = (torch.half, torch.bfloat16) if torch.cuda.is_bf16_supported() else (torch.half,)


class TestFusedLayerNorm(common_utils.TestCase):
    def _test_fused_layer_norm(
        self,
        batch_size,
        contiguous,
        elementwise_affine,
        mixed_fused,
        dtype,
        memory_efficient,
        fwd_thresholds=dict(rtol=None, atol=None),
        bwd_thresholds=dict(rtol=None, atol=None),
    ):
        normalized_shape = [32, 16]

        if not mixed_fused:
            module_cpu_ = FusedLayerNorm(
                normalized_shape=normalized_shape,
                elementwise_affine=elementwise_affine,
                memory_efficient=memory_efficient,
            ).cpu()
            module_cuda_ = FusedLayerNorm(
                normalized_shape=normalized_shape,
                elementwise_affine=elementwise_affine,
                memory_efficient=memory_efficient,
            ).to(device="cuda", dtype=dtype)
        else:
            assert elementwise_affine
            module_cpu_ = MixedFusedLayerNorm(
                normalized_shape=normalized_shape, memory_efficient=memory_efficient
            ).cpu()
            module_cuda_ = MixedFusedLayerNorm(
                normalized_shape=normalized_shape, memory_efficient=memory_efficient
            ).to(device="cuda", dtype=dtype)

        torch.cuda.manual_seed(42)
        if contiguous:
            input_shape = [batch_size] + normalized_shape
            input_ = torch.randn(input_shape, device="cpu").requires_grad_(True)
            input_cuda_ = input_.to(device="cuda", dtype=dtype).detach().requires_grad_(True)
            self.assertTrue(input_.is_contiguous())
            self.assertTrue(input_cuda_.is_contiguous())
        else:
            input_shape = [batch_size] + normalized_shape
            input_shape = [batch_size * 3] + [
                normalized_shape[0] * 5,
                normalized_shape[1] * 3,
            ]
            input_src_ = torch.randn(input_shape, device="cpu")
            input_ = input_src_[::3, ::5, ::3].detach().requires_grad_(True)
            input_cuda_ = (
                input_src_.to(device="cuda", dtype=dtype)[::3, ::5, ::3]
                .detach()
                .requires_grad_(True)
            )
            # make sure that tensors are NOT contiguous.
            self.assertFalse(input_.is_contiguous())
            self.assertFalse(input_cuda_.is_contiguous())
        out_cpu_ = module_cpu_(input_)
        gO = torch.rand_like(out_cpu_)
        out_cpu_.backward(gO)
        out_cuda_ = module_cuda_(input_cuda_)

        gO = gO.to(device="cuda", dtype=dtype)
        out_cuda_.backward(gO)
        self.assertFalse(out_cpu_.is_cuda)
        self.assertTrue(out_cuda_.is_cuda)
        torch.testing.assert_close(
            out_cpu_.to(device="cuda", dtype=dtype), out_cuda_, **fwd_thresholds
        )
        torch.testing.assert_close(
            input_.grad.to(device="cuda", dtype=dtype),
            input_cuda_.grad,
            **bwd_thresholds,
        )

    def _test_fused_rms_norm(
        self,
        batch_size,
        contiguous,
        elementwise_affine,
        mixed_fused,
        dtype,
        memory_efficient,
        fwd_thresholds=dict(rtol=None, atol=None),
        bwd_thresholds=dict(rtol=None, atol=None),
    ):
        normalized_shape = [32, 16]

        if not mixed_fused:
            module_cpu_ = FusedRMSNorm(
                normalized_shape=normalized_shape,
                elementwise_affine=elementwise_affine,
                memory_efficient=memory_efficient,
            ).cpu()
            module_cuda_ = FusedRMSNorm(
                normalized_shape=normalized_shape,
                elementwise_affine=elementwise_affine,
                memory_efficient=memory_efficient,
            ).to(device="cuda", dtype=dtype)
        else:
            assert elementwise_affine
            module_cpu_ = MixedFusedRMSNorm(normalized_shape=normalized_shape).cpu()
            module_cuda_ = MixedFusedRMSNorm(normalized_shape=normalized_shape).to(
                device="cuda", dtype=dtype
            )

        torch.cuda.manual_seed(42)
        if contiguous:
            input_shape = [batch_size] + normalized_shape
            input_ = torch.randn(input_shape, device="cpu").requires_grad_(True)
            input_cuda_ = input_.to(device="cuda", dtype=dtype).detach().requires_grad_(True)
            self.assertTrue(input_.is_contiguous())
            self.assertTrue(input_cuda_.is_contiguous())
        else:
            input_shape = [batch_size] + normalized_shape
            input_shape = [batch_size * 3] + [
                normalized_shape[0] * 5,
                normalized_shape[1] * 3,
            ]
            input_src_ = torch.randn(input_shape, device="cpu")
            input_ = input_src_[::3, ::5, ::3].detach().requires_grad_(True)
            input_cuda_ = (
                input_src_.to(device="cuda", dtype=dtype)[::3, ::5, ::3]
                .detach()
                .requires_grad_(True)
            )
            # make sure that tensors are NOT contiguous.
            self.assertFalse(input_.is_contiguous())
            self.assertFalse(input_cuda_.is_contiguous())
        out_cpu_ = module_cpu_(input_)
        gO = torch.rand_like(out_cpu_)
        out_cpu_.backward(gO)
        out_cuda_ = module_cuda_(input_cuda_)

        torch.testing.assert_close(
            out_cpu_.to(device="cuda", dtype=dtype),
            out_cuda_.clone().detach(),
            **fwd_thresholds,
        )
        gO = gO.to(device="cuda", dtype=dtype)
        out_cuda_.backward(gO)
        self.assertFalse(out_cpu_.is_cuda)
        self.assertTrue(out_cuda_.is_cuda)
        torch.testing.assert_close(
            input_.grad.to(device="cuda", dtype=dtype),
            input_cuda_.grad,
            **bwd_thresholds,
        )
        if elementwise_affine:
            torch.testing.assert_close(
                module_cpu_.weight.grad.to(device="cuda", dtype=dtype),
                module_cuda_.weight.grad,
                **bwd_thresholds,
            )

    # layer norm tests
    @common_utils.parametrize(
        "batch_size, contiguous, elementwise_affine, mixed_fused, dtype, memory_efficient",
        list(
            product(
                (16, 65536),
                (True, False),
                (False,),
                (False,),
                (torch.float,),
                (True, False),
            )
        ),
    )
    def test_layer_norm_regular(
        self,
        batch_size,
        contiguous,
        elementwise_affine,
        mixed_fused,
        dtype,
        memory_efficient,
    ):
        self._test_fused_layer_norm(
            batch_size,
            contiguous,
            elementwise_affine,
            mixed_fused,
            dtype,
            memory_efficient,
        )

    @common_utils.parametrize(
        "batch_size, contiguous, elementwise_affine, mixed_fused, dtype, memory_efficient",
        list(
            product(
                (16, 65536),
                (True, False),
                (True,),
                (False,),
                (torch.float,),
                (True, False),
            )
        ),
    )
    def test_layer_norm_elemwise(
        self,
        batch_size,
        contiguous,
        elementwise_affine,
        mixed_fused,
        dtype,
        memory_efficient,
    ):
        self._test_fused_layer_norm(
            batch_size,
            contiguous,
            elementwise_affine,
            mixed_fused,
            dtype,
            memory_efficient,
        )

    @common_utils.parametrize(
        "batch_size, contiguous, elementwise_affine, mixed_fused, dtype, memory_efficient",
        list(
            product(
                (16, 65536),
                (True, False),
                (True,),
                (True,),
                (torch.float,),
                (True, False),
            )
        ),
    )
    def test_layer_norm_mixed(
        self,
        batch_size,
        contiguous,
        elementwise_affine,
        mixed_fused,
        dtype,
        memory_efficient,
    ):
        self._test_fused_layer_norm(
            batch_size,
            contiguous,
            elementwise_affine,
            mixed_fused,
            dtype,
            memory_efficient,
        )

    @common_utils.parametrize(
        "batch_size, contiguous, elementwise_affine, mixed_fused, dtype, memory_efficient",
        list(product((16,), (True, False), (True,), (False,), (torch.half,), (True, False))),
    )
    def test_layer_norm_half(
        self,
        batch_size,
        contiguous,
        elementwise_affine,
        mixed_fused,
        dtype,
        memory_efficient,
    ):
        self._test_fused_layer_norm(
            batch_size,
            contiguous,
            elementwise_affine,
            mixed_fused,
            dtype,
            memory_efficient,
            fwd_thresholds=dict(rtol=1e-3, atol=1e-3),
            bwd_thresholds=dict(rtol=1e-3, atol=1e-3),
        )

    @common_utils.parametrize(
        "batch_size, contiguous, elementwise_affine, mixed_fused, dtype, memory_efficient",
        list(
            product(
                (16,),
                (True, False),
                (True,),
                (False,),
                (torch.bfloat16,),
                (True, False),
            )
        ),
    )
    def test_layer_norm_bfloat16(
        self,
        batch_size,
        contiguous,
        elementwise_affine,
        mixed_fused,
        dtype,
        memory_efficient,
    ):
        self._test_fused_layer_norm(
            batch_size,
            contiguous,
            elementwise_affine,
            mixed_fused,
            dtype,
            memory_efficient,
            fwd_thresholds=dict(rtol=1.6e-2, atol=3e-4),
            bwd_thresholds=dict(rtol=1.6e-2, atol=3e-3),
        )

    # rms norm tests
    @common_utils.parametrize(
        "batch_size, contiguous, elementwise_affine, mixed_fused, dtype, memory_efficient",
        list(
            product(
                (16, 65536),
                (True, False),
                (False,),
                (False,),
                (torch.float,),
                (True, False),
            )
        ),
    )
    def test_rms_norm_regular(
        self,
        batch_size,
        contiguous,
        elementwise_affine,
        mixed_fused,
        dtype,
        memory_efficient,
    ):
        self._test_fused_rms_norm(
            batch_size,
            contiguous,
            elementwise_affine,
            mixed_fused,
            dtype,
            memory_efficient,
        )

    @common_utils.parametrize(
        "batch_size, contiguous, elementwise_affine, mixed_fused, dtype, memory_efficient",
        list(
            product(
                (16, 65536),
                (True, False),
                (True,),
                (False,),
                (torch.float,),
                (True, False),
            )
        ),
    )
    def test_rms_norm_elemwise(
        self,
        batch_size,
        contiguous,
        elementwise_affine,
        mixed_fused,
        dtype,
        memory_efficient,
    ):
        self._test_fused_rms_norm(
            batch_size,
            contiguous,
            elementwise_affine,
            mixed_fused,
            dtype,
            memory_efficient,
            bwd_thresholds=dict(rtol=2e-3, atol=2e-4),
        )

    @common_utils.parametrize(
        "batch_size, contiguous, elementwise_affine, mixed_fused, dtype, memory_efficient",
        list(
            product(
                (16, 65536),
                (True, False),
                (True,),
                (True,),
                (torch.float,),
                (True, False),
            )
        ),
    )
    def test_rms_norm_mixed(
        self,
        batch_size,
        contiguous,
        elementwise_affine,
        mixed_fused,
        dtype,
        memory_efficient,
    ):
        self._test_fused_rms_norm(
            batch_size,
            contiguous,
            elementwise_affine,
            mixed_fused,
            dtype,
            memory_efficient,
            bwd_thresholds=dict(rtol=2e-3, atol=2e-4),
        )

    @common_utils.parametrize(
        "batch_size, contiguous, elementwise_affine, mixed_fused, dtype, memory_efficient",
        list(product((16,), (True, False), (True,), (False,), (torch.half,), (True, False))),
    )
    def test_rms_norm_half(
        self,
        batch_size,
        contiguous,
        elementwise_affine,
        mixed_fused,
        dtype,
        memory_efficient,
    ):
        self._test_fused_rms_norm(
            batch_size,
            contiguous,
            elementwise_affine,
            mixed_fused,
            dtype,
            memory_efficient,
            bwd_thresholds=dict(rtol=1.6e-2, atol=3e-3),
        )

    @common_utils.parametrize(
        "batch_size, contiguous, elementwise_affine, mixed_fused, dtype, memory_efficient",
        list(
            product(
                (16,),
                (True, False),
                (True,),
                (False,),
                (torch.bfloat16,),
                (True, False),
            )
        ),
    )
    def test_rms_norm_bfloat16(
        self,
        batch_size,
        contiguous,
        elementwise_affine,
        mixed_fused,
        dtype,
        memory_efficient,
    ):
        self._test_fused_rms_norm(
            batch_size,
            contiguous,
            elementwise_affine,
            mixed_fused,
            dtype,
            memory_efficient,
            fwd_thresholds=dict(rtol=1.6e-2, atol=3e-4),
            bwd_thresholds=dict(rtol=1.6e-2, atol=3e-2),
        )

    @common_utils.parametrize(
        "dtype, elementwise_affine, memory_efficient",
        list(product(autocast_dtypes, (True, False), (True, False))),
    )
    def test_autocast_fused_layer_norm(self, dtype, elementwise_affine, memory_efficient):
        bf16_fwd_thresholds = dict(rtol=1.6e-2, atol=3e-4)
        bf16_bwd_thresholds = dict(rtol=1.6e-2, atol=3e-3)
        batch_size = 16
        normalized_shape = [32, 16]
        native = torch.nn.LayerNorm(
            normalized_shape=normalized_shape, elementwise_affine=elementwise_affine
        ).to(device="cuda", dtype=dtype)
        fused = FusedLayerNorm(
            normalized_shape=normalized_shape,
            elementwise_affine=elementwise_affine,
            memory_efficient=memory_efficient,
        ).cuda()
        native_x, fused_x = _prep_inputs(batch_size, normalized_shape, dtype)

        expected = native(native_x)
        with torch.amp.autocast("cuda", dtype=dtype):
            actual = fused(fused_x)
        tols = {"rtol": None, "atol": None} if dtype == torch.half else bf16_fwd_thresholds
        # original tests used torch.testing.assert_allclose, which disables dtype checking by default.
        # link to issue here: https://github.com/pytorch/pytorch/issues/61844
        torch.testing.assert_close(actual, expected, **tols, check_dtype=False)

        g_native = torch.rand_like(expected)
        with torch.no_grad():
            g_fused = g_native.clone()
        expected.backward(g_native)
        actual.backward(g_fused)

        if dtype != torch.half:
            tols = bf16_bwd_thresholds
        elif memory_efficient:
            tols = {"rtol": 1e-3, "atol": 1e-4}
        else:
            tols = {"rtol": None, "atol": None}
        torch.testing.assert_close(native_x.grad, fused_x.grad, **tols, check_dtype=False)

    @common_utils.parametrize(
        "dtype, elementwise_affine, memory_efficient",
        list(product(autocast_dtypes, (True, False), (True, False))),
    )
    def test_autocast_fused_rms_norm(self, dtype, elementwise_affine, memory_efficient):
        bf16_fwd_thresholds = dict(rtol=1.6e-2, atol=3e-4)
        bf16_bwd_thresholds = dict(rtol=1.6e-2, atol=3e-3)
        batch_size = 16
        normalized_shape = [32, 16]
        native = FusedRMSNorm(
            normalized_shape=normalized_shape,
            elementwise_affine=elementwise_affine,
            memory_efficient=memory_efficient,
        ).to(dtype=dtype)
        fused = FusedRMSNorm(
            normalized_shape=normalized_shape,
            elementwise_affine=elementwise_affine,
            memory_efficient=memory_efficient,
        ).cuda()
        native_x, fused_x = _prep_inputs(batch_size, normalized_shape, dtype)

        expected = native(native_x.cpu())
        with torch.amp.autocast("cuda", dtype=dtype):
            actual = fused(fused_x)
        tols = {"rtol": None, "atol": None} if dtype == torch.half else bf16_fwd_thresholds
        torch.testing.assert_close(
            actual, expected.detach().clone().cuda(), **tols, check_dtype=False
        )

        g_native = torch.rand_like(expected)
        with torch.no_grad():
            g_fused = g_native.detach().clone().cuda()
        expected.backward(g_native)
        actual.backward(g_fused)

        tols = {"rtol": 1e-3, "atol": 1e-3} if dtype == torch.half else bf16_bwd_thresholds
        torch.testing.assert_close(native_x.grad.cuda(), fused_x.grad, **tols, check_dtype=False)

    def _verify_export(self, fused, fused_x):
        if importlib.util.find_spec("onnxscript") is None:
            self.skipTest("`onnxscript` is not found")
        # check that export() is working
        import io

        f = io.BytesIO()
        torch.onnx.export(
            fused,
            (fused_x,),
            f,
            input_names=["x_in"],
            opset_version=18,
        )
        # Load the ONNX model
        import onnx

        model_onnx = onnx.load_from_string(f.getvalue())
        # Get string representation
        onnx_str = onnx.helper.printable_graph(model_onnx.graph)

        assert "x_in" in onnx_str
        assert "ReduceMean" in onnx_str or "LayerNormalization" in onnx_str

    def test_rms_export(self):
        batch_size = 16
        normalized_shape = [32, 16]
        fused = FusedRMSNorm(normalized_shape=normalized_shape, elementwise_affine=True).cuda()
        fused_m = MixedFusedRMSNorm(normalized_shape=normalized_shape).cuda()
        native_x, fused_x = _prep_inputs(batch_size, normalized_shape, torch.float32)
        self._verify_export(fused, fused_x)
        self._verify_export(fused_m, fused_x)

    def test_layer_norm_export(self):
        batch_size = 16
        normalized_shape = [32, 16]
        fused = FusedLayerNorm(normalized_shape=normalized_shape, elementwise_affine=True).cuda()
        fused_m = MixedFusedLayerNorm(normalized_shape=normalized_shape).cuda()
        native_x, fused_x = _prep_inputs(batch_size, normalized_shape, torch.float32)
        self._verify_export(fused, fused_x)
        self._verify_export(fused_m, fused_x)

    @common_utils.parametrize("elementwise_affine", (True, False))
    def test_compile_fused_layer_norm(self, elementwise_affine):
        batch_size = 16
        normalized_shape = [32, 16]
        eager_mod = FusedLayerNorm(
            normalized_shape=normalized_shape, elementwise_affine=elementwise_affine
        ).cuda()
        compiled_mod = torch.compile(fullgraph=True)(eager_mod)
        input_shape = [batch_size] + normalized_shape
        eager_x = torch.randn(input_shape, device="cuda").requires_grad_(True)
        compiled_x = eager_x.detach().clone().requires_grad_(True)

        expected = eager_mod(eager_x)
        actual = compiled_mod(compiled_x)
        torch.testing.assert_close(actual, expected.detach())

        g_eager = torch.rand_like(expected)
        with torch.no_grad():
            g_compiled = g_eager.detach().clone()
        expected.backward(g_eager)
        actual.backward(g_compiled)

        torch.testing.assert_close(eager_x.grad, compiled_x.grad)

    @common_utils.parametrize("elementwise_affine", (True, False))
    def test_compile_fused_rms_norm(self, elementwise_affine):
        batch_size = 16
        normalized_shape = [32, 16]
        eager_mod = FusedRMSNorm(
            normalized_shape=normalized_shape, elementwise_affine=elementwise_affine
        ).cuda()
        compiled_mod = torch.compile(fullgraph=True)(eager_mod)
        input_shape = [batch_size] + normalized_shape
        eager_x = torch.randn(input_shape, device="cuda").requires_grad_(True)
        compiled_x = eager_x.detach().clone().requires_grad_(True)

        expected = eager_mod(eager_x)
        actual = compiled_mod(compiled_x)
        torch.testing.assert_close(actual, expected.detach())

        g_eager = torch.rand_like(expected)
        with torch.no_grad():
            g_compiled = g_eager.detach().clone()
        expected.backward(g_eager)
        actual.backward(g_compiled)

        torch.testing.assert_close(eager_x.grad, compiled_x.grad)


instantiate_device_type_tests(TestFusedLayerNorm, globals(), only_for=("cuda",))
if __name__ == "__main__":
    common_utils.run_tests()


================================================
FILE: tests/L0/run_mlp/test_mlp.py
================================================
"""Tests for c++ MLP"""

from itertools import product
from time import time

import torch
from torch import nn
from torch.testing._internal import common_utils
from torch.testing._internal.common_device_type import instantiate_device_type_tests
from torch.testing._internal.common_cuda import tf32_off

from apex.mlp import MLP


batch_size = 1024
mlp_sizes = [480, 1024, 1024, 512, 256, 1]
num_iters = 10


# note(crcrpar): On Ampere, this test should be run without TF32 enabled.
class TestMLP(common_utils.TestCase):
    def test_creation(self):
        MLP(mlp_sizes)

    def test_numeric(self):
        mlp = MLP(mlp_sizes).cuda()

        mlp_layers = []
        for i in range(mlp.num_layers):
            linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1])
            with torch.no_grad():
                mlp.weights[i].copy_(linear.weight)
                mlp.biases[i].copy_(linear.bias)
            mlp_layers.append(linear)
            mlp_layers.append(nn.ReLU())

        ref_mlp = nn.Sequential(*mlp_layers).cuda()

        test_input = (
            torch.empty(batch_size, mlp_sizes[0], device="cuda")
            .uniform_(-1.0, 1.0)
            .requires_grad_()
        )
        ref_input = test_input.clone().detach().requires_grad_()
        mlp_out = mlp(test_input)
        ref_out = ref_mlp(ref_input)
        self.assertEqual(mlp_out, ref_out)

        # Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
        mlp_out.mean().mul(10.0).backward()
        ref_out.mean().mul(10.0).backward()
        self.assertEqual(test_input.grad, ref_input.grad)
        self.assertEqual(mlp.biases[0].grad, ref_mlp[0].bias.grad)

    def _test_mlp_impl(self, use_activation: str, bias: bool, enable_autocast: bool):
        mlp = MLP(mlp_sizes, bias=bias, activation=use_activation).cuda()

        mlp_layers = []
        for i in range(mlp.num_layers):
            linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1], bias=bias)
            with torch.no_grad():
                mlp.weights[i].copy_(linear.weight)
                if bias:
                    mlp.biases[i].copy_(linear.bias)
            mlp_layers.append(linear)
            if use_activation == "relu":
                mlp_layers.append(nn.ReLU())
            if use_activation == "sigmoid":
                mlp_layers.append(nn.Sigmoid())

        ref_mlp = nn.Sequential(*mlp_layers).cuda()

        test_input = (
            torch.empty(batch_size, mlp_sizes[0], device="cuda")
            .uniform_(-1.0, 1.0)
            .requires_grad_()
        )
        ref_input = test_input.clone().detach().requires_grad_()

        with torch.cuda.amp.autocast_mode.autocast(enabled=enable_autocast):
            mlp_out = mlp(test_input)
            mlp_loss = mlp_out.mean().mul(10.0)
            # Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
            ref_out = ref_mlp(ref_input)
            ref_loss = ref_out.mean().mul(10.0)

        mlp_loss.backward()
        ref_loss.backward()
        if enable_autocast:
            self.assertEqual(mlp_out.dtype, torch.float16)
            self.assertEqual(ref_out.dtype, torch.float16)
        else:
            self.assertEqual(mlp_out, ref_out)
            self.assertEqual(test_input.grad, ref_input.grad)
            self.assertEqual(mlp.weights[0].grad, ref_mlp[0].weight.grad)

    @tf32_off()
    @common_utils.parametrize(
        "use_activation,bias",
        list(product(("none", "relu", "sigmoid"), (True, False))),
    )
    def test_mlp(self, use_activation: str, bias: bool):
        self._test_mlp_impl(use_activation, bias, enable_autocast=False)

    @common_utils.parametrize(
        "use_activation,bias",
        list(product(("none", "relu", "sigmoid"), (True, False))),
    )
    def test_mlp_autocast_fp16(self, use_activation: str, bias: bool):
        self._test_mlp_impl(use_activation, bias, enable_autocast=True)

    def test_no_grad(self):
        mlp = MLP(mlp_sizes).cuda()

        mlp_layers = []
        for i in range(mlp.num_layers):
            linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1])
            with torch.no_grad():
                mlp.weights[i].copy_(linear.weight)
                mlp.biases[i].copy_(linear.bias)
            mlp_layers.append(linear)
            mlp_layers.append(nn.ReLU(inplace=True))

        ref_mlp = nn.Sequential(*mlp_layers).cuda()

        test_input = torch.empty(batch_size, mlp_sizes[0], device="cuda").uniform_(-1.0, 1.0)
        ref_input = test_input.clone().detach()
        mlp_out = mlp(test_input)
        ref_out = ref_mlp(ref_input)
        self.assertEqual(mlp_out, ref_out)

        # Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
        mlp_out.mean().mul(10.0).backward()
        ref_out.mean().mul(10.0).backward()
        self.assertEqual(mlp.weights[0].grad, ref_mlp[0].weight.grad)

    def test_performance_half(self):
        mlp = MLP(mlp_sizes).cuda().half()

        mlp_layers = []
        for i in range(mlp.num_layers):
            linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1])
            mlp.weights[i].data.copy_(linear.weight)
            mlp.biases[i].data.copy_(linear.bias)
            mlp_layers.append(linear)
            mlp_layers.append(nn.ReLU(inplace=True))

        ref_mlp = nn.Sequential(*mlp_layers).cuda().half()

        test_input = (
            torch.empty(batch_size, mlp_sizes[0], device="cuda", dtype=torch.half)
            .fill_(10.0)
            .requires_grad_()
        )
        ref_input = (
            torch.empty(batch_size, mlp_sizes[0], device="cuda", dtype=torch.half)
            .fill_(10.0)
            .requires_grad_()
        )

        # Warm up GPU
        for _ in range(100):
            ref_out = ref_mlp(ref_input)
            ref_loss = ref_out.mean()
            ref_mlp.zero_grad()
            ref_loss.backward()
            mlp_out = mlp(test_input)
            test_loss = mlp_out.mean()
            mlp.zero_grad()
            test_loss.backward()

        torch.cuda.profiler.start()
        torch.cuda.synchronize()
        start_time = time()
        for _ in range(num_iters):
            ref_out = ref_mlp(ref_input)
            ref_loss = ref_out.mean()
            ref_mlp.zero_grad()
            ref_loss.backward()
        torch.cuda.synchronize()
        stop_time = time()
        ref_time = (stop_time - start_time) * 1000.0 / num_iters
        print(f"\nPytorch MLP time {ref_time:.4f} ms")

        torch.cuda.synchronize()
        start_time = time()
        for _ in range(num_iters):
            mlp_out = mlp(test_input)
            test_loss = mlp_out.mean()
            mlp.zero_grad()
            test_loss.backward()
        torch.cuda.synchronize()
        stop_time = time()
        actual_time = (stop_time - start_time) * 1000.0 / num_iters
        print(f"C++ MLP time {actual_time:.4f} ms")
        torch.cuda.profiler.stop()
        self.assertLessEqual(
            actual_time,
            ref_time,
            msg=f"Custom extension took {actual_time:.4f} while PyTorch took {ref_time:.4f}",
        )


instantiate_device_type_tests(TestMLP, globals(), only_for=("cuda",))


if __name__ == "__main__":
    common_utils.run_tests()


================================================
FILE: tests/L0/run_optimizers/__init__.py
================================================


================================================
FILE: tests/L0/run_optimizers/test_adam.py
================================================
import copy
import unittest

import torch
from torch import nn
from torch.testing._internal.common_device_type import largeTensorTest

try:
    import apex
except ImportError:
    HAS_APEX = False
else:
    HAS_APEX = True


class Model(torch.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(2)
        self.fc1 = nn.Linear(256, 120)
        self.relu3 = nn.ReLU()
        self.fc2 = nn.Linear(120, 84)
        self.relu4 = nn.ReLU()
        self.fc3 = nn.Linear(84, 10)
        self.relu5 = nn.ReLU()

    def forward(self, x):
        y = self.conv1(x)
        y = self.relu1(y)
        y = self.pool1(y)
        y = self.conv2(y)
        y = self.relu2(y)
        y = self.pool2(y)
        y = y.reshape(y.shape[0], -1)
        y = self.fc1(y)
        y = self.relu3(y)
        y = self.fc2(y)
        y = self.relu4(y)
        y = self.fc3(y)
        y = self.relu5(y)
        return y


@unittest.skipIf(not HAS_APEX, "`apex` is not found.")
class AdamTest(unittest.TestCase):
    def setUp(self, seed=0):
        super().setUp()
        torch.manual_seed(seed)

        self.model = Model().cuda()
        self.model_ = Model().cuda()
        self.model_.load_state_dict(copy.deepcopy(self.model.state_dict()))

        self.lr = 0.00001
        params = [p for p in self.model.parameters() if p.requires_grad]
        self.optimizer = torch.optim.Adam(params, lr=self.lr)

    def testGradScaler(self):
        params_ = [p for p in self.model_.parameters() if p.requires_grad]
        optimizer_ = apex.optimizers.FusedAdam(params_, lr=self.lr, capturable=False)
        scaler = torch.amp.GradScaler("cuda", enabled=True)
        scaler_ = torch.amp.GradScaler("cuda", enabled=True)

        for i in range(100):
            x = torch.rand([32, 1, 28, 28]).cuda().to(memory_format=torch.channels_last)
            x_ = x.clone()
            gt = torch.rand([32, 10]).cuda()
            gt_ = gt.clone()

            # Reference
            with torch.amp.autocast("cuda", enabled=True):
                y = self.model(x)
                loss = ((gt - y) ** 2).mean()

            scaler.scale(loss).backward()
            scaler.step(self.optimizer)
            scaler.update()

            # DUT
            with torch.amp.autocast("cuda", enabled=True):
                y = self.model_(x)
                loss_ = ((gt_ - y) ** 2).mean()

            scaler_.scale(loss_).backward()
            scaler_.step(optimizer_)
            scaler_.update()

            for module in zip(self.model.modules(), self.model_.modules()):
                m = module[0]
                m_ = module[1]
                if isinstance(m, nn.Conv2d) or isinstance(m_, nn.Linear):
                    torch.testing.assert_close(
                        m.weight, m_.weight, atol=1e-3, rtol=1e-3, equal_nan=True
                    )
                    torch.testing.assert_close(
                        m.weight.grad,
                        m_.weight.grad,
                        atol=1e-3,
                        rtol=1e-3,
                        equal_nan=True,
                    )

            # Init for next iteration
            self.optimizer.zero_grad()
            optimizer_.zero_grad()

            self.model_.load_state_dict(copy.deepcopy(self.model.state_dict()))

    def testGradScalerCapturable(self):
        params_ = [p for p in self.model_.parameters() if p.requires_grad]
        optimizer_ = apex.optimizers.FusedAdam(params_, lr=self.lr, capturable=True)
        scaler = torch.amp.GradScaler("cuda", enabled=True)
        scaler_ = torch.amp.GradScaler("cuda", enabled=True)

        for i in range(100):
            x = torch.rand([32, 1, 28, 28]).cuda().to(memory_format=torch.channels_last)
            x_ = x.clone()
            gt = torch.rand([32, 10]).cuda()
            gt_ = gt.clone()

            # Reference
            with torch.amp.autocast("cuda", enabled=True):
                y = self.model(x)
                loss = ((gt - y) ** 2).mean()

            scaler.scale(loss).backward()
            scaler.step(self.optimizer)
            scaler.update()

            # DUT
            with torch.amp.autocast("cuda", enabled=True):
                y = self.model_(x)
                loss_ = ((gt_ - y) ** 2).mean()

            scaler_.scale(loss_).backward()
            scaler_.step(optimizer_)
            scaler_.update()

            for module in zip(self.model.modules(), self.model_.modules()):
                m = module[0]
                m_ = module[1]
                if isinstance(m, nn.Conv2d) or isinstance(m_, nn.Linear):
                    torch.testing.assert_close(
                        m.weight, m_.weight, atol=1e-3, rtol=1e-3, equal_nan=True
                    )
                    torch.testing.assert_close(
                        m.weight.grad,
                        m_.weight.grad,
                        atol=1e-3,
                        rtol=1e-3,
                        equal_nan=True,
                    )

            # Init for next iteration
            self.optimizer.zero_grad()
            optimizer_.zero_grad()

            self.model_.load_state_dict(copy.deepcopy(self.model.state_dict()))

    def testGradScalerCapturableMaster(self):
        # Cast conv layers to FP16
        for m in self.model_.modules():
            if m.__class__ in [torch.nn.Conv2d]:
                m.half()
        params_ = [p for p in self.model_.parameters() if p.requires_grad]
        optimizer_ = apex.optimizers.FusedAdam(
            params_, lr=self.lr, capturable=True, master_weights=True
        )
        scaler = torch.amp.GradScaler("cuda", enabled=True)
        scaler_ = torch.amp.GradScaler("cuda", enabled=True)

        for i in range(100):
            x = torch.rand([32, 1, 28, 28]).cuda().to(memory_format=torch.channels_last)
            x_ = x.clone()
            gt = torch.rand([32, 10]).cuda()
            gt_ = gt.clone()

            # Reference
            with torch.amp.autocast("cuda", enabled=True):
                y = self.model(x)
                loss = ((gt - y) ** 2).mean()

            scaler.scale(loss).backward()
            scaler.step(self.optimizer)
            scaler.update()

            # DUT
            with torch.amp.autocast("cuda", enabled=True):
                y = self.model_(x)
                loss_ = ((gt_ - y) ** 2).mean()

            scaler_.scale(loss_).backward()
            scaler_.step(optimizer_)
            scaler_.update()

            for module in zip(self.model.modules(), self.model_.modules()):
                m = module[0]
                m_ = module[1]
                if isinstance(m, nn.Conv2d) or isinstance(m_, nn.Linear):
                    torch.testing.assert_close(
                        m.weight,
                        m_.weight.float(),
                        atol=1e-3,
                        rtol=1e-3,
                        equal_nan=True,
                    )
                    torch.testing.assert_close(
                        m.weight.grad,
                        m_.weight.grad.float(),
                        atol=1e-3,
                        rtol=1e-3,
                        equal_nan=True,
                    )

            # Init for next iteration
            self.optimizer.zero_grad()
            optimizer_.zero_grad()

            self.model_.load_state_dict(copy.deepcopy(self.model.state_dict()))

    def testNative(self):
        params_ = [p for p in self.model_.parameters() if p.requires_grad]
        optimizer_ = apex.optimizers.FusedAdam(params_, lr=self.lr, capturable=False)

        for i in range(100):
            x = torch.rand([32, 1, 28, 28]).cuda().to(memory_format=torch.channels_last)
            x_ = x.clone()
            gt = torch.rand([32, 10]).cuda()
            gt_ = gt.clone()

            # Reference
            y = self.model(x)
            loss = ((gt - y) ** 2).mean()

            loss.backward()
            self.optimizer.step()

            # DUT
            y = self.model_(x)
            loss_ = ((gt_ - y) ** 2).mean()

            loss_.backward()
            optimizer_.step()

            for module in zip(self.model.modules(), self.model_.modules()):
                m = module[0]
                m_ = module[1]
                if isinstance(m, nn.Conv2d) or isinstance(m_, nn.Linear):
                    torch.testing.assert_close(
                        m.weight, m_.weight, atol=1e-3, rtol=1e-3, equal_nan=True
                    )
                    torch.testing.assert_close(
                        m.weight.grad,
                        m_.weight.grad,
                        atol=1e-3,
                        rtol=1e-3,
                        equal_nan=True,
                    )

            # Init for next iteration
            self.optimizer.zero_grad()
            optimizer_.zero_grad()

            self.model_.load_state_dict(copy.deepcopy(self.model.state_dict()))

    @largeTensorTest("60GB", "cuda")
    def testLargeTensor(self):
        t = torch.zeros(2359332864, dtype=torch.half, device="cuda")
        t2 = torch.zeros(2359332864, dtype=torch.half, device="cuda")
        grad = torch.randn_like(t)
        t.grad = grad
        t2.grad = grad
        params = [t]
        params2 = [t2]
        optimizer = apex.optimizers.FusedAdam(params, lr=self.lr)
        optimizer.step()
        optimizer2 = torch.optim.Adam(params2, lr=self.lr)
        torch.testing.assert_close(t, t2)
        torch.cuda.synchronize()


if __name__ == "__main__":
    unittest.main()


================================================
FILE: tests/L0/run_optimizers/test_fused_novograd.py
================================================
import torch
from torch.optim import Optimizer
import apex
import unittest

from test_fused_optimizer import TestFusedOptimizer
from itertools import product


class Novograd(Optimizer):
    """
    Implements Novograd algorithm.

    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.95, 0))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        grad_averaging: gradient averaging
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False)
    """

    def __init__(
        self,
        params,
        lr=1e-3,
        betas=(0.95, 0),
        eps=1e-8,
        weight_decay=0,
        grad_averaging=False,
        amsgrad=False,
    ):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(
            lr=lr,
            betas=betas,
            eps=eps,
            weight_decay=weight_decay,
            grad_averaging=grad_averaging,
            amsgrad=amsgrad,
        )

        super(Novograd, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(Novograd, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault("amsgrad", False)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
            and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group["params"]:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError("Sparse gradients are not supported.")
                amsgrad = group["amsgrad"]

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state["step"] = 0
                    # Exponential moving average of gradient values
                    state["exp_avg"] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state["exp_avg_sq"] = torch.zeros([]).to(state["exp_avg"].device)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state["max_exp_avg_sq"] = torch.zeros([]).to(state["exp_avg"].device)

                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
                if amsgrad:
                    max_exp_avg_sq = state["max_exp_avg_sq"]
                beta1, beta2 = group["betas"]

                state["step"] += 1

                norm = torch.sum(torch.pow(grad, 2))

                if exp_avg_sq == 0:
                    exp_avg_sq.copy_(norm)
                else:
                    exp_avg_sq.mul_(beta2).add_(norm, alpha=1 - beta2)

                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = max_exp_avg_sq.sqrt().add_(group["eps"])
                else:
                    denom = exp_avg_sq.sqrt().add_(group["eps"])

                grad.div_(denom)
                if group["weight_decay"] != 0:
                    grad.add_(p.data, alpha=group["weight_decay"])
                if group["grad_averaging"]:
                    grad.mul_(1 - beta1)
                exp_avg.mul_(beta1).add_(grad)

                p.data.add_(exp_avg, alpha=-group["lr"])

        return loss


class TestFusedNovoGrad(TestFusedOptimizer):
    def __init__(self, *args, **kwargs):
        super(TestFusedNovoGrad, self).__init__(*args, **kwargs)

        # The options for NovoGrad and FusedNovoGrad are very specific if they
        # are expected to behave the same.
        self.options = {
            "lr": 1e-3,
            "betas": (0.95, 0),
            "eps": 1e-8,
            "weight_decay": 0,
            "grad_averaging": False,
            "amsgrad": False,
        }

        self.tst_options = {
            "lr": 1e-3,
            "betas": (0.95, 0),
            "eps": 1e-8,
            "weight_decay": 0,
            "grad_averaging": False,
            "amsgrad": False,
            "bias_correction": False,
            "reg_inside_moment": True,
            "norm_type": 2,
            "init_zero": False,
            "set_grad_none": True,
        }

        self.ref_optim = Novograd
        self.fused_optim = apex.optimizers.FusedNovoGrad

    def test_float(self):
        self.gen_single_type_test(param_type=torch.float)

    def test_half(self):
        self.gen_single_type_test(param_type=torch.float16)

    @unittest.skipIf(torch.cuda.device_count() < 2, "more than 1 GPU required")
    def test_multi_device(self):
        devices = ("cuda:1", "cuda:0")
        for current_dev, tensor_dev in product(devices, devices):
            with torch.cuda.device(current_dev):
                torch.cuda.synchronize()
                self.gen_single_type_test(param_type=torch.float, device=tensor_dev)

    def test_multi_params(self):
        sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]

        tensors = []
        for size in sizes:
            tensors.append(torch.rand(size, dtype=torch.float, device="cuda"))
        ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
            tensors, self.options, self.tst_options
        )

        for _ in range(self.iters):
            self.gen_grad(ref_param, tst_param)
            ref_optim.step()
            tst_optim.step()
            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
            self.assertLessEqual(max_rel_diff, self.max_rel_diff)


if __name__ == "__main__":
    unittest.main()


================================================
FILE: tests/L0/run_optimizers/test_fused_optimizer.py
================================================
from itertools import product
import random
import unittest

import torch

import apex


class TestFusedOptimizer(unittest.TestCase):
    def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7):
        self.max_abs_diff = max_abs_diff
        self.max_rel_diff = max_rel_diff
        self.iters = iters
        torch.manual_seed(9876)

    def tearDown(self):
        pass

    def gen_param_optim(self, tensors, options, tst_options=None):
        # Adding this to make backward compatible with existing tests. Just in
        # case "tst_options" are not provided, it gets a copy of options
        # which contains the parameters for the reference optimizer
        if tst_options == None:
            tst_options = options

        ref_param = []
        tst_param = []
        for tensor in tensors:
            ref_param.append(torch.nn.Parameter(tensor.clone()))
            tst_param.append(torch.nn.Parameter(tensor.clone()))

        ref_optim = self.ref_optim(ref_param, **options)
        tst_optim = self.fused_optim(tst_param, **tst_options)

        return (ref_param, tst_param, ref_optim, tst_optim)

    def gen_grad(self, ref_param, tst_param):
        for p_ref, p_tst in zip(ref_param, tst_param):
            p_ref.grad = torch.rand_like(p_ref)
            p_tst.grad = p_ref.grad

    def gen_mixed_grad(self, ref_param, tst_param, scale=1.0):
        half_grads = []
        for p_ref, p_tst in zip(ref_param, tst_param):
            half_grads.append(torch.rand_like(p_ref).half())
            p_ref.grad = half_grads[-1].float() / scale
        return half_grads

    def get_max_diff(self, ref_param, tst_param):
        max_abs_diff = max_rel_diff = 0
        for p_ref, p_tst in zip(ref_param, tst_param):
            max_abs_diff_p = (p_ref - p_tst).abs().max().item()
            max_rel_diff_p = ((p_ref - p_tst) / p_ref).abs().max().item()

            if max_abs_diff_p > max_abs_diff:
                max_abs_diff = max_abs_diff_p
            if max_rel_diff_p > max_rel_diff:
                max_rel_diff = max_rel_diff_p

        return max_abs_diff, max_rel_diff

    def gen_single_type_test(
        self, param_type=torch.float, device="cuda", *, skip_assert: bool = False
    ):
        nelem = 278011

        # Some ref and test optimizers may require different set of options.
        # This is a quick workaround to add that functionality while making
        # minimum changes in existing code.
        # If there is no "tst_options" field provided, safe to initialize
        # the test optimizer with the parameters of reference optimizer.
        if not hasattr(self, "tst_options"):
            self.tst_options = self.options

        tensor = torch.rand(nelem, dtype=param_type, device=device)

        ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
            [tensor], self.options, self.tst_options
        )

        for i in range(self.iters):
            self.gen_grad(ref_param, tst_param)
            ref_optim.step()
            tst_optim.step()
            if skip_assert:
                return
            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
            self.assertLessEqual(max_rel_diff, self.max_rel_diff)


class TestFusedAdam(TestFusedOptimizer):
    def setUp(self):
        super().setUp()
        self.options = {
            "lr": 5e-4,
            "betas": (0.9, 0.999),
            "eps": 1e-08,
            "weight_decay": 0,
            "amsgrad": False,
        }
        self.ref_optim = torch.optim.Adam
        self.fused_optim = apex.optimizers.FusedAdam

    def test_float(self):
        self.gen_single_type_test(param_type=torch.float)

    # NOTE(mkozuki): Current threshold values look too small for BFloat16.
    # TODO(mkozuki): Refactor `TestFusedOptimizer`
    def test_half(self):
        self.gen_single_type_test(param_type=torch.float16, skip_assert=True)

    def test_bfloat16(self):
        self.gen_single_type_test(param_type=torch.bfloat16, skip_assert=True)

    @unittest.skipIf(torch.cuda.device_count() < 2, "more than 1 GPU required")
    def test_multi_device(self):
        devices = ("cuda:0", "cuda:1")
        for current_dev, tensor_dev in product(devices, devices):
            with torch.cuda.device(current_dev):
                self.gen_single_type_test(param_type=torch.float, device=tensor_dev)

    @unittest.skip("Disable until 8/1/2019 adam/adamw upstream picked")
    def test_multi_params(self):
        sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]

        tensors = []
        for size in sizes:
            tensors.append(torch.rand(size, dtype=torch.float, device="cuda"))
        ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(tensors, self.options)

        for i in range(self.iters):
            self.gen_grad(ref_param, tst_param)
            ref_optim.step()
            tst_optim.step()
            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
            self.assertLessEqual(max_rel_diff, self.max_rel_diff)

    @unittest.skip("No longer support fuse scaling")
    def test_scale(self):
        nelem = 278011
        tensor = torch.rand(nelem, dtype=torch.float, device="cuda")
        ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim([tensor], self.options)

        for i in range(self.iters):
            scale = random.random() * 1000
            half_grads = self.gen_mixed_grad(ref_param, tst_param, scale)
            ref_optim.step()
            tst_optim.step(grads=half_grads, scale=scale)
            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)

            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
            self.assertLessEqual(max_rel_diff, self.max_rel_diff)

    @unittest.skip("No longer support output fp16 param")
    def test_fp16_output(self):
        nelem = 278011

        tensor = torch.rand(nelem, dtype=torch.float, device="cuda")
        ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim([tensor], self.options)

        fp16_param = torch.nn.Parameter(tensor.clone().half())

        for i in range(self.iters):
            half_grads = self.gen_mixed_grad(ref_param, tst_param)
            ref_optim.step()
            tst_optim.step(grads=half_grads, output_params=[fp16_param])

            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
            self.assertLessEqual(max_rel_diff, self.max_rel_diff)

            max_abs_diff, max_rel_diff = self.get_max_diff(tst_param, [fp16_param.float()])
            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
            self.assertLessEqual(max_rel_diff, self.max_rel_diff)

    def test_adam_option(self):
        nelem = 1
        adam_option = {
            "lr": 0.01,
            "betas": (0.6, 0.9),
            "eps": 3e-06,
            "weight_decay": 0,
            "amsgrad": False,
        }

        tensor = torch.rand(nelem, dtype=torch.float, device="cuda")
        ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim([tensor], adam_option)

        for i in range(self.iters):
            self.gen_grad(ref_param, tst_param)
            ref_optim.step()
            tst_optim.step()
            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)

            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
            self.assertLessEqual(max_rel_diff, self.max_rel_diff)

    def test_frozen_model(self):
        nelem = 1
        adam_option = {
            "lr": 0.01,
            "betas": (0.6, 0.9),
            "eps": 3e-06,
            "weight_decay": 0,
            "amsgrad": False,
        }

        tensor = torch.rand(nelem, dtype=torch.float, device="cuda")
        ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim([tensor], adam_option)

        # Add an empty param group which may occur for pipeline parallel p-tuning
        tst_optim.add_param_group({"params": []})

        for i in range(self.iters):
            self.gen_grad(ref_param, tst_param)
            ref_optim.step()
            tst_optim.step()
            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)

            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
            self.assertLessEqual(max_rel_diff, self.max_rel_diff)


class TestFusedAdagrad(TestFusedOptimizer):
    def __init__(self, *args, **kwargs):
        super(TestFusedAdagrad, self).__init__(*args, **kwargs)
        self.options = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 1.0e-5}
        self.ref_optim = torch.optim.Adagrad
        self.fused_optim = apex.optimizers.FusedAdagrad

    def test_float(self):
        self.gen_single_type_test(param_type=torch.float)

    @unittest.skip("PyTorch optimizer is not numerically correct for fp16")
    def test_half(self):
        self.gen_single_type_test(param_type=torch.float16)

    @unittest.skipIf(torch.cuda.device_count() < 2, "more than 1 GPU required")
    def test_multi_device(self):
        devices = ("cuda:0", "cuda:1")
        for current_dev, tensor_dev in product(devices, devices):
            with torch.cuda.device(current_dev):
                self.gen_single_type_test(param_type=torch.float, device=tensor_dev)

    def test_multi_params(self):
        sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
        adagrad_option = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 0}

        tensors = []
        for size in sizes:
            tensors.append(torch.rand(size, dtype=torch.float, device="cuda"))
        ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(tensors, adagrad_option)

        for _ in range(self.iters):
            self.gen_grad(ref_param, tst_param)
            ref_optim.step()
            tst_optim.step()
            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
            self.assertLessEqual(max_rel_diff, self.max_rel_diff)

    @unittest.skipIf(torch.cuda.device_count() < 2, "more than 1 GPU required")
    def test_multi_params_different_devices_throws(self):
        sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
        adagrad_option = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 0}

        tensors = []
        for i, size in enumerate(sizes):
            tensors.append(torch.rand(size, dtype=torch.float, device="cuda:" + str(i % 2)))
        ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(tensors, adagrad_option)
        self.gen_grad(ref_param, tst_param)
        with self.assertRaisesRegex(RuntimeError, "not on the same device"):
            tst_optim.step()

    def test_adagrad_option(self):
        nelem = 1
        adagrad_option = {"lr": 0.01, "eps": 3e-06, "weight_decay": 0}

        tensor = torch.rand(nelem, dtype=torch.float, device="cuda")
        ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim([tensor], adagrad_option)

        for _ in range(self.iters):
            self.gen_grad(ref_param, tst_param)
            ref_optim.step()
            tst_optim.step()
            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)

            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
            self.assertLessEqual(max_rel_diff, self.max_rel_diff)


class TestFusedSGD(TestFusedOptimizer):
    def __init__(self, *args, **kwargs):
        super(TestFusedSGD, self).__init__(*args, **kwargs)
        self.options = {"lr": 0.25, "momentum": 0.125}
        self.ref_optim = torch.optim.SGD
        self.fused_optim = apex.optimizers.FusedSGD

    def test_float(self):
        self.gen_single_type_test(param_type=torch.float)

    def test_half(self):
        self.gen_single_type_test(param_type=torch.float16)

    @unittest.skipIf(torch.cuda.device_count() < 2, "more than 1 GPU required")
    def test_multi_device(self):
        devices = ("cuda:0", "cuda:1")
        for current_dev, tensor_dev in product(devices, devices):
            with torch.cuda.device(current_dev):
                self.gen_single_type_test(param_type=torch.float, device=tensor_dev)


if __name__ == "__main__":
    unittest.main()


================================================
FILE: tests/L0/run_optimizers/test_lamb.py
================================================
import unittest
import os

import torch
from torch.optim import Optimizer
import apex
from apex.multi_tensor_apply import multi_tensor_applier
from itertools import product


class RefLAMB(Optimizer):
    r"""Implements Lamb algorithm.

    It has been proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-6)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0.01)

    .. _Large Batch Optimization for Deep Learning: Training BERT in 76 minutes:
        https://arxiv.org/abs/1904.00962
    """

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.01):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
        super(RefLAMB, self).__init__(params, defaults)
        if multi_tensor_applier.available:
            import amp_C

            self.multi_tensor_l2norm = amp_C.multi_tensor_l2norm
            # Skip buffer
            self._dummy_overflow_buf = torch.tensor(
                [0], dtype=torch.int, device=self.param_groups[0]["params"][0].device
            )
            self.multi_tensor_lamb = amp_C.multi_tensor_lamb
        else:
            raise RuntimeError("apex.optimizers.FusedLAMB requires cuda extensions")

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        # create separate grad lists for fp32, fp16, and bf16 params
        g_all_32, g_all_16, g_all_bf16 = [], [], []
        for group in self.param_groups:
            for p in group["params"]:
                if p.grad is None:
                    continue
                if p.dtype == torch.float32:
                    g_all_32.append(p.grad.data)
                elif p.dtype == torch.float16:
                    g_all_16.append(p.grad.data)
                elif p.dtype == torch.bfloat16:
                    g_all_bf16.append(p.grad.data)
                else:
                    raise RuntimeError("FusedLAMB only support fp16, fp32, and bf16.")

        device = self.param_groups[0]["params"][0].device
        g_norm_32, g_norm_16, g_norm_bf16 = (
            torch.zeros(1, device=device),
            torch.zeros(1, device=device),
            torch.zeros(1, device=device),
        )
        # compute grad norm for two lists
        if len(g_all_32) > 0:
            g_norm_32 = multi_tensor_applier(
                self.multi_tensor_l2norm, self._dummy_overflow_buf, [g_all_32], False
            )[0]
        if len(g_all_16) > 0:
            g_norm_16 = multi_tensor_applier(
                self.multi_tensor_l2norm, self._dummy_overflow_buf, [g_all_16], False
            )[0]
        if len(g_all_bf16) > 0:
            g_norm_bf16 = multi_tensor_applier(
                self.multi_tensor_l2norm, self._dummy_overflow_buf, [g_all_bf16], False
            )[0]

        # blend two grad norms to get global grad norm
        global_grad_norm = multi_tensor_applier(
            self.multi_tensor_l2norm,
            self._dummy_overflow_buf,
            [[g_norm_32, g_norm_16, g_norm_bf16]],
            False,
        )[0]

        max_grad_norm = 1.0
        clipped_ratio = max_grad_norm / max(global_grad_norm, max_grad_norm)

        for group in self.param_groups:
            for p in group["params"]:
                if p.grad is None:
                    continue
                p.grad.data *= clipped_ratio
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError(
                        "Lamb does not support sparse gradients, consider SparseAdam instad."
                    )

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state["step"] = 0
                    # Exponential moving average of gradient values
                    state["m"] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state["v"] = torch.zeros_like(p.data)

                m_t, v_t = state["m"], state["v"]
                beta1, beta2 = group["betas"]

                state["step"] += 1

                # m_t = beta1 * m + (1 - beta1) * g_t
                m_t.mul_(beta1).add_(grad, alpha=1 - beta1)
                # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
                if len(g_all_16) > 0:
                    v_t.mul_(beta2)
                    v_t = v_t.to(torch.float32)
                    grad32 = grad.to(torch.float32)
                    v_t.addcmul_(grad32, grad32, value=1 - beta2)
                else:
                    v_t.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)

                # Debiasing
                m_t_hat = m_t / (1.0 - beta1 ** state["step"])
                v_t_hat = v_t / (1.0 - beta2 ** state["step"])

                update = m_t_hat / v_t_hat.sqrt().add(group["eps"])

                if group["weight_decay"] != 0:
                    update.add_(p.data, alpha=group["weight_decay"])

                trust_ratio = 1.0
                w_norm = p.data.to(torch.float32).pow(2).sum().sqrt()
                g_norm = update.pow(2).sum().sqrt()
                if w_norm > 0 and g_norm > 0:
                    trust_ratio = w_norm / g_norm

                state["w_norm"] = w_norm
                state["g_norm"] = g_norm
                state["trust_ratio"] = trust_ratio

                step_size = group["lr"]

                p.data.add_(update, alpha=-step_size * trust_ratio)

        return loss


class TestLamb(unittest.TestCase):
    def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7):
        self.max_abs_diff = max_abs_diff
        self.max_rel_diff = max_rel_diff
        self.iters = iters
        torch.cuda.manual_seed(9876)

    def tearDown(self):
        pass

    def gen_param_optim(self, tensors, lamb_option):
        ref_param = []
        tst_param = []
        for tensor in tensors:
            ref_param.append(torch.nn.Parameter(tensor.clone()))
            tst_param.append(torch.nn.Parameter(tensor.clone()))

        ref_optim = self.ref_optim(ref_param, **lamb_option)
        tst_optim = self.tst_optim(tst_param, use_nvlamb=True, **lamb_option)

        return (ref_param, tst_param, ref_optim, tst_optim)

    def gen_grad(self, ref_param, tst_param):
        for p_ref, p_tst in zip(ref_param, tst_param):
            p_ref.grad = torch.rand_like(p_ref)
            p_tst.grad = p_ref.grad

    def gen_mixed_grad(self, ref_param, tst_param, scale=1.0):
        half_grads = []
        for p_ref, _ in zip(ref_param, tst_param):
            half_grads.append(torch.rand_like(p_ref).half())
            p_ref.grad = half_grads[-1].float() / scale
        return half_grads

    def gen_single_type_test(self, param_type=torch.float, device="cuda"):
        nelem = 18011
        tensor = torch.rand(nelem, dtype=param_type, device=device)
        weight_decay = [0, 0.01]

        for wd in weight_decay:
            lamb_option = {
                "lr": 5e-4,
                "betas": (0.9, 0.999),
                "eps": 1e-08,
                "weight_decay": wd,
            }
            ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim([tensor], lamb_option)

            if isinstance(tst_optim, apex.optimizers.FusedMixedPrecisionLamb):
                if param_type != torch.float:
                    # joseli: This parameter is usually passed into the constructor,
                    # but I do not want to change the testing interface.
                    # As long as this parameter is set before the first call to step(),
                    # then it should act normally.
                    tst_optim.reduced_precision_dtype = param_type
            for i in range(self.iters):
                self.gen_grad(ref_param, tst_param)
                ref_optim.step()
                torch.cuda.synchronize()
                tst_optim.step()
                torch.cuda.synchronize()
                torch.testing.assert_close(tst_param, ref_param)


class TestFusedLAMB(TestLamb):
    def __init__(self, *args, **kwargs):
        super(TestLamb, self).__init__(*args, **kwargs)
        self.ref_optim = RefLAMB
        self.tst_optim = apex.optimizers.FusedLAMB

    def test_float(self):
        self.gen_single_type_test(param_type=torch.float)

    @unittest.skip("PyTorch optimizer is not numerically correct for fp16")
    def test_half(self):
        self.gen_single_type_test(param_type=torch.float16)

    @unittest.skipIf(torch.cuda.device_count() < 2, "more than 1 GPU required")
    def test_multi_device(self):
        devices = ("cuda:0", "cuda:1")
        for current_dev, tensor_dev in product(devices, devices):
            with torch.cuda.device(current_dev):
                self.gen_single_type_test(param_type=torch.float, device=tensor_dev)

    def test_multi_params(self):
        sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
        weight_decay = [0, 0.01]

        for wd in weight_decay:
            lamb_option = {
                "lr": 5e-4,
                "betas": (0.9, 0.999),
                "eps": 1e-08,
                "weight_decay": wd,
            }
            tensors = []
            for size in sizes:
                tensors.append(torch.rand(size, dtype=torch.float, device="cuda"))
            ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(tensors, lamb_option)

            for i in range(self.iters):
                self.gen_grad(ref_param, tst_param)
                ref_optim.step()
                tst_optim.step()
                torch.testing.assert_close(tst_param, ref_param)

    def test_lamb_option(self):
        nelem = 1
        tensor = torch.rand(nelem, dtype=torch.float, device="cuda")
        weight_decay = [0, 0.01]

        for wd in weight_decay:
            lamb_option = {
                "lr": 0.01,
                "betas": (0.6, 0.9),
                "eps": 3e-06,
                "weight_decay": wd,
            }
            ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim([tensor], lamb_option)

            for i in range(self.iters):
                self.gen_grad(ref_param, tst_param)
                ref_optim.step()
                tst_optim.step()
                torch.testing.assert_close(tst_param, ref_param)


class TestFusedMixedPrecisionLamb(TestLamb):
    def __init__(self, *args, **kwargs):
        super(TestLamb, self).__init__(*args, **kwargs)
        self.ref_optim = RefLAMB
        self.tst_optim = apex.optimizers.FusedMixedPrecisionLamb

    def test_float(self):
        self.gen_single_type_test(param_type=torch.float)

    def test_bfloat16(self):
        self.iters = 4
        self.gen_single_type_test(param_type=torch.bfloat16)

    def test_half(self):
        self.iters = 1
        self.gen_single_type_test(param_type=torch.float16)

    @unittest.skipIf(torch.cuda.device_count() < 2, "more than 1 GPU required")
    def test_multi_device(self):
        devices = ("cuda:0", "cuda:1")
        for current_dev, tensor_dev in product(devices, devices):
            with torch.cuda.device(current_dev):
                self.gen_single_type_test(param_type=torch.float, device=tensor_dev)

    def test_multi_params(self):
        sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
        weight_decay = [0, 0.01]

        for wd in weight_decay:
            lamb_option = {
                "lr": 5e-4,
                "betas": (0.9, 0.999),
                "eps": 1e-08,
                "weight_decay": wd,
            }
            tensors = []
            for size in sizes:
                tensors.append(torch.rand(size, dtype=torch.float, device="cuda"))
            ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(tensors, lamb_option)

            for i in range(self.iters):
                self.gen_grad(ref_param, tst_param)
                ref_optim.step()
                tst_optim.step()
                torch.testing.assert_close(tst_param, ref_param)

    def test_lamb_option(self):
        nelem = 1
        tensor = torch.rand(nelem, dtype=torch.float, device="cuda")
        weight_decay = [0, 0.01]

        for wd in weight_decay:
            lamb_option = {
                "lr": 0.01,
                "betas": (0.6, 0.9),
                "eps": 3e-06,
                "weight_decay": wd,
            }
            ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim([tensor], lamb_option)

            for i in range(self.iters):
                self.gen_grad(ref_param, tst_param)
                ref_optim.step()
                tst_optim.step()
                torch.testing.assert_close(tst_param, ref_param)


if __name__ == "__main__":
    script_path = os.path.dirname(os.path.realpath(__file__))
    unittest.main()


================================================
FILE: tests/L0/run_test.py
================================================
"""L0 Tests Runner.

How to run this script?

1. Run all the tests: `python /path/to/apex/tests/L0/run_test.py` If you want an xml report,
    pass `--xml-report`, i.e. `python /path/to/apex/tests/L0/run_test.py --xml-report` and
    the file is created in `/path/to/apex/tests/L0`.
2. Run one of the tests (e.g. fused layer norm):
    `python /path/to/apex/tests/L0/run_test.py --include run_fused_layer_norm`
3. Run two or more of the tests (e.g. optimizers and fused layer norm):
    `python /path/to/apex/tests/L0/run_test.py --include run_optimizers run_fused_layer_norm`
"""

import argparse
import os
import unittest
import sys


TEST_ROOT = os.path.dirname(os.path.abspath(__file__))
TEST_DIRS = [
    "run_optimizers",
    "run_fused_layer_norm",
    "run_mlp",
]
DEFAULT_TEST_DIRS = [
    "run_optimizers",
    "run_fused_layer_norm",
    "run_mlp",
]


def parse_args():
    parser = argparse.ArgumentParser(
        description="L0 test runner",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--include",
        nargs="+",
        choices=TEST_DIRS,
        default=DEFAULT_TEST_DIRS,
        help="select a set of tests to run (defaults to ALL tests).",
    )
    parser.add_argument(
        "--xml-report",
        default=None,
        action="store_true",
        help="[deprecated] pass this argument to get a junit xml report. Use `--xml-dir`. (requires `xmlrunner`)",
    )
    parser.add_argument(
        "--xml-dir",
        default=None,
        type=str,
        help="Directory to save junit test reports. (requires `xmlrunner`)",
    )
    args, _ = parser.parse_known_args()
    return args


def main(args: argparse.Namespace) -> None:
    test_runner_kwargs = {"verbosity": 2}
    Runner = unittest.TextTestRunner

    xml_dir = None
    if (args.xml_report is not None) or (args.xml_dir is not None):
        if args.xml_report is not None:
            import warnings

            warnings.warn("The option of `--xml-report` is deprecated", FutureWarning)

        import xmlrunner
        from datetime import date  # NOQA

        Runner = xmlrunner.XMLTestRunner
        if args.xml_report:
            xml_dir = os.path.abspath(os.path.dirname(__file__))
        else:
            xml_dir = os.path.abspath(args.xml_dir)
        if not os.path.exists(xml_dir):
            os.makedirs(xml_dir)

    errcode = 0
    for test_dir in args.include:
        if xml_dir is not None:
            xml_output = os.path.join(
                xml_dir,
                f"""TEST_{test_dir}_{date.today().strftime("%y%m%d")}""",
            )
            if not os.path.exists(xml_output):
                os.makedirs(xml_output)
            test_runner_kwargs["output"] = xml_output

        runner = Runner(**test_runner_kwargs)
        test_dir = os.path.join(TEST_ROOT, test_dir)
        suite = unittest.TestLoader().discover(test_dir)

        print("\nExecuting tests from " + test_dir)

        result = runner.run(suite)

        if not result.wasSuccessful():
            errcode = 1

    sys.exit(errcode)


if __name__ == "__main__":
    args = parse_args()
    main(args)


================================================
FILE: tests/L1/common/compare.py
================================================
import argparse
import torch

parser = argparse.ArgumentParser(description="Compare")
parser.add_argument("--opt-level", type=str)
parser.add_argument("--keep-batchnorm-fp32", type=str, default=None)
parser.add_argument("--loss-scale", type=str, default=None)
parser.add_argument("--fused-adam", action="store_true")
parser.add_argument("--use_baseline", action="store_true")
args = parser.parse_args()

base_file = (
    str(args.opt_level)
    + "_"
    + str(args.loss_scale)
    + "_"
    + str(args.keep_batchnorm_fp32)
    + "_"
    + str(args.fused_adam)
)

file_e = "True_" + base_file
file_p = "False_" + base_file
if args.use_baseline:
    file_b = "baselines/True_" + base_file

dict_e = torch.load(file_e)
dict_p = torch.load(file_p)
if args.use_baseline:
    dict_b = torch.load(file_b)

torch.set_printoptions(precision=10)

print(file_e)
print(file_p)
if args.use_baseline:
    print(file_b)

# ugly duplication here...
if not args.use_baseline:
    for n, (i_e, i_p) in enumerate(zip(dict_e["Iteration"], dict_p["Iteration"])):
        assert i_e == i_p, "i_e = {}, i_p = {}".format(i_e, i_p)

        loss_e = dict_e["Loss"][n]
        loss_p = dict_p["Loss"][n]
        assert loss_e == loss_p, "Iteration {}, loss_e = {}, loss_p = {}".format(
            i_e, loss_e, loss_p
        )
        print(
            "{:4} {:15.10f} {:15.10f} {:15.10f} {:15.10f}".format(
                i_e, loss_e, loss_p, dict_e["Speed"][n], dict_p["Speed"][n]
            )
        )
else:
    for n, (i_e, i_p) in enumerate(zip(dict_e["Iteration"], dict_p["Iteration"])):
        assert i_e == i_p, "i_e = {}, i_p = {}".format(i_e, i_p)

        loss_e = dict_e["Loss"][n]
        loss_p = dict_p["Loss"][n]
        loss_b = dict_b["Loss"][n]
        assert loss_e == loss_p, "Iteration {}, loss_e = {}, loss_p = {}".format(
            i_e, loss_e, loss_p
        )
        assert loss_e == loss_b, "Iteration {}, loss_e = {}, loss_b = {}".format(
            i_e, loss_e, loss_b
        )
        print(
            "{:4} {:15.10f} {:15.10f} {:15.10f} {:15.10f} {:15.10f} {:15.10f}".format(
                i_e,
                loss_b,
                loss_e,
                loss_p,
                dict_b["Speed"][n],
                dict_e["Speed"][n],
                dict_p["Speed"][n],
            )
        )


================================================
FILE: tests/L1/common/main_amp.py
================================================
import argparse
import os
import shutil
import time

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models

import numpy as np

try:
    from apex.parallel import DistributedDataParallel as DDP
    from apex.fp16_utils import *
    from apex import amp, optimizers
    from apex.multi_tensor_apply import multi_tensor_applier
except ImportError:
    raise ImportError(
        "Please install apex from https://www.github.com/nvidia/apex to run this example."
    )

model_names = sorted(
    name
    for name in models.__dict__
    if name.islower() and not name.startswith("__") and callable(models.__dict__[name])
)

parser = argparse.ArgumentParser(description="PyTorch ImageNet Training")
parser.add_argument("data", metavar="DIR", help="path to dataset")
parser.add_argument(
    "--arch",
    "-a",
    metavar="ARCH",
    default="resnet18",
    choices=model_names,
    help="model architecture: " + " | ".join(model_names) + " (default: resnet18)",
)
parser.add_argument(
    "-j",
    "--workers",
    default=4,
    type=int,
    metavar="N",
    help="number of data loading workers (default: 4)",
)
parser.add_argument(
    "--epochs", default=90, type=int, metavar="N", help="number of total epochs to run"
)
parser.add_argument(
    "--start-epoch",
    default=0,
    type=int,
    metavar="N",
    help="manual epoch number (useful on restarts)",
)
parser.add_argument(
    "-b",
    "--batch-size",
    default=256,
    type=int,
    metavar="N",
    help="mini-batch size per process (default: 256)",
)
parser.add_argument(
    "--lr",
    "--learning-rate",
    default=0.1,
    type=float,
    metavar="LR",
    help="Initial learning rate.  Will be scaled by <global batch size>/256: args.lr = args.lr*float(args.batch_size*args.world_size)/256.  A warmup schedule will also be applied over the first 5 epochs.",
)
parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum")
parser.add_argument(
    "--weight-decay",
    "--wd",
    default=1e-4,
    type=float,
    metavar="W",
    help="weight decay (default: 1e-4)",
)
parser.add_argument(
    "--print-freq",
    "-p",
    default=10,
    type=int,
    metavar="N",
    help="print frequency (default: 10)",
)
parser.add_argument(
    "--resume",
    default="",
    type=str,
    metavar="PATH",
    help="path to latest checkpoint (default: none)",
)
parser.add_argument(
    "-e",
    "--evaluate",
    dest="evaluate",
    action="store_true",
    help="evaluate model on validation set",
)
parser.add_argument(
    "--pretrained", dest="pretrained", action="store_true", help="use pre-trained model"
)

parser.add_argument(
    "--prof",
    dest="prof",
    action="store_true",
    help="Only run 10 iterations for profiling.",
)
parser.add_argument("--deterministic", action="store_true")

parser.add_argument("--local_rank", default=0, type=int)
parser.add_argument("--sync_bn", action="store_true", help="enabling apex sync BN.")

parser.add_argument("--has-ext", action="store_true")
parser.add_argument("--opt-level", type=str)
parser.add_argument("--keep-batchnorm-fp32", type=str, default=None)
parser.add_argument("--loss-scale", type=str, default=None)
parser.add_argument("--fused-adam", action="store_true")

parser.add_argument("--prints-to-process", type=int, default=10)

cudnn.benchmark = True


def fast_collate(batch):
    imgs = [img[0] for img in batch]
    targets = torch.tensor([target[1] for target in batch], dtype=torch.int64)
    w = imgs[0].size[0]
    h = imgs[0].size[1]
    tensor = torch.zeros((len(imgs), 3, h, w), dtype=torch.uint8)
    for i, img in enumerate(imgs):
        nump_array = np.asarray(img, dtype=np.uint8)
        if nump_array.ndim < 3:
            nump_array = np.expand_dims(nump_array, axis=-1)
        nump_array = np.rollaxis(nump_array, 2)

        tensor[i] += torch.from_numpy(nump_array)

    return tensor, targets


best_prec1 = 0
args = parser.parse_args()

# Let multi_tensor_applier be the canary in the coalmine
# that verifies if the backend is what we think it is
assert multi_tensor_applier.available == args.has_ext

print("opt_level = {}".format(args.opt_level))
print(
    "keep_batchnorm_fp32 = {}".format(args.keep_batchnorm_fp32),
    type(args.keep_batchnorm_fp32),
)
print("loss_scale = {}".format(args.loss_scale), type(args.loss_scale))


print("\nCUDNN VERSION: {}\n".format(torch.backends.cudnn.version()))

if args.deterministic:
    cudnn.benchmark = False
    cudnn.deterministic = True
    torch.manual_seed(args.local_rank)
    torch.set_printoptions(precision=10)


def main():
    global best_prec1, args

    args.distributed = False
    if "WORLD_SIZE" in os.environ:
        args.distributed = int(os.environ["WORLD_SIZE"]) > 1

    args.gpu = 0
    args.world_size = 1

    if args.distributed:
        args.gpu = args.local_rank % torch.cuda.device_count()
        torch.cuda.set_device(args.gpu)
        torch.distributed.init_process_group(backend="nccl", init_method="env://")
        args.world_size = torch.distributed.get_world_size()

    assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled."

    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    if args.sync_bn:
        import apex

        print("using apex synced BN")
        model = apex.parallel.convert_syncbn_model(model)

    model = model.cuda()

    # Scale learning rate based on global batch size
    args.lr = args.lr * float(args.batch_size * args.world_size) / 256.0
    if args.fused_adam:
        optimizer = optimizers.FusedAdam(model.parameters())
    else:
        optimizer = torch.optim.SGD(
            model.parameters(),
            args.lr,
            momentum=args.momentum,
            weight_decay=args.weight_decay,
        )

    model, optimizer = amp.initialize(
        model,
        optimizer,
        # enabled=False,
        opt_level=args.opt_level,
        keep_batchnorm_fp32=args.keep_batchnorm_fp32,
        loss_scale=args.loss_scale,
    )

    if args.distributed:
        # By default, apex.parallel.DistributedDataParallel overlaps communication with
        # computation in the backward pass.
        # model = DDP(model)
        # delay_allreduce delays all communication to the end of the backward pass.
        model = DDP(model, delay_allreduce=True)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    # Optionally resume from a checkpoint
    if args.resume:
        # Use a local scope to avoid dangling references
        def resume():
            if os.path.isfile(args.resume):
                print("=> loading checkpoint '{}'".format(args.resume))
                checkpoint = torch.load(
                    args.resume,
                    map_location=lambda storage, loc: storage.cuda(args.gpu),
                )
                args.start_epoch = checkpoint["epoch"]
                best_prec1 = checkpoint["best_prec1"]
                model.load_state_dict(checkpoint["state_dict"])
                optimizer.load_state_dict(checkpoint["optimizer"])
                print(
                    "=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint["epoch"])
                )
            else:
                print("=> no checkpoint found at '{}'".format(args.resume))

        resume()

    # Data loading code
    traindir = os.path.join(args.data, "train")
    valdir = os.path.join(args.data, "val")

    if args.arch == "inception_v3":
        crop_size = 299
        val_size = 320  # I chose this value arbitrarily, we can adjust.
    else:
        crop_size = 224
        val_size = 256

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose(
            [
                transforms.RandomResizedCrop(crop_size),
                transforms.RandomHorizontalFlip(),
                # transforms.ToTensor(), Too slow
                # normalize,
            ]
        ),
    )
    val_dataset = datasets.ImageFolder(
        valdir,
        transforms.Compose(
            [
                transforms.Resize(val_size),
                transforms.CenterCrop(crop_size),
            ]
        ),
    )

    train_sampler = None
    val_sampler = None
    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
        val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        shuffle=(train_sampler is None),
        num_workers=args.workers,
        pin_memory=True,
        sampler=train_sampler,
        collate_fn=fast_collate,
    )

    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.workers,
        pin_memory=True,
        sampler=val_sampler,
        collate_fn=fast_collate,
    )

    if args.evaluate:
        validate(val_loader, model, criterion)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)
        if args.prof:
            break
        # evaluate on validation set
        prec1 = validate(val_loader, model, criterion)

        # remember best prec@1 and save checkpoint
        if args.local_rank == 0:
            is_best = prec1 > best_prec1
            best_prec1 = max(prec1, best_prec1)
            save_checkpoint(
                {
                    "epoch": epoch + 1,
                    "arch": args.arch,
                    "state_dict": model.state_dict(),
                    "best_prec1": best_prec1,
                    "optimizer": optimizer.state_dict(),
                },
                is_best,
            )


class data_prefetcher:
    def __init__(self, loader):
        self.loader = iter(loader)
        self.stream = torch.cuda.Stream()
        self.mean = torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]).cuda().view(1, 3, 1, 1)
        self.std = torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]).cuda().view(1, 3, 1, 1)
        # With Amp, it isn't necessary to manually convert data to half.
        # if args.fp16:
        #     self.mean = self.mean.half()
        #     self.std = self.std.half()
        self.preload()

    def preload(self):
        try:
            self.next_input, self.next_target = next(self.loader)
        except StopIteration:
            self.next_input = None
            self.next_target = None
            return
        with torch.cuda.stream(self.stream):
            self.next_input = self.next_input.cuda(non_blocking=True)
            self.next_target = self.next_target.cuda(non_blocking=True)
            # With Amp, it isn't necessary to manually convert data to half.
            # if args.fp16:
            #     self.next_input = self.next_input.half()
            # else:
            self.next_input = self.next_input.float()
            self.next_input = self.next_input.sub_(self.mean).div_(self.std)

    def next(self):
        torch.cuda.current_stream().wait_stream(self.stream)
        input = self.next_input
        target = self.next_target
        self.preload()
        return input, target


def train(train_loader, model, criterion, optimizer, epoch):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to train mode
    model.train()
    end = time.time()

    run_info_dict = {"Iteration": [], "Loss": [], "Speed": []}

    prefetcher = data_prefetcher(train_loader)
    input, target = prefetcher.next()
    i = -1
    while input is not None:
        i += 1

        # No learning rate warmup for this test, to expose bitwise inaccuracies more quickly
        # adjust_learning_rate(optimizer, epoch, i, len(train_loader))

        if args.prof:
            if i > 10:
                break
        # measure data loading time
        data_time.update(time.time() - end)

        # compute output
        output = model(input)
        loss = criterion(output, target)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))

        if args.distributed:
            reduced_loss = reduce_tensor(loss.data)
            prec1 = reduce_tensor(prec1)
            prec5 = reduce_tensor(prec5)
        else:
            reduced_loss = loss.data

        losses.update(to_python_float(reduced_loss), input.size(0))
        top1.update(to_python_float(prec1), input.size(0))
        top5.update(to_python_float(prec5), input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()

        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()

        # for param in model.parameters():
        #     print(param.data.double().sum().item(), param.grad.data.double().sum().item())

        # torch.cuda.synchronize()
        torch.cuda.nvtx.range_push("step")
        optimizer.step()
        torch.cuda.nvtx.range_pop()

        torch.cuda.synchronize()
        # measure elapsed time
        batch_time.update(time.time() - end)

        end = time.time()

        # If you decide to refactor this test, like examples/imagenet, to sample the loss every
        # print_freq iterations, make sure to move this prefetching below the accuracy calculation.
        input, target = prefetcher.next()

        if i % args.print_freq == 0 and i > 1:
            if args.local_rank == 0:
                print(
                    "Epoch: [{0}][{1}/{2}]\t"
                    "Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t"
                    "Speed {3:.3f} ({4:.3f})\t"
                    "Data {data_time.val:.3f} ({data_time.avg:.3f})\t"
                    "Loss {loss.val:.10f} ({loss.avg:.4f})\t"
                    "Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t"
                    "Prec@5 {top5.val:.3f} ({top5.avg:.3f})".format(
                        epoch,
                        i,
                        len(train_loader),
                        args.world_size * args.batch_size / batch_time.val,
                        args.world_size * args.batch_size / batch_time.avg,
                        batch_time=batch_time,
                        data_time=data_time,
                        loss=losses,
                        top1=top1,
                        top5=top5,
                    )
                )
            run_info_dict["Iteration"].append(i)
            run_info_dict["Loss"].append(losses.val)
            run_info_dict["Speed"].append(args.world_size * args.batch_size / batch_time.val)
            if len(run_info_dict["Loss"]) == args.prints_to_process:
                if args.local_rank == 0:
                    torch.save(
                        run_info_dict,
                        str(args.has_ext)
                        + "_"
                        + str(args.opt_level)
                        + "_"
                        + str(args.loss_scale)
                        + "_"
                        + str(args.keep_batchnorm_fp32)
                        + "_"
                        + str(args.fused_adam),
                    )
                quit()


def validate(val_loader, model, criterion):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()

    prefetcher = data_prefetcher(val_loader)
    input, target = prefetcher.next()
    i = -1
    while input is not None:
        i += 1

        # compute output
        with torch.no_grad():
            output = model(input)
            loss = criterion(output, target)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))

        if args.distributed:
            reduced_loss = reduce_tensor(loss.data)
            prec1 = reduce_tensor(prec1)
            prec5 = reduce_tensor(prec5)
        else:
            reduced_loss = loss.data

        losses.update(to_python_float(reduced_loss), input.size(0))
        top1.update(to_python_float(prec1), input.size(0))
        top5.update(to_python_float(prec5), input.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if args.local_rank == 0 and i % args.print_freq == 0:
            print(
                "Test: [{0}/{1}]\t"
                "Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t"
                "Speed {2:.3f} ({3:.3f})\t"
                "Loss {loss.val:.4f} ({loss.avg:.4f})\t"
                "Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t"
                "Prec@5 {top5.val:.3f} ({top5.avg:.3f})".format(
                    i,
                    len(val_loader),
                    args.world_size * args.batch_size / batch_time.val,
                    args.world_size * args.batch_size / batch_time.avg,
                    batch_time=batch_time,
                    loss=losses,
                    top1=top1,
                    top5=top5,
                )
            )

        input, target = prefetcher.next()

    print(" * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}".format(top1=top1, top5=top5))

    return top1.avg


def save_checkpoint(state, is_best, filename="checkpoint.pth.tar"):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, "model_best.pth.tar")


class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def adjust_learning_rate(optimizer, epoch, step, len_epoch):
    """LR schedule that should yield 76% converged accuracy with batch size 256"""
    factor = epoch // 30

    if epoch >= 80:
        factor = factor + 1

    lr = args.lr * (0.1**factor)

    """Warmup"""
    if epoch < 5:
        lr = lr * float(1 + step + epoch * len_epoch) / (5.0 * len_epoch)

    # if(args.local_rank == 0):
    #     print("epoch = {}, step = {}, lr = {}".format(epoch, step, lr))

    for param_group in optimizer.param_groups:
        param_group["lr"] = lr


def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res


def reduce_tensor(tensor):
    rt = tensor.clone()
    dist.all_reduce(rt, op=dist.reduce_op.SUM)
    rt /= args.world_size
    return rt


if __name__ == "__main__":
    main()


================================================
FILE: tests/L1/common/run_test.sh
================================================
#!/bin/bash

print_banner() {
  printf "\n\n\n\e[30m\e[42m$1\e[0m\n\n\n\n"
}

print_banner "Distributed status:  $1"

echo $2
DATADIR=$2

if [ -n "$3" ]
then
  USE_BASELINE=""
else
  USE_BASELINE="--use_baseline"
fi

if [ "$1" == "single_gpu" ]
then
  BASE_CMD="python main_amp.py -a resnet50 --b 128 --workers 4 --deterministic --prints-to-process 5"
fi

if [ "$1" == "distributed" ]
then
  BASE_CMD="python -m torch.distributed.launch --nproc_per_node=2 main_amp.py -a resnet50 --b 128 --workers 4 --deterministic --prints-to-process 5"
fi

ADAM_ARGS="--opt-level O2 --keep-batchnorm-fp32 False --fused-adam"

keep_batchnorms=(
""
"--keep-batchnorm-fp32 True"
"--keep-batchnorm-fp32 False"
)

loss_scales=(
""
"--loss-scale 1.0"
"--loss-scale 128.0"
"--loss-scale dynamic"
)

opt_levels=(
"O0"
"O1"
"O2"
"O3"
)

rm True*
rm False*

set -e

print_banner "Installing Apex with --cuda_ext and --cpp_ext"

pushd ../../..
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
popd

for opt_level in "${opt_levels[@]}"
do
  for loss_scale in "${loss_scales[@]}"
  do
    for keep_batchnorm in "${keep_batchnorms[@]}"
    do
      if [ "$opt_level" == "O1" ] && [ -n "${keep_batchnorm}" ]
      then
        print_banner "Skipping ${opt_level} ${loss_scale} ${keep_batchnorm}"
        continue
      fi
      print_banner "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} --has-ext $DATADIR"
      set -x
      ${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} --has-ext $DATADIR
      set +x
    done
  done
done

# Handle FusedAdam separately due to limited support.
# FusedAdam will not be tested for bitwise accuracy against the Python implementation.
# The L0 tests already do so.  These tests are here to ensure that it actually runs,
# and get an idea of performance.
for loss_scale in "${loss_scales[@]}"
do
  print_banner "${BASE_CMD} ${ADAM_ARGS} ${loss_scale} --has-ext $DATADIR"
  set -x
  ${BASE_CMD} ${ADAM_ARGS} ${loss_scale} --has-ext $DATADIR
  set +x
done

print_banner "Reinstalling apex without extensions"

pushd ../../..
pip install -v --no-cache-dir .
popd

for opt_level in "${opt_levels[@]}"
do
  for loss_scale in "${loss_scales[@]}"
  do
    for keep_batchnorm in "${keep_batchnorms[@]}"
    do
      if [ "$opt_level" == "O1" ] && [ -n "${keep_batchnorm}" ]
      then
        print_banner "Skipping ${opt_level} ${loss_scale} ${keep_batchnorm}"
        continue
      fi
      print_banner "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} $DATADIR"
      set -x
      ${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} $DATADIR
      set +x
    done
  done
done

print_banner "Checking for bitwise accuracy between Python-only and cpp/cuda extension installs"

for opt_level in "${opt_levels[@]}"
do
  for loss_scale in "${loss_scales[@]}"
  do
    for keep_batchnorm in "${keep_batchnorms[@]}"
    do
      echo ""
      if [ "$opt_level" == "O1" ] && [ -n "${keep_batchnorm}" ]
      then
        echo "Skipping ${opt_level} ${loss_scale} ${keep_batchnorm}"
        continue
      fi
      echo "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} [--has-ext] $DATADIR"
      set -x
      python compare.py --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} --use_baseline
      set +x
    done
  done
done

print_banner "Reinstalling Apex with --cuda_ext and --cpp_ext"

pushd ../../..
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
popd


================================================
FILE: tests/L1/cross_product/run.sh
================================================
#!/bin/bash

# DATADIR="/home/mcarilli/Desktop/pt18data/apex_stale/examples/imagenet/bare_metal_train_val/"
# DATADIR="/opt/home/apex/examples/imagenet/"
cp ../common/* .
bash run_test.sh single_gpu $1


================================================
FILE: tests/L1/cross_product_distributed/run.sh
================================================
#!/bin/bash

cp ../common/* .
bash run_test.sh distributed $1


================================================
FILE: tests/distributed/DDP/ddp_race_condition_test.py
================================================
import torch
from torch.nn import Parameter
from torch.nn import Module
from apex.parallel import DistributedDataParallel as DDP
import argparse
import os


parser = argparse.ArgumentParser(description="allreduce hook example")
parser.add_argument("--local_rank", default=0, type=int)
args = parser.parse_args()

args.distributed = False
if "WORLD_SIZE" in os.environ:
    args.distributed = int(os.environ["WORLD_SIZE"]) > 1

if args.distributed:
    args.gpu = args.local_rank % torch.cuda.device_count()
    torch.cuda.set_device(args.gpu)
    torch.distributed.init_process_group(backend="nccl", init_method="env://")
    args.world_size = torch.distributed.get_world_size()

torch.set_printoptions(precision=10)
torch.manual_seed(args.local_rank)


class Model(Module):
    def __init__(self):
        super(Model, self).__init__()
        self.a = Parameter(torch.cuda.FloatTensor(4096 * 4096).fill_(1.0))
        self.b = Parameter(torch.cuda.FloatTensor(4096 * 4096).fill_(2.0))

    def forward(self, input):
        return (input * self.a) * self.b


model = Model()
# model = DDP(model, message_size=1, gradient_predivide_factor=8.0)
# model = DDP(model, delay_allreduce=True)
# model = DDP(model, message_size=1, allreduce_trigger_params=[model.b])
model = DDP(model, message_size=1, allreduce_trigger_params=[model.b], num_allreduce_streams=3)

x = torch.cuda.FloatTensor(4096 * 4096)

passed = True
torch.cuda.cudart().cudaProfilerStart()
for i in range(10):
    x.fill_(i + args.local_rank)  # fill x with new values every iteration for sanity
    model.zero_grad()
    out = model(x)
    loss = out.sum()
    # torch.cuda.nvtx.range_push("backward")
    loss.backward()
    # torch.cuda.nvtx.range_pop()

    # torch.cuda.nvtx.range_push("synchronize() + info")
    # torch.cuda.synchronize()
    print("i = {}".format(i))

    def info(name, param, val):
        expected = val * 4096 * 4096 * (2.0 * i + 1) / 2.0
        actual = param.grad.data.sum().item()
        print(
            name
            + ": grad.data_ptr() = {}, expected sum {}, got {}".format(
                param.grad.data_ptr(), expected, actual
            )
        )
        return expected == actual

    if not info("model.a", model.module.a, 2.0):
        passed = False
    if not info("model.b", model.module.b, 1.0):
        passed = False
    # torch.cuda.nvtx.range_pop()
torch.cuda.cudart().cudaProfilerStop()

print("passed = ", passed)


================================================
FILE: tests/distributed/DDP/run_race_test.sh
================================================
#!/bin/bash

CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 ddp_race_condition_test.py


================================================
FILE: tests/distributed/amp_master_params/amp_master_params.py
================================================
import torch
import argparse
import os
from apex import amp

# FOR DISTRIBUTED: (can also use torch.nn.parallel.DistributedDataParallel instead)
from apex.parallel import DistributedDataParallel

parser = argparse.ArgumentParser()
# FOR DISTRIBUTED:  Parse for the local_rank argument, which will be supplied
# automatically by torch.distributed.launch.
parser.add_argument("--local_rank", default=0, type=int)
args = parser.parse_args()

# FOR DISTRIBUTED:  If we are running under torch.distributed.launch,
# the 'WORLD_SIZE' environment variable will also be set automatically.
args.distributed = False
if "WORLD_SIZE" in os.environ:
    args.distributed = int(os.environ["WORLD_SIZE"]) > 1

if args.distributed:
    # FOR DISTRIBUTED:  Set the device according to local_rank.
    torch.cuda.set_device(args.local_rank)

    # FOR DISTRIBUTED:  Initialize the backend.  torch.distributed.launch will provide
    # environment variables, and requires that you use init_method=`env://`.
    torch.distributed.init_process_group(backend="nccl", init_method="env://")

    torch.manual_seed(torch.distributed.get_rank())

torch.backends.cudnn.benchmark = True

N, D_in, D_out = 64, 1024, 16

# Each process receives its own batch of "fake input data" and "fake target data."
# The "training loop" in each process just uses this fake batch over and over.
# https://github.com/NVIDIA/apex/tree/master/examples/imagenet provides a more realistic
# example of distributed data sampling for both training and validation.
x = torch.randn(N, D_in, device="cuda")
y = torch.randn(N, D_out, device="cuda")

model = torch.nn.Linear(D_in, D_out).cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

model, optimizer = amp.initialize(model, optimizer, opt_level="O2")

if args.distributed:
    # FOR DISTRIBUTED:  After amp.initialize, wrap the model with
    # apex.parallel.DistributedDataParallel.
    model = DistributedDataParallel(model)
    # torch.nn.parallel.DistributedDataParallel is also fine, with some added args:
    # model = torch.nn.parallel.DistributedDataParallel(model,
    #                                                   device_ids=[args.local_rank],
    #                                                   output_device=args.local_rank)

loss_fn = torch.nn.MSELoss()

for t in range(500):
    optimizer.zero_grad()
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    with amp.scale_loss(loss, optimizer) as scaled_loss:
        scaled_loss.backward()
    optimizer.step()

if args.local_rank == 0:
    print("final loss = ", loss)

torch.save(list(model.parameters()), "rank{}model.pth".format(torch.distributed.get_rank()))
torch.save(
    list(amp.master_params(optimizer)),
    "rank{}master.pth".format(torch.distributed.get_rank()),
)


================================================
FILE: tests/distributed/amp_master_params/compare.py
================================================
import torch

model_params_rank0 = torch.load("rank0model.pth", map_location=lambda storage, loc: storage.cuda(0))
model_params_rank1 = torch.load("rank1model.pth", map_location=lambda storage, loc: storage.cuda(0))
master_params_rank0 = torch.load(
    "rank0master.pth", map_location=lambda storage, loc: storage.cuda(0)
)
master_params_rank1 = torch.load(
    "rank1master.pth", map_location=lambda storage, loc: storage.cuda(0)
)

for model_rank0, model_rank1, master_rank0, master_rank1 in zip(
    model_params_rank0, model_params_rank1, master_params_rank0, master_params_rank1
):
    assert torch.allclose(model_rank0, model_rank1), "Model param mismatch"
    assert torch.allclose(master_rank0, master_rank1), "Master param mismatch"
    # Some debugging/investigation assistance code:
    # maxval, maxind = torch.max(((torch.abs(model_rank0).float())/torch.abs(master_rank0)).view(-1), 0)
    # offending_val_half = model_rank0.view(-1)[maxind.item()]
    # offending_val_float = master_rank0.view(-1)[maxind.item()]
    # print(maxval.item(), maxind.item(), offending_val_half.item(), offending_val_float.item(),
    #       offending_val_float.half().item())
    # rtol needs to be > 2^-11 because of denormals...
    assert torch.allclose(model_rank0, master_rank0.half(), rtol=0.005), "Model-master mismatch"

print("OK:  Model and master params match across ranks.")


================================================
FILE: tests/distributed/amp_master_params/run.sh
================================================
#!/bin/bash
python -m torch.distributed.launch --nproc_per_node=2 amp_master_params.py

python compare.py


================================================
FILE: tests/distributed/synced_batchnorm/python_single_gpu_unit_test.py
================================================
import torch
import numpy as np


def compare(desc, inp1, inp2, error):
    a = inp1.clone().detach().cpu().numpy()
    b = inp2.clone().detach().cpu().numpy()
    close = np.allclose(a, b, error, error)
    if not close:
        print(desc, close)
        z = a - b
        index = (np.abs(z) >= error + error * np.abs(b)).nonzero()
        print("dif    : ", z[index])
        print("inp1   : ", a[index])
        print("inp2   : ", b[index])
    return close


feature_size = 10
space_size = 16
batch_size = 5


error = 1e-5

np.random.seed(1)
dtype = np.float32
inp = (np.random.randn(batch_size, feature_size, space_size, space_size)).astype(dtype)
grad = (np.random.randn(batch_size, feature_size, space_size, space_size)).astype(dtype)
weight = (np.random.randn(feature_size)).astype(dtype)
bias = (np.random.randn(feature_size)).astype(dtype)

type_tensor = torch.cuda.FloatTensor
ref_tensor = torch.cuda.DoubleTensor

inp_t = type_tensor(inp)
weight_t = type_tensor(weight)
bias_t = type_tensor(bias)

inp_r = ref_tensor(inp.transpose(1, 0, 2, 3).reshape(feature_size, -1))
inp2_r = ref_tensor(inp)
weight_r = ref_tensor(weight).view(-1, 1, 1)
bias_r = ref_tensor(bias).view(-1, 1, 1)

grad_output_t = type_tensor(grad)

m = inp_r.mean(1)
b_v = inp_r.var(1, unbiased=False)
unb_v = inp_r.var(1, unbiased=True)

eps = 1e-5

bn = torch.nn.BatchNorm2d(feature_size).cuda()
bn.momentum = 1.0
bn.weight.data = weight_t.clone()
bn.bias.data = bias_t.clone()
inp_bn = inp_t.clone().requires_grad_()
grad_bn = grad_output_t.clone().detach()
out_bn = bn(inp_bn)
out_bn.backward(grad_bn)

from apex.parallel.sync_batchnorm import SyncBatchNorm

sbn = SyncBatchNorm(feature_size).cuda()
sbn.momentum = 1.0
sbn.weight.data = weight_t.clone()
sbn.bias.data = bias_t.clone()
inp_sbn = inp_t.clone().requires_grad_()
grad_sbn = grad_output_t.clone().detach()
out_sbn = sbn(inp_sbn)
out_sbn.backward(grad_sbn)

sbn_result = True
sbn_result_c_last = True
bn_result = True

out_r = weight_r * (inp2_r - m.view(-1, 1, 1)) * torch.rsqrt(b_v.view(-1, 1, 1) + eps) + bias_r

compare("comparing bn output: ", out_bn, out_r, error)

grad_output_t = type_tensor(grad)

grad_output_r = ref_tensor(grad.transpose(1, 0, 2, 3).reshape(feature_size, -1))
grad_output2_r = ref_tensor(grad)

grad_bias_r = grad_output_r.sum(1)
grad_weight_r = (
    ((inp2_r - m.view(-1, 1, 1)) * torch.rsqrt(b_v.view(-1, 1, 1) + eps) * grad_output2_r)
    .transpose(1, 0)
    .contiguous()
    .view(feature_size, -1)
    .sum(1)
)

mean_dy_r = grad_output_r.mean(1)
mean_dy_xmu_r = (
    ((inp2_r - m.view(-1, 1, 1)) * grad_output2_r)
    .transpose(1, 0)
    .contiguous()
    .view(feature_size, -1)
    .mean(1)
)

grad_input_r = (
    (
        grad_output2_r
        - mean_dy_r.view(-1, 1, 1)
        - (inp2_r - m.view(-1, 1, 1)) / (b_v.view(-1, 1, 1) + eps) * mean_dy_xmu_r.view(-1, 1, 1)
    )
    * torch.rsqrt(b_v.view(-1, 1, 1) + eps)
    * weight_r.view(-1, 1, 1)
)

compare("comparing bn input grad: ", inp_bn.grad, grad_input_r, error)
sbn_result = compare("comparing sbn input grad: ", inp_sbn.grad, grad_input_r, error) and sbn_result

compare("comparing bn/sbn output: ", out_bn, out_sbn, error)
sbn_result = (
    compare("comparing running_mean: ", bn.running_mean.data, sbn.running_mean.data, error)
    and sbn_result
)
sbn_result = (
    compare("comparing running_variance: ", bn.running_var.data, sbn.running_var.data, error)
    and sbn_result
)
compare("comparing grad_input: ", inp_bn.grad, inp_sbn.grad, error)
compare("comparing grad_bias: ", bn.bias.grad, sbn.bias.grad, error)
compare("comparing grad_bias bn to ref: ", bn.bias.grad, grad_bias_r, error)
sbn_result = (
    compare("comparing grad_bias sbn to ref: ", sbn.bias.grad, grad_bias_r, error) and sbn_result
)
compare("comparing grad_weight: ", bn.weight.grad, sbn.weight.grad, error)
compare("comparing grad_weight bn to ref: ", bn.weight.grad, grad_weight_r, error)
sbn_result = (
    compare("comparing grad_weight sbn to ref: ", sbn.weight.grad, grad_weight_r, error)
    and sbn_result
)

if sbn_result:
    print("====SBN single gpu passed tests")
else:
    print("*SBN single gpu failed*")


================================================
FILE: tests/distributed/synced_batchnorm/single_gpu_unit_test.py
================================================
import torch
import numpy as np
import apex

if True:
    print("using setup tools")
    import syncbn
else:
    print("using jit")
    from torch.utils.cpp_extension import load

    syncbn = load(name="syncbn", sources=["../../csrc/syncbn.cpp", "../../csrc/welford.cu"])


def compare(desc, inp1, inp2, error):
    a = inp1.clone().detach().cpu().numpy()
    b = inp2.clone().detach().cpu().numpy()
    close = np.allclose(a, b, error, error)
    if not close:
        print(desc, close)
        z = a - b
        index = (np.abs(z) >= error + error * np.abs(b)).nonzero()
        print("dif    : ", z[index])
        print("inp1   : ", a[index])
        print("inp2   : ", b[index])
    return close


feature_size = 10
space_size = 16
batch_size = 5


error = 1e-5

np.random.seed(1)
dtype = np.float32
inp = (np.random.randn(batch_size, feature_size, space_size, space_size)).astype(dtype)
grad = (np.random.randn(batch_size, feature_size, space_size, space_size)).astype(dtype)
weight = (np.random.randn(feature_size)).astype(dtype)
bias = (np.random.randn(feature_size)).astype(dtype)
count = torch.cuda.IntTensor([batch_size * space_size**2])

type_tensor = torch.cuda.FloatTensor
ref_tensor = torch.cuda.DoubleTensor

inp_t = type_tensor(inp)
weight_t = type_tensor(weight)
bias_t = type_tensor(bias)

inp_r = ref_tensor(inp.transpose(1, 0, 2, 3).reshape(feature_size, -1))
inp2_r = ref_tensor(inp)
weight_r = ref_tensor(weight).view(-1, 1, 1)
bias_r = ref_tensor(bias).view(-1, 1, 1)

grad_output_t = type_tensor(grad)

m = inp_r.mean(1)
b_v = inp_r.var(1, unbiased=False)
unb_v = inp_r.var(1, unbiased=True)

eps = 1e-5

# mean, var, var_biased = syncbn.welford_mean_var(inp_t)
mean, var_biased = syncbn.welford_mean_var(inp_t)
inv_std = 1.0 / torch.sqrt(var_biased + eps)

bn = torch.nn.BatchNorm2d(feature_size).cuda()
bn.momentum = 1.0
bn.weight.data = weight_t.clone()
bn.bias.data = bias_t.clone()
inp_bn = inp_t.clone().requires_grad_()
grad_bn = grad_output_t.clone().detach()
out_bn = bn(inp_bn)
out_bn.backward(grad_bn)

sbn = apex.parallel.SyncBatchNorm(feature_size).cuda()
sbn.momentum = 1.0
sbn.weight.data = weight_t.clone()
sbn.bias.data = bias_t.clone()
inp_sbn = inp_t.clone().requires_grad_()
grad_sbn = grad_output_t.clone().detach()
out_sbn = sbn(inp_sbn)
out_sbn.backward(grad_sbn)

sbn_c_last = apex.parallel.SyncBatchNorm(feature_size, channel_last=True).cuda()
sbn_c_last.momentum = 1.0
sbn_c_last.weight.data = weight_t.clone()
sbn_c_last.bias.data = bias_t.clone()
inp_sbn_c_last = inp_t.clone().transpose(-1, 1).contiguous().requires_grad_()
grad_sbn_c_last = grad_output_t.clone().transpose(-1, 1).contiguous().detach()
out_sbn_c_last = sbn_c_last(inp_sbn_c_last)
out_sbn_c_last.backward(grad_sbn_c_last)

sbn_result = True
sbn_result_c_last = True
bn_result = True

sbn_result = compare("comparing mean: ", mean, m, error) and sbn_result
# sbn_result = compare("comparing variance: ", var, unb_v, error) and sbn_result
sbn_result = compare("comparing biased variance: ", var_biased, b_v, error) and sbn_result


out = syncbn.batchnorm_forward(inp_t, mean, inv_std, weight_t, bias_t)
out_r = weight_r * (inp2_r - m.view(-1, 1, 1)) * torch.rsqrt(b_v.view(-1, 1, 1) + eps) + bias_r

sbn_result = compare("comparing output: ", out, out_r, error) and sbn_result
compare("comparing bn output: ", out_bn, out_r, error)

grad_output_t = type_tensor(grad)

grad_output_r = ref_tensor(grad.transpose(1, 0, 2, 3).reshape(feature_size, -1))
grad_output2_r = ref_tensor(grad)

grad_bias_r = grad_output_r.sum(1)
grad_weight_r = (
    ((inp2_r - m.view(-1, 1, 1)) * torch.rsqrt(b_v.view(-1, 1, 1) + eps) * grad_output2_r)
    .transpose(1, 0)
    .contiguous()
    .view(feature_size, -1)
    .sum(1)
)

sum_dy_r = grad_output_r.sum(1)
mean_dy_r = grad_output_r.mean(1)
sum_dy_xmu_r = (
    ((inp2_r - m.view(-1, 1, 1)) * grad_output2_r)
    .transpose(1, 0)
    .contiguous()
    .view(feature_size, -1)
    .sum(1)
)
mean_dy_xmu_r = (
    ((inp2_r - m.view(-1, 1, 1)) * grad_output2_r)
    .transpose(1, 0)
    .contiguous()
    .view(feature_size, -1)
    .mean(1)
)

grad_input_r = (
    (
        grad_output2_r
        - mean_dy_r.view(-1, 1, 1)
        - (inp2_r - m.view(-1, 1, 1)) / (b_v.view(-1, 1, 1) + eps) * mean_dy_xmu_r.view(-1, 1, 1)
    )
    * torch.rsqrt(b_v.view(-1, 1, 1) + eps)
    * weight_r.view(-1, 1, 1)
)

sum_dy, sum_dy_xmu, grad_weight, grad_bias = syncbn.reduce_bn(
    grad_output_t, inp_t, mean, inv_std, weight_t
)
grad_input = syncbn.batchnorm_backward(
    grad_output_t, inp_t, mean, inv_std, weight_t, sum_dy, sum_dy_xmu, count
)
sbn_result = compare("comparing bias grad: ", grad_bias, grad_bias_r, error) and sbn_result
sbn_result = compare("comparing weight grad: ", grad_weight, grad_weight_r, error) and sbn_result
sbn_result = compare("comparing sum_dy grad: ", sum_dy, sum_dy_r, error) and sbn_result
sbn_result = compare("comparing sum_dy_xmu grad: ", sum_dy_xmu, sum_dy_xmu_r, error) and sbn_result
sbn_result = compare("comparing input grad: ", grad_input, grad_input_r, error) and sbn_result
compare("comparing bn input grad: ", inp_bn.grad, grad_input_r, error)
sbn_result = compare("comparing sbn input grad: ", inp_sbn.grad, grad_input_r, error) and sbn_result

compare("comparing bn/sbn output: ", out_bn, out_sbn, error)
sbn_result = (
    compare("comparing running_mean: ", bn.running_mean.data, sbn.running_mean.data, error)
    and sbn_result
)
sbn_result = (
    compare("comparing running_variance: ", bn.running_var.data, sbn.running_var.data, error)
    and sbn_result
)
compare("comparing grad_input: ", inp_bn.grad, inp_sbn.grad, error)
compare("comparing grad_bias: ", bn.bias.grad, sbn.bias.grad, error)
compare("comparing grad_bias bn to ref: ", bn.bias.grad, grad_bias_r, error)
sbn_result = (
    compare("comparing grad_bias sbn to ref: ", sbn.bias.grad, grad_bias_r, error) and sbn_result
)
compare("comparing grad_weight: ", bn.weight.grad, sbn.weight.grad, error)
compare("comparing grad_weight bn to ref: ", bn.weight.grad, grad_weight_r, error)
sbn_result = (
    compare("comparing grad_weight sbn to ref: ", sbn.weight.grad, grad_weight_r, error)
    and sbn_result
)

compare(
    "comparing channel last bn/sbn output: ",
    out_bn,
    out_sbn_c_last.transpose(-1, 1).contiguous(),
    error,
)
sbn_result_c_last = (
    compare(
        "comparing channel last running_mean: ",
        bn.running_mean.data,
        sbn_c_last.running_mean.data,
        error,
    )
    and sbn_result_c_last
)
sbn_result_c_last = (
    compare(
        "comparing channel last running_variance: ",
        bn.running_var.data,
        sbn_c_last.running_var.data,
        error,
    )
    and sbn_result_c_last
)
compare(
    "comparing channel last grad_input: ",
    inp_bn.grad,
    inp_sbn_c_last.grad.transpose(-1, 1).contiguous(),
    error,
)
compare("comparing channel last grad_bias: ", bn.bias.grad, sbn_c_last.bias.grad, error)
sbn_result_c_last = (
    compare(
        "comparing channel last grad_bias sbn to ref: ",
        sbn_c_last.bias.grad,
        grad_bias_r,
        error,
    )
    and sbn_result_c_last
)
compare(
    "comparing channel last grad_weight: ",
    bn.weight.grad,
    sbn_c_last.weight.grad,
    error,
)
sbn_result_c_last = (
    compare(
        "comparing channel last grad_weight sbn to ref: ",
        sbn_c_last.weight.grad,
        grad_weight_r,
        error,
    )
    and sbn_result_c_last
)

if sbn_result:
    print("====SBN single gpu passed tests")
else:
    print("*SBN single gpu failed*")

if sbn_result_c_last:
    print("====SBN channel last single gpu passed tests")
else:
    print("*SBN channel last single gpu failed*")


================================================
FILE: tests/distributed/synced_batchnorm/test_batchnorm1d.py
================================================
import torch
import apex

model = apex.parallel.SyncBatchNorm(4).cuda()
model.weight.data.uniform_()
model.bias.data.uniform_()
data = torch.rand((8, 4)).cuda()

model_ref = torch.nn.BatchNorm1d(4).cuda()
model_ref.load_state_dict(model.state_dict())
data_ref = data.clone()

output = model(data)
output_ref = model_ref(data_ref)

assert output.allclose(output_ref)
assert model.running_mean.allclose(model_ref.running_mean)
assert model.running_var.allclose(model_ref.running_var)


================================================
FILE: tests/distributed/synced_batchnorm/test_groups.py
================================================
import torch
import numpy as np
import apex
import syncbn
import os
import argparse
import torch.optim as optim


def compare(desc, inp1, inp2, error):
    a = inp1.clone().detach().cpu().numpy()
    b = inp2.clone().detach().cpu().numpy()
    close = np.allclose(a, b, error, error)
    if not close:
        print(desc, close)
        z = a - b
        index = (np.abs(z) >= error + error * np.abs(b)).nonzero()
        print("dif    : ", z[index])
        print("inp1   : ", a[index])
        print("inp2   : ", b[index])
    return close


feature_size = 10
space_size = 40
batch_size = 32


from apex.parallel import DistributedDataParallel as DDP

parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", default=0, type=int)
parser.add_argument("--fp16", action="store_true", default=False)
parser.add_argument("--fp64", action="store_true", default=False)
parser.add_argument("--group_size", default=0, type=int)
args = parser.parse_args()

try:
    args.world_size = int(os.environ["WORLD_SIZE"])
except:
    print(
        "This is a multi-gpu test. To run it please use 'python -m torch.distributed.launch --nproc_per_node=<num gpus> test_groups.py <more options>'"
    )
    exit(1)

torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend="nccl", init_method="env://")

start = (args.local_rank % args.group_size) * batch_size // args.group_size
finish = (args.local_rank % args.group_size + 1) * batch_size // args.group_size

error = 1e-5
dtype = np.float32
if args.fp16:
    error = 1e-3
    dtype = np.float16
elif args.fp64:
    error = 1e-8
    dtype = np.float64


np.random.seed(18 + args.local_rank // args.group_size)

inp = np.random.randn(batch_size, feature_size, space_size, space_size).astype(dtype)
grad = np.random.randn(batch_size, feature_size, space_size, space_size).astype(dtype)
weight = np.random.randn(feature_size).astype(dtype)
bias = np.random.randn(feature_size).astype(dtype)


type_tensor = torch.cuda.FloatTensor
if args.fp16:
    type_tensor = torch.cuda.HalfTensor
if args.fp64:
    type_tensor = torch.cuda.DoubleTensor

ref_tensor = torch.cuda.DoubleTensor

inp_t = type_tensor(inp)
weight_t = type_tensor(weight)
bias_t = type_tensor(bias)

inp_r = ref_tensor(inp.transpose(1, 0, 2, 3).reshape(feature_size, -1))
inp2_r = ref_tensor(inp)
weight_r = ref_tensor(weight).view(-1, 1, 1)
bias_r = ref_tensor(bias).view(-1, 1, 1)

grad_output_t = type_tensor(grad)

m = inp_r.mean(1)
b_v = inp_r.var(1, unbiased=False)
unb_v = inp_r.var(1, unbiased=True)

eps = 1e-5

mean, var_biased = syncbn.welford_mean_var(inp_t)
inv_std = 1.0 / torch.sqrt(var_biased + eps)

bn = torch.nn.BatchNorm2d(feature_size).cuda()
bn.momentum = 1.0
bn.weight.data = weight_t.clone()
bn.bias.data = bias_t.clone()
if args.fp16:
    bn.half()
if args.fp64:
    bn.double()
bn = DDP(bn)
inp_bn = inp_t.clone().requires_grad_()
grad_bn = grad_output_t.clone().detach()
out_bn = bn(inp_bn)
out_bn.backward(grad_bn)
# compensating the averaging over processes done by DDP
# in order to produce mathematically equivalent result
# https://github.com/NVIDIA/apex/issues/134#issuecomment-458307368
for param in bn.parameters():
    param.grad = param.grad / args.group_size
bn_opt = optim.SGD(bn.parameters(), lr=1.0)

sbn = apex.parallel.SyncBatchNorm(
    feature_size,
    process_group=apex.parallel.create_syncbn_process_group(args.group_size),
).cuda()
sbn.momentum = 1.0
sbn.weight.data = weight_t.clone()
sbn.bias.data = bias_t.clone()
if args.fp16:
    sbn.half()
if args.fp64:
    sbn.double()
sbn = DDP(sbn)
sbn_opt = optim.SGD(sbn.parameters(), lr=1.0)
inp_sbn = inp_t.clone().requires_grad_()
grad_sbn = grad_output_t.clone().detach()
out_sbn = sbn(inp_sbn[start:finish])
out_sbn.backward(grad_sbn[start:finish])

sbn_result = True
bn_result = True

if args.local_rank == 0:
    sbn_result = compare("comparing mean: ", mean, m, error) and sbn_result
    sbn_result = compare("comparing biased variance: ", var_biased, b_v, error) and sbn_result

out = syncbn.batchnorm_forward(inp_t, mean, inv_std, weight_t, bias_t)
out_r = weight_r * (inp2_r - m.view(-1, 1, 1)) * torch.rsqrt(b_v.view(-1, 1, 1) + eps) + bias_r

if args.local_rank == 0:
    sbn_result = compare("comparing output: ", out, out_r, error) and sbn_result
    compare("comparing bn output: ", out_bn, out_r, error)

grad_output_t = type_tensor(grad)

grad_output_r = ref_tensor(grad.transpose(1, 0, 2, 3).reshape(feature_size, -1))
grad_output2_r = ref_tensor(grad)

grad_bias_r = grad_output_r.sum(1)
grad_weight_r = (
    ((inp2_r - m.view(-1, 1, 1)) * torch.rsqrt(b_v.view(-1, 1, 1) + eps) * grad_output2_r)
    .transpose(1, 0)
    .contiguous()
    .view(feature_size, -1)
    .sum(1)
)

mean_dy_r = grad_output_r.mean(1)
mean_dy_xmu_r = (
    ((inp2_r - m.view(-1, 1, 1)) * grad_output2_r)
    .transpose(1, 0)
    .contiguous()
    .view(feature_size, -1)
    .mean(1)
)

grad_input_r = (
    (
        grad_output2_r
        - mean_dy_r.view(-1, 1, 1)
        - (inp2_r - m.view(-1, 1, 1)) / (b_v.view(-1, 1, 1) + eps) * mean_dy_xmu_r.view(-1, 1, 1)
    )
    * torch.rsqrt(b_v.view(-1, 1, 1) + eps)
    * weight_r.view(-1, 1, 1)
)

mean_dy, mean_dy_xmu, grad_weight, grad_bias = syncbn.reduce_bn(
    grad_output_t, inp_t, mean, inv_std, weight_t
)
grad_input = syncbn.batchnorm_backward(
    grad_output_t, inp_t, mean, inv_std, weight_t, mean_dy, mean_dy_xmu
)

if args.local_rank == 0:
    sbn_result = compare("comparing bias grad: ", grad_bias, grad_bias_r, error) and sbn_result
    sbn_result = (
        compare("comparing weight grad: ", grad_weight, grad_weight_r, error) and sbn_result
    )
    sbn_result = compare("comparing mean_dy grad: ", mean_dy, mean_dy_r, error) and sbn_result
    sbn_result = (
        compare("comparing mean_dy_xmu grad: ", mean_dy_xmu, mean_dy_xmu_r, error) and sbn_result
    )
    sbn_result = compare("comparing input grad: ", grad_input, grad_input_r, error) and sbn_result
    compare("comparing bn input grad: ", inp_bn.grad, grad_input_r, error)

if args.local_rank == 0:
    sbn_result = (
        compare(
            "comparing running_mean: ",
            bn.module.running_mean.data,
            sbn.module.running_mean.data,
            error,
        )
        and sbn_result
    )
    sbn_result = (
        compare(
            "comparing running_variance: ",
            bn.module.running_var.data,
            sbn.module.running_var.data,
            error,
        )
        and sbn_result
    )

# execute by both
compare("comparing layers output: ", out_bn[start:finish], out_sbn, error) and sbn_result
compare(
    "comparing layers grad_input: ",
    inp_bn.grad[start:finish],
    inp_sbn.grad[start:finish],
    error,
) and sbn_result

bn_opt.step()
sbn_opt.step()

if args.local_rank == 0:
    compare("comparing bn vs sbn bias: ", bn.module.bias, sbn.module.bias, error)
    compare("comparing bn vs sbn weight: ", bn.module.weight, sbn.module.weight, error)


if sbn_result:
    print("====SBN group test passed")
else:
    print("*SBN group test failed*")


================================================
FILE: tests/distributed/synced_batchnorm/two_gpu_test_different_batch_size.py
================================================
import torch
import torch.nn as nn
from torch.nn.parallel import DistributedDataParallel as DDP
from apex.parallel import SyncBatchNorm as ApexSyncBatchNorm

import argparse
import os
import numpy as np

var_batch = 16


def compare(desc, inp1, inp2, error=1e-5):
    a = inp1.clone().detach().cpu().numpy()
    b = inp2.clone().detach().cpu().numpy()
    close = np.allclose(a, b, error, error)
    if not close:
        print(desc, close)
        z = a - b
        index = (np.abs(z) >= error + error * np.abs(b)).nonzero()
        print("dif    : ", z[index])
        print("inp1   : ", a[index])
        print("inp2   : ", b[index])
    return close


parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", type=int, default=0)
parser.add_argument("--apex", action="store_true")
args = parser.parse_args()


torch.manual_seed(2809)
# Setup DDP
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda:{}".format(args.local_rank))

torch.distributed.init_process_group(
    "nccl",
    init_method="env://",
    rank=args.local_rank,
)

# Setup model
if args.apex:
    model = nn.Sequential(nn.Conv2d(3, 6, 3, 1, 1), ApexSyncBatchNorm(6))
else:
    model = nn.Sequential(nn.Conv2d(3, 6, 3, 1, 1), nn.SyncBatchNorm(6))

# Setup reference model
model_reference = nn.Sequential(nn.Conv2d(3, 6, 3, 1, 1), nn.BatchNorm2d(6))

with torch.no_grad():
    model_reference[0].weight.copy_(model[0].weight)
    model_reference[0].bias.copy_(model[0].bias)
model_reference.to(device)

model = model.to(device)
model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank)

global_batch_size = var_batch + 8
# Create random data
if args.local_rank == 0:
    data = torch.randn(var_batch, 3, 8, 8, device=device, dtype=torch.float) * 50.0
    grad = torch.randint(0, 10, (var_batch, 6, 8, 8), device=device, dtype=torch.float) / 10.0
else:
    data = torch.randn(8, 3, 8, 8, device=device)
    grad = torch.randint(0, 10, (8, 6, 8, 8), device=device, dtype=torch.float) / 10.0

data.requires_grad_()
data.retain_grad = True

weighted_gradient = True

# DDP forward/backward
output = model(data)

if weighted_gradient:
    output.backward(grad * 2 / global_batch_size)
else:
    output.backward(grad / output.size(0))

d_list = [torch.randn(8, 3, 8, 8, device=device) for i in range(int(os.environ["WORLD_SIZE"]))]
y_list = [torch.randn(8, 6, 8, 8, device=device) for i in range(int(os.environ["WORLD_SIZE"]))]
dgrad_list = [torch.randn(8, 3, 8, 8, device=device) for i in range(int(os.environ["WORLD_SIZE"]))]
grad_list = [torch.randn(8, 6, 8, 8, device=device) for i in range(int(os.environ["WORLD_SIZE"]))]
if args.local_rank == 0:
    # placeholder, these random data will later be discarded.
    torch.distributed.all_gather(d_list, torch.randn(8, 3, 8, 8, device=device))
    torch.distributed.all_gather(y_list, torch.randn(8, 6, 8, 8, device=device))
    torch.distributed.all_gather(dgrad_list, torch.randn(8, 3, 8, 8, device=device))
    torch.distributed.all_gather(grad_list, torch.randn(8, 6, 8, 8, device=device))
else:
    torch.distributed.all_gather(d_list, data)
    torch.distributed.all_gather(y_list, output)
    torch.distributed.all_gather(dgrad_list, data.grad)
    torch.distributed.all_gather(grad_list, grad)

torch.distributed.barrier()

if args.local_rank == 0:
    ref_tensor = d_list[1:]
    ref_tensor.insert(0, data)
    assert ref_tensor[0].equal(data)
    ref_tensor = torch.cat(ref_tensor, 0)
    ref_tensor = ref_tensor.detach()
    ref_tensor.requires_grad_()
    ref_tensor.retain_grad()

    # Reference forward/backward
    output_reference = model_reference(ref_tensor)
    grad_tensor = grad_list[1:]
    grad_tensor.insert(0, grad)
    assert grad_tensor[0].equal(grad)
    grad_tensor = torch.cat(grad_tensor, 0)
    if weighted_gradient:
        output_reference.backward(grad_tensor / output_reference.size(0))
    else:
        output_reference.backward(grad_tensor / output_reference.size(0))

    dgrad_tensor = dgrad_list[1:]
    dgrad_tensor.insert(0, data.grad)
    dgrad_tensor = torch.cat(dgrad_tensor, 0)
    # check output
    output_tensor = y_list[1:]
    output_tensor.insert(0, output)
    output_tensor = torch.cat(output_tensor, 0)
    passed = True
    passed = passed and compare("check output", output_tensor, output_reference)
    # check stats
    passed = passed and compare(
        "check running mean failed",
        model_reference[1].running_mean,
        model.module[1].running_mean,
    )
    passed = passed and compare(
        "check running var failed",
        model_reference[1].running_var,
        model.module[1].running_var,
    )
    passed = passed and compare(
        "bn wgrad check failed!",
        model_reference[1].weight.grad,
        model.module[1].weight.grad,
        1e-6,
    )
    passed = passed and compare(
        "conv wgrad check failed!",
        model_reference[0].weight.grad,
        model.module[0].weight.grad,
    )
    # can't really compare dgrad directly, as we need to scale it to account for
    # DDP
    # passed = passed and compare("dgrad check failed!", ref_tensor.grad, dgrad_tensor)
    if passed:
        print("====SBN two gpu with different batches test passed")
    else:
        assert "*failed two gpu with different batches tests*"


================================================
FILE: tests/distributed/synced_batchnorm/two_gpu_unit_test.py
================================================
import torch
import numpy as np
import apex
import syncbn
import os
import argparse
import torch.optim as optim


def compare(desc, inp1, inp2, error):
    a = inp1.clone().detach().cpu().numpy()
    b = inp2.clone().detach().cpu().numpy()
    close = np.allclose(a, b, error, error)
    if not close:
        print(desc, close)
        z = a - b
        index = (np.abs(z) >= error + error * np.abs(b)).nonzero()
        print("dif    : ", z[index])
        print("inp1   : ", a[index])
        print("inp2   : ", b[index])
    return close


feature_size = 10
space_size = 40
batch_size = 32


from apex.parallel import DistributedDataParallel as DDP

parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", default=0, type=int)
parser.add_argument("--fp16", action="store_true", default=False)
parser.add_argument("--fp64", action="store_true", default=False)
args = parser.parse_args()
args.world_size = int(os.environ["WORLD_SIZE"])
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend="nccl", init_method="env://")
start = args.local_rank * batch_size // args.world_size
finish = (args.local_rank + 1) * batch_size // args.world_size

error = 1e-5
dtype = np.float32
if args.fp16:
    error = 1e-3
    dtype = np.float16
elif args.fp64:
    error = 1e-8
    dtype = np.float64

np.random.seed(18)
inp = np.random.randn(batch_size, feature_size, space_size, space_size).astype(dtype)
grad = np.random.randn(batch_size, feature_size, space_size, space_size).astype(dtype)
weight = np.random.randn(feature_size).astype(dtype)
bias = np.random.randn(feature_size).astype(dtype)


type_tensor = torch.cuda.FloatTensor
if args.fp16:
    type_tensor = torch.cuda.HalfTensor
if args.fp64:
    type_tensor = torch.cuda.DoubleTensor

ref_tensor = torch.cuda.DoubleTensor

inp_t = type_tensor(inp)
weight_t = type_tensor(weight)
bias_t = type_tensor(bias)

inp_r = ref_tensor(inp.transpose(1, 0, 2, 3).reshape(feature_size, -1))
inp2_r = ref_tensor(inp)
weight_r = ref_tensor(weight).view(-1, 1, 1)
bias_r = ref_tensor(bias).view(-1, 1, 1)

grad_output_t = type_tensor(grad)

m = inp_r.mean(1)
b_v = inp_r.var(1, unbiased=False)
unb_v = inp_r.var(1, unbiased=True)

eps = 1e-5

mean, var_biased = syncbn.welford_mean_var(inp_t)
inv_std = 1.0 / torch.sqrt(var_biased + eps)

bn = torch.nn.BatchNorm2d(feature_size).cuda()
bn.momentum = 1.0
bn.weight.data = weight_t.clone()
bn.bias.data = bias_t.clone()
if args.fp16:
    bn.half()
if args.fp64:
    bn.double()
inp_bn = inp_t.clone().requires_grad_()
grad_bn = grad_output_t.clone().detach()
out_bn = bn(inp_bn)
out_bn.backward(grad_bn)
# compensating the averaging over processes done by DDP
# in order to produce mathematically equivalent result
# https://github.com/NVIDIA/apex/issues/134#issuecomment-458307368
for param in bn.parameters():
    param.grad = param.grad / args.world_size
bn_opt = optim.SGD(bn.parameters(), lr=1.0)

sbn = apex.parallel.SyncBatchNorm(feature_size).cuda()
sbn.momentum = 1.0
sbn.weight.data = weight_t.clone()
sbn.bias.data = bias_t.clone()
if args.fp16:
    sbn.half()
if args.fp64:
    sbn.double()
sbn = DDP(sbn)
sbn_opt = optim.SGD(sbn.parameters(), lr=1.0)
inp_sbn = inp_t.clone().requires_grad_()
grad_sbn = grad_output_t.clone().detach()
out_sbn = sbn(inp_sbn[start:finish])
out_sbn.backward(grad_sbn[start:finish])

count = [
    space_size**2 * ((i + 1) * batch_size // args.world_size - i * batch_size // args.world_size)
    for i in range(0, args.world_size)
]
count = torch.cuda.IntTensor(count)

print("--- count : ", count)

sbn_result = True
bn_result = True

if args.local_rank == 0:
    sbn_result = compare("comparing mean: ", mean, m, error) and sbn_result
    sbn_result = compare("comparing biased variance: ", var_biased, b_v, error) and sbn_result

out = syncbn.batchnorm_forward(inp_t, mean, inv_std, weight_t, bias_t)
out_r = weight_r * (inp2_r - m.view(-1, 1, 1)) * torch.rsqrt(b_v.view(-1, 1, 1) + eps) + bias_r

if args.local_rank == 0:
    sbn_result = compare("comparing output: ", out, out_r, error) and sbn_result
    compare("comparing bn output: ", out_bn, out_r, error)

grad_output_t = type_tensor(grad)

grad_output_r = ref_tensor(grad.transpose(1, 0, 2, 3).reshape(feature_size, -1))
grad_output2_r = ref_tensor(grad)

grad_bias_r = grad_output_r.sum(1)
grad_weight_r = (
    ((inp2_r - m.view(-1, 1, 1)) * torch.rsqrt(b_v.view(-1, 1, 1) + eps) * grad_output2_r)
    .transpose(1, 0)
    .contiguous()
    .view(feature_size, -1)
    .sum(1)
)

sum_dy_r = grad_output_r.sum(1)
mean_dy_r = grad_output_r.mean(1)
mean_dy_xmu_r = (
    ((inp2_r - m.view(-1, 1, 1)) * grad_output2_r)
    .transpose(1, 0)
    .contiguous()
    .view(feature_size, -1)
    .mean(1)
)
sum_dy_xmu_r = (
    ((inp2_r - m.view(-1, 1, 1)) * grad_output2_r)
    .transpose(1, 0)
    .contiguous()
    .view(feature_size, -1)
    .sum(1)
)

grad_input_r = (
    (
        grad_output2_r
        - mean_dy_r.view(-1, 1, 1)
        - (inp2_r - m.view(-1, 1, 1)) / (b_v.view(-1, 1, 1) + eps) * mean_dy_xmu_r.view(-1, 1, 1)
    )
    * torch.rsqrt(b_v.view(-1, 1, 1) + eps)
    * weight_r.view(-1, 1, 1)
)

sum_dy, sum_dy_xmu, grad_weight, grad_bias = syncbn.reduce_bn(
    grad_output_t, inp_t, mean, inv_std, weight_t
)
grad_input = syncbn.batchnorm_backward(
    grad_output_t, inp_t, mean, inv_std, weight_t, sum_dy, sum_dy_xmu, count
)
if args.local_rank == 0:
    sbn_result = compare("comparing bias grad: ", grad_bias, grad_bias_r, error) and sbn_result
    sbn_result = (
        compare("comparing weight grad: ", grad_weight, grad_weight_r, error) and sbn_result
    )
    sbn_result = compare("comparing sum_dy grad: ", sum_dy, sum_dy_r, error) and sbn_result
    sbn_result = (
        compare("comparing sum_dy_xmu grad: ", sum_dy_xmu, sum_dy_xmu_r, error) and sbn_result
    )
    sbn_result = compare("comparing input grad: ", grad_input, grad_input_r, error) and sbn_result
    compare("comparing bn input grad: ", inp_bn.grad, grad_input_r, error)

if args.local_rank == 0:
    sbn_result = (
        compare(
            "comparing running_mean: ",
            bn.running_mean.data,
            sbn.module.running_mean.data,
            error,
        )
        and sbn_result
    )
    sbn_result = (
        compare(
            "comparing running_variance: ",
            bn.running_var.data,
            sbn.module.running_var.data,
            error,
        )
        and sbn_result
    )

# execute by both
compare("comparing layers output: ", out_bn[start:finish], out_sbn, error) and sbn_result
compare(
    "comparing layers grad_input: ",
    inp_bn.grad[start:finish],
    inp_sbn.grad[start:finish],
    error,
) and sbn_result

bn_opt.step()
sbn_opt.step()

if args.local_rank == 0:
    compare("comparing bn vs sbn bias: ", bn.bias, sbn.module.bias, error)
    compare("comparing bn vs sbn weight: ", bn.weight, sbn.module.weight, error)


if sbn_result:
    print("====SBN two gpu passed tests")
else:
    print("*SBN two gpu failed*")


================================================
FILE: tests/distributed/synced_batchnorm/unit_test.sh
================================================
python python_single_gpu_unit_test.py
python single_gpu_unit_test.py
python test_batchnorm1d.py
python -m torch.distributed.launch --nproc_per_node=2 two_gpu_unit_test.py
python -m torch.distributed.launch --nproc_per_node=2 two_gpu_unit_test.py --fp16
python -m torch.distributed.launch --nproc_per_node=2 two_gpu_test_different_batch_size.py --apex
#beware, you need a system with at least 4 gpus to test group_size<world_size
#python -m torch.distributed.launch --nproc_per_node=4 test_groups.py --group_size=2


================================================
FILE: tests/docker_extension_builds/run.sh
================================================
#!/bin/bash

print_banner() {
  printf "\n\n\n\e[30m\e[42m$1\e[0m\n\n\n\n"
}

print_green() {
  printf "\e[30m\e[42m$1\e[0m\n"
}

print_red() {
  printf "\e[30m\e[41m$1\e[0m\n"
}

images=(
"pytorch/pytorch:nightly-devel-cuda10.0-cudnn7"
"pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-devel"
"pytorch/pytorch:1.0.1-cuda10.0-cudnn7-devel"
"pytorch/pytorch:1.0-cuda10.0-cudnn7-devel"
"pytorch/pytorch:nightly-devel-cuda9.2-cudnn7"
)

branch="master"

# Associative array for exit codes
declare -A exit_codes
for image in images
do
  exit_codes[$image]="None"
done

for image in "${images[@]}"
do
  print_banner "$image"
  set -x
  docker pull $image
  # Trying python setup.py install instead of pip install to ensure direct access to error codes.
  # Maybe pip install would be ok too but this works.
  docker run --runtime=nvidia --rm $image /bin/bash -c "yes | pip uninstall apex; yes | pip uninstall apex; git clone https://github.com/NVIDIA/apex.git; cd apex; git checkout $branch; set -e;  python setup.py install --cuda_ext --cpp_ext"
  exit_code=$?
  set +x
  if [ $exit_code != 0 ]
  then
    print_red "Exit code: $exit_code"
  else
    print_green "Exit code: $exit_code"
  fi
  exit_codes[$image]=$exit_code
done

success=0
for image in "${images[@]}"
do
  exit_code=${exit_codes[$image]}
  if [ $exit_code != 0 ]
  then
    print_red "$image : $exit_code"
    success=1
  else
    print_green "$image : $exit_code"
  fi
done

if [ $success != 0 ]
then
  print_red "Overall status:  failure"
else
  print_green "Overall status:  success"
fi

exit $success