Repository: BM-K/Sentence-Embedding-Is-All-You-Need
Branch: main
Commit: 32cec8ea887f
Files: 522
Total size: 4.0 MB
Directory structure:
gitextract_y9jvowoy/
├── KoSBERT/
│ ├── Clustering.py
│ ├── README.md
│ ├── SemanticSearch.py
│ ├── con_training_sts.py
│ ├── output/
│ │ └── empty.txt
│ ├── run_example.sh
│ └── training_nli.py
├── KoSentenceT5/
│ ├── README.md
│ ├── apex/
│ │ ├── RNN/
│ │ │ ├── README.md
│ │ │ ├── RNNBackend.py
│ │ │ ├── __init__.py
│ │ │ ├── cells.py
│ │ │ └── models.py
│ │ ├── __init__.py
│ │ ├── amp/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── __version__.py
│ │ │ ├── _amp_state.py
│ │ │ ├── _initialize.py
│ │ │ ├── _process_optimizer.py
│ │ │ ├── amp.py
│ │ │ ├── compat.py
│ │ │ ├── frontend.py
│ │ │ ├── handle.py
│ │ │ ├── lists/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── functional_overrides.py
│ │ │ │ ├── tensor_overrides.py
│ │ │ │ └── torch_overrides.py
│ │ │ ├── opt.py
│ │ │ ├── rnn_compat.py
│ │ │ ├── scaler.py
│ │ │ ├── utils.py
│ │ │ └── wrap.py
│ │ ├── contrib/
│ │ │ ├── __init__.py
│ │ │ ├── bottleneck/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── bottleneck.py
│ │ │ │ └── test.py
│ │ │ ├── csrc/
│ │ │ │ ├── bottleneck/
│ │ │ │ │ └── bottleneck.cpp
│ │ │ │ ├── fmha/
│ │ │ │ │ ├── fmha_api.cpp
│ │ │ │ │ └── src/
│ │ │ │ │ ├── fmha/
│ │ │ │ │ │ ├── gemm.h
│ │ │ │ │ │ ├── gmem_tile.h
│ │ │ │ │ │ ├── kernel_traits.h
│ │ │ │ │ │ ├── mask.h
│ │ │ │ │ │ ├── smem_tile.h
│ │ │ │ │ │ ├── softmax.h
│ │ │ │ │ │ └── utils.h
│ │ │ │ │ ├── fmha.h
│ │ │ │ │ ├── fmha_dgrad_fp16_128_64_kernel.sm80.cu
│ │ │ │ │ ├── fmha_dgrad_fp16_256_64_kernel.sm80.cu
│ │ │ │ │ ├── fmha_dgrad_fp16_384_64_kernel.sm80.cu
│ │ │ │ │ ├── fmha_dgrad_fp16_512_64_kernel.sm80.cu
│ │ │ │ │ ├── fmha_dgrad_kernel_1xN_reload.h
│ │ │ │ │ ├── fmha_dgrad_kernel_1xN_reload_nl.h
│ │ │ │ │ ├── fmha_fprop_fp16_128_64_kernel.sm80.cu
│ │ │ │ │ ├── fmha_fprop_fp16_256_64_kernel.sm80.cu
│ │ │ │ │ ├── fmha_fprop_fp16_384_64_kernel.sm80.cu
│ │ │ │ │ ├── fmha_fprop_fp16_512_64_kernel.sm80.cu
│ │ │ │ │ ├── fmha_fprop_kernel_1xN.h
│ │ │ │ │ ├── fmha_fprop_kernel_1xN_nl.h
│ │ │ │ │ ├── fmha_fprop_kernel_1xN_reload_v.h
│ │ │ │ │ ├── fmha_kernel.h
│ │ │ │ │ ├── fmha_noloop_reduce.cu
│ │ │ │ │ └── fmha_utils.h
│ │ │ │ ├── groupbn/
│ │ │ │ │ ├── batch_norm.cu
│ │ │ │ │ ├── batch_norm.h
│ │ │ │ │ ├── batch_norm_add_relu.cu
│ │ │ │ │ ├── batch_norm_add_relu.h
│ │ │ │ │ ├── cuda_utils.h
│ │ │ │ │ ├── interface.cpp
│ │ │ │ │ ├── ipc.cu
│ │ │ │ │ └── nhwc_batch_norm_kernel.h
│ │ │ │ ├── layer_norm/
│ │ │ │ │ ├── ln_api.cpp
│ │ │ │ │ ├── ln_bwd_semi_cuda_kernel.cu
│ │ │ │ │ ├── ln_fwd_cuda_kernel.cu
│ │ │ │ │ ├── ln_kernel_traits.h
│ │ │ │ │ └── utils.cuh
│ │ │ │ ├── multihead_attn/
│ │ │ │ │ ├── additive_masked_softmax_dropout.cpp
│ │ │ │ │ ├── additive_masked_softmax_dropout_cuda.cu
│ │ │ │ │ ├── dropout.h
│ │ │ │ │ ├── encdec_multihead_attn.cpp
│ │ │ │ │ ├── encdec_multihead_attn_cuda.cu
│ │ │ │ │ ├── encdec_multihead_attn_norm_add.cpp
│ │ │ │ │ ├── encdec_multihead_attn_norm_add_cuda.cu
│ │ │ │ │ ├── layer_norm.h
│ │ │ │ │ ├── masked_softmax_dropout.cpp
│ │ │ │ │ ├── masked_softmax_dropout_cuda.cu
│ │ │ │ │ ├── philox.h
│ │ │ │ │ ├── self_multihead_attn.cpp
│ │ │ │ │ ├── self_multihead_attn_bias.cpp
│ │ │ │ │ ├── self_multihead_attn_bias_additive_mask.cpp
│ │ │ │ │ ├── self_multihead_attn_bias_additive_mask_cuda.cu
│ │ │ │ │ ├── self_multihead_attn_bias_cuda.cu
│ │ │ │ │ ├── self_multihead_attn_cuda.cu
│ │ │ │ │ ├── self_multihead_attn_norm_add.cpp
│ │ │ │ │ ├── self_multihead_attn_norm_add_cuda.cu
│ │ │ │ │ ├── softmax.h
│ │ │ │ │ └── strided_batched_gemm.h
│ │ │ │ ├── optimizers/
│ │ │ │ │ ├── fused_adam_cuda.cpp
│ │ │ │ │ ├── fused_adam_cuda_kernel.cu
│ │ │ │ │ ├── fused_lamb_cuda.cpp
│ │ │ │ │ ├── fused_lamb_cuda_kernel.cu
│ │ │ │ │ ├── multi_tensor_distopt_adam.cpp
│ │ │ │ │ ├── multi_tensor_distopt_adam_kernel.cu
│ │ │ │ │ ├── multi_tensor_distopt_lamb.cpp
│ │ │ │ │ └── multi_tensor_distopt_lamb_kernel.cu
│ │ │ │ ├── transducer/
│ │ │ │ │ ├── transducer_joint.cpp
│ │ │ │ │ ├── transducer_joint_kernel.cu
│ │ │ │ │ ├── transducer_loss.cpp
│ │ │ │ │ └── transducer_loss_kernel.cu
│ │ │ │ └── xentropy/
│ │ │ │ ├── interface.cpp
│ │ │ │ └── xentropy_kernel.cu
│ │ │ ├── examples/
│ │ │ │ └── multihead_attn/
│ │ │ │ ├── func_test_multihead_attn.py
│ │ │ │ └── perf_test_multihead_attn.py
│ │ │ ├── fmha/
│ │ │ │ ├── __init__.py
│ │ │ │ └── fmha.py
│ │ │ ├── groupbn/
│ │ │ │ ├── __init__.py
│ │ │ │ └── batch_norm.py
│ │ │ ├── layer_norm/
│ │ │ │ ├── __init__.py
│ │ │ │ └── layer_norm.py
│ │ │ ├── multihead_attn/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── encdec_multihead_attn.py
│ │ │ │ ├── encdec_multihead_attn_func.py
│ │ │ │ ├── fast_encdec_multihead_attn_func.py
│ │ │ │ ├── fast_encdec_multihead_attn_norm_add_func.py
│ │ │ │ ├── fast_self_multihead_attn_func.py
│ │ │ │ ├── fast_self_multihead_attn_norm_add_func.py
│ │ │ │ ├── mask_softmax_dropout_func.py
│ │ │ │ ├── self_multihead_attn.py
│ │ │ │ └── self_multihead_attn_func.py
│ │ │ ├── optimizers/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── distributed_fused_adam.py
│ │ │ │ ├── distributed_fused_adam_v2.py
│ │ │ │ ├── distributed_fused_adam_v3.py
│ │ │ │ ├── distributed_fused_lamb.py
│ │ │ │ ├── fp16_optimizer.py
│ │ │ │ ├── fused_adam.py
│ │ │ │ ├── fused_lamb.py
│ │ │ │ └── fused_sgd.py
│ │ │ ├── sparsity/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── asp.py
│ │ │ │ ├── sparse_masklib.py
│ │ │ │ └── test/
│ │ │ │ ├── checkpointing_test_part1.py
│ │ │ │ ├── checkpointing_test_part2.py
│ │ │ │ ├── checkpointing_test_reference.py
│ │ │ │ └── toy_problem.py
│ │ │ ├── test/
│ │ │ │ ├── fmha/
│ │ │ │ │ └── test_fmha.py
│ │ │ │ ├── layer_norm/
│ │ │ │ │ └── test_fast_layer_norm.py
│ │ │ │ ├── multihead_attn/
│ │ │ │ │ ├── test_encdec_multihead_attn.py
│ │ │ │ │ ├── test_encdec_multihead_attn_norm_add.py
│ │ │ │ │ ├── test_fast_self_multihead_attn_bias.py
│ │ │ │ │ ├── test_mha_fused_softmax.py
│ │ │ │ │ ├── test_self_multihead_attn.py
│ │ │ │ │ └── test_self_multihead_attn_norm_add.py
│ │ │ │ ├── test_label_smoothing.py
│ │ │ │ └── transducer/
│ │ │ │ ├── test_transducer_joint.py
│ │ │ │ ├── test_transducer_loss.py
│ │ │ │ └── transducer_ref.py
│ │ │ ├── transducer/
│ │ │ │ ├── __init__.py
│ │ │ │ └── transducer.py
│ │ │ └── xentropy/
│ │ │ ├── __init__.py
│ │ │ └── softmax_xentropy.py
│ │ ├── fp16_utils/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── fp16_optimizer.py
│ │ │ ├── fp16util.py
│ │ │ └── loss_scaler.py
│ │ ├── mlp/
│ │ │ ├── __init__.py
│ │ │ └── mlp.py
│ │ ├── multi_tensor_apply/
│ │ │ ├── __init__.py
│ │ │ └── multi_tensor_apply.py
│ │ ├── normalization/
│ │ │ ├── __init__.py
│ │ │ └── fused_layer_norm.py
│ │ ├── optimizers/
│ │ │ ├── __init__.py
│ │ │ ├── fused_adagrad.py
│ │ │ ├── fused_adam.py
│ │ │ ├── fused_lamb.py
│ │ │ ├── fused_novograd.py
│ │ │ └── fused_sgd.py
│ │ ├── parallel/
│ │ │ ├── LARC.py
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── distributed.py
│ │ │ ├── multiproc.py
│ │ │ ├── optimized_sync_batchnorm.py
│ │ │ ├── optimized_sync_batchnorm_kernel.py
│ │ │ ├── sync_batchnorm.py
│ │ │ └── sync_batchnorm_kernel.py
│ │ ├── pyprof/
│ │ │ ├── FAQs.md
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── examples/
│ │ │ │ ├── .gitignore
│ │ │ │ ├── apex/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── fused_adam.py
│ │ │ │ │ ├── fused_layer_norm.py
│ │ │ │ │ └── test.sh
│ │ │ │ ├── custom_func_module/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── custom_function.py
│ │ │ │ │ ├── custom_module.py
│ │ │ │ │ └── test.sh
│ │ │ │ ├── imagenet/
│ │ │ │ │ ├── imagenet.py
│ │ │ │ │ └── test.sh
│ │ │ │ ├── jit/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── jit_script_function.py
│ │ │ │ │ ├── jit_script_method.py
│ │ │ │ │ ├── jit_trace_function.py
│ │ │ │ │ ├── jit_trace_method.py
│ │ │ │ │ └── test.sh
│ │ │ │ ├── lenet.py
│ │ │ │ ├── operators.py
│ │ │ │ ├── simple.py
│ │ │ │ └── user_annotation/
│ │ │ │ ├── README.md
│ │ │ │ ├── resnet.py
│ │ │ │ └── test.sh
│ │ │ ├── nvtx/
│ │ │ │ ├── __init__.py
│ │ │ │ └── nvmarker.py
│ │ │ ├── parse/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── db.py
│ │ │ │ ├── kernel.py
│ │ │ │ ├── nvvp.py
│ │ │ │ └── parse.py
│ │ │ └── prof/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── activation.py
│ │ │ ├── base.py
│ │ │ ├── blas.py
│ │ │ ├── conv.py
│ │ │ ├── convert.py
│ │ │ ├── data.py
│ │ │ ├── dropout.py
│ │ │ ├── embedding.py
│ │ │ ├── index_slice_join_mutate.py
│ │ │ ├── linear.py
│ │ │ ├── loss.py
│ │ │ ├── misc.py
│ │ │ ├── normalization.py
│ │ │ ├── optim.py
│ │ │ ├── output.py
│ │ │ ├── pointwise.py
│ │ │ ├── pooling.py
│ │ │ ├── prof.py
│ │ │ ├── randomSample.py
│ │ │ ├── recurrentCell.py
│ │ │ ├── reduction.py
│ │ │ ├── softmax.py
│ │ │ ├── usage.py
│ │ │ └── utility.py
│ │ └── reparameterization/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── reparameterization.py
│ │ └── weight_norm.py
│ ├── data/
│ │ └── dataloader.py
│ ├── main.py
│ ├── model/
│ │ ├── loss.py
│ │ ├── setting.py
│ │ ├── simcse/
│ │ │ ├── kost5.py
│ │ │ └── processor.py
│ │ └── utils.py
│ └── run_example.sh
├── KoSimCSE/
│ ├── README.md
│ ├── SemanticSearch.py
│ ├── apex/
│ │ ├── RNN/
│ │ │ ├── README.md
│ │ │ ├── RNNBackend.py
│ │ │ ├── __init__.py
│ │ │ ├── cells.py
│ │ │ └── models.py
│ │ ├── __init__.py
│ │ ├── amp/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── __version__.py
│ │ │ ├── _amp_state.py
│ │ │ ├── _initialize.py
│ │ │ ├── _process_optimizer.py
│ │ │ ├── amp.py
│ │ │ ├── compat.py
│ │ │ ├── frontend.py
│ │ │ ├── handle.py
│ │ │ ├── lists/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── functional_overrides.py
│ │ │ │ ├── tensor_overrides.py
│ │ │ │ └── torch_overrides.py
│ │ │ ├── opt.py
│ │ │ ├── rnn_compat.py
│ │ │ ├── scaler.py
│ │ │ ├── utils.py
│ │ │ └── wrap.py
│ │ ├── contrib/
│ │ │ ├── __init__.py
│ │ │ ├── bottleneck/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── bottleneck.py
│ │ │ │ └── test.py
│ │ │ ├── csrc/
│ │ │ │ ├── bottleneck/
│ │ │ │ │ └── bottleneck.cpp
│ │ │ │ ├── fmha/
│ │ │ │ │ ├── fmha_api.cpp
│ │ │ │ │ └── src/
│ │ │ │ │ ├── fmha/
│ │ │ │ │ │ ├── gemm.h
│ │ │ │ │ │ ├── gmem_tile.h
│ │ │ │ │ │ ├── kernel_traits.h
│ │ │ │ │ │ ├── mask.h
│ │ │ │ │ │ ├── smem_tile.h
│ │ │ │ │ │ ├── softmax.h
│ │ │ │ │ │ └── utils.h
│ │ │ │ │ ├── fmha.h
│ │ │ │ │ ├── fmha_dgrad_fp16_128_64_kernel.sm80.cu
│ │ │ │ │ ├── fmha_dgrad_fp16_256_64_kernel.sm80.cu
│ │ │ │ │ ├── fmha_dgrad_fp16_384_64_kernel.sm80.cu
│ │ │ │ │ ├── fmha_dgrad_fp16_512_64_kernel.sm80.cu
│ │ │ │ │ ├── fmha_dgrad_kernel_1xN_reload.h
│ │ │ │ │ ├── fmha_dgrad_kernel_1xN_reload_nl.h
│ │ │ │ │ ├── fmha_fprop_fp16_128_64_kernel.sm80.cu
│ │ │ │ │ ├── fmha_fprop_fp16_256_64_kernel.sm80.cu
│ │ │ │ │ ├── fmha_fprop_fp16_384_64_kernel.sm80.cu
│ │ │ │ │ ├── fmha_fprop_fp16_512_64_kernel.sm80.cu
│ │ │ │ │ ├── fmha_fprop_kernel_1xN.h
│ │ │ │ │ ├── fmha_fprop_kernel_1xN_nl.h
│ │ │ │ │ ├── fmha_fprop_kernel_1xN_reload_v.h
│ │ │ │ │ ├── fmha_kernel.h
│ │ │ │ │ ├── fmha_noloop_reduce.cu
│ │ │ │ │ └── fmha_utils.h
│ │ │ │ ├── groupbn/
│ │ │ │ │ ├── batch_norm.cu
│ │ │ │ │ ├── batch_norm.h
│ │ │ │ │ ├── batch_norm_add_relu.cu
│ │ │ │ │ ├── batch_norm_add_relu.h
│ │ │ │ │ ├── cuda_utils.h
│ │ │ │ │ ├── interface.cpp
│ │ │ │ │ ├── ipc.cu
│ │ │ │ │ └── nhwc_batch_norm_kernel.h
│ │ │ │ ├── layer_norm/
│ │ │ │ │ ├── ln_api.cpp
│ │ │ │ │ ├── ln_bwd_semi_cuda_kernel.cu
│ │ │ │ │ ├── ln_fwd_cuda_kernel.cu
│ │ │ │ │ ├── ln_kernel_traits.h
│ │ │ │ │ └── utils.cuh
│ │ │ │ ├── multihead_attn/
│ │ │ │ │ ├── additive_masked_softmax_dropout.cpp
│ │ │ │ │ ├── additive_masked_softmax_dropout_cuda.cu
│ │ │ │ │ ├── dropout.h
│ │ │ │ │ ├── encdec_multihead_attn.cpp
│ │ │ │ │ ├── encdec_multihead_attn_cuda.cu
│ │ │ │ │ ├── encdec_multihead_attn_norm_add.cpp
│ │ │ │ │ ├── encdec_multihead_attn_norm_add_cuda.cu
│ │ │ │ │ ├── layer_norm.h
│ │ │ │ │ ├── masked_softmax_dropout.cpp
│ │ │ │ │ ├── masked_softmax_dropout_cuda.cu
│ │ │ │ │ ├── philox.h
│ │ │ │ │ ├── self_multihead_attn.cpp
│ │ │ │ │ ├── self_multihead_attn_bias.cpp
│ │ │ │ │ ├── self_multihead_attn_bias_additive_mask.cpp
│ │ │ │ │ ├── self_multihead_attn_bias_additive_mask_cuda.cu
│ │ │ │ │ ├── self_multihead_attn_bias_cuda.cu
│ │ │ │ │ ├── self_multihead_attn_cuda.cu
│ │ │ │ │ ├── self_multihead_attn_norm_add.cpp
│ │ │ │ │ ├── self_multihead_attn_norm_add_cuda.cu
│ │ │ │ │ ├── softmax.h
│ │ │ │ │ └── strided_batched_gemm.h
│ │ │ │ ├── optimizers/
│ │ │ │ │ ├── fused_adam_cuda.cpp
│ │ │ │ │ ├── fused_adam_cuda_kernel.cu
│ │ │ │ │ ├── fused_lamb_cuda.cpp
│ │ │ │ │ ├── fused_lamb_cuda_kernel.cu
│ │ │ │ │ ├── multi_tensor_distopt_adam.cpp
│ │ │ │ │ ├── multi_tensor_distopt_adam_kernel.cu
│ │ │ │ │ ├── multi_tensor_distopt_lamb.cpp
│ │ │ │ │ └── multi_tensor_distopt_lamb_kernel.cu
│ │ │ │ ├── transducer/
│ │ │ │ │ ├── transducer_joint.cpp
│ │ │ │ │ ├── transducer_joint_kernel.cu
│ │ │ │ │ ├── transducer_loss.cpp
│ │ │ │ │ └── transducer_loss_kernel.cu
│ │ │ │ └── xentropy/
│ │ │ │ ├── interface.cpp
│ │ │ │ └── xentropy_kernel.cu
│ │ │ ├── examples/
│ │ │ │ └── multihead_attn/
│ │ │ │ ├── func_test_multihead_attn.py
│ │ │ │ └── perf_test_multihead_attn.py
│ │ │ ├── fmha/
│ │ │ │ ├── __init__.py
│ │ │ │ └── fmha.py
│ │ │ ├── groupbn/
│ │ │ │ ├── __init__.py
│ │ │ │ └── batch_norm.py
│ │ │ ├── layer_norm/
│ │ │ │ ├── __init__.py
│ │ │ │ └── layer_norm.py
│ │ │ ├── multihead_attn/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── encdec_multihead_attn.py
│ │ │ │ ├── encdec_multihead_attn_func.py
│ │ │ │ ├── fast_encdec_multihead_attn_func.py
│ │ │ │ ├── fast_encdec_multihead_attn_norm_add_func.py
│ │ │ │ ├── fast_self_multihead_attn_func.py
│ │ │ │ ├── fast_self_multihead_attn_norm_add_func.py
│ │ │ │ ├── mask_softmax_dropout_func.py
│ │ │ │ ├── self_multihead_attn.py
│ │ │ │ └── self_multihead_attn_func.py
│ │ │ ├── optimizers/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── distributed_fused_adam.py
│ │ │ │ ├── distributed_fused_adam_v2.py
│ │ │ │ ├── distributed_fused_adam_v3.py
│ │ │ │ ├── distributed_fused_lamb.py
│ │ │ │ ├── fp16_optimizer.py
│ │ │ │ ├── fused_adam.py
│ │ │ │ ├── fused_lamb.py
│ │ │ │ └── fused_sgd.py
│ │ │ ├── sparsity/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── asp.py
│ │ │ │ ├── sparse_masklib.py
│ │ │ │ └── test/
│ │ │ │ ├── checkpointing_test_part1.py
│ │ │ │ ├── checkpointing_test_part2.py
│ │ │ │ ├── checkpointing_test_reference.py
│ │ │ │ └── toy_problem.py
│ │ │ ├── test/
│ │ │ │ ├── fmha/
│ │ │ │ │ └── test_fmha.py
│ │ │ │ ├── layer_norm/
│ │ │ │ │ └── test_fast_layer_norm.py
│ │ │ │ ├── multihead_attn/
│ │ │ │ │ ├── test_encdec_multihead_attn.py
│ │ │ │ │ ├── test_encdec_multihead_attn_norm_add.py
│ │ │ │ │ ├── test_fast_self_multihead_attn_bias.py
│ │ │ │ │ ├── test_mha_fused_softmax.py
│ │ │ │ │ ├── test_self_multihead_attn.py
│ │ │ │ │ └── test_self_multihead_attn_norm_add.py
│ │ │ │ ├── test_label_smoothing.py
│ │ │ │ └── transducer/
│ │ │ │ ├── test_transducer_joint.py
│ │ │ │ ├── test_transducer_loss.py
│ │ │ │ └── transducer_ref.py
│ │ │ ├── transducer/
│ │ │ │ ├── __init__.py
│ │ │ │ └── transducer.py
│ │ │ └── xentropy/
│ │ │ ├── __init__.py
│ │ │ └── softmax_xentropy.py
│ │ ├── fp16_utils/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── fp16_optimizer.py
│ │ │ ├── fp16util.py
│ │ │ └── loss_scaler.py
│ │ ├── mlp/
│ │ │ ├── __init__.py
│ │ │ └── mlp.py
│ │ ├── multi_tensor_apply/
│ │ │ ├── __init__.py
│ │ │ └── multi_tensor_apply.py
│ │ ├── normalization/
│ │ │ ├── __init__.py
│ │ │ └── fused_layer_norm.py
│ │ ├── optimizers/
│ │ │ ├── __init__.py
│ │ │ ├── fused_adagrad.py
│ │ │ ├── fused_adam.py
│ │ │ ├── fused_lamb.py
│ │ │ ├── fused_novograd.py
│ │ │ └── fused_sgd.py
│ │ ├── parallel/
│ │ │ ├── LARC.py
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── distributed.py
│ │ │ ├── multiproc.py
│ │ │ ├── optimized_sync_batchnorm.py
│ │ │ ├── optimized_sync_batchnorm_kernel.py
│ │ │ ├── sync_batchnorm.py
│ │ │ └── sync_batchnorm_kernel.py
│ │ ├── pyprof/
│ │ │ ├── FAQs.md
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── examples/
│ │ │ │ ├── .gitignore
│ │ │ │ ├── apex/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── fused_adam.py
│ │ │ │ │ ├── fused_layer_norm.py
│ │ │ │ │ └── test.sh
│ │ │ │ ├── custom_func_module/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── custom_function.py
│ │ │ │ │ ├── custom_module.py
│ │ │ │ │ └── test.sh
│ │ │ │ ├── imagenet/
│ │ │ │ │ ├── imagenet.py
│ │ │ │ │ └── test.sh
│ │ │ │ ├── jit/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── jit_script_function.py
│ │ │ │ │ ├── jit_script_method.py
│ │ │ │ │ ├── jit_trace_function.py
│ │ │ │ │ ├── jit_trace_method.py
│ │ │ │ │ └── test.sh
│ │ │ │ ├── lenet.py
│ │ │ │ ├── operators.py
│ │ │ │ ├── simple.py
│ │ │ │ └── user_annotation/
│ │ │ │ ├── README.md
│ │ │ │ ├── resnet.py
│ │ │ │ └── test.sh
│ │ │ ├── nvtx/
│ │ │ │ ├── __init__.py
│ │ │ │ └── nvmarker.py
│ │ │ ├── parse/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── db.py
│ │ │ │ ├── kernel.py
│ │ │ │ ├── nvvp.py
│ │ │ │ └── parse.py
│ │ │ └── prof/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── activation.py
│ │ │ ├── base.py
│ │ │ ├── blas.py
│ │ │ ├── conv.py
│ │ │ ├── convert.py
│ │ │ ├── data.py
│ │ │ ├── dropout.py
│ │ │ ├── embedding.py
│ │ │ ├── index_slice_join_mutate.py
│ │ │ ├── linear.py
│ │ │ ├── loss.py
│ │ │ ├── misc.py
│ │ │ ├── normalization.py
│ │ │ ├── optim.py
│ │ │ ├── output.py
│ │ │ ├── pointwise.py
│ │ │ ├── pooling.py
│ │ │ ├── prof.py
│ │ │ ├── randomSample.py
│ │ │ ├── recurrentCell.py
│ │ │ ├── reduction.py
│ │ │ ├── softmax.py
│ │ │ ├── usage.py
│ │ │ └── utility.py
│ │ └── reparameterization/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── reparameterization.py
│ │ └── weight_norm.py
│ ├── data/
│ │ └── dataloader.py
│ ├── main.py
│ ├── model/
│ │ ├── loss.py
│ │ ├── setting.py
│ │ ├── simcse/
│ │ │ ├── bert.py
│ │ │ └── processor.py
│ │ └── utils.py
│ ├── output/
│ │ └── empty.txt
│ ├── requirements.txt
│ └── run_example.sh
├── LICENSE
├── README.md
├── get_model_checkpoint.sh
└── get_model_dataset.sh
================================================
FILE CONTENTS
================================================
================================================
FILE: KoSBERT/Clustering.py
================================================
from sentence_transformers import SentenceTransformer, util
import numpy as np
model_path = '../Checkpoint/KoSBERT/kosbert-klue-bert-base'
embedder = SentenceTransformer(model_path)
# Corpus with example sentences
corpus = ['한 남자가 음식을 먹는다.',
'한 남자가 빵 한 조각을 먹는다.',
'그 여자가 아이를 돌본다.',
'한 남자가 말을 탄다.',
'한 여자가 바이올린을 연주한다.',
'두 남자가 수레를 숲 솦으로 밀었다.',
'한 남자가 담으로 싸인 땅에서 백마를 타고 있다.',
'원숭이 한 마리가 드럼을 연주한다.',
'치타 한 마리가 먹이 뒤에서 달리고 있다.',
'한 남자가 파스타를 먹는다.',
'고릴라 의상을 입은 누군가가 드럼을 연주하고 있다.',
'치타가 들판을 가로 질러 먹이를 쫓는다.']
corpus_embeddings = embedder.encode(corpus)
# Then, we perform k-means clustering using sklearn:
from sklearn.cluster import KMeans
num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_
clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
clustered_sentences[cluster_id].append(corpus[sentence_id])
for i, cluster in enumerate(clustered_sentences):
print("Cluster ", i+1)
print(cluster)
print("")
================================================
FILE: KoSBERT/README.md
================================================
# KoSentenceBERT
[[Github]](https://github.com/UKPLab/sentence-transformers) Official implementation of SBERT.
Korean SentenceBERT : Korean Sentence Embeddings using Siamese BERT-Networks.
## Quick start
- If you want to do inference quickly, download the pre-trained models and then you can start some downstream tasks.
```
bash get_model_checkpoint.sh
python SemanticSearch.py
```
## Training
- Before training or evaluation, please download the datasets by running
```
bash get_model_dataset.sh
```
- Two stage training
- First step, training NLI dataset
```
python training_nli.py --model klue/bert-base --batch 32 --evaluation_steps 1000 --epochs 1
```
- Second step, continued training STS dataset
```
python con_training_sts.py --model klue/bert-base --batch 32 --evaluation_steps 1000 --epochs 4
```
- Run Examples
```
bash run_example.sh
```
### Hyperparameters
- Training NLI
1. Pooling Method: MEAN strategy
2. Batch Size: 32
3. Evaluation Steps: 1000
4. Epochs: 1(BERT), 2(RoBERTa)
- Continued Training STS
1. Pooling Method: MEAN strategy
2. Batch Size: 32
3. Evaluation Steps: 1000
4. Epochs: 4
### Semantic Search
```
python SemanticSearch.py
```
```python
from sentence_transformers import SentenceTransformer, util
import numpy as np
model_path = '../Checkpoint/KoSBERT/kosbert-klue-bert-base'
embedder = SentenceTransformer(model_path)
# Corpus with example sentences
corpus = ['한 남자가 음식을 먹는다.',
'한 남자가 빵 한 조각을 먹는다.',
'그 여자가 아이를 돌본다.',
'한 남자가 말을 탄다.',
'한 여자가 바이올린을 연주한다.',
'두 남자가 수레를 숲 솦으로 밀었다.',
'한 남자가 담으로 싸인 땅에서 백마를 타고 있다.',
'원숭이 한 마리가 드럼을 연주한다.',
'치타 한 마리가 먹이 뒤에서 달리고 있다.']
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)
# Query sentences:
queries = ['한 남자가 파스타를 먹는다.',
'고릴라 의상을 입은 누군가가 드럼을 연주하고 있다.',
'치타가 들판을 가로 질러 먹이를 쫓는다.']
# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = 5
for query in queries:
query_embedding = embedder.encode(query, convert_to_tensor=True)
cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
cos_scores = cos_scores.cpu()
#We use np.argpartition, to only partially sort the top_k results
top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]
print("\n\n======================\n\n")
print("Query:", query)
print("\nTop 5 most similar sentences in corpus:")
for idx in top_results[0:top_k]:
print(corpus[idx].strip(), "(Score: %.4f)" % (cos_scores[idx]))
```
- Results are as follows :
```
Query: 한 남자가 파스타를 먹는다.
Top 5 most similar sentences in corpus:
한 남자가 음식을 먹는다. (Score: 0.6141)
한 남자가 빵 한 조각을 먹는다. (Score: 0.5952)
한 남자가 말을 탄다. (Score: 0.1231)
한 남자가 담으로 싸인 땅에서 백마를 타고 있다. (Score: 0.0752)
두 남자가 수레를 숲 솦으로 밀었다. (Score: 0.0486)
======================
Query: 고릴라 의상을 입은 누군가가 드럼을 연주하고 있다.
Top 5 most similar sentences in corpus:
원숭이 한 마리가 드럼을 연주한다. (Score: 0.6656)
치타 한 마리가 먹이 뒤에서 달리고 있다. (Score: 0.2988)
한 여자가 바이올린을 연주한다. (Score: 0.1566)
한 남자가 말을 탄다. (Score: 0.1112)
한 남자가 담으로 싸인 땅에서 백마를 타고 있다. (Score: 0.0262)
======================
Query: 치타가 들판을 가로 질러 먹이를 쫓는다.
Top 5 most similar sentences in corpus:
치타 한 마리가 먹이 뒤에서 달리고 있다. (Score: 0.7570)
두 남자가 수레를 숲 솦으로 밀었다. (Score: 0.3658)
원숭이 한 마리가 드럼을 연주한다. (Score: 0.3583)
한 남자가 말을 탄다. (Score: 0.0505)
그 여자가 아이를 돌본다. (Score: -0.0087)
```
### Clustering
```
python Clustering.py
```
```python
from sentence_transformers import SentenceTransformer, util
import numpy as np
model_path = '../Checkpoint/KoSBERT/kosbert-klue-bert-base'
embedder = SentenceTransformer(model_path)
# Corpus with example sentences
corpus = ['한 남자가 음식을 먹는다.',
'한 남자가 빵 한 조각을 먹는다.',
'그 여자가 아이를 돌본다.',
'한 남자가 말을 탄다.',
'한 여자가 바이올린을 연주한다.',
'두 남자가 수레를 숲 솦으로 밀었다.',
'한 남자가 담으로 싸인 땅에서 백마를 타고 있다.',
'원숭이 한 마리가 드럼을 연주한다.',
'치타 한 마리가 먹이 뒤에서 달리고 있다.',
'한 남자가 파스타를 먹는다.',
'고릴라 의상을 입은 누군가가 드럼을 연주하고 있다.',
'치타가 들판을 가로 질러 먹이를 쫓는다.']
corpus_embeddings = embedder.encode(corpus)
# Then, we perform k-means clustering using sklearn:
from sklearn.cluster import KMeans
num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_
clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
clustered_sentences[cluster_id].append(corpus[sentence_id])
for i, cluster in enumerate(clustered_sentences):
print("Cluster ", i+1)
print(cluster)
print("")
```
- Results are as follows:
```
Cluster 1
['한 남자가 음식을 먹는다.', '한 남자가 빵 한 조각을 먹는다.', '한 남자가 파스타를 먹는다.']
Cluster 2
['원숭이 한 마리가 드럼을 연주한다.', '고릴라 의상을 입은 누군가가 드럼을 연주하고 있다.']
Cluster 3
['한 남자가 말을 탄다.', '두 남자가 수레를 숲 솦으로 밀었다.', '한 남자가 담으로 싸인 땅에서 백마를 타고 있다.']
Cluster 4
['치타 한 마리가 먹이 뒤에서 달리고 있다.', '치타가 들판을 가로 질러 먹이를 쫓는다.']
Cluster 5
['그 여자가 아이를 돌본다.', '한 여자가 바이올린을 연주한다.']
```
================================================
FILE: KoSBERT/SemanticSearch.py
================================================
from sentence_transformers import SentenceTransformer, util
import numpy as np
model_path = '../Checkpoint/KoSBERT/kosbert-klue-bert-base'
embedder = SentenceTransformer(model_path)
# Corpus with example sentences
corpus = ['한 남자가 음식을 먹는다.',
'한 남자가 빵 한 조각을 먹는다.',
'그 여자가 아이를 돌본다.',
'한 남자가 말을 탄다.',
'한 여자가 바이올린을 연주한다.',
'두 남자가 수레를 숲 솦으로 밀었다.',
'한 남자가 담으로 싸인 땅에서 백마를 타고 있다.',
'원숭이 한 마리가 드럼을 연주한다.',
'치타 한 마리가 먹이 뒤에서 달리고 있다.']
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)
# Query sentences:
queries = ['한 남자가 파스타를 먹는다.',
'고릴라 의상을 입은 누군가가 드럼을 연주하고 있다.',
'치타가 들판을 가로 질러 먹이를 쫓는다.']
# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = 5
for query in queries:
query_embedding = embedder.encode(query, convert_to_tensor=True)
cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
cos_scores = cos_scores.cpu()
#We use np.argpartition, to only partially sort the top_k results
top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]
print("\n\n======================\n\n")
print("Query:", query)
print("\nTop 5 most similar sentences in corpus:")
for idx in top_results[0:top_k]:
print(corpus[idx].strip(), "(Score: %.4f)" % (cos_scores[idx]))
================================================
FILE: KoSBERT/con_training_sts.py
================================================
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import os
import gzip
import csv
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, default='klue/bert-base')
parser.add_argument('--batch', type=int, default=32)
parser.add_argument('--evaluation_steps', type=int, default=1000)
parser.add_argument('--epochs', type=int, default=4)
args = parser.parse_args()
logging.basicConfig(format='%(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
level=logging.INFO,
handlers=[LoggingHandler()])
model_name = './output/training_nli_'+args.model.replace("/", "-")
train_batch_size = args.batch
num_epochs = args.epochs
model_save_path = 'output/kosbert-'+args.model.replace("/", "-")
model = SentenceTransformer(model_name)
logging.info("Read STSbenchmark train dataset")
train_samples = []
dev_samples = []
test_samples = []
with open('../Dataset/tune_sts_dev.tsv', 'rt', encoding='utf-8') as fIn:
lines = fIn.readlines()
for line in lines:
s1, s2, score = line.split('\t')
score = score.strip()
score = float(score) / 5.0
dev_samples.append(InputExample(texts= [s1,s2], label=score))
with open('../Dataset/tune_sts_test.tsv', 'rt', encoding='utf-8') as fIn:
lines = fIn.readlines()
for line in lines:
s1, s2, score = line.split('\t')
score = score.strip()
score = float(score) / 5.0
test_samples.append(InputExample(texts= [s1,s2], label=score))
with open('../Dataset/tune_sts_train.tsv', 'rt', encoding='utf-8') as fIn:
lines = fIn.readlines()
for line in lines:
s1, s2, score = line.split('\t')
score = score.strip()
score = float(score) / 5.0
train_samples.append(InputExample(texts= [s1,s2], label=score))
train_dataset = SentencesDataset(train_samples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)
# Development set: Measure correlation between cosine score and gold labels
logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')
warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
evaluator=evaluator,
epochs=num_epochs,
evaluation_steps=args.evaluation_steps,
warmup_steps=warmup_steps,
output_path=model_save_path)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)
================================================
FILE: KoSBERT/output/empty.txt
================================================
.
================================================
FILE: KoSBERT/run_example.sh
================================================
#!/bin/bash
# bert-base
echo "First Step Training NLI Dataset (BERT-BASE)"
CUDA_VISIBLE_DEVICES=0 python training_nli.py --model klue/bert-base --batch 32 --evaluation_steps 1000 --epochs 1
echo "Second Step Continuously Training STS Dataset (BERT-BASE)"
CUDA_VISIBLE_DEVICES=0 python con_training_sts.py --model klue/bert-base --batch 32 --evaluation_steps 1000 --epochs 4
# roberta-base
echo "First Step Training NLI Dataset (ROBERTA-BASE)"
CUDA_VISIBLE_DEVICES=0 python training_nli.py --model klue/roberta-base --batch 32 --evaluation_steps 1000 --epochs 1
echo "Second Step Continuously Training STS Dataset (ROBERTA-BASE)"
CUDA_VISIBLE_DEVICES=0 python con_training_sts.py --model klue/roberta-base --batch 32 --evaluation_steps 1000 --epochs 4
# roberta-large
echo "First Step Training NLI Dataset (ROBERAT-LARGE)"
CUDA_VISIBLE_DEVICES=0 python training_nli.py --model klue/roberta-large --batch 32 --evaluation_steps 1000 --epochs 1
echo "Second Step Continuously Training STS Dataset (ROBERTA-LARGE)"
CUDA_VISIBLE_DEVICES=0 python con_training_sts.py --model klue/roberta-large --batch 32 --evaluation_steps 1000 --epochs 4
================================================
FILE: KoSBERT/training_nli.py
================================================
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, default='klue/bert-base')
parser.add_argument('--batch', type=int, default=32)
parser.add_argument('--evaluation_steps', type=int, default=1000)
parser.add_argument('--epochs', type=int, default=1)
args = parser.parse_args()
model_name = args.model
train_batch_size = args.batch
model_save_path = 'output/training_nli_'+model_name.replace("/", "-")#+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
word_embedding_model = models.Transformer(model_name)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
pooling_mode_mean_tokens=True,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
logging.info("Read AllNLI train dataset")
label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
train_samples = []
with open('../Dataset/snli_1.0_train.ko.tsv', "rt", encoding="utf-8") as fIn:
lines = fIn.readlines()
for line in lines:
s1, s2, label = line.split('\t')
label = label2int[label.strip()]
train_samples.append(InputExample(texts=[s1, s2], label=label))
train_dataset = SentencesDataset(train_samples, model=model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int))
#Read STSbenchmark dataset and use it as development set
logging.info("Read STSbenchmark dev dataset")
dev_samples = []
with open('../Dataset/tune_sts_dev.tsv', 'rt', encoding='utf-8') as fIn:
lines = fIn.readlines()
for line in lines:
s1, s2, score = line.split('\t')
score = score.strip()
score = float(score) / 5.0
dev_samples.append(InputExample(texts= [s1,s2], label=score))
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=train_batch_size, name='sts-dev')
num_epochs = args.epochs
warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
evaluator=dev_evaluator,
epochs=num_epochs,
evaluation_steps=args.evaluation_steps,
warmup_steps=warmup_steps,
output_path=model_save_path
)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
test_samples = []
with open('../Dataset/tune_sts_test.tsv', 'rt', encoding='utf-8') as fIn:
lines = fIn.readlines()
for line in lines:
s1, s2, score = line.split('\t')
score = score.strip()
score = float(score) / 5.0
test_samples.append(InputExample(texts=[s1,s2], label=score))
print("\n\n\n")
print("======================TEST===================")
print("\n\n\n")
model = SentenceTransformer(model_save_path)
print(f"model save path > {model_save_path}")
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=train_batch_size, name='sts-test')
test_evaluator(model, output_path=model_save_path)
================================================
FILE: KoSentenceT5/README.md
================================================
# KoSentenceT5
KoSentenceT5 : Korean Sentence Embeddings using T5.
> **Warning**
> This repository uses ETRI-T5 model and does not provide it. You can download T5 model from [here](https://aiopen.etri.re.kr/service_dataset.php).
## Training
- Before training or evaluation, please download the datasets by running
```
bash get_model_dataset.sh
```
### Train KoSentenceT5
```
python main.py \
--model etri-t5 \
--multi_gpu True \
--test False \
--max_len 110 \
--batch_size 64 \
--epochs 2 \
--eval_steps 125 \
--lr 0.0001 \
--warmup_ratio 0.01 \
--temperature 0.05 \
--path_to_data ../Dataset/ \
--train_data train_nli.tsv \
--valid_data valid_sts.tsv
```
### Evaluation
```
python main.py \
--model etri-t5 \
--train False \
--test True \
--max_len 110 \
--batch_size 64 \
--temperature 0.05 \
--path_to_data ../Dataset/ \
--test_data test_sts.tsv \
```
### Run Examples
```
bash run_example.sh
```
================================================
FILE: KoSentenceT5/apex/RNN/README.md
================================================
Under construction...
================================================
FILE: KoSentenceT5/apex/RNN/RNNBackend.py
================================================
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import math
def is_iterable(maybe_iterable):
return isinstance(maybe_iterable, list) or isinstance(maybe_iterable, tuple)
def flatten_list(tens_list):
"""
flatten_list
"""
if not is_iterable(tens_list):
return tens_list
return torch.cat(tens_list, dim=0).view(len(tens_list), *tens_list[0].size() )
#These modules always assumes batch_first
class bidirectionalRNN(nn.Module):
"""
bidirectionalRNN
"""
def __init__(self, inputRNN, num_layers=1, dropout = 0):
super(bidirectionalRNN, self).__init__()
self.dropout = dropout
self.fwd = stackedRNN(inputRNN, num_layers=num_layers, dropout = dropout)
self.bckwrd = stackedRNN(inputRNN.new_like(), num_layers=num_layers, dropout = dropout)
self.rnns = nn.ModuleList([self.fwd, self.bckwrd])
#collect hidden option will return all hidden/cell states from entire RNN
def forward(self, input, collect_hidden=False):
"""
forward()
"""
seq_len = input.size(0)
bsz = input.size(1)
fwd_out, fwd_hiddens = list(self.fwd(input, collect_hidden = collect_hidden))
bckwrd_out, bckwrd_hiddens = list(self.bckwrd(input, reverse=True, collect_hidden = collect_hidden))
output = torch.cat( [fwd_out, bckwrd_out], -1 )
hiddens = tuple( torch.cat(hidden, -1) for hidden in zip( fwd_hiddens, bckwrd_hiddens) )
return output, hiddens
def reset_parameters(self):
"""
reset_parameters()
"""
for rnn in self.rnns:
rnn.reset_parameters()
def init_hidden(self, bsz):
"""
init_hidden()
"""
for rnn in self.rnns:
rnn.init_hidden(bsz)
def detach_hidden(self):
"""
detach_hidden()
"""
for rnn in self.rnns:
rnn.detachHidden()
def reset_hidden(self, bsz):
"""
reset_hidden()
"""
for rnn in self.rnns:
rnn.reset_hidden(bsz)
def init_inference(self, bsz):
"""
init_inference()
"""
for rnn in self.rnns:
rnn.init_inference(bsz)
#assumes hidden_state[0] of inputRNN is output hidden state
#constructor either takes an RNNCell or list of RNN layers
class stackedRNN(nn.Module):
"""
stackedRNN
"""
def __init__(self, inputRNN, num_layers=1, dropout=0):
super(stackedRNN, self).__init__()
self.dropout = dropout
if isinstance(inputRNN, RNNCell):
self.rnns = [inputRNN]
for i in range(num_layers-1):
self.rnns.append(inputRNN.new_like(inputRNN.output_size))
elif isinstance(inputRNN, list):
assert len(inputRNN) == num_layers, "RNN list length must be equal to num_layers"
self.rnns=inputRNN
else:
raise RuntimeError()
self.nLayers = len(self.rnns)
self.rnns = nn.ModuleList(self.rnns)
'''
Returns output as hidden_state[0] Tensor([sequence steps][batch size][features])
If collect hidden will also return Tuple(
[n_hidden_states][sequence steps] Tensor([layer][batch size][features])
)
If not collect hidden will also return Tuple(
[n_hidden_states] Tensor([layer][batch size][features])
'''
def forward(self, input, collect_hidden=False, reverse=False):
"""
forward()
"""
seq_len = input.size(0)
bsz = input.size(1)
inp_iter = reversed(range(seq_len)) if reverse else range(seq_len)
hidden_states = [[] for i in range(self.nLayers)]
outputs = []
for seq in inp_iter:
for layer in range(self.nLayers):
if layer == 0:
prev_out = input[seq]
outs = self.rnns[layer](prev_out)
if collect_hidden:
hidden_states[layer].append(outs)
elif seq == seq_len-1:
hidden_states[layer].append(outs)
prev_out = outs[0]
outputs.append(prev_out)
if reverse:
outputs = list(reversed(outputs))
'''
At this point outputs is in format:
list( [seq_length] x Tensor([bsz][features]) )
need to convert it to:
list( Tensor([seq_length][bsz][features]) )
'''
output = flatten_list(outputs)
'''
hidden_states at this point is in format:
list( [layer][seq_length][hidden_states] x Tensor([bsz][features]) )
need to convert it to:
For not collect hidden:
list( [hidden_states] x Tensor([layer][bsz][features]) )
For collect hidden:
list( [hidden_states][seq_length] x Tensor([layer][bsz][features]) )
'''
if not collect_hidden:
seq_len = 1
n_hid = self.rnns[0].n_hidden_states
new_hidden = [ [ [ None for k in range(self.nLayers)] for j in range(seq_len) ] for i in range(n_hid) ]
for i in range(n_hid):
for j in range(seq_len):
for k in range(self.nLayers):
new_hidden[i][j][k] = hidden_states[k][j][i]
hidden_states = new_hidden
#Now in format list( [hidden_states][seq_length][layer] x Tensor([bsz][features]) )
#Reverse seq_length if reverse
if reverse:
hidden_states = list( list(reversed(list(entry))) for entry in hidden_states)
#flatten layer dimension into tensor
hiddens = list( list(
flatten_list(seq) for seq in hidden )
for hidden in hidden_states )
#Now in format list( [hidden_states][seq_length] x Tensor([layer][bsz][features]) )
#Remove seq_length dimension if not collect_hidden
if not collect_hidden:
hidden_states = list( entry[0] for entry in hidden_states)
return output, hidden_states
def reset_parameters(self):
"""
reset_parameters()
"""
for rnn in self.rnns:
rnn.reset_parameters()
def init_hidden(self, bsz):
"""
init_hidden()
"""
for rnn in self.rnns:
rnn.init_hidden(bsz)
def detach_hidden(self):
"""
detach_hidden()
"""
for rnn in self.rnns:
rnn.detach_hidden()
def reset_hidden(self, bsz):
"""
reset_hidden()
"""
for rnn in self.rnns:
rnn.reset_hidden(bsz)
def init_inference(self, bsz):
"""
init_inference()
"""
for rnn in self.rnns:
rnn.init_inference(bsz)
class RNNCell(nn.Module):
"""
RNNCell
gate_multiplier is related to the architecture you're working with
For LSTM-like it will be 4 and GRU-like will be 3.
Always assumes input is NOT batch_first.
Output size that's not hidden size will use output projection
Hidden_states is number of hidden states that are needed for cell
if one will go directly to cell as tensor, if more will go as list
"""
def __init__(self, gate_multiplier, input_size, hidden_size, cell, n_hidden_states = 2, bias = False, output_size = None):
super(RNNCell, self).__init__()
self.gate_multiplier = gate_multiplier
self.input_size = input_size
self.hidden_size = hidden_size
self.cell = cell
self.bias = bias
self.output_size = output_size
if output_size is None:
self.output_size = hidden_size
self.gate_size = gate_multiplier * self.hidden_size
self.n_hidden_states = n_hidden_states
self.w_ih = nn.Parameter(torch.Tensor(self.gate_size, self.input_size))
self.w_hh = nn.Parameter(torch.Tensor(self.gate_size, self.output_size))
#Check if there's recurrent projection
if(self.output_size != self.hidden_size):
self.w_ho = nn.Parameter(torch.Tensor(self.output_size, self.hidden_size))
self.b_ih = self.b_hh = None
if self.bias:
self.b_ih = nn.Parameter(torch.Tensor(self.gate_size))
self.b_hh = nn.Parameter(torch.Tensor(self.gate_size))
#hidden states for forward
self.hidden = [ None for states in range(self.n_hidden_states)]
self.reset_parameters()
def new_like(self, new_input_size=None):
"""
new_like()
"""
if new_input_size is None:
new_input_size = self.input_size
return type(self)(self.gate_multiplier,
new_input_size,
self.hidden_size,
self.cell,
self.n_hidden_states,
self.bias,
self.output_size)
#Use xavier where we can (weights), otherwise use uniform (bias)
def reset_parameters(self, gain=1):
"""
reset_parameters()
"""
stdev = 1.0 / math.sqrt(self.hidden_size)
for param in self.parameters():
param.data.uniform_(-stdev, stdev)
'''
Xavier reset:
def reset_parameters(self, gain=1):
stdv = 1.0 / math.sqrt(self.gate_size)
for param in self.parameters():
if (param.dim() > 1):
torch.nn.init.xavier_normal(param, gain)
else:
param.data.uniform_(-stdv, stdv)
'''
def init_hidden(self, bsz):
"""
init_hidden()
"""
for param in self.parameters():
if param is not None:
a_param = param
break
for i, _ in enumerate(self.hidden):
if(self.hidden[i] is None or self.hidden[i].data.size()[0] != bsz):
if i==0:
hidden_size = self.output_size
else:
hidden_size = self.hidden_size
tens = a_param.data.new(bsz, hidden_size).zero_()
self.hidden[i] = Variable(tens, requires_grad=False)
def reset_hidden(self, bsz):
"""
reset_hidden()
"""
for i, _ in enumerate(self.hidden):
self.hidden[i] = None
self.init_hidden(bsz)
def detach_hidden(self):
"""
detach_hidden()
"""
for i, _ in enumerate(self.hidden):
if self.hidden[i] is None:
raise RuntimeError("Must initialize hidden state before you can detach it")
for i, _ in enumerate(self.hidden):
self.hidden[i] = self.hidden[i].detach()
def forward(self, input):
"""
forward()
if not inited or bsz has changed this will create hidden states
"""
self.init_hidden(input.size()[0])
hidden_state = self.hidden[0] if self.n_hidden_states == 1 else self.hidden
self.hidden = self.cell(input, hidden_state, self.w_ih, self.w_hh, b_ih=self.b_ih, b_hh=self.b_hh)
if(self.n_hidden_states > 1):
self.hidden = list(self.hidden)
else:
self.hidden=[self.hidden]
if self.output_size != self.hidden_size:
self.hidden[0] = F.linear(self.hidden[0], self.w_ho)
return tuple(self.hidden)
================================================
FILE: KoSentenceT5/apex/RNN/__init__.py
================================================
from .models import LSTM, GRU, ReLU, Tanh, mLSTM
__all__ = ['models']
================================================
FILE: KoSentenceT5/apex/RNN/cells.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from .RNNBackend import RNNCell
from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend
import math
class mLSTMRNNCell(RNNCell):
"""
mLSTMRNNCell
"""
def __init__(self, input_size, hidden_size, bias = False, output_size = None):
gate_multiplier = 4
super(mLSTMRNNCell, self).__init__(gate_multiplier, input_size, hidden_size, mLSTMCell, n_hidden_states = 2, bias = bias, output_size = output_size)
self.w_mih = nn.Parameter(torch.Tensor(self.output_size, self.input_size))
self.w_mhh = nn.Parameter(torch.Tensor(self.output_size, self.output_size))
self.reset_parameters()
def forward(self, input):
"""
mLSTMRNNCell.forward()
"""
#if not inited or bsz has changed this will create hidden states
self.init_hidden(input.size()[0])
hidden_state = self.hidden[0] if self.n_hidden_states == 1 else self.hidden
self.hidden = list(
self.cell(input, hidden_state, self.w_ih, self.w_hh, self.w_mih, self.w_mhh,
b_ih=self.b_ih, b_hh=self.b_hh)
)
if self.output_size != self.hidden_size:
self.hidden[0] = F.linear(self.hidden[0], self.w_ho)
return tuple(self.hidden)
def new_like(self, new_input_size=None):
if new_input_size is None:
new_input_size = self.input_size
return type(self)(
new_input_size,
self.hidden_size,
self.bias,
self.output_size)
def mLSTMCell(input, hidden, w_ih, w_hh, w_mih, w_mhh, b_ih=None, b_hh=None):
"""
mLSTMCell
"""
if input.is_cuda:
igates = F.linear(input, w_ih)
m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh)
hgates = F.linear(m, w_hh)
state = fusedBackend.LSTMFused.apply
return state(igates, hgates, hidden[1], b_ih, b_hh)
hx, cx = hidden
m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh)
gates = F.linear(input, w_ih, b_ih) + F.linear(m, w_hh, b_hh)
ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
ingate = F.sigmoid(ingate)
forgetgate = F.sigmoid(forgetgate)
cellgate = F.tanh(cellgate)
outgate = F.sigmoid(outgate)
cy = (forgetgate * cx) + (ingate * cellgate)
hy = outgate * F.tanh(cy)
return hy, cy
================================================
FILE: KoSentenceT5/apex/RNN/models.py
================================================
import torch
from torch.nn._functions.rnn import LSTMCell, RNNReLUCell, RNNTanhCell, GRUCell
from .RNNBackend import bidirectionalRNN, stackedRNN, RNNCell
from .cells import mLSTMRNNCell, mLSTMCell
def toRNNBackend(inputRNN, num_layers, bidirectional=False, dropout = 0):
"""
:class:`toRNNBackend`
"""
if bidirectional:
return bidirectionalRNN(inputRNN, num_layers, dropout = dropout)
else:
return stackedRNN(inputRNN, num_layers, dropout = dropout)
def LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
"""
:class:`LSTM`
"""
inputRNN = RNNCell(4, input_size, hidden_size, LSTMCell, 2, bias, output_size)
return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
def GRU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
"""
:class:`GRU`
"""
inputRNN = RNNCell(3, input_size, hidden_size, GRUCell, 1, bias, output_size)
return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
def ReLU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
"""
:class:`ReLU`
"""
inputRNN = RNNCell(1, input_size, hidden_size, RNNReLUCell, 1, bias, output_size)
return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
def Tanh(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
"""
:class:`Tanh`
"""
inputRNN = RNNCell(1, input_size, hidden_size, RNNTanhCell, 1, bias, output_size)
return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
def mLSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
"""
:class:`mLSTM`
"""
inputRNN = mLSTMRNNCell(input_size, hidden_size, bias=bias, output_size=output_size)
return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
================================================
FILE: KoSentenceT5/apex/__init__.py
================================================
# May help avoid undefined symbol errors https://pytorch.org/cppdocs/notes/faq.html#undefined-symbol-errors-from-pytorch-aten
import torch
import warnings
if torch.distributed.is_available():
from . import parallel
from . import amp
from . import fp16_utils
# For optimizers and normalization there is no Python fallback.
# Absence of cuda backend is a hard error.
# I would like the errors from importing fused_adam_cuda or fused_layer_norm_cuda
# to be triggered lazily, because if someone has installed with --cpp_ext and --cuda_ext
# so they expect those backends to be available, but for some reason they actually aren't
# available (for example because they built improperly in a way that isn't revealed until
# load time) the error message is timely and visible.
from . import optimizers
from . import normalization
from . import pyprof
================================================
FILE: KoSentenceT5/apex/amp/README.md
================================================
# amp: Automatic Mixed Precision
## Annotating User Functions
Nearly all PyTorch user code needs nothing more than the two steps
above to use amp. After all, custom layers are built out of simpler
PyTorch components, and amp already can see those.
However, any custom C++ or CUDA code is outside of amp's (default)
view of things. For example, suppose I implemented a new recurrent
cell called a "forgetful recurrent unit" that calls directly into a
CUDA backend:
```python
from backend import FRUBackend
def fru(input, hidden, weight, bias):
# call to CUDA code
FRUBackend(input, hidden, weight, bias)
```
In this case, it is possible to get a runtime type mismatch. For
example, you might have `input` in fp16, and `weight` in fp32, and amp
doesn't have the visibility to insert an appropriate cast.
amp exposes two ways to handle "invisible" backend code: function
annotations and explicit registration.
#### Function annotation
The first way to handle backend code is a set of function annotations:
- `@amp.half_function`
- `@amp.float_function`
- `@amp.promote_function`
These correspond to:
- Cast all arguments to fp16
- Cast all argumnets fo fp32
- If there are any type mismatches, cast everything to the widest type
In our example, we believe that the FRU unit is fp16-safe and will get
performance gains from casting its arguments to fp16, so we write:
```python
@amp.half_function
def fru(input, hidden, weight, bias):
#...
```
#### Explicit registration
The other way to handle backend code is with explicit function
registration:
- `amp.register_half_function(module, function_name)`
- `amp.register_float_function(module, function_name)`
- `amp.register_promote_function(module, function_name)`
When using this API, `module` is the containing class or module for
the function, and `function_name` is the _string_ name of the
function. Note that the function must be registered before the call to
`amp.initalize()`.
For our FRU unit, we can register the backend function directly:
```python
import backend
amp.register_half_function(backend, 'FRUBackend')
```
================================================
FILE: KoSentenceT5/apex/amp/__init__.py
================================================
from .amp import init, half_function, float_function, promote_function,\
register_half_function, register_float_function, register_promote_function
from .handle import scale_loss, disable_casts
from .frontend import initialize, state_dict, load_state_dict
from ._amp_state import master_params, _amp_state
================================================
FILE: KoSentenceT5/apex/amp/__version__.py
================================================
VERSION = (0, 1, 0)
__version__ = '.'.join(map(str, VERSION))
================================================
FILE: KoSentenceT5/apex/amp/_amp_state.py
================================================
# This is a "header object" that allows different amp modules to communicate.
# I'm a C++ guy, not a python guy. I decided this approach because it seemed most C++-like.
# But apparently it's ok:
# http://effbot.org/pyfaq/how-do-i-share-global-variables-across-modules.htm
import os
import torch
TORCH_MAJOR = int(torch.__version__.split('.')[0])
TORCH_MINOR = int(torch.__version__.split('.')[1])
if TORCH_MAJOR == 1 and TORCH_MINOR < 8:
from torch._six import container_abcs
else:
import collections.abc as container_abcs
class AmpState(object):
def __init__(self):
self.hard_override=False
self.allow_incoming_model_not_fp32 = False
self.verbosity=1
# Attribute stash. Could also just stash things as global module attributes.
_amp_state = AmpState()
def warn_or_err(msg):
if _amp_state.hard_override:
print("Warning: " + msg)
else:
raise RuntimeError(msg)
# I'm not sure if allowing hard_override is a good idea.
# + " If you're sure you know what you're doing, supply " +
# "hard_override=True to amp.initialize.")
def maybe_print(msg, rank0=False):
distributed = torch.distributed.is_available() and \
torch.distributed.is_initialized() and \
torch.distributed.get_world_size() > 1
if _amp_state.verbosity > 0:
if rank0:
if distributed:
if torch.distributed.get_rank() == 0:
print(msg)
else:
print(msg)
else:
print(msg)
# def iter_params(param_groups):
# for group in param_groups:
# for p in group['params']:
# yield p
def master_params(optimizer):
"""
Generator expression that iterates over the params owned by ``optimizer``.
Args:
optimizer: An optimizer previously returned from ``amp.initialize``.
"""
for group in optimizer.param_groups:
for p in group['params']:
yield p
================================================
FILE: KoSentenceT5/apex/amp/_initialize.py
================================================
import torch
from torch._six import string_classes
import functools
import numpy as np
import sys
from types import MethodType
import warnings
from ._amp_state import _amp_state, warn_or_err, container_abcs
from .handle import disable_casts
from .scaler import LossScaler
from ._process_optimizer import _process_optimizer
from apex.fp16_utils import convert_network
from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general
from ..contrib.optimizers import FP16_Optimizer as FP16_Optimizer_for_fused
if torch.distributed.is_available():
from ..parallel import DistributedDataParallel as apex_DDP
from ..parallel.LARC import LARC
def to_type(dtype, t):
if isinstance(t, torch.Tensor):
if not t.is_cuda:
# This should not be a hard error, since it may be legitimate.
warnings.warn("An input tensor was not cuda.")
# GANs require this.
# if t.requires_grad:
# warn_or_err("input data requires grad. Since input data is not a model parameter,\n"
# "its gradients will not be properly allreduced by DDP.")
if t.is_floating_point():
return t.to(dtype)
return t
else:
# Trust the user's custom batch type, that's all I can do here.
return t.to(dtype)
# Modified from torch.optim.optimizer.py. This is a bit more general than casted_args in utils.py.
def applier(value, fn):
if isinstance(value, torch.Tensor):
return fn(value)
elif isinstance(value, string_classes):
return value
elif isinstance(value, np.ndarray):
return value
elif hasattr(value, "to"): # Allow handling of custom batch classes
return fn(value)
elif isinstance(value, container_abcs.Mapping):
return {applier(k, fn) : applier(v, fn) for k, v in value.items()}
elif isinstance(value, container_abcs.Iterable):
return type(value)(applier(v, fn) for v in value)
else:
# Do I want this to fire off even if someone chooses to pass something ordinary like
# an int or float? May be more annoying than it's worth.
# print("Warning: unrecognized type in applier. If your input data is a custom class, "
# "provide it with a .to(dtype) method which converts its floating-point Tensors to dtype. "
# "Amp will check for your custom to() and invoke it to cast the batch's "
# "floating-point Tensors to the appropriate type. "
# "Also, if your data is a custom class, it is your responsibility to ensure that "
# "any Tensors you want to be cuda are already cuda."
return value
def check_models(models):
for model in models:
parallel_type = None
if isinstance(model, torch.nn.parallel.DistributedDataParallel):
parallel_type = "torch.nn.parallel.DistributedDataParallel"
if ('apex_DDP' in sys.modules) and isinstance(model, apex_DDP):
parallel_type = "apex.parallel.DistributedDataParallel"
if isinstance(model, torch.nn.parallel.DataParallel):
parallel_type = "torch.nn.parallel.DataParallel"
if parallel_type is not None:
raise RuntimeError("Incoming model is an instance of {}. ".format(parallel_type) +
"Parallel wrappers should only be applied to the model(s) AFTER \n"
"the model(s) have been returned from amp.initialize.")
def check_params_fp32(models):
for model in models:
for name, param in model.named_parameters():
if param.is_floating_point():
if 'Half' in param.type():
warn_or_err("Found param {} with type {}, expected torch.cuda.FloatTensor.\n"
"When using amp.initialize, you do not need to call .half() on your model\n"
"before passing it, no matter what optimization level you choose.".format(
name, param.type()))
elif not param.is_cuda:
warn_or_err("Found param {} with type {}, expected torch.cuda.FloatTensor.\n"
"When using amp.initialize, you need to provide a model with parameters\n"
"located on a CUDA device before passing it no matter what optimization level\n"
"you chose. Use model.to('cuda') to use the default device.".format(
name, param.type()))
# Backward compatibility for PyTorch 0.4
if hasattr(model, 'named_buffers'):
buf_iter = model.named_buffers()
else:
buf_iter = model._buffers
for obj in buf_iter:
if type(obj)==tuple:
name, buf = obj
else:
name, buf = obj, buf_iter[obj]
if buf.is_floating_point():
if 'Half' in buf.type():
warn_or_err("Found buffer {} with type {}, expected torch.cuda.FloatTensor.\n"
"When using amp.initialize, you do not need to call .half() on your model\n"
"before passing it, no matter what optimization level you choose.".format(
name, buf.type()))
elif not buf.is_cuda:
warn_or_err("Found buffer {} with type {}, expected torch.cuda.FloatTensor.\n"
"When using amp.initialize, you need to provide a model with buffers\n"
"located on a CUDA device before passing it no matter what optimization level\n"
"you chose. Use model.to('cuda') to use the default device.".format(
name, buf.type()))
def check_optimizers(optimizers):
for optim in optimizers:
bad_optim_type = None
if isinstance(optim, FP16_Optimizer_general):
bad_optim_type = "apex.fp16_utils.FP16_Optimizer"
if isinstance(optim, FP16_Optimizer_for_fused):
bad_optim_type = "apex.optimizers.FP16_Optimizer"
if bad_optim_type is not None:
raise RuntimeError("An incoming optimizer is an instance of {}. ".format(bad_optim_type) +
"The optimizer(s) passed to amp.initialize() must be bare \n"
"instances of either ordinary Pytorch optimizers, or Apex fused \n"
"optimizers.\n")
class O2StateDictHook(object):
def __init__(self, fn):
self.fn = fn
def __call__(self, module, state_dict, prefix, local_metadata):
for key in state_dict:
param = state_dict[key]
if 'Half' in param.type():
param = param.to(torch.float32)
state_dict[key] = param
def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs=None):
from .amp import init as amp_init
optimizers_was_list = False
if isinstance(optimizers, torch.optim.Optimizer) or ('LARC' in globals() and isinstance(optimizers, LARC)):
optimizers = [optimizers]
elif optimizers is None:
optimizers = []
elif isinstance(optimizers, list):
optimizers_was_list = True
check_optimizers(optimizers)
else:
check_optimizers([optimizers])
raise TypeError("optimizers must be either a single optimizer or a list of optimizers.")
if isinstance(models, torch.nn.Module):
models_was_list = False
models = [models]
elif isinstance(models, list):
models_was_list = True
else:
raise TypeError("models must be either a single model or a list of models.")
check_models(models)
if not _amp_state.allow_incoming_model_not_fp32:
check_params_fp32(models)
# In the future, when FP16_Optimizer can be deprecated and master weights can
# become an attribute, remember to stash master weights before casting the model.
if properties.cast_model_type:
if properties.keep_batchnorm_fp32:
for model in models:
convert_network(model, properties.cast_model_type)
else:
for model in models:
model.to(properties.cast_model_type)
input_caster = functools.partial(to_type, properties.cast_model_type)
if cast_model_outputs is not None:
output_caster = functools.partial(to_type, cast_model_outputs)
else:
output_caster = functools.partial(to_type, torch.float32)
for model in models:
# Patch the forward method to cast incoming data to the correct type, and
# outgoing data to float32, so "the user never needs to call .half()."
# I like writing things explicitly more than decorators.
def patch_forward(old_fwd):
def new_fwd(*args, **kwargs):
output = old_fwd(*applier(args, input_caster),
**applier(kwargs, input_caster))
return applier(output, output_caster)
return new_fwd
model.forward = patch_forward(model.forward)
# State dict trick to recast any preexisting per-param state tensors
for optimizer in optimizers:
optimizer.load_state_dict(optimizer.state_dict())
# patch model.state_dict() to return float32 params
for model in models:
for module in model.modules():
module._register_state_dict_hook(O2StateDictHook(functools.partial(to_type, torch.float32)))
elif cast_model_outputs is not None:
output_caster = functools.partial(to_type, cast_model_outputs)
for model in models:
def patch_forward(old_fwd):
def new_fwd(*args, **kwargs):
output = old_fwd(*args, **kwargs)
return applier(output, output_caster)
return new_fwd
model.forward = patch_forward(model.forward)
for i, optimizer in enumerate(optimizers):
optimizers[i] = _process_optimizer(optimizer, properties)
_amp_state.loss_scalers = []
for _ in range(num_losses):
_amp_state.loss_scalers.append(LossScaler(properties.loss_scale,
min_loss_scale=_amp_state.min_loss_scale,
max_loss_scale=_amp_state.max_loss_scale))
if properties.patch_torch_functions:
# handle is unused here. It's accessible later through a global value anyway.
handle = amp_init(loss_scale=properties.loss_scale, verbose=(_amp_state.verbosity == 2))
for optimizer in optimizers:
# Disable Amp casting for the optimizer step, because it should only be
# applied to FP32 master params anyway.
def patch_step(old_step):
def new_step(self, *args, **kwargs):
with disable_casts():
output = old_step(*args, **kwargs)
return output
return new_step
optimizer.step = MethodType(patch_step(optimizer.step), optimizer)
if optimizers_was_list:
if models_was_list:
return models, optimizers
else:
return models[0], optimizers
else:
if models_was_list:
if len(optimizers) == 0:
return models
else:
return models, optimizers[0]
else:
if len(optimizers) == 0:
return models[0]
else:
return models[0], optimizers[0]
================================================
FILE: KoSentenceT5/apex/amp/_process_optimizer.py
================================================
import types
from ..fp16_utils import master_params_to_model_params
from ..multi_tensor_apply import multi_tensor_applier
from ._amp_state import maybe_print
import torch
from ..optimizers import FusedSGD
class AmpOptimizerState(object):
def __init__(self):
pass
def _master_params_to_model_params(self):
stash = self._amp_stash
if multi_tensor_applier.available:
if len(stash.all_fp16_params) > 0:
multi_tensor_applier(
stash.multi_tensor_scale,
stash.dummy_overflow_buf,
[stash.all_fp32_from_fp16_params, stash.all_fp16_params],
1.0)
else:
for fp16_group, fp32_from_fp16_group in zip(stash.fp16_groups, stash.fp32_from_fp16_groups):
master_params_to_model_params(fp16_group, fp32_from_fp16_group)
def lazy_init_with_master_weights(self):
stash = self._amp_stash
stash.fp16_groups = []
stash.fp32_from_fp16_groups = []
stash.fp32_from_fp32_groups = []
for i, param_group in enumerate(self.param_groups):
# maybe_print("FP16_Optimizer processing param group {}:".format(i))
fp16_params_this_group = []
fp32_params_this_group = []
fp32_from_fp16_params_this_group = []
for i, param in enumerate(param_group['params']):
if param.requires_grad:
if param.type() == 'torch.cuda.HalfTensor':
# maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
# .format(param.size()))
fp16_params_this_group.append(param)
master_param = param.detach().clone().float()
master_param.requires_grad = True
param_group['params'][i] = master_param
fp32_from_fp16_params_this_group.append(master_param)
# Reset existing state dict key to the new master param.
# We still need to recast per-param state tensors, if any, to FP32.
if param in self.state:
self.state[master_param] = self.state.pop(param)
elif param.type() == 'torch.cuda.FloatTensor':
# maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
# .format(param.size()))
fp32_params_this_group.append(param)
param_group['params'][i] = param
else:
raise TypeError("Optimizer's parameters must be either "
"torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
"Received {}".format(param.type()))
stash.fp16_groups.append(fp16_params_this_group)
stash.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
stash.fp32_from_fp32_groups.append(fp32_params_this_group)
stash.all_fp16_params = []
for group in stash.fp16_groups:
stash.all_fp16_params += group
stash.all_fp32_from_fp16_params = []
for group in stash.fp32_from_fp16_groups:
stash.all_fp32_from_fp16_params += group
stash.all_fp32_from_fp32_params = []
for group in stash.fp32_from_fp32_groups:
stash.all_fp32_from_fp32_params += group
# all_fp16_grad_stash is only needed for fused optimizers.
stash.all_fp16_grad_stash = [None for _ in stash.all_fp16_params]
# stash.all_fp32_from_fp16_grad_stash = [None for _ in stash.all_fp32_from_fp16_params]
stash.all_fp32_from_fp32_grad_stash = [None for _ in stash.all_fp32_from_fp32_params]
for param in stash.all_fp32_from_fp16_params:
param.grad = None
for param in stash.all_fp32_from_fp32_params:
param.grad = None
# Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
self.load_state_dict(self.state_dict())
def post_backward_models_are_masters(scaler, params, stashed_grads, scale_override=None):
grads_have_scale, stashed_have_scale, out_scale = scaler.loss_scale(), 1.0, 1.0
# not much to do if scale == 1.0 and static scaling
if scaler.loss_scale() == 1.0 and not scaler.dynamic:
# Clear the stash.
for i in range(len(stashed_grads)):
stashed_grads[i] = None
return
if scale_override is not None:
grads_have_scale, stashed_have_scale, out_scale = scale_override
# This is a lot of python overhead...
grads_needing_unscale = []
grads_needing_unscale_with_stash = []
stashed = []
for param, stashed_grad in zip(params, stashed_grads):
if param.grad is None and stashed_grad is not None:
param.grad = stashed_grad
elif param.grad is not None and stashed_grad is None:
grads_needing_unscale.append(param.grad)
elif param.grad is not None and stashed_grad is not None:
grads_needing_unscale_with_stash.append(param.grad)
stashed.append(stashed_grad)
else: # param.grad is None and stashed_grad is None
continue
# unscale() implements grads*(1/scale), so "scale" should be grads_have_scale/out_scale.
if len(grads_needing_unscale) > 0:
scaler.unscale(
grads_needing_unscale,
grads_needing_unscale,
None, # unused_scale, currently present to avoid API breakage elsewhere
models_are_masters=True,
scale_override=grads_have_scale/out_scale)
if len(grads_needing_unscale_with_stash) > 0:
scaler.unscale_with_stashed(
grads_needing_unscale_with_stash,
stashed,
grads_needing_unscale_with_stash,
scale_override=(grads_have_scale, stashed_have_scale, out_scale))
# Clear the stash.
for i in range(len(stashed_grads)):
stashed_grads[i] = None
def prepare_backward_with_master_weights(self):
stash = self._amp_stash
self._amp_lazy_init()
for i, param in enumerate(stash.all_fp16_params):
# Set up to leverage grad copy elision.
# This may behave differently from an unpatched optimizer if zero_grad is used and the param is unused.
param.grad = None
# for i, param in enumerate(stash.all_fp32_from_fp16_params):
# stash.all_fp32_from_fp16_grad_stash[i] = param.grad
for i, param in enumerate(stash.all_fp32_from_fp32_params):
stash.all_fp32_from_fp32_grad_stash[i] = param.grad
# Set up to leverage grad copy elision:
param.grad = None
def post_backward_with_master_weights(self, scaler):
stash = self._amp_stash
self._amp_lazy_init()
# This is a lot of python overhead...
fp16_grads_needing_unscale = []
new_fp32_grads = []
fp16_grads_needing_unscale_with_stash = []
preexisting_fp32_grads = []
for fp16_param, fp32_param in zip(stash.all_fp16_params,
stash.all_fp32_from_fp16_params):
if fp16_param.grad is None and fp32_param.grad is not None:
continue
elif fp16_param.grad is not None and fp32_param.grad is None:
fp32_param.grad = torch.empty_like(fp32_param)
fp16_grads_needing_unscale.append(fp16_param.grad)
new_fp32_grads.append(fp32_param.grad)
elif fp16_param.grad is not None and fp32_param.grad is not None:
fp16_grads_needing_unscale_with_stash.append(fp16_param.grad)
preexisting_fp32_grads.append(fp32_param.grad)
else: # fp16_param.grad is None and fp32_param.grad is None:
continue
if len(fp16_grads_needing_unscale) > 0:
scaler.unscale(
fp16_grads_needing_unscale,
new_fp32_grads,
scaler.loss_scale(),
models_are_masters=False)
if len(fp16_grads_needing_unscale_with_stash) > 0:
scaler.unscale_with_stashed(
fp16_grads_needing_unscale_with_stash,
preexisting_fp32_grads,
preexisting_fp32_grads)
# fp32 params can be treated as they would be in the "no_master_weights" case.
post_backward_models_are_masters(
scaler,
stash.all_fp32_from_fp32_params,
stash.all_fp32_from_fp32_grad_stash)
def lazy_init_no_master_weights(self):
stash = self._amp_stash
stash.all_fp16_params = []
stash.all_fp32_params = []
for i, param_group in enumerate(self.param_groups):
for i, param in enumerate(param_group['params']):
if param.type() == 'torch.cuda.HalfTensor':
stash.all_fp16_params.append(param)
elif param.type() == 'torch.cuda.FloatTensor':
stash.all_fp32_params.append(param)
else:
raise TypeError("Optimizer's parameters must be either "
"torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
"Received {}".format(param.type()))
stash.all_fp16_grad_stash = [None for _ in stash.all_fp16_params]
stash.all_fp32_grad_stash = [None for _ in stash.all_fp32_params]
def prepare_backward_no_master_weights(self):
stash = self._amp_stash
self._amp_lazy_init()
for i, param in enumerate(stash.all_fp16_params):
stash.all_fp16_grad_stash[i] = param.grad
# Set up to leverage grad copy elision:
param.grad = None
for i, param in enumerate(stash.all_fp32_params):
stash.all_fp32_grad_stash[i] = param.grad
# Set up to leverage grad copy elision:
param.grad = None
def post_backward_no_master_weights(self, scaler):
stash = self._amp_stash
self._amp_lazy_init()
split_types = ((stash.all_fp16_params, stash.all_fp16_grad_stash),
(stash.all_fp32_params, stash.all_fp32_grad_stash))
for params, stashed_grads in split_types:
post_backward_models_are_masters(scaler, params, stashed_grads)
#####################################################################################
# FusedSGD versions
#####################################################################################
# FusedSGD never explicitly materializes the fp32 gradients for "fp32 from fp16" master params
# outside the kernel, so we must accumulate directly into the model grads.
def prepare_backward_with_master_weights_FusedSGD(self):
if self.materialize_master_grads:
prepare_backward_with_master_weights(self)
else:
stash = self._amp_stash
self._amp_lazy_init()
for i, param in enumerate(stash.all_fp16_params):
stash.all_fp16_grad_stash[i] = param.grad
# Set up to leverage grad copy elision:
param.grad = None
for i, param in enumerate(stash.all_fp32_from_fp32_params):
stash.all_fp32_from_fp32_grad_stash[i] = param.grad
# Set up to leverage grad copy elision:
param.grad = None
def post_backward_with_master_weights_FusedSGD(self, scaler):
if self.materialize_master_grads:
post_backward_with_master_weights(self, scaler)
else:
stash = self._amp_stash
self._amp_lazy_init()
grads_have_scale = scaler.loss_scale()
stashed_have_scale = self.most_recent_scale
out_scale = grads_have_scale
if self.scale_set_by_backward:
out_scale = min(grads_have_scale, self.most_recent_scale)
split_types = ((stash.all_fp16_params, stash.all_fp16_grad_stash),
(stash.all_fp32_from_fp32_params, stash.all_fp32_from_fp32_grad_stash))
# unscale_with_stashed() implements grads*1/scale + stashed_grads*1.
# stashed_grads are scaled by self.most_recent_scale.
for params, stashed_grads in split_types:
post_backward_models_are_masters(scaler, params, stashed_grads,
(grads_have_scale, stashed_have_scale, out_scale))
self.most_recent_scale = out_scale
self.scale_set_by_backward = True
def prepare_backward_no_master_weights_FusedSGD(self):
prepare_backward_no_master_weights(self)
def post_backward_no_master_weights_FusedSGD(self, scaler):
post_backward_no_master_weights(self, scaler)
def _amp_lazy_init(self):
stash = self._amp_stash
if not stash.lazy_init_called:
self._lazy_init_maybe_master_weights()
stash.lazy_init_called = True
def _process_optimizer(optimizer, properties):
if hasattr(optimizer, "_amp_stash"):
raise RuntimeError("A given optimizer should only be passed through amp.initialize once.")
else:
optimizer._amp_stash = AmpOptimizerState()
optimizer._amp_stash.lazy_init_called = False
optimizer._amp_stash.already_patched = False
optimizer._amp_stash.params_have_scaled_gradients = False
for name in ("_lazy_init_maybe_master_weights",
"_master_params_to_model_params",
"_prepare_amp_backward",
"_post_amp_backward",
"_amp_lazy_init"):
if hasattr(optimizer, name):
raise RuntimeError("Incoming optimizer already has {} defined.".format(name))
# TODO: Centralize exposure and import error checking for the C backend.
if multi_tensor_applier.available:
import amp_C
optimizer._amp_stash.multi_tensor_scale = amp_C.multi_tensor_scale
optimizer._amp_stash.multi_tensor_l2norm = amp_C.multi_tensor_l2norm
optimizer._amp_stash.dummy_overflow_buf = torch.cuda.IntTensor([0]);
if properties.master_weights:
optimizer._lazy_init_maybe_master_weights = types.MethodType(
lazy_init_with_master_weights, optimizer)
optimizer._master_params_to_model_params = types.MethodType(
_master_params_to_model_params, optimizer)
old_step = optimizer.step
def new_step(self, closure=None):
if closure is not None:
raise RuntimeError("Currently, Amp does not support closure use with optimizers.")
retval = old_step()
if not isinstance(self, FusedSGD):
self._master_params_to_model_params()
# Clear the master grads that wouldn't be zeroed by model.zero_grad()
for param in self._amp_stash.all_fp32_from_fp16_params:
param.grad = None
return retval
optimizer.step = types.MethodType(new_step, optimizer)
old_zero_grad = optimizer.zero_grad
def new_zero_grad(self):
stash = self._amp_stash
self._amp_lazy_init()
# Zero the model grads.
for param in stash.all_fp16_params:
if param.grad is not None:
param.grad.detach_()
param.grad.zero_()
for param in stash.all_fp32_from_fp32_params:
if param.grad is not None:
param.grad.detach_()
param.grad.zero_()
# Clear the master grads that are independent of model grads
for param in self._amp_stash.all_fp32_from_fp16_params:
param.grad = None
optimizer.zero_grad = types.MethodType(new_zero_grad, optimizer)
if isinstance(optimizer, FusedSGD):
optimizer._prepare_amp_backward = types.MethodType(
prepare_backward_with_master_weights_FusedSGD, optimizer)
optimizer._post_amp_backward = types.MethodType(
post_backward_with_master_weights_FusedSGD, optimizer)
else:
optimizer._prepare_amp_backward = types.MethodType(
prepare_backward_with_master_weights, optimizer)
optimizer._post_amp_backward = types.MethodType(
post_backward_with_master_weights, optimizer)
else:
optimizer._lazy_init_maybe_master_weights = types.MethodType(
lazy_init_no_master_weights, optimizer)
if isinstance(optimizer, FusedSGD):
optimizer._prepare_amp_backward = types.MethodType(
prepare_backward_no_master_weights_FusedSGD, optimizer)
optimizer._post_amp_backward = types.MethodType(
post_backward_no_master_weights_FusedSGD, optimizer)
else:
optimizer._prepare_amp_backward = types.MethodType(
prepare_backward_no_master_weights, optimizer)
optimizer._post_amp_backward = types.MethodType(
post_backward_no_master_weights, optimizer)
optimizer._amp_lazy_init = types.MethodType(_amp_lazy_init, optimizer)
old_add_param_group = optimizer.add_param_group
def new_add_param_group(self, new_group):
stash = self._amp_stash
if not stash.lazy_init_called:
self._lazy_init_maybe_master_weights()
stash.lazy_init_called = True
assert isinstance(new_group, dict), "param group must be a dict"
new_params = new_group['params']
if isinstance(new_params, torch.Tensor):
new_group['params'] = [new_params]
elif isinstance(new_params, set):
raise TypeError('optimizer parameters need to be organized in ordered collections, but '
'the ordering of tensors in sets will change between runs. Please use a list instead.')
else:
new_group['params'] = list(new_params)
if properties.master_weights:
# Mutate new_group in-place to use FP32 master params
fp16_params_this_group = []
fp32_params_this_group = []
fp32_from_fp16_params_this_group = []
for i, param in enumerate(new_group['params']):
if param.requires_grad:
if param.type() == 'torch.cuda.HalfTensor':
fp16_params_this_group.append(param)
master_param = param.detach().clone().float()
master_param.requires_grad = True
new_group['params'][i] = master_param
fp32_from_fp16_params_this_group.append(master_param)
elif param.type() == 'torch.cuda.FloatTensor':
fp32_params_this_group.append(param)
new_group['params'][i] = param
else:
raise TypeError("Optimizer's parameters must be either "
"torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
"Received {}".format(param.type()))
stash.fp16_groups.append(fp16_params_this_group)
stash.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
stash.fp32_from_fp32_groups.append(fp32_params_this_group)
stash.all_fp16_params += fp16_params_this_group
stash.all_fp32_from_fp16_params += fp32_from_fp16_params_this_group
stash.all_fp32_from_fp32_params += fp32_params_this_group
# stash.all_fp32_from_fp16_grad_stash = [None for _ in stash.all_fp32_from_fp16_params]
stash.all_fp32_from_fp32_grad_stash += [None for _ in fp32_params_this_group]
# It should be ok to let params be added with existing .grad attributes.
# for param in fp16_params_this_group:
# param.grad = None
# for param in fp32_from_fp16_params_this_group:
# param.grad = None
# for param in stash.fp32_params_this_group:
# param.grad = None
else:
for param in new_group['params']:
if param.type() == 'torch.cuda.HalfTensor':
stash.all_fp16_params.append(param)
stash.all_fp16_grad_stash.append(None)
elif param.type() == 'torch.cuda.FloatTensor':
stash.all_fp32_params.append(param)
stash.all_fp32_grad_stash.append(None)
else:
raise TypeError("Optimizer's parameters must be either "
"torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
"Received {}".format(param.type()))
old_add_param_group(new_group)
optimizer.add_param_group = types.MethodType(new_add_param_group, optimizer)
return optimizer
================================================
FILE: KoSentenceT5/apex/amp/amp.py
================================================
from . import compat, rnn_compat, utils, wrap
from .handle import AmpHandle, NoOpHandle
from .lists import functional_overrides, torch_overrides, tensor_overrides
from ._amp_state import _amp_state
from .frontend import *
import functools
import itertools
import torch
_DECORATOR_HANDLE = None
_USER_CAST_REGISTRY = set()
_USER_PROMOTE_REGISTRY = set()
def _decorator_helper(orig_fn, cast_fn, wrap_fn):
def wrapper(*args, **kwargs):
handle = _DECORATOR_HANDLE
if handle is None or not handle.is_active():
return orig_fn(*args, **kwargs)
inner_cast_fn = utils.verbosify(cast_fn, orig_fn.__name__,
handle.verbose)
return wrap_fn(orig_fn, inner_cast_fn, handle)(*args, **kwargs)
return wrapper
# Decorator form
def half_function(fn):
wrap_fn = functools.partial(wrap.make_cast_wrapper, try_caching=True)
return _decorator_helper(fn, utils.maybe_half, wrap_fn)
def float_function(fn):
wrap_fn = functools.partial(wrap.make_cast_wrapper, try_caching=False)
return _decorator_helper(fn, utils.maybe_float, wrap_fn)
def promote_function(fn):
wrap_fn = functools.partial(wrap.make_promote_wrapper)
return _decorator_helper(fn, utils.maybe_float, wrap_fn)
# Registry form
def register_half_function(module, name):
if not hasattr(module, name):
raise ValueError('No function named {} in module {}.'.format(
name, module))
_USER_CAST_REGISTRY.add((module, name, utils.maybe_half))
def register_float_function(module, name):
if not hasattr(module, name):
raise ValueError('No function named {} in module {}.'.format(
name, module))
_USER_CAST_REGISTRY.add((module, name, utils.maybe_float))
def register_promote_function(module, name):
if not hasattr(module, name):
raise ValueError('No function named {} in module {}.'.format(
name, module))
_USER_PROMOTE_REGISTRY.add((module, name))
# Top-level function to insert _all_ the hooks.
def init(enabled=True, loss_scale="dynamic", enable_caching=True, verbose=False, allow_banned=False):
global _DECORATOR_HANDLE
if not enabled:
handle = NoOpHandle()
_DECORATOR_HANDLE = handle
return handle
handle = AmpHandle(loss_scale, enable_caching, verbose)
# 0) Force-{fp16, fp32} for user-annotated functions
for mod, fn, cast_fn in _USER_CAST_REGISTRY:
try_caching = (cast_fn == utils.maybe_half)
wrap.cached_cast(mod, fn, cast_fn, handle,
try_caching, verbose)
_USER_CAST_REGISTRY.clear()
# 0.5) Force-promote for user-annotated functions
for mod, fn in _USER_PROMOTE_REGISTRY:
wrap.promote(mod, fn, handle, verbose)
_USER_PROMOTE_REGISTRY.clear()
# 1) Force-{fp16, fp32} on white- / black-list functions
override_modules = [functional_overrides,
torch_overrides,
tensor_overrides]
cast_table = [('FP16_FUNCS', utils.maybe_half),
('FP32_FUNCS', utils.maybe_float)]
for module, (list_name, cast_fn) in itertools.product(override_modules,
cast_table):
for fn in getattr(module, list_name):
try_caching = (cast_fn == utils.maybe_half)
wrap.cached_cast(module.MODULE, fn, cast_fn, handle,
try_caching, verbose)
# 1.5) Pre-0.4, put the blacklist methods on HalfTensor and whitelist
# methods on FloatTensor, since they're distinct types.
if compat.tensor_is_float_tensor():
for fn in tensor_overrides.FP16_FUNCS:
wrap.cached_cast(torch.cuda.FloatTensor, fn, utils.maybe_half,
handle, try_caching=True, verbose=verbose)
for fn in tensor_overrides.FP32_FUNCS:
wrap.cached_cast(torch.cuda.HalfTensor, fn, utils.maybe_float,
handle, try_caching=False, verbose=verbose)
# 2) Enable type-promotion on multi-arg functions and methods.
# NB: special handling for sequence fns (e.g. `torch.cat`).
promote_modules = [torch_overrides, tensor_overrides]
promote_table = [('CASTS', wrap.promote),
('SEQUENCE_CASTS', wrap.sequence_promote)]
for promote_mod, (list_name, promote_fn) in itertools.product(promote_modules,
promote_table):
for fn in getattr(promote_mod, list_name):
promote_fn(promote_mod.MODULE, fn, handle, verbose)
# 2.5) Pre-0.4, add blacklist methods directly to HalfTensor and FloatTensor types
if compat.tensor_is_float_tensor():
for cls, (list_name, promote_fn) in itertools.product([torch.cuda.FloatTensor,
torch.cuda.HalfTensor],
promote_table):
for fn in getattr(tensor_overrides, list_name):
promote_fn(cls, fn, handle, verbose)
# 3) For any in-place version of a blacklist function, error if any input is fp16.
# NB: this is overly conservative.
for fn in utils.as_inplace(torch_overrides.FP32_FUNCS):
wrap.err_if_any_half(torch_overrides.MODULE, fn, handle)
# 3.5) For any in-place blacklist method, error if called on fp16 tensor
for fn in utils.as_inplace(tensor_overrides.FP32_FUNCS):
wrap.err_if_arg0_half(tensor_overrides.MODULE, fn, handle, verbose)
if compat.tensor_is_float_tensor():
wrap.err_if_arg0_half(torch.cuda.HalfTensor, fn, handle, verbose)
# 4) For other in-place methods, match the type of self tensor
for fn in utils.as_inplace(itertools.chain(
tensor_overrides.FP16_FUNCS,
tensor_overrides.CASTS)):
wrap.promote_match_arg0(tensor_overrides.MODULE, fn, handle, verbose)
if compat.tensor_is_float_tensor():
wrap.promote_match_arg0(torch.cuda.HalfTensor, fn, handle, verbose)
wrap.promote_match_arg0(torch.cuda.FloatTensor, fn, handle, verbose)
# 5) RNNs + RNN cells are whitelisted specially
if rnn_compat.has_old_rnns():
wrap.rnn_cast(torch.nn.backends.thnn.backend, 'RNN', handle, verbose)
if not rnn_compat.has_old_rnns():
# Patch in our own indirection of `_VF` in modules/rnn s.t. it is mutable.
torch.nn.modules.rnn._VF = rnn_compat.VariableFunctionsShim()
# Wrap all the rnns
for x in rnn_compat.RNN_NAMES:
wrap.new_rnn_cast(x.upper(), handle, verbose)
# Wrap all the RNN cells
rnn_compat.whitelist_rnn_cells(handle, verbose)
# 6) Place error+print message on banned functions.
# Or, if allow_banned, then cast to FP32.
for fn, err_msg in functional_overrides.BANNED_FUNCS:
if allow_banned:
wrap.cached_cast(functional_overrides.MODULE, fn, utils.maybe_float,
handle, try_caching=True, verbose=verbose)
else:
wrap.err_if_any_half(functional_overrides.MODULE, fn, handle, err_msg)
_DECORATOR_HANDLE = handle
_amp_state.handle = handle
return handle
================================================
FILE: KoSentenceT5/apex/amp/compat.py
================================================
import torch
# True for post-0.4, when Variables/Tensors merged.
def variable_is_tensor():
v = torch.autograd.Variable()
return isinstance(v, torch.Tensor)
def tensor_is_variable():
x = torch.Tensor()
return type(x) == torch.autograd.Variable
# False for post-0.4
def tensor_is_float_tensor():
x = torch.Tensor()
return type(x) == torch.FloatTensor
# Akin to `torch.is_tensor`, but returns True for Variable
# objects in pre-0.4.
def is_tensor_like(x):
return torch.is_tensor(x) or isinstance(x, torch.autograd.Variable)
# Wraps `torch.is_floating_point` if present, otherwise checks
# the suffix of `x.type()`.
def is_floating_point(x):
if hasattr(torch, 'is_floating_point'):
return torch.is_floating_point(x)
try:
torch_type = x.type()
return torch_type.endswith('FloatTensor') or \
torch_type.endswith('HalfTensor') or \
torch_type.endswith('DoubleTensor')
except AttributeError:
return False
def scalar_python_val(x):
if hasattr(x, 'item'):
return x.item()
else:
if isinstance(x, torch.autograd.Variable):
return x.data[0]
else:
return x[0]
# Accounts for the possibility that some ops may be removed from a namespace.
def filter_attrs(module, attrs):
return list(attrname for attrname in attrs if hasattr(module, attrname))
================================================
FILE: KoSentenceT5/apex/amp/frontend.py
================================================
import torch
from ._initialize import _initialize
from ._amp_state import _amp_state, warn_or_err, maybe_print
from collections import OrderedDict
class Properties(object):
"""
This class has two purposes: to establish a set of default properties,
and to route setting of these attributes through __setattr__ so that (in theory)
they can be checked for consistency with other existing args.
"""
def __init__(self):
self.options = {
"enabled" : False,
"opt_level" : None,
"cast_model_type" : None,
"patch_torch_functions" : False,
"keep_batchnorm_fp32" : None,
"master_weights" : None,
"loss_scale" : 1.0,
# Reserved for future functionality
# "fused_optimizer" : False,
# "enable_ddp_interop" : False,
}
"""
This function allows updating several options at a time without routing through
__setattr__ checks, to avoid "you can't get there from here" scenarios.
Currently not intended to be exposed; users are expected to select an opt_level
and apply consistent modifications.
"""
def _update_options_dict(self, new_options):
for k, v in new_options:
if k in self.options:
self.options[k] = v
else:
raise ValueError("Tried to set unexpected option {}".format(k))
"""
The members of "options" are not direct attributes of self, so access attempts
will roll down to __getattr__. This borrows from the logic in torch.nn.Module.
"""
def __getattr__(self, name):
if "options" in self.__dict__:
options = self.__dict__["options"]
if name in options:
return options[name]
raise AttributeError("'{}' object has no attribute '{}'".format(
type(self).__name__, name))
def __setattr__(self, name, value):
if "options" in self.__dict__:
if name in self.options:
# print("setting {} {}".format(name, value))
if name == "cast_model_type":
if self.opt_level == "O1" and value is not None:
if value is not False:
if value is not torch.float32:
warn_or_err("O1 inserts casts around Torch functions rather than "
"model weights, so with O1, the model weights themselves "
"should remain FP32. If you wish to cast the model to a "
"different type, use opt_level='O2' or 'O3'. " +
"cast_model_type was {}".format(value))
self.options[name] = value
elif name == "patch_torch_functions":
if self.opt_level != "O1" and value:
warn_or_err("Currently, patch_torch_functions=True should only be set by "
"selecting opt_level='O1'.")
self.options[name] = value
elif name == "keep_batchnorm_fp32":
if self.opt_level == "O1" and value is not None:
warn_or_err("With opt_level O1, batchnorm functions are automatically patched "
"to run in FP32, so keep_batchnorm_fp32 should be None." +
" keep_batchnorm_fp32 was {}".format(value))
if value == "False":
self.options[name] = False
elif value == "True":
self.options[name] = True
else:
assert (value is True or value is False or value is None),\
"keep_batchnorm_fp32 must be a boolean, the string 'True' or 'False', "\
"or None, found keep_batchnorm_fp32={}".format(value)
self.options[name] = value
elif name == "master_weights":
if self.opt_level == "O1" and value is not None:
warn_or_err("It doesn't make sense to use master_weights with O1. "
"With O1, your model weights themselves should be FP32.")
self.options[name] = value
elif name == "loss_scale":
if value == "dynamic":
self.options[name] = value
else:
self.options[name] = float(value)
else:
self.options[name] = value
else:
super(Properties, self).__setattr__(name, value)
""" O0-O3 are convenience wrappers to establish defaults for typically used mixed precision options. """
class O3:
brief = "O3: Pure FP16 training."
more = "Calls .half() on your model, converting the entire model to FP16.\n"\
"A casting operation is also inserted to cast incoming Tensors to FP16,\n"\
"so you don't need to change your data pipeline.\n"\
"This mode is useful for establishing a performance ceiling.\n"\
"It's also possible training may 'just work' in this mode.\n"\
"If not, try other optimization levels."
def __call__(self, properties):
properties.enabled = True
properties.opt_level = "O3"
properties.cast_model_type = torch.float16
properties.patch_torch_functions = False
properties.keep_batchnorm_fp32 = False
properties.master_weights = False
properties.loss_scale = 1.0
# properties.fused_optimizer = False
# properties.enable_ddp_interop = False
return properties # modified in place so this isn't really necessary
class O2:
brief = "O2: FP16 training with FP32 batchnorm and FP32 master weights.\n"
more = "Calls .half() on your model, converting the entire model (except for batchnorms)\n"\
"to FP16. Batchnorms are retained in FP32 for additional stability.\n"\
"The forward pass is patched to cast incoming Tensors to FP16, so you don't need to change\n"\
"your data pipeline.\n"\
"O2 creates FP32 master weights outside the model and patches any optimizers to update\n"\
"these master weights, then copy the master weights into the FP16 model weights.\n"\
"Master weights can also improve convergence and stability."
def __call__(self, properties):
properties.enabled = True
properties.opt_level = "O2"
properties.cast_model_type = torch.float16
properties.patch_torch_functions = False
properties.keep_batchnorm_fp32 = True
properties.master_weights = True
properties.loss_scale = "dynamic"
# properties.fused_optimizer = False
# properties.enable_ddp_interop = False
return properties # modified in place so this isn't really necessary
class O1:
brief = "O1: Insert automatic casts around Pytorch functions and Tensor methods.\n"
more = "The type of your model's weights is not altered. However, internally,\n"\
"Pytorch functions are patched to cast any Tensor Core-friendly ops to FP16 for speed,\n"\
"while operations that might benefit from the additional stability of FP32 are patched\n"\
"to cast their inputs to fp32.\n"\
"O1 is the safest way to try mixed precision training, and is recommended when\n"\
"trying mixed precision training for the first time."
def __call__(self, properties):
properties.enabled = True
properties.opt_level = "O1"
properties.cast_model_type = None
properties.patch_torch_functions = True
properties.keep_batchnorm_fp32 = None
properties.master_weights = None
properties.loss_scale = "dynamic"
# properties.fused_optimizer = False
# properties.enable_ddp_interop = False
return properties # modified in place so this isn't really necessary
class O0:
brief = "O0: Pure FP32 training.\n"
more = "Your models are checked to make sure parameters are FP32, but otherwise the\n"\
"types of weights and internal Pytorch operations are not altered. This mode disables any\n"\
"FP16 arithmetic, although other optimizations like DDP interop may still be requested.\n"
def __call__(self, properties):
properties.enabled = True
properties.opt_level = "O0"
properties.cast_model_type = torch.float32
properties.patch_torch_functions = False
properties.keep_batchnorm_fp32 = None
properties.master_weights = False
properties.loss_scale = 1.0
# properties.fused_optimizer = False
# properties.enable_ddp_interop = False
return properties # modified in place so this isn't really necessary
opt_levels = {"O3": O3(),
"O2": O2(),
"O1": O1(),
"O0": O0()}
# allow user to directly pass Properties struct as well?
def initialize(
models,
optimizers=None,
enabled=True,
opt_level="O1",
cast_model_type=None,
patch_torch_functions=None,
keep_batchnorm_fp32=None,
master_weights=None,
loss_scale=None,
cast_model_outputs=None,
num_losses=1,
verbosity=1,
min_loss_scale=None,
max_loss_scale=2.**24
):
"""
Initialize your models, optimizers, and the Torch tensor and functional namespace according to the
chosen ``opt_level`` and overridden properties, if any.
``amp.initialize`` should be called **after** you have finished
constructing your model(s) and
optimizer(s), but **before** you send your model through any DistributedDataParallel wrapper.
See `Distributed training`_ in the Imagenet example.
Currently, ``amp.initialize`` should only be called **once**,
although it can process an arbitrary number of
models and optimizers (see the corresponding `Advanced Amp Usage topic`_).
If you think your use case requires ``amp.initialize`` to be called more than once,
`let us know`_.
Any property keyword argument that is not ``None`` will be interpreted as a manual override.
To prevent having to rewrite anything else in your script, name the returned models/optimizers
to replace the passed models/optimizers, as in the code sample below.
Args:
models (torch.nn.Module or list of torch.nn.Modules): Models to modify/cast.
optimizers (optional, torch.optim.Optimizer or list of torch.optim.Optimizers): Optimizers to modify/cast.
REQUIRED for training, optional for inference.
enabled (bool, optional, default=True): If False, renders all Amp calls no-ops, so your script
should run as if Amp were not present.
opt_level (str, optional, default="O1"): Pure or mixed precision optimization level. Accepted values are
"O0", "O1", "O2", and "O3", explained in detail above.
cast_model_type (``torch.dtype``, optional, default=None): Optional property override, see
above.
patch_torch_functions (bool, optional, default=None): Optional property override.
keep_batchnorm_fp32 (bool or str, optional, default=None): Optional property override. If
passed as a string, must be the string "True" or "False".
master_weights (bool, optional, default=None): Optional property override.
loss_scale (float or str, optional, default=None): Optional property override. If passed as a string,
must be a string representing a number, e.g., "128.0", or the string "dynamic".
cast_model_outputs (torch.dtype, optional, default=None): Option to ensure that the outputs
of your model(s) are always cast to a particular type regardless of ``opt_level``.
num_losses (int, optional, default=1): Option to tell Amp in advance how many losses/backward
passes you plan to use. When used in conjunction with the ``loss_id`` argument to
``amp.scale_loss``, enables Amp to use a different loss scale per loss/backward pass,
which can improve stability. See "Multiple models/optimizers/losses"
under `Advanced Amp Usage`_ for examples. If ``num_losses`` is left to 1, Amp will still
support multiple losses/backward passes, but use a single global loss scale
for all of them.
verbosity (int, default=1): Set to 0 to suppress Amp-related output.
min_loss_scale (float, default=None): Sets a floor for the loss scale values that can be chosen by dynamic
loss scaling. The default value of None means that no floor is imposed.
If dynamic loss scaling is not used, `min_loss_scale` is ignored.
max_loss_scale (float, default=2.**24): Sets a ceiling for the loss scale values that can be chosen by
dynamic loss scaling. If dynamic loss scaling is not used, `max_loss_scale` is ignored.
Returns:
Model(s) and optimizer(s) modified according to the ``opt_level``.
If either the ``models`` or ``optimizers`` args were lists, the corresponding return value will
also be a list.
Permissible invocations::
model, optim = amp.initialize(model, optim,...)
model, [optim1, optim2] = amp.initialize(model, [optim1, optim2],...)
[model1, model2], optim = amp.initialize([model1, model2], optim,...)
[model1, model2], [optim1, optim2] = amp.initialize([model1, model2], [optim1, optim2],...)
# This is not an exhaustive list of the cross product of options that are possible,
# just a set of examples.
model, optim = amp.initialize(model, optim, opt_level="O0")
model, optim = amp.initialize(model, optim, opt_level="O0", loss_scale="dynamic"|128.0|"128.0")
model, optim = amp.initialize(model, optim, opt_level="O1") # uses "loss_scale="dynamic" default
model, optim = amp.initialize(model, optim, opt_level="O1", loss_scale=128.0|"128.0")
model, optim = amp.initialize(model, optim, opt_level="O2") # uses "loss_scale="dynamic" default
model, optim = amp.initialize(model, optim, opt_level="O2", loss_scale=128.0|"128.0")
model, optim = amp.initialize(model, optim, opt_level="O2", keep_batchnorm_fp32=True|False|"True"|"False")
model, optim = amp.initialize(model, optim, opt_level="O3") # uses loss_scale=1.0 default
model, optim = amp.initialize(model, optim, opt_level="O3", loss_scale="dynamic"|128.0|"128.0")
model, optim = amp.initialize(model, optim, opt_level="O3", keep_batchnorm_fp32=True|False|"True"|"False")
The `Imagenet example`_ demonstrates live use of various opt_levels and overrides.
.. _`Distributed training`:
https://github.com/NVIDIA/apex/tree/master/examples/imagenet#distributed-training
.. _`Imagenet example`:
https://github.com/NVIDIA/apex/tree/master/examples/imagenet
.. _`Advanced Amp Usage`:
https://nvidia.github.io/apex/advanced.html
.. _`Advanced Amp Usage topic`:
https://nvidia.github.io/apex/advanced.html#multiple-models-optimizers-losses
.. _`let us know`:
https://github.com/NVIDIA/apex/issues
"""
_amp_state.opt_properties = Properties()
_amp_state.verbosity = verbosity
if not enabled:
if optimizers is None:
return models
else:
return models, optimizers
if not torch.backends.cudnn.enabled:
raise RuntimeError(
"Amp requires torch.backends.cudnn.enabled = True")
if opt_level not in opt_levels:
raise RuntimeError(
"Unexpected optimization level {}. ".format(opt_level) +
"Options are 'O0', 'O1', 'O2', 'O3'. Note that in `O0`, `O1`, etc., the prefix O is the letter O, " +
"not the number zero.")
else:
_amp_state.opt_properties = opt_levels[opt_level](_amp_state.opt_properties)
maybe_print("Selected optimization level {}".format(opt_levels[opt_level].brief), True)
maybe_print("Defaults for this optimization level are:", True)
for k, v in _amp_state.opt_properties.options.items():
maybe_print("{:22} : {}".format(k, v), True)
_amp_state.min_loss_scale = min_loss_scale
_amp_state.max_loss_scale = max_loss_scale
maybe_print("Processing user overrides (additional kwargs that are not None)...", True)
# I chose to have the keyword arguments listed directly in the argument list,
# instead of **kwargs, so I can't use kwargs.items() here.
if enabled is not None:
_amp_state.opt_properties.enabled = enabled
if opt_level is not None:
_amp_state.opt_properties.opt_level = opt_level
if cast_model_type is not None:
_amp_state.opt_properties.cast_model_type = cast_model_type
if patch_torch_functions is not None:
_amp_state.opt_properties.patch_torch_functions = patch_torch_functions
if keep_batchnorm_fp32 is not None:
_amp_state.opt_properties.keep_batchnorm_fp32 = keep_batchnorm_fp32
if master_weights is not None:
_amp_state.opt_properties.master_weights = master_weights
if loss_scale is not None:
_amp_state.opt_properties.loss_scale = loss_scale
maybe_print("After processing overrides, optimization options are:", True)
for k, v in _amp_state.opt_properties.options.items():
maybe_print("{:22} : {}".format(k, v), True)
return _initialize(models, optimizers, _amp_state.opt_properties, num_losses, cast_model_outputs)
def state_dict(destination=None):
if destination is None:
destination = OrderedDict()
for idx, loss_scaler in enumerate(_amp_state.loss_scalers):
destination['loss_scaler%d' % idx] = {
'loss_scale': loss_scaler.loss_scale(),
'unskipped': loss_scaler._unskipped,
}
return destination
def load_state_dict(state_dict):
# Check if state_dict containes the same number of loss_scalers as current setup
if len(state_dict) != len(_amp_state.loss_scalers):
print('Warning: state_dict contains {} entries, while {} loss_scalers are used'.format(
len(state_dict), len(_amp_state.loss_scalers)))
state_dict = state_dict.copy()
nb_loss_scalers = len(_amp_state.loss_scalers)
unexpected_keys = []
# Initialize idx outside, since unexpected_keys will increase it if enumerate is used
idx = 0
for key in state_dict:
if 'loss_scaler' not in key:
unexpected_keys.append(key)
else:
if idx > (nb_loss_scalers - 1):
print('Skipping loss_scaler[{}], since num_losses was set to {}'.format(
idx, nb_loss_scalers))
break
_amp_state.loss_scalers[idx]._loss_scale = state_dict[key]['loss_scale']
_amp_state.loss_scalers[idx]._unskipped = state_dict[key]['unskipped']
idx += 1
if len(unexpected_keys) > 0:
raise RuntimeError(
'Error(s) in loading state_dict. Unexpected key(s) in state_dict: {}. '.format(
', '.join('"{}"'.format(k) for k in unexpected_keys)))
# TODO: is this necessary/useful?
# def check_option_consistency(enabled=True,
# opt_level=None,
# cast_model_type=None,
# patch_torch_functions=None,
# keep_batchnorm_fp32=None,
# master_weights=None,
# loss_scale=None,
# enable_ddp_interop=None,
# hard_override=False):
# """
# Utility function that enables users to quickly check if the option combination they intend
# to use is permitted. ``check_option_consistency`` does not require models or optimizers
# to be constructed, and can be called at any point in the script. ``check_option_consistency``
# is totally self-contained; it does not set any amp global state or affect anything outside
# of itself.
# """
#
# if not enabled:
# return
#
# if opt_level not in opt_levels:
# raise RuntimeError("Unexpected optimization level. Options are 'O0', 'O1', 'O2', 'O3'.")
# else:
# opt_properties = opt_levels[opt_level](Properties())
# print("Selected optimization level {}", opt_levels[opt_level].brief)
# print("Defaults for this optimization level are:")
# for k, v in opt_properties.options:
# print("{:22} : {}".format(k, v))
#
# print("Processing user overrides (additional kwargs that are not None)...")
# for k, v in kwargs:
# if k not in _amp_state.opt_properties.options:
# raise RuntimeError("Unexpected kwarg {}".format(k))
# if v is not None:
# setattr(opt_properties, k, v)
#
# print("After processing overrides, optimization options are:")
# for k, v in opt_properties.options:
# print("{:22} : {}".format(k, v))
================================================
FILE: KoSentenceT5/apex/amp/handle.py
================================================
import contextlib
import warnings
import sys
import torch
from . import utils
from .opt import OptimWrapper
from .scaler import LossScaler
from ._amp_state import _amp_state, master_params, maybe_print
if torch.distributed.is_available():
from ..parallel.LARC import LARC
# There's no reason to expose the notion of a "handle". Everything can happen through amp.* calls.
@contextlib.contextmanager
def scale_loss(loss,
optimizers,
loss_id=0,
model=None,
delay_unscale=False,
delay_overflow_check=False):
"""
On context manager entrance, creates ``scaled_loss = (loss.float())*current loss scale``.
``scaled_loss`` is yielded so that the user can call ``scaled_loss.backward()``::
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
On context manager exit (if ``delay_unscale=False``), the gradients are checked for infs/NaNs
and unscaled, so that ``optimizer.step()`` can be called.
.. note::
If Amp is using explicit FP32 master params (which is the default for ``opt_level=O2``, and
can also be manually enabled by supplying ``master_weights=True`` to ``amp.initialize``)
any FP16 gradients are copied to FP32 master gradients before being unscaled.
``optimizer.step()`` will then apply the unscaled master gradients to the master params.
.. warning::
If Amp is using explicit FP32 master params, only the FP32 master gradients will be
unscaled. The direct ``.grad`` attributes of any FP16
model params will remain scaled after context manager exit.
This subtlety affects gradient clipping. See "Gradient clipping" under
`Advanced Amp Usage`_ for best practices.
Args:
loss(Tensor): Typically a scalar Tensor. The ``scaled_loss`` that the context
manager yields is simply ``loss.float()*loss_scale``, so in principle
``loss`` could have more than one element, as long as you call
``backward()`` on ``scaled_loss`` appropriately within the context manager body.
optimizers: All optimizer(s) for which the current backward pass is creating gradients.
Must be an optimizer or list of optimizers returned from an earlier call
to ``amp.initialize``. For example use with multiple optimizers, see
"Multiple models/optimizers/losses" under `Advanced Amp Usage`_.
loss_id(int, optional, default=0): When used in conjunction with the ``num_losses`` argument
to ``amp.initialize``, enables Amp to use a different loss scale per loss. ``loss_id``
must be an integer between 0 and ``num_losses`` that tells Amp which loss is
being used for the current backward pass. See "Multiple models/optimizers/losses"
under `Advanced Amp Usage`_ for examples. If ``loss_id`` is left unspecified, Amp
will use the default global loss scaler for this backward pass.
model(torch.nn.Module, optional, default=None): Currently unused, reserved to enable future
optimizations.
delay_unscale(bool, optional, default=False): ``delay_unscale`` is never necessary, and
the default value of ``False`` is strongly recommended.
If ``True``, Amp will not unscale the gradients or perform model->master
gradient copies on context manager exit.
``delay_unscale=True`` is a minor ninja performance optimization and can result
in weird gotchas (especially with multiple models/optimizers/losses),
so only use it if you know what you're doing.
"Gradient accumulation across iterations" under `Advanced Amp Usage`_
illustrates a situation where this CAN (but does not need to) be used.
.. warning::
If ``delay_unscale`` is ``True`` for a given backward pass, ``optimizer.step()`` cannot be
called yet after context manager exit, and must wait for another, later backward context
manager invocation with ``delay_unscale`` left to False.
.. _`Advanced Amp Usage`:
https://nvidia.github.io/apex/advanced.html
"""
if not hasattr(_amp_state, "opt_properties"):
raise RuntimeError("Invoked 'with amp.scale_loss`, but internal Amp state has not been initialized. "
"model, optimizer = amp.initialize(model, optimizer, opt_level=...) must be called "
"before `with amp.scale_loss`.")
if not _amp_state.opt_properties.enabled:
yield loss
return
if isinstance(optimizers, torch.optim.Optimizer) or ('LARC' in globals() and isinstance(optimizers, LARC)):
optimizers = [optimizers]
loss_scaler = _amp_state.loss_scalers[loss_id]
loss_scale = loss_scaler.loss_scale()
if ((not _amp_state.opt_properties.master_weights)
and (not loss_scaler.dynamic)
and loss_scale == 1.0):
yield loss.float()
# Needing to drop the cache here as well is an ugly gotcha.
# But for now I think it's necessary to short-circuit.
# Probably ok to skip this if not delay_unscale
if _amp_state.opt_properties.patch_torch_functions:
_amp_state.handle._clear_cache()
return
if not delay_unscale:
if isinstance(optimizers, list):
for optimizer in optimizers:
if not optimizer._amp_stash.params_have_scaled_gradients:
optimizer._prepare_amp_backward()
yield (loss.float())*loss_scale
if delay_unscale:
for optimizer in optimizers:
optimizer._amp_stash.params_have_scaled_gradients = True
else:
# FusedSGD may take care of unscaling as part of their step() methods.
# if not isinstance(optimizers, FP16_Optimizer_for_fused):
loss_scaler.clear_overflow_state()
for optimizer in optimizers:
optimizer._post_amp_backward(loss_scaler)
optimizer._amp_stash.params_have_scaled_gradients = False
# For future fused optimizers that enable sync-free dynamic loss scaling,
# should_skip will always be False.
should_skip = False if delay_overflow_check else loss_scaler.update_scale()
if should_skip:
for optimizer in optimizers:
if not optimizer._amp_stash.already_patched:
# Close on loss_scaler and loss_id as well, to be safe. Probably not
# necessary because amp.scale_loss is already creating a temporary scope.
def patch_step(opt, loss_scaler, loss_id):
opt_step = opt.step
def skip_step(closure=None):
if closure is not None:
raise RuntimeError("Currently, Amp does not support closure use with optimizers.")
maybe_print(("Gradient overflow. Skipping step, loss scaler " +
"{} reducing loss scale to {}").format(loss_id,
loss_scaler.loss_scale()))
# TODO: I don't like the special casing for different optimizer implementations.
# Maybe skip should delegate to a method owned by the optimizers themselves.
if hasattr(opt._amp_stash, "all_fp32_from_fp16_params"):
# Clear the master grads that wouldn't be zeroed by model.zero_grad()
for param in opt._amp_stash.all_fp32_from_fp16_params:
param.grad = None
if hasattr(opt, "most_recent_scale"):
opt.most_recent_scale = 1.0
opt.scale_set_by_backward = False
opt.step = opt_step
opt._amp_stash.already_patched = False
return skip_step
optimizer.step = patch_step(optimizer, loss_scaler, loss_id)
optimizer._amp_stash.already_patched = True
# Probably ok to skip this if not delay_unscale
if _amp_state.opt_properties.patch_torch_functions:
_amp_state.handle._clear_cache()
# Free function version of AmpHandle.disable_casts, another step on the
# path to removing the concept of "AmpHandle"
@contextlib.contextmanager
def disable_casts():
_amp_state.handle._is_active = False
yield
_amp_state.handle._is_active = True
class AmpHandle(object):
def __init__(self, loss_scale="dynamic", enable_caching=True, verbose=False):
self._enable_caching = enable_caching
self._verbose = verbose
self._cache = dict()
self._default_scaler = LossScaler(loss_scale)
self._is_active = True
self._all_wrappers = []
def is_active(self):
return self._is_active
@contextlib.contextmanager
def _disable_casts(self):
self._is_active = False
yield
self._is_active = True
def wrap_optimizer(self, optimizer, num_loss=1):
self._default_scaler = None
return OptimWrapper(optimizer, self, num_loss)
@contextlib.contextmanager
def scale_loss(self, loss, optimizer):
raise RuntimeError("The old Amp API is no longer supported. Please move to the new API, "
"documented here: https://nvidia.github.io/apex/amp.html. Transition guide: "
"https://nvidia.github.io/apex/amp.html#transition-guide-for-old-api-users")
if not self.is_active():
yield loss
return
if self._default_scaler is None:
raise RuntimeError(
'After calling `handle.wrap_optimizer()`, you must explicitly ' +
'use `optimizer.scale_loss(loss)`.')
# TODO: this code block is duplicated here and `opt.py`. Unify.
loss_scale = self._default_scaler.loss_scale()
yield loss * loss_scale
self._default_scaler.clear_overflow_state()
self._default_scaler.unscale(
master_params(optimizer),
master_params(optimizer),
loss_scale)
should_skip = self._default_scaler.update_scale()
if should_skip:
optimizer_step = optimizer.step
def skip_step():
maybe_print('Gradient overflow, skipping update')
optimizer.step = optimizer_step
optimizer.step = skip_step
self._clear_cache()
def _clear_cache(self):
self._cache.clear()
# Experimental support for saving / restoring uncasted versions of functions
def _save_func(self, mod, fn, func):
self._all_wrappers.append((mod, fn, func))
def _deactivate(self):
for mod, fn, func in self._all_wrappers:
utils.set_func(mod, fn, func)
self._all_wrappers = []
@property
def has_cache(self):
return self._enable_caching
@property
def cache(self):
return self._cache
def remove_cache(self, param):
if self.has_cache and param in self.cache:
del self.cache[param]
@property
def verbose(self):
return self._verbose
class NoOpHandle(object):
def is_active(self):
return False
@contextlib.contextmanager
def _disable_casts(self):
yield
def wrap_optimizer(self, optimizer, num_loss=1):
return OptimWrapper(optimizer, self, num_loss)
@contextlib.contextmanager
def scale_loss(self, loss, optimizer):
yield loss
@property
def has_cache(self):
return False
@property
def verbose(self):
return False
def _clear_cache(self):
pass
def _deactivate(self):
pass
================================================
FILE: KoSentenceT5/apex/amp/lists/__init__.py
================================================
================================================
FILE: KoSentenceT5/apex/amp/lists/functional_overrides.py
================================================
# TODO: think about the following two. They do weird things.
# - torch.nn.utils.clip_grad (but it should always be fp32 anyway)
# - torch.nn.utils.weight_norm
# Notes:
# F.instance_norm uses batch_norm internally. Which correctly handles
# fp16 in/out with fp32 weights. So we shouldn't do anything for
# either of these.
# F.normalize calls `input.norm()` internally, so it's redundant, but
# kept here in case impl. changes.
# F.cosine_similarity is same: calls `x.norm()` internally.
import torch.nn.functional
MODULE = torch.nn.functional
FP16_FUNCS = [
'conv1d',
'conv2d',
'conv3d',
'conv_transpose1d',
'conv_transpose2d',
'conv_transpose3d',
'conv_tbc', # Undocumented / maybe new?
'linear',
]
FP32_FUNCS = [
# Interpolation/Upsampling TODO: Remove for 1.2
'interpolate',
'grid_sample',
# Pointwise
'softplus',
'softmin',
'log_softmax',
'softmax',
'gelu',
# Normalization
'layer_norm',
'group_norm',
'local_response_norm',
'normalize',
'cosine_similarity',
# Loss functions
# TODO: which of these can be fp16?
'poisson_nll_loss',
'cosine_embedding_loss',
'cross_entropy',
'hinge_embedding_loss',
'kl_div',
'l1_loss',
'mse_loss',
'margin_ranking_loss',
'multilabel_margin_loss',
'multilabel_soft_margin_loss',
'multi_margin_loss',
'nll_loss',
'binary_cross_entropy_with_logits',
'smooth_l1_loss',
'soft_margin_loss',
'triplet_margin_loss',
'ctc_loss'
]
BANNED_FUNCS = [
('binary_cross_entropy',
("\namp does not work out-of-the-box with `F.binary_cross_entropy` or `torch.nn.BCELoss.` "
"It requires that the output of the previous function be already a FloatTensor. \n\n"
"Most models have a Sigmoid right before BCELoss. In that case, you can use\n"
" torch.nn.BCEWithLogitsLoss\nto combine Sigmoid+BCELoss into a single layer "
"that is compatible with amp.\nAnother option is to add\n"
" amp.register_float_function(torch, 'sigmoid')\nbefore calling `amp.init()`.\n"
"If you _really_ know what you are doing, you can disable this warning by passing "
"allow_banned=True to `amp.init()`."))
]
================================================
FILE: KoSentenceT5/apex/amp/lists/tensor_overrides.py
================================================
from .. import compat
from . import torch_overrides
import importlib
import torch
# if compat.variable_is_tensor() and not compat.tensor_is_variable():
MODULE = torch.Tensor
# else:
# MODULE = torch.autograd.Variable
FP16_FUNCS = compat.filter_attrs(MODULE, [
'__matmul__',
])
FP32_FUNCS = compat.filter_attrs(MODULE, [
'__ipow__',
'__pow__',
'__rpow__',
# Cast to fp32 before transfer to CPU
'cpu',
])
CASTS = compat.filter_attrs(MODULE, [
'__add__',
'__div__',
'__eq__',
'__ge__',
'__gt__',
'__iadd__',
'__idiv__',
'__imul__',
'__isub__',
'__itruediv__',
'__le__',
'__lt__',
'__mul__',
'__ne__',
'__radd__',
'__rdiv__',
'__rmul__',
'__rsub__',
'__rtruediv__',
'__sub__',
'__truediv__',
])
# None of these, but here to make code cleaner.
SEQUENCE_CASTS = []
# We need to grab all the methods from torch_overrides and add them to
# the Tensor lists as well, as almost all methods are duplicated
# between `torch` and `torch.Tensor` (and check with `hasattr`,
# because a few random ones aren't defined on Tensor)
_self_mod = importlib.import_module(__name__)
for attrname in ['FP16_FUNCS', 'FP32_FUNCS', 'CASTS', 'SEQUENCE_CASTS']:
lst = getattr(_self_mod, attrname)
for fn in getattr(torch_overrides, attrname):
if hasattr(MODULE, fn):
lst.append(fn)
================================================
FILE: KoSentenceT5/apex/amp/lists/torch_overrides.py
================================================
import torch
from .. import utils
MODULE = torch
FP16_FUNCS = [
# Low level functions wrapped by torch.nn layers.
# The wrapper layers contain the weights which are then passed in as a parameter
# to these functions.
'conv1d',
'conv2d',
'conv3d',
'conv_transpose1d',
'conv_transpose2d',
'conv_transpose3d',
'conv_tbc',
'prelu',
# BLAS
'addmm',
'addmv',
'addr',
'matmul',
'mm',
'mv',
]
FP32_FUNCS = [
# Pointwise
'acos',
'asin',
'cosh',
'erfinv',
'exp',
'expm1',
'log',
'log10',
'log2',
'reciprocal',
'rsqrt',
'sinh',
'tan',
# Other math
'pow',
# Reduction
'cumprod',
'cumsum',
'dist',
# 'mean',
'norm',
'prod',
'std',
'sum',
'var',
# Misc
'renorm'
]
version_strings = torch.__version__.split('.')
version_major = version_strings[0]
version_minor = version_strings[1]
version_num = float(version_major + "." + version_minor)
# Before torch 1.1, mean must be blacklisted.
if version_num < 1.1:
FP32_FUNCS.append('mean')
# Before CUDA 9.1, batched matmul was missing fast FP16 kernels. We
# check the CUDA version -- if at least 9.1, then put the bmm
# functions on the fp16 list. Otherwise, put them on the fp32 list.
_bmms = ['addbmm',
'baddbmm',
'bmm']
if utils.is_cuda_enabled():
# workaround https://github.com/facebookresearch/maskrcnn-benchmark/issues/802
if utils.get_cuda_version() >= (9, 1, 0):
FP16_FUNCS.extend(_bmms)
else:
FP32_FUNCS.extend(_bmms)
# Multi-tensor fns that may need type promotion
CASTS = [
# Multi-tensor math
'addcdiv',
'addcmul',
'atan2',
'cross',
'bilinear',
'dot',
# Element-wise _or_ tensor-wise math
'add',
'div',
'mul',
# Comparison
'eq',
'equal',
'ge',
'gt',
'le',
'lt',
'ne'
]
# Functions that take sequence arguments. We need to inspect the whole
# sequence and cast to the widest type.
SEQUENCE_CASTS = [
'cat',
'stack'
]
================================================
FILE: KoSentenceT5/apex/amp/opt.py
================================================
import contextlib
import warnings
from .scaler import LossScaler, master_params
from ._amp_state import maybe_print
import numpy as np
class OptimWrapper(object):
def __init__(self, optimizer, amp_handle, num_loss):
self._optimizer = optimizer
self._amp_handle = amp_handle
self._num_loss = num_loss
self._loss_idx = 0
self._skip_next = [False] * num_loss
self._loss_scaler = [LossScaler('dynamic') for _ in range(num_loss)]
@contextlib.contextmanager
def scale_loss(self, loss):
if not self._amp_handle.is_active():
yield loss
return
# When there are multiple losses per-optimizer, we need
# to save out current grad accumulation, since we won't be
# able to unscale this particulare loss once the grads are
# all mixed together.
cached_grads = []
if self._loss_idx > 0:
for p in master_params(self._optimizer):
if p.grad is not None:
cached_grads.append(p.grad.data.detach().clone())
else:
cached_grads.append(None)
self._optimizer.zero_grad()
loss_scale = self._cur_loss_scaler().loss_scale()
yield loss * loss_scale
self._cur_loss_scaler().clear_overflow_state()
self._cur_loss_scaler().unscale(
master_params(self._optimizer),
master_params(self._optimizer),
loss_scale)
self._skip_next[self._loss_idx] = self._cur_loss_scaler().update_scale()
self._loss_idx += 1
if len(cached_grads) > 0:
for p, cached_grad in zip(master_params(self._optimizer),
cached_grads):
if cached_grad is not None:
p.grad.data.add_(cached_grad)
cached_grads = []
def _cur_loss_scaler(self):
assert 0 <= self._loss_idx < self._num_loss
return self._loss_scaler[self._loss_idx]
def step(self, closure=None):
if not self._amp_handle.is_active():
return self._optimizer.step(closure=closure)
self._loss_idx = 0
for group in self._optimizer.param_groups:
for p in group['params']:
self._amp_handle.remove_cache(p)
if closure is not None:
raise NotImplementedError(
'The `closure` argument is unsupported by the amp ' +
'optimizer wrapper.')
if any(self._skip_next):
maybe_print('Gradient overflow, skipping update')
self._skip_next = [False] * self._num_loss
else:
return self._optimizer.step(closure=closure)
# Forward any attribute lookups
def __getattr__(self, attr):
return getattr(self._optimizer, attr)
# Forward all torch.optim.Optimizer methods
def __getstate__(self):
return self._optimizer.__getstate__()
def __setstate__(self):
return self._optimizer.__setstate__()
def __repr__(self):
return self._optimizer.__repr__()
def state_dict(self):
return self._optimizer.state_dict()
def load_state_dict(self, state_dict):
return self._optimizer.load_state_dict(state_dict)
def zero_grad(self):
return self._optimizer.zero_grad()
def add_param_group(self, param_group):
return self._optimizer.add_param_group(param_group)
================================================
FILE: KoSentenceT5/apex/amp/rnn_compat.py
================================================
from . import utils, wrap
import torch
_VF = torch._C._VariableFunctions
RNN_NAMES = ['rnn_relu', 'rnn_tanh', 'gru', 'lstm']
def _gen_VF_wrapper(name):
def wrapper(*args, **kwargs):
return getattr(_VF, name)(*args, **kwargs)
return wrapper
# Some python magic to generate an object that has the rnn cell functions
# defined on it, all of which call into corresponding _VF version.
# Intended to patch torch.nn.modules.rnn._VF (aka, the ref named "_VF"
# imported at module scope within torch.nn.modules.rnn). This should
# not affect third-party importers of _VF.py.
class VariableFunctionsShim(object):
def __init__(self):
for name in RNN_NAMES:
for suffix in ['', '_cell']:
fn_name = name + suffix
setattr(self, fn_name, _gen_VF_wrapper(fn_name))
def has_old_rnns():
try:
torch.nn.backends.thnn.backend.LSTMCell
return True
except:
return False
def whitelist_rnn_cells(handle, verbose):
# Different module + function names in old/new RNN cases
if has_old_rnns():
fn_names = ['RNNReLUCell', 'RNNTanhCell', 'LSTMCell', 'GRUCell']
mod = torch.nn.backends.thnn.backend
else:
fn_names = [x + '_cell' for x in RNN_NAMES]
mod = torch.nn.modules.rnn._VF
assert isinstance(mod, VariableFunctionsShim)
# Insert casts on cell functions
for fn in fn_names:
wrap.cached_cast(mod, fn, utils.maybe_half, handle,
try_caching=True, verbose=verbose)
if has_old_rnns():
# Special handling of `backward` for fused gru / lstm:
# The `backward` method calls Tensor.sum() (blacklist) internally,
# and then the resulting grad_input has the wrong type.
# TODO: where else is this a problem?
for rnn_type in ['GRUFused', 'LSTMFused']:
mod = getattr(torch.nn._functions.thnn.rnnFusedPointwise, rnn_type)
wrap.disable_casts(mod, 'backward', handle)
================================================
FILE: KoSentenceT5/apex/amp/scaler.py
================================================
import torch
from ..multi_tensor_apply import multi_tensor_applier
from ._amp_state import _amp_state, master_params, maybe_print
from itertools import product
def scale_check_overflow_python(model_grad, master_grad, scale, check_overflow=False):
# Exception handling for 18.04 compatibility
if check_overflow:
cpu_sum = float(model_grad.float().sum())
if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
return True
if master_grad is not model_grad: # copy_ probably internally short-circuits this
master_grad.copy_(model_grad)
if scale != 1.0:
master_grad.mul_(scale)
return False
def axpby_check_overflow_python(model_grad, stashed_grad, master_grad, a, b, check_overflow=False):
# Exception handling for 18.04 compatibility
if check_overflow:
cpu_sum = float(model_grad.float().sum())
if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
return True
# if master_grad is not model_grad: # copy_ probably internally short-circuits this
# master_grad.copy_(model_grad)
assert stashed_grad.dtype == master_grad.dtype
converted_model_grad = model_grad.data.to(master_grad.dtype)
master_grad.data = a*converted_model_grad.data + b*stashed_grad.data
return False
class LossScaler(object):
warned_no_fused_kernel = False
warned_unscaling_non_fp32_grad = False
has_fused_kernel = False
def __init__(self,
loss_scale,
init_scale=2.**16,
scale_factor=2.,
scale_window=2000,
min_loss_scale=None,
max_loss_scale=2.**24):
if loss_scale == "dynamic":
self.dynamic = True
self._loss_scale = min(max_loss_scale, init_scale)
else:
self.dynamic = False
self._loss_scale = loss_scale
self._max_loss_scale = max_loss_scale
self._min_loss_scale = min_loss_scale
self._scale_seq_len = scale_window
self._unskipped = 0
self._has_overflow = False
self._overflow_buf = torch.cuda.IntTensor([0])
if multi_tensor_applier.available:
import amp_C
LossScaler.has_fused_kernel = multi_tensor_applier.available
LossScaler.multi_tensor_scale_cuda = amp_C.multi_tensor_scale
LossScaler.multi_tensor_axpby_cuda = amp_C.multi_tensor_axpby
else:
if not LossScaler.warned_no_fused_kernel:
maybe_print(
"Warning: multi_tensor_applier fused unscale kernel is unavailable, "
"possibly because apex was installed without --cuda_ext --cpp_ext. "
"Using Python fallback. Original ImportError was: " +
repr(multi_tensor_applier.import_err),
True)
LossScaler.has_fused_kernel = False
LossScaler.warned_no_fused_kernel = True
def loss_scale(self):
return self._loss_scale
def unscale_python(self, model_grads, master_grads, scale):
for model, master in zip(model_grads, master_grads):
if model is not None:
if not LossScaler.warned_unscaling_non_fp32_grad:
if master.dtype != torch.float32:
maybe_print(
"Attempting to unscale a grad with type {} ".format(master.type()) +
"Unscaling non-fp32 grads may indicate an error. "
"When using Amp, you don't need to call .half() on your model.")
LossScaler.warned_unscaling_non_fp32_grad = True
self._has_overflow = scale_check_overflow_python(model,
master,
1./scale,
self.dynamic)
if self._has_overflow and self.dynamic:
break
# unused_scale keeps some of the old API alive for hopefully a short time.
def unscale(self, model_grads, master_grads, unused_scale, models_are_masters=False, scale_override=None):
if self._has_overflow:
return
scale = self._loss_scale
if scale_override is not None:
scale = scale_override
if scale == 1.0 and models_are_masters and not self.dynamic:
return
if LossScaler.has_fused_kernel:
# if (not LossScaler.warned_unscaling_non_fp32_grad
# and master_grads[0].dtype == torch.float16):
# print("Warning: unscaling grads that are not FP32. "
# "Unscaling non-fp32 grads may indicate an error. "
# "When using Amp, you don't need to call .half() on your model.")
# # Setting this to True unconditionally allows the possibility of an escape
# # if never-before-seen non-fp32 grads are created in some later iteration.
# LossScaler.warned_unscaling_non_fp32_grad = True
multi_tensor_applier(LossScaler.multi_tensor_scale_cuda,
self._overflow_buf,
[model_grads, master_grads],
1./scale)
else:
self.unscale_python(model_grads, master_grads, scale)
# Defer to update_scale
# If the fused kernel is available, we only need one D2H memcopy and sync.
# if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
# self._has_overflow = self._overflow_buf.item()
def unscale_with_stashed_python(self,
model_grads,
stashed_master_grads,
master_grads,
a,
b):
for model, stashed, master in zip(model_grads, stashed_master_grads, master_grads):
if model is None and stashed is None:
continue
else:
if not LossScaler.warned_unscaling_non_fp32_grad:
if master.dtype != torch.float32:
maybe_print(
"Attempting to unscale a grad with type {} ".format(master.type()) +
"Unscaling non-fp32 grads may indicate an error. "
"When using Amp, you don't need to call .half() on your model.")
LossScaler.warned_unscaling_non_fp32_grad = True
self._has_overflow = axpby_check_overflow_python(model,
stashed,
master,
a,
b,
self.dynamic)
if self._has_overflow and self.dynamic:
break
def unscale_with_stashed(self,
model_grads,
stashed_master_grads,
master_grads,
scale_override=None):
if self._has_overflow:
return
grads_have_scale, stashed_have_scale, out_scale = self._loss_scale, 1.0, 1.0
if scale_override is not None:
grads_have_scale, stashed_have_scale, out_scale = scale_override
if LossScaler.has_fused_kernel:
if (not LossScaler.warned_unscaling_non_fp32_grad
and master_grads[0].dtype == torch.float16):
print("Warning: unscaling grads that are not FP32. "
"Unscaling non-fp32 grads may indicate an error. "
"When using Amp, you don't need to call .half() on your model.")
# Setting this to True unconditionally allows the possibility of an escape
# if never-before-seen non-fp32 grads are created in some later iteration.
LossScaler.warned_unscaling_non_fp32_grad = True
multi_tensor_applier(LossScaler.multi_tensor_axpby_cuda,
self._overflow_buf,
[model_grads, stashed_master_grads, master_grads],
out_scale/grads_have_scale, # 1./scale,
out_scale/stashed_have_scale, # 1.0,
0) # check only arg 0, aka the incoming model grads, for infs
else:
self.unscale_with_stashed_python(model_grads,
stashed_master_grads,
master_grads,
out_scale/grads_have_scale,
out_scale/stashed_have_scale)
# Defer to update_scale
# If the fused kernel is available, we only need one D2H memcopy and sync.
# if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
# self._has_overflow = self._overflow_buf.item()
def clear_overflow_state(self):
self._has_overflow = False
if self.has_fused_kernel:
self._overflow_buf.zero_()
# Separate so unscale() can be called more that once before updating.
def update_scale(self):
# If the fused kernel is available, we only need one D2H memcopy and sync.
if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
self._has_overflow = self._overflow_buf.item()
if self._has_overflow and self.dynamic:
should_skip = True
if(self._min_loss_scale):
self._loss_scale = max(self._min_loss_scale, self._loss_scale/2.)
else:
self._loss_scale = self._loss_scale/2.
self._unskipped = 0
else:
should_skip = False
self._unskipped += 1
if self._unskipped == self._scale_seq_len and self.dynamic:
self._loss_scale = min(self._max_loss_scale, self._loss_scale*2.)
self._unskipped = 0
return should_skip
================================================
FILE: KoSentenceT5/apex/amp/utils.py
================================================
from . import compat
import functools
import itertools
import torch
def is_cuda_enabled():
return torch.version.cuda is not None
def get_cuda_version():
return tuple(int(x) for x in torch.version.cuda.split('.'))
def is_fp_tensor(x):
if is_nested(x):
# Fast-fail version of all(is_fp_tensor)
for y in x:
if not is_fp_tensor(y):
return False
return True
return compat.is_tensor_like(x) and compat.is_floating_point(x)
def is_nested(x):
return isinstance(x, tuple) or isinstance(x, list)
def should_cache(x):
if is_nested(x):
# Fast-fail version of all(should_cache)
for y in x:
if not should_cache(y):
return False
return True
return isinstance(x, torch.nn.parameter.Parameter) and \
type_string(x) == 'FloatTensor'
def collect_fp_tensor_types(args, kwargs):
def collect_types(x, types):
if is_nested(x):
for y in x:
collect_types(y, types)
else:
types.add(type_string(x))
all_args = itertools.chain(args, kwargs.values())
types = set()
for x in all_args:
if is_fp_tensor(x):
collect_types(x, types)
return types
def type_string(x):
return x.type().split('.')[-1]
def maybe_half(x, name='', verbose=False):
if is_nested(x):
return type(x)([maybe_half(y) for y in x])
if not x.is_cuda or type_string(x) == 'HalfTensor':
return x
else:
if verbose:
print('Float->Half ({})'.format(name))
return x.half()
def maybe_float(x, name='', verbose=False):
if is_nested(x):
return type(x)([maybe_float(y) for y in x])
if not x.is_cuda or type_string(x) == 'FloatTensor':
return x
else:
if verbose:
print('Half->Float ({})'.format(name))
return x.float()
# NB: returneds casted `args`, mutates `kwargs` in-place
def casted_args(cast_fn, args, kwargs):
new_args = []
for x in args:
if is_fp_tensor(x):
new_args.append(cast_fn(x))
else:
new_args.append(x)
for k in kwargs:
val = kwargs[k]
if is_fp_tensor(val):
kwargs[k] = cast_fn(val)
return new_args
def cached_cast(cast_fn, x, cache):
if is_nested(x):
return type(x)([cached_cast(y) for y in x])
if x in cache:
cached_x = cache[x]
if x.requires_grad and cached_x.requires_grad:
# Make sure x is actually cached_x's autograd parent.
if cached_x.grad_fn.next_functions[1][0].variable is not x:
raise RuntimeError("x and cache[x] both require grad, but x is not "
"cache[x]'s parent. This is likely an error.")
# During eval, it's possible to end up caching casted weights with
# requires_grad=False. On the next training iter, if cached_x is found
# and reused from the cache, it will not actually have x as its parent.
# Therefore, we choose to invalidate the cache (and force refreshing the cast)
# if x.requires_grad and cached_x.requires_grad do not match.
#
# During eval (i.e. running under with torch.no_grad()) the invalidation
# check would cause the cached value to be dropped every time, because
# cached_x would always be created with requires_grad=False, while x would
# still have requires_grad=True. This would render the cache effectively
# useless during eval. Therefore, if we are running under the no_grad()
# context manager (torch.is_grad_enabled=False) we elide the invalidation
# check, and use the cached value even though its requires_grad flag doesn't
# match. During eval, we don't care that there's no autograd-graph
# connection between x and cached_x.
if torch.is_grad_enabled() and x.requires_grad != cached_x.requires_grad:
del cache[x]
else:
return cached_x
casted_x = cast_fn(x)
cache[x] = casted_x
return casted_x
def verbosify(cast_fn, fn_name, verbose):
if verbose:
return functools.partial(cast_fn, name=fn_name, verbose=verbose)
else:
return cast_fn
def as_inplace(fns):
for x in fns:
yield x + '_'
def has_func(mod, fn):
if isinstance(mod, dict):
return fn in mod
else:
return hasattr(mod, fn)
def get_func(mod, fn):
if isinstance(mod, dict):
return mod[fn]
else:
return getattr(mod, fn)
def set_func(mod, fn, new_fn):
if isinstance(mod, dict):
mod[fn] = new_fn
else:
setattr(mod, fn, new_fn)
def set_func_save(handle, mod, fn, new_fn):
cur_fn = get_func(mod, fn)
handle._save_func(mod, fn, cur_fn)
set_func(mod, fn, new_fn)
# A couple problems get solved here:
# - The flat_weight buffer is disconnected from autograd graph,
# so the fp16 weights need to be derived from the input weights
# to this forward call, not the flat buffer.
# - The ordering of weights in the flat buffer is...idiosyncratic.
# First problem is solved with combination of set_ (to set up
# correct storage) and copy_ (so the fp16 weight derives from the
# fp32 one in autograd.
# Second is solved by doing ptr arithmetic on the fp32 weights
# to derive the correct offset.
#
# TODO: maybe this should actually use
# `torch._cudnn_rnn_flatten_weight`? But then I need to call
# on first iter and cache the right offsets. Ugh.
def synthesize_flattened_rnn_weights(fp32_weights,
fp16_flat_tensor,
rnn_fn='',
verbose=False):
fp16_weights = []
fp32_base_ptr = fp32_weights[0][0].data_ptr()
for layer_weights in fp32_weights:
fp16_layer_weights = []
for w_fp32 in layer_weights:
w_fp16 = w_fp32.new().half()
offset = (w_fp32.data_ptr() - fp32_base_ptr) // w_fp32.element_size()
w_fp16.set_(fp16_flat_tensor.storage(),
offset,
w_fp32.shape)
w_fp16.copy_(w_fp32)
if verbose:
print('Float->Half ({})'.format(rnn_fn))
fp16_layer_weights.append(w_fp16)
fp16_weights.append(fp16_layer_weights)
return fp16_weights
# Roughly same as above, just the `fp32_weights` aren't nested.
# Code kept separate for readability.
def new_synthesize_flattened_rnn_weights(fp32_weights,
fp16_flat_tensor,
rnn_fn='',
verbose=False):
fp16_weights = []
fp32_base_ptr = fp32_weights[0].data_ptr()
for w_fp32 in fp32_weights:
w_fp16 = w_fp32.new().half()
offset = (w_fp32.data_ptr() - fp32_base_ptr) // w_fp32.element_size()
w_fp16.set_(fp16_flat_tensor.storage(),
offset,
w_fp32.shape)
w_fp16.copy_(w_fp32)
if verbose:
print('Float->Half ({})'.format(rnn_fn))
fp16_weights.append(w_fp16)
return fp16_weights
================================================
FILE: KoSentenceT5/apex/amp/wrap.py
================================================
from . import compat
from . import utils
from ._amp_state import _amp_state
from . import rnn_compat
import functools
import torch
def make_cast_wrapper(orig_fn, cast_fn, handle,
try_caching=False):
@functools.wraps(orig_fn)
def wrapper(*args, **kwargs):
if not handle.is_active():
return orig_fn(*args, **kwargs)
if try_caching and handle.has_cache:
args = list(args)
for i in range(len(args)):
if utils.should_cache(args[i]):
args[i] = utils.cached_cast(cast_fn, args[i], handle.cache)
for k in kwargs:
if utils.should_cache(kwargs[k]):
kwargs[k] = utils.cached_cast(cast_fn, kwargs[k], handle.cache)
new_args = utils.casted_args(cast_fn,
args,
kwargs)
return orig_fn(*new_args, **kwargs)
return wrapper
def cached_cast(mod, fn, cast_fn, handle,
try_caching=False, verbose=False):
if not utils.has_func(mod, fn):
return
orig_fn = utils.get_func(mod, fn)
cast_fn = utils.verbosify(cast_fn, fn, verbose)
wrapper = make_cast_wrapper(orig_fn, cast_fn, handle, try_caching)
utils.set_func_save(handle, mod, fn, wrapper)
# `handle` arg is unused, but simplifies API to make `make_cast_wrapper`
# Annoyingly, make_promote_wrapper still uses the global handle. Once everyone
# is on the new API and I am free to get rid of handle, I can clean this up.
def make_promote_wrapper(orig_fn, cast_fn, handle=None):
@functools.wraps(orig_fn)
def wrapper(*args, **kwargs):
if not _amp_state.handle.is_active():
return orig_fn(*args, **kwargs)
types = utils.collect_fp_tensor_types(args, kwargs)
if len(types) <= 1:
return orig_fn(*args, **kwargs)
elif len(types) == 2 and types == set(['HalfTensor', 'FloatTensor']):
new_args = utils.casted_args(cast_fn,
args,
kwargs)
return orig_fn(*new_args, **kwargs)
else:
raise NotImplementedError('Do not know how to handle ' +
'these types to promote: {}'
.format(types))
return wrapper
def promote(mod, fn, handle, verbose=False):
orig_fn = utils.get_func(mod, fn)
maybe_float = utils.verbosify(utils.maybe_float, fn, verbose)
wrapper = make_promote_wrapper(orig_fn, maybe_float)
utils.set_func_save(handle, mod, fn, wrapper)
def sequence_promote(mod, fn, handle, verbose=False):
orig_fn = utils.get_func(mod, fn)
maybe_float = utils.verbosify(utils.maybe_float, fn, verbose)
@functools.wraps(orig_fn)
def wrapper(seq, *args, **kwargs):
if not _amp_state.handle.is_active():
return orig_fn(seq, *args, **kwargs)
types = set([utils.type_string(x) for x in seq])
if len(types) <= 1:
return orig_fn(seq, *args, **kwargs)
elif types == set(['HalfTensor', 'FloatTensor']):
cast_seq = utils.casted_args(maybe_float,
seq, {})
return orig_fn(cast_seq, *args, **kwargs)
else:
# TODO: other mixed-type cases aren't due to amp.
# Just pass through?
return orig_fn(seq, *args, **kwargs)
utils.set_func_save(handle, mod, fn, wrapper)
def promote_match_arg0(mod, fn, handle, verbose=False):
if not utils.has_func(mod, fn):
return
orig_fn = utils.get_func(mod, fn)
@functools.wraps(orig_fn)
def wrapper(arg0, *args, **kwargs):
assert compat.is_tensor_like(arg0)
if not _amp_state.handle.is_active():
return orig_fn(arg0, *args, **kwargs)
if utils.type_string(arg0) == 'HalfTensor':
cast_fn = utils.maybe_half
elif utils.type_string(arg0) == 'FloatTensor':
cast_fn = utils.maybe_float
else:
return orig_fn(arg0, *args, **kwargs)
cast_fn = utils.verbosify(cast_fn, fn, verbose)
new_args = utils.casted_args(cast_fn, args, kwargs)
return orig_fn(arg0, *new_args, **kwargs)
utils.set_func_save(handle, mod, fn, wrapper)
def err_if_any_half(mod, fn, handle, custom_err_msg=None):
if not utils.has_func(mod, fn):
return
orig_fn = utils.get_func(mod, fn)
@functools.wraps(orig_fn)
def wrapper(*args, **kwargs):
types = utils.collect_fp_tensor_types(args, kwargs)
if 'HalfTensor' in types:
if custom_err_msg:
raise NotImplementedError(custom_err_msg)
else:
raise NotImplementedError('Cannot call in-place function ' +
'{} with fp16 arguments.'.format(fn))
else:
return orig_fn(*args, **kwargs)
utils.set_func_save(handle, mod, fn, wrapper)
def err_if_arg0_half(mod, fn, handle, verbose=False):
if not utils.has_func(mod, fn):
return
orig_fn = utils.get_func(mod, fn)
@functools.wraps(orig_fn)
def wrapper(arg0, *args, **kwargs):
assert compat.is_tensor_like(arg0)
if utils.type_string(arg0) == 'HalfTensor':
raise NotImplementedError('Cannot call in-place method ' +
'{} on fp16 Tensors.'.format(fn))
else:
cast_fn = utils.verbosify(utils.maybe_float, fn, verbose)
new_args = utils.casted_args(cast_fn, args, kwargs)
return orig_fn(arg0, *new_args, **kwargs)
utils.set_func_save(handle, mod, fn, wrapper)
# Current RNN approach:
# - Wrap top-level `RNN` function in thnn backend
# - Will call into either CudnnRNN or AutogradRNN
# - Each of these are factory functions that return a per-iter
# `forward` function
# - We interpose on the factory function to:
# 1) Interpose on the actual forward function and put in casts
# 2) Insert an fp16 `flat_weight` if necessary
def rnn_cast(backend, fn, handle, verbose=False):
orig_rnn = utils.get_func(backend, fn)
@functools.wraps(orig_rnn)
def rnn_wrapper(*args, **kwargs):
flat_weight = kwargs.get('flat_weight')
if flat_weight is not None:
# We replace `flat_weight` with an uninitialized fp16
# Tensor. The "actual" weight tensors (provided in `forward`),
# will then be set up as ptrs into the buffer and have the
# corresponding fp32 values copied in.
# We need to call `copy` on the "actual" weights so that the
# autograd graph correctly backprops from the wgrads computed
# inside cuDNN (on fp16 weights) into the fp32 weights.
assert utils.type_string(flat_weight) == 'FloatTensor'
if compat.tensor_is_float_tensor() or compat.tensor_is_variable():
# Pre-0.4. A little slower, since it zeros out memory.
flat_weight_fp16 = flat_weight.new().half().resize_(flat_weight.shape)
else:
flat_weight_fp16 = torch.empty_like(flat_weight,
dtype=torch.float16)
kwargs['flat_weight'] = flat_weight_fp16
else:
flat_weight_fp16 = None
forward = orig_rnn(*args, **kwargs)
@functools.wraps(forward)
def fwd_wrapper(*fargs, **fkwargs):
assert len(fargs) == 3 or len(fargs) == 4
inputs, weights, hiddens = fargs[:3]
assert utils.is_fp_tensor(inputs)
assert isinstance(weights, list)
cast_fn = utils.verbosify(utils.maybe_half,
fn,
verbose)
new_args = []
# 0) Inputs
new_args.append(cast_fn(inputs))
# 1) Weights
if flat_weight_fp16 is not None:
fp16_weights = utils.synthesize_flattened_rnn_weights(
weights, flat_weight_fp16, fn, verbose)
else:
fp16_weights = [[cast_fn(w) for w in layer]
for layer in weights]
new_args.append(fp16_weights)
# 2) Inputs: either a tuple (for LSTM) or single tensor
if isinstance(hiddens, tuple):
new_args.append(tuple(cast_fn(x) for x in hiddens))
elif utils.is_fp_tensor(hiddens):
new_args.append(cast_fn(hiddens))
else:
# Hiddens can, in principle, be `None` -- pass through
new_args.append(hiddens)
# 3) Batch sizes (0.4 or later only)
if len(fargs) == 4:
new_args.append(fargs[3])
return forward(*new_args, **fkwargs)
return fwd_wrapper
utils.set_func_save(handle, backend, fn, rnn_wrapper)
def new_rnn_cast(fn, handle, verbose=False):
# Forward+backward compatibility around https://github.com/pytorch/pytorch/pull/15744
# For rnn backend calls that route through _rnn_impls, we must patch the ref
# that _rnn_impls stashed. For rnn backend calls that directly invoke
# _VF., e.g. _VF.lstm, we can patch onto VariableFunctionsShim,
# which in turn has patched the ref named "_VF" in torch.nn.modules.rnn.
if utils.has_func(torch.nn.modules.rnn._rnn_impls, fn):
mod = torch.nn.modules.rnn._rnn_impls
else:
mod = torch.nn.modules.rnn._VF
assert isinstance(mod, rnn_compat.VariableFunctionsShim)
fn = fn.lower()
orig_fn = utils.get_func(mod, fn)
cast_fn = utils.verbosify(utils.maybe_half, fn, verbose)
@functools.wraps(orig_fn)
def wrapper(*args, **kwargs):
# Exact call signature from modules/rnn.py
assert len(args) == 9
assert len(kwargs) == 0
if not _amp_state.handle.is_active():
return orig_fn(*args, **kwargs)
if isinstance(args[6], bool):
params_idx = 2 # Not PackedSequence case
else:
params_idx = 3 # PackedSequence case
new_args = []
for i, arg in enumerate(args):
if i == params_idx:
num_params = sum([x.numel() for x in arg])
fp16_weight_buf = args[0].new_empty((num_params,),
dtype=torch.half)
casted_weights = utils.new_synthesize_flattened_rnn_weights(
arg, fp16_weight_buf, fn, verbose)
new_args.append(casted_weights)
elif utils.is_fp_tensor(arg):
new_args.append(cast_fn(arg))
else:
new_args.append(arg)
return orig_fn(*new_args)
utils.set_func_save(handle, mod, fn, wrapper)
def disable_casts(mod, fn, handle):
if not utils.has_func(mod, fn):
return
orig_fn = utils.get_func(mod, fn)
@functools.wraps(orig_fn)
def wrapper(*args, **kwargs):
with handle._disable_casts():
return orig_fn(*args, **kwargs)
utils.set_func_save(handle, mod, fn, wrapper)
================================================
FILE: KoSentenceT5/apex/contrib/__init__.py
================================================
================================================
FILE: KoSentenceT5/apex/contrib/bottleneck/__init__.py
================================================
from .bottleneck import Bottleneck
================================================
FILE: KoSentenceT5/apex/contrib/bottleneck/bottleneck.py
================================================
import torch
from torch import nn
import fast_bottleneck
def kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
weight_tensor_nchw = tensor
nn.init.kaiming_uniform_(weight_tensor_nchw, a=a, mode=mode, nonlinearity=nonlinearity)
class FrozenBatchNorm2d(torch.nn.Module):
"""
BatchNorm2d where the batch statistics and the affine parameters are fixed
"""
def __init__(self, n):
super(FrozenBatchNorm2d, self).__init__()
self.register_buffer("weight", torch.ones(n))
self.register_buffer("bias", torch.zeros(n))
self.register_buffer("running_mean", torch.zeros(n))
self.register_buffer("running_var", torch.ones(n))
def get_scale_bias(self, nhwc=False):
scale = self.weight * self.running_var.rsqrt()
bias = self.bias - self.running_mean * scale
if nhwc:
scale = scale.reshape(1, 1, 1, -1)
bias = bias.reshape(1, 1, 1, -1)
else:
scale = scale.reshape(1, -1, 1, 1)
bias = bias.reshape(1, -1, 1, 1)
return scale, bias
def forward(self, x):
scale, bias = self.get_scale_bias()
return x * scale + bias
@torch.jit.script
def drelu_dscale1(grad_o, output, scale1):
relu_mask = (output>0).half()
dx_relu = relu_mask * grad_o
g1 = dx_relu * scale1
return g1, dx_relu
@torch.jit.script
def drelu_dscale2(grad_o, output, scale1, scale2):
relu_mask = (output>0).half()
dx_relu = relu_mask * grad_o
g1 = dx_relu * scale1
g2 = dx_relu * scale2
return g1, g2
class BottleneckFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, nhwc, stride_1x1, scale, bias, x, *conv):
# TODO: clean up order of tensors
args = [x, *conv[0:3], *scale[0:3], *bias[0:3]]
ctx.downsample = len(conv) > 3
if ctx.downsample:
args.append(conv[3])
args.append(scale[3])
args.append(bias[3])
# weight buffers are always in nhwc while shape can be nhwc or channels_last
# here we pass in flag and let c++ handle it
# alternatively, we can put all sizes into a fixed format and pass it in
outputs = fast_bottleneck.forward(nhwc, stride_1x1, args)
ctx.save_for_backward(*(args+outputs))
# save relu outputs for drelu
ctx.nhwc = nhwc
ctx.stride_1x1 = stride_1x1
return outputs[2]
# backward relu is not exposed, MUL with mask used now
# only support dgrad
@staticmethod
def backward(ctx, grad_o):
outputs = ctx.saved_tensors[-3:]
if ctx.downsample:
grad_conv3, grad_conv4 = drelu_dscale2(grad_o, outputs[2], ctx.saved_tensors[6], ctx.saved_tensors[11])
else:
grad_conv3, grad_conv4 = drelu_dscale1(grad_o, outputs[2], ctx.saved_tensors[6])
# create input vector for backward
t_list = [*ctx.saved_tensors[0:10]]
t_list.append(grad_conv3)
t_list.append(grad_conv4)
# outputs used for wgrad and generating drelu mask
t_list.append(outputs[0])
t_list.append(outputs[1])
# in case there is downsample
if ctx.downsample:
t_list.append(ctx.saved_tensors[10])
grads = fast_bottleneck.backward(ctx.nhwc, ctx.stride_1x1, t_list)
return (None, None, None, None, *grads)
bottleneck_function = BottleneckFunction.apply
def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
"""3x3 convolution with padding"""
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=dilation, groups=groups, bias=False, dilation=dilation)
def conv1x1(in_planes, out_planes, stride=1):
"""1x1 convolution"""
return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
class Bottleneck(torch.nn.Module):
# Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
# while original implementation places the stride at the first 1x1 convolution(self.conv1)
# according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
# This variant is also known as ResNet V1.5 and improves accuracy according to
# https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
# here we put it at 1x1
def __init__(self, in_channels, bottleneck_channels, out_channels, stride=1, groups=1,
dilation=1, norm_func=None, use_cudnn=False, explicit_nhwc=False):
super(Bottleneck, self).__init__()
if groups != 1:
raise RuntimeError('Only support groups == 1')
if dilation != 1:
raise RuntimeError('Only support dilation == 1')
if norm_func == None:
norm_func = FrozenBatchNorm2d
else:
raise RuntimeError('Only support frozen BN now.')
if stride != 1 or in_channels != out_channels:
self.downsample = nn.Sequential(
conv1x1(in_channels, out_channels, stride),
norm_func(out_channels),
)
else:
self.downsample = None
# Both self.conv2 and self.downsample layers downsample the input when stride != 1
self.conv1 = conv1x1(in_channels, bottleneck_channels, stride)
self.conv2 = conv3x3(bottleneck_channels, bottleneck_channels)
self.conv3 = conv1x1(bottleneck_channels, out_channels)
self.relu = nn.ReLU(inplace=True)
self.stride = stride
self.bn1 = norm_func(bottleneck_channels)
self.bn2 = norm_func(bottleneck_channels)
self.bn3 = norm_func(out_channels)
self.use_cudnn = use_cudnn
# setup conv weights
self.w_conv = [self.conv1.weight, self.conv2.weight, self.conv3.weight]
if self.downsample is not None:
self.w_conv.append(self.downsample[0].weight)
# init weight in nchw format before possible transpose
for w in self.w_conv:
kaiming_uniform_(w, a=1)
# TODO: prevent unsupported case usage
# support cases
# native cudnn
# normal yes no
# channel_last yes yes
# explicit_nhwc no yes
self.explicit_nhwc = explicit_nhwc
if self.explicit_nhwc:
for p in self.parameters():
with torch.no_grad():
p.data = p.data.permute(0,2,3,1).contiguous()
return
def forward(self, x):
if self.use_cudnn:
# calculate scale/bias from registered buffers
# TODO: make this better
s1, b1 = self.bn1.get_scale_bias(self.explicit_nhwc)
s2, b2 = self.bn2.get_scale_bias(self.explicit_nhwc)
s3, b3 = self.bn3.get_scale_bias(self.explicit_nhwc)
w_scale = [s1, s2, s3]
w_bias = [b1, b2, b3]
if self.downsample is not None:
s4, b4 = self.downsample[1].get_scale_bias(self.explicit_nhwc)
w_scale.append(s4)
w_bias.append(b4)
out = bottleneck_function(self.explicit_nhwc, self.stride, w_scale, w_bias, x, *self.w_conv)
return out
if self.explicit_nhwc:
raise RuntimeError('explicit nhwc with native ops is not supported.')
# fallback to native ops
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
================================================
FILE: KoSentenceT5/apex/contrib/bottleneck/test.py
================================================
import torch
from bottleneck import Bottleneck
torch.manual_seed(23337)
# use True to print layerwise sum for all outputs in reference code path
DEBUG = False#True
for stride, o_channel in [(1,32), (1,128), (2,32)]:
print("testing stride ==", stride, ", in_channel == 32 , out_channel ==", o_channel)
a_ = torch.randn(17,32,28,28)
a = a_.cuda().half().to(memory_format=torch.channels_last).requires_grad_()
model = Bottleneck(32,8,o_channel,stride=stride).cuda().half().to(memory_format=torch.channels_last)
# test model
b = model(a)
b.mean().backward()
d_grad = a.grad.float()
a.grad = None
torch.cuda.synchronize()
if DEBUG:
print("[DEBUG] ref dx :", d_grad.sum().item())
# print wgrad. we don't need to reset since later cpp print before accumulation
for i, w in enumerate(model.w_conv):
print("[DEBUG] ref wgrad{} :".format(i+1), w.grad.sum().item())
wgrads = []
for w in model.w_conv:
wgrads.append(w.grad.float())
model.use_cudnn = True
model.zero_grad()
c = model(a)
c.mean().backward()
torch.cuda.synchronize()
print("comparing native and channels_last:")
print("max error fprop:", (b-c).abs().max().item(), "max elem:", b.abs().max().item())
print("max error dgrad:", (d_grad-a.grad.float()).abs().max().item(), "max elem:", d_grad.abs().max().item())
for i, (w, wgrad) in enumerate(zip(model.w_conv, wgrads)):
print("max error wgrad{}:".format(i+1), (wgrad - w.grad.float()).abs().max().item(), "max elem:", wgrad.abs().max().item())
nhwc_a = a_.permute(0,2,3,1).contiguous().cuda().half().requires_grad_()
nhwc_model = Bottleneck(32,8,o_channel,stride=stride,explicit_nhwc=True, use_cudnn=True).cuda().half()
for p,q in zip(model.parameters(), nhwc_model.parameters()):
# model's storage is already in nhwc, we clone and assign to explicit nhwc model
q.data.copy_(p.data.permute(0,2,3,1).contiguous())
for p,q in zip(model.buffers(), nhwc_model.buffers()):
q.data.copy_(p.data)
d = nhwc_model(nhwc_a)
d.mean().backward()
torch.cuda.synchronize()
# reset reference to cudnn channels_last permute
#c_s = c.storage().tolist()
#d_s = d.storage().tolist()
#print(max([x-y for x,y in zip(c_s,d_s)]))
c = c.contiguous(memory_format=torch.contiguous_format).permute(0,2,3,1).contiguous()
d_grad = a.grad.float().permute(0,2,3,1).contiguous()
wgrads = []
for w in model.w_conv:
wgrads.append(w.grad.float().permute(0,2,3,1).contiguous())
torch.cuda.synchronize()
print("comparing nhwc and channels_last:")
print("max error fprop:", (d-c).abs().max().item(), "max elem:", c.abs().max().item())
print("max error dgrad:", (d_grad-nhwc_a.grad.float()).abs().max().item(), "max elem:", d_grad.abs().max().item())
for i, (w, wgrad) in enumerate(zip(nhwc_model.w_conv, wgrads)):
print("max error wgrad{}:".format(i+1), (wgrad - w.grad.float()).abs().max().item(), "max elem:", wgrad.abs().max().item())
================================================
FILE: KoSentenceT5/apex/contrib/csrc/bottleneck/bottleneck.cpp
================================================
#include
#include // for getcudnnhandle
#include
#include
#include
#include
#include
#ifdef DEBUG
#define DEBUG_MSG(str) do { std::cout << str << std::endl; } while( false )
#else
#define DEBUG_MSG(str) do { } while ( false )
#endif
#ifdef DEBUG_CUDNN
#define DEBUG_CUDNN_MSG(buf, str) do { buf << str << std::endl; } while( false )
#else
#define DEBUG_CUDNN_MSG(buf, str) do { } while ( false )
#endif
#define checkCudnnErr(...) \
do { \
int err = checkCudnnError(__VA_ARGS__, #__VA_ARGS__, __FILE__, __LINE__); \
if (err) { \
return; \
} \
} while (0)
int checkCudnnError(cudnnStatus_t code, const char* expr, const char* file, int line) {
if (code) {
printf("CUDNN error at %s:%d, code=%d (%s) in '%s'\n", file, line, (int)code, cudnnGetErrorString(code), expr);
return 1;
}
return 0;
}
void checkError(cudaError_t code, char const * func, const char *file, const int line, bool abort = true);
#define checkCUDAError(val) { checkError((val), #val, __FILE__, __LINE__); } // in-line regular function
void checkError(cudaError_t code, char const * func, const char *file, const int line, bool abort)
{
if (code != cudaSuccess)
{
const char * errorMessage = cudaGetErrorString(code);
fprintf(stderr, "CUDA error returned from \"%s\" at %s:%d, Error code: %d (%s)\n", func, file, line, code, errorMessage);
if (abort){
cudaDeviceReset();
exit(code);
}
}
}
void generateStrides(const int64_t* dimA, int64_t* strideA, int nbDims, cudnnTensorFormat_t filterFormat) {
// For INT8x4 and INT8x32 we still compute standard strides here to input
// into the cuDNN functions. We will manually scale by resizeFactor in the cpu ref.
if (filterFormat == CUDNN_TENSOR_NCHW) {
strideA[nbDims - 1] = 1;
for (int64_t d = nbDims - 2; d >= 0; d--) {
strideA[d] = strideA[d + 1] * dimA[d + 1];
}
} else {
// Here we assume that the format is CUDNN_TENSOR_NHWC
strideA[1] = 1;
strideA[nbDims - 1] = strideA[1] * dimA[1];
for (int64_t d = nbDims - 2; d >= 2; d--) {
strideA[d] = strideA[d + 1] * dimA[d + 1];
}
strideA[0] = strideA[2] * dimA[2];
}
}
int getFwdConvDilatedFilterDim(int filterDim, int dilation) {
return ((filterDim - 1) * dilation) + 1;
}
int getFwdConvPaddedImageDim(int tensorDim, int pad) {
return tensorDim + (2 * pad);
}
int getFwdConvOutputDim(
int tensorDim,
int pad,
int filterDim,
int stride,
int dilation)
{
int p = (getFwdConvPaddedImageDim(tensorDim, pad) - getFwdConvDilatedFilterDim(filterDim, dilation)) / stride + 1;
return (p);
}
enum {
X_TENSOR,
Y_TENSOR,
W_TENSOR,
Z_TENSOR,
B_TENSOR,
AFTERADD_TENSOR,
AFTERBIAS_TENSOR,
AFTERCONV_TENSOR,
OPTIONAL,
AFTEROPT_TENSOR,
};
using common_conv_descriptors =
std::tuple;
common_conv_descriptors
create_common_descriptors(int64_t* x_dim_padded,
int64_t* padA,
int64_t* convstrideA,
int64_t* dilationA,
int64_t* w_dim_padded,
int64_t* y_dim_padded,
cudnnDataType_t dataType,
cudnnConvolutionMode_t mode) {
const int convDim = 2;
int64_t strideA_padded[4];
int64_t outstrideA_padded[4];
int64_t filterstrideA_padded[4];
generateStrides(w_dim_padded, filterstrideA_padded, 4, CUDNN_TENSOR_NHWC);
generateStrides(x_dim_padded, strideA_padded, 4, CUDNN_TENSOR_NHWC);
generateStrides(y_dim_padded, outstrideA_padded, 4, CUDNN_TENSOR_NHWC);
return common_conv_descriptors(cudnn_frontend::TensorBuilder()
.setDim(4, x_dim_padded)
.setStrides(4, strideA_padded)
.setId('x')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, outstrideA_padded)
.setId('y')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, w_dim_padded)
.setStrides(4, filterstrideA_padded)
.setId('w')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::ConvDescBuilder()
.setDataType(CUDNN_DATA_FLOAT)
.setMathMode(mode)
.setNDims(convDim)
.setStrides(convDim, convstrideA)
.setPrePadding(convDim, padA)
.setPostPadding(convDim, padA)
.setDilation(convDim, dilationA)
.build());
}
using common_convbias_descriptors = std::tuple;
common_convbias_descriptors
create_conv_bias_add_act_descriptors(int64_t* x_dim_padded,
int64_t* padA,
int64_t* convstrideA,
int64_t* dilationA,
int64_t* w_dim_padded,
int64_t* y_dim_padded,
cudnnDataType_t dataType) {
const int convDim = 2;
int64_t b_dim_padded[4];
b_dim_padded[0] = 1;
b_dim_padded[1] = y_dim_padded[1];
b_dim_padded[2] = 1;
b_dim_padded[3] = 1;
int64_t x_stride_padded[4];
int64_t y_stride_padded[4];
int64_t w_stride_padded[4];
int64_t b_stride_padded[4];
generateStrides(w_dim_padded, w_stride_padded, 4, CUDNN_TENSOR_NHWC);
generateStrides(x_dim_padded, x_stride_padded, 4, CUDNN_TENSOR_NHWC);
generateStrides(y_dim_padded, y_stride_padded, 4, CUDNN_TENSOR_NHWC);
generateStrides(b_dim_padded, b_stride_padded, 4, CUDNN_TENSOR_NHWC);
return common_convbias_descriptors(cudnn_frontend::TensorBuilder()
.setDim(4, x_dim_padded)
.setStrides(4, x_stride_padded)
.setId('x')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setId('y')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, w_dim_padded)
.setStrides(4, w_stride_padded)
.setId('w')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, b_dim_padded)
.setStrides(4, b_stride_padded)
.setId('z')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, b_dim_padded)
.setStrides(4, b_stride_padded)
.setId('b')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setVirtual()
.setId('A') // after add
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setVirtual()
.setId('B') // after bias
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setId('C') // after conv
.setAlignment(16)
.setVirtual()
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setId('i')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setId('D') // after optional add
.setAlignment(16)
.setVirtual()
.setDataType(dataType)
.build());
}
// tensor descriptors used for dgrad
enum {
X_OR_DX_TENSOR,
DY_TENSOR,
W_OR_DW_TENSOR,
SCALE_TENSOR,
RELU_TENSOR,
AFTER_DCONV_TENSOR,
AFTER_DRELU_TENSOR,
};
using dconv_descriptors = std::tuple;
dconv_descriptors
create_dconv_descriptors(int64_t* x_dim_padded,
int64_t* padA,
int64_t* convstrideA,
int64_t* dilationA,
int64_t* w_dim_padded,
int64_t* y_dim_padded,
cudnnDataType_t dataType) {
const int convDim = 2;
int64_t b_dim_padded[4];
b_dim_padded[0] = 1;
b_dim_padded[1] = x_dim_padded[1];
b_dim_padded[2] = 1;
b_dim_padded[3] = 1;
int64_t x_stride_padded[4];
int64_t y_stride_padded[4];
int64_t w_stride_padded[4];
int64_t b_stride_padded[4];
generateStrides(w_dim_padded, w_stride_padded, 4, CUDNN_TENSOR_NHWC);
generateStrides(x_dim_padded, x_stride_padded, 4, CUDNN_TENSOR_NHWC);
generateStrides(y_dim_padded, y_stride_padded, 4, CUDNN_TENSOR_NHWC);
generateStrides(b_dim_padded, b_stride_padded, 4, CUDNN_TENSOR_NHWC);
return dconv_descriptors(cudnn_frontend::TensorBuilder()
.setDim(4, x_dim_padded)
.setStrides(4, x_stride_padded)
.setId('x')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setId('y')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, w_dim_padded)
.setStrides(4, w_stride_padded)
.setId('w')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, b_dim_padded)
.setStrides(4, b_stride_padded)
.setId('s')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, x_dim_padded)
.setStrides(4, x_stride_padded)
.setId('r')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, x_dim_padded)
.setStrides(4, x_stride_padded)
.setVirtual()
.setId('A') // after dconv
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, x_dim_padded)
.setStrides(4, x_stride_padded)
.setVirtual()
.setId('B') // after drelu
.setAlignment(16)
.setDataType(dataType)
.build());
}
// create a cache for plan
std::unordered_map plan_cache;
// TODO: better name
std::string getConvFusionString(int64_t* x_dim_padded,
int64_t* padA,
int64_t* convstrideA,
int64_t* dilationA,
int64_t* w_dim_padded,
cudnnDataType_t dataType,
std::string fusion_string) {
for(int i=0;i<4;i++) {
fusion_string += 'X';
fusion_string += std::to_string(x_dim_padded[i]);
}
for(int i=0;i<4;i++) {
fusion_string += 'W';
fusion_string += std::to_string(w_dim_padded[i]);
}
for(int i=0;i<2;i++) {
fusion_string += 'P';
fusion_string += std::to_string(padA[i]);
}
for(int i=0;i<2;i++) {
fusion_string += 'S';
fusion_string += std::to_string(convstrideA[i]);
}
for(int i=0;i<2;i++) {
fusion_string += 'D';
fusion_string += std::to_string(dilationA[i]);
}
fusion_string += 'T';
fusion_string += std::to_string(dataType);
return fusion_string;
}
cudnn_frontend::ExecutionPlan& getOrCreatePlan(cudnnHandle_t handle_,
std::stringstream& log_buf,
cudnn_frontend::OperationGraph& opGraph,
std::string cache_string,
bool use_heuristic = true){
auto it = plan_cache.find(cache_string);
if (it != plan_cache.end()) {
DEBUG_CUDNN_MSG(log_buf, "Found plan in cache");
return it->second;
} else {
if (use_heuristic){
// TODO: confirm which mode to use
auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
.setOperationGraph(opGraph)
.setHeurMode(CUDNN_HEUR_MODE_INSTANT)
.build();
// try 3 times for now as WAR for no heuristic training
int max_tries = 3, count = 0;
auto& engine_configs = heuristics.getEngineConfig(max_tries);
while(true) {
try {
plan_cache.emplace(cache_string, std::move(cudnn_frontend::ExecutionPlanBuilder()
.setHandle(handle_)
.setEngineConfig(engine_configs[count], opGraph.getTag())
.build()));
break;
} catch (cudnn_frontend::cudnnException e) {
if (++count == max_tries) throw e;
}
}
}else{
DEBUG_CUDNN_MSG(log_buf, "No plan in cache");
// How many engines support this operation graph ?
auto total_engines = opGraph.getEngineCount();
DEBUG_CUDNN_MSG(log_buf, opGraph.describe() << " has " << total_engines << " engines.");
// We have to randomly pick one engine from [0, total_engines)
// Selecting "0" by default
auto engine = cudnn_frontend::EngineBuilder().setGlobalEngineIdx(0).setOperationGraph(opGraph).build();
DEBUG_CUDNN_MSG(log_buf, engine.describe());
auto& knobs = engine.getSupportedKnobs();
for (auto it = std::begin(knobs); it != std::end(knobs); ++it) {
DEBUG_CUDNN_MSG(log_buf, it->describe());
}
if (knobs.begin() != knobs.end()) {
DEBUG_CUDNN_MSG(log_buf, "Updated knob choice");
knobs.begin()->setChoice(knobs.begin()->getMinValue() + 1);
DEBUG_CUDNN_MSG(log_buf, knobs.begin()->describe());
}
// Createmplacee the requisite engine config
auto engine_config = cudnn_frontend::EngineConfigBuilder().setEngine(engine).build();
DEBUG_CUDNN_MSG(log_buf, engine_config.describe());
plan_cache.emplace(cache_string, std::move(cudnn_frontend::ExecutionPlanBuilder().setHandle(handle_).setEngineConfig(engine_config).build()));
}
return plan_cache.find(cache_string)->second;
}
}
void
run_conv_scale_bias_add_activation(int64_t* x_dim_padded,
int64_t* pad,
int64_t* convstride,
int64_t* dilation,
int64_t* w_dim_padded,
int64_t* y_dim_padded,
cudnnDataType_t dataType,
at::Half* devPtrX,
at::Half* devPtrW,
at::Half* devPtrY,
at::Half* devPtrZ,
at::Half* devPtrB,
at::Half* devPtrI) {
cudnnHandle_t handle_ = torch::native::getCudnnHandle();
std::stringstream log_buf;
try {
int convDim = 2;
// Creates the necessary tensor descriptors
common_convbias_descriptors tensors = create_conv_bias_add_act_descriptors(
x_dim_padded, pad, convstride, dilation, w_dim_padded, y_dim_padded, dataType);
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
// Define the add operation
auto scaleDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_MUL)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
DEBUG_CUDNN_MSG(log_buf, scaleDesc.describe());
// Define the bias operation
auto biasDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_ADD)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
DEBUG_CUDNN_MSG(log_buf, biasDesc.describe());
// optional add
auto addDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_ADD)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
DEBUG_CUDNN_MSG(log_buf, addDesc.describe());
// Define the activation operation
auto actDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_RELU_FWD)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
DEBUG_CUDNN_MSG(log_buf, actDesc.describe());
// Define the convolution problem
auto convDesc = cudnn_frontend::ConvDescBuilder()
.setDataType(CUDNN_DATA_FLOAT)
.setMathMode(CUDNN_CROSS_CORRELATION)
.setNDims(convDim)
.setStrides(convDim, convstride)
.setPrePadding(convDim, pad)
.setPostPadding(convDim, pad)
.setDilation(convDim, dilation)
.build();
DEBUG_CUDNN_MSG(log_buf, convDesc.describe());
float alpha = 1.0f;
float beta = 0.0f;
// Create a convolution Node
auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
.setxDesc(std::get(tensors))
.setwDesc(std::get(tensors))
.setyDesc(std::get(tensors))
.setcDesc(convDesc)
.setAlpha(alpha)
.setBeta(beta)
.build();
DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
// Create a Add Node with scaling parameters.
auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.setxDesc(conv_op.getOutputTensor())
.setbDesc(std::get(tensors))
.setyDesc(std::get(tensors))
.setpwDesc(scaleDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, scale_op.describe());
// Create a Bias Node.
auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.setxDesc(scale_op.getOutputTensor())
.setbDesc(std::get(tensors))
.setyDesc(std::get(tensors))
.setpwDesc(biasDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, bias_op.describe());
// Create a optional add Node.
auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.setxDesc(bias_op.getOutputTensor())
.setbDesc(std::get(tensors))
.setyDesc(std::get(tensors))
.setpwDesc(addDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, add_op.describe());
// Create an Activation Node.
auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.setxDesc(devPtrI ? add_op.getOutputTensor() : bias_op.getOutputTensor())
.setyDesc(std::get(tensors))
.setpwDesc(actDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, act_op.describe());
// Create an Operation Graph. In this case it is convolution add bias activation
std::array ops = {&conv_op, &scale_op, &bias_op, devPtrI ? &add_op : &act_op, &act_op};
auto opGraph = cudnn_frontend::OperationGraphBuilder()
.setHandle(handle_)
.setOperationGraph(devPtrI ? ops.size() : 4, ops.data())
.build();
// Create string encoding for plan caching
auto cache_string = getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);
auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());
auto workspace_size = plan.getWorkspaceSize();
DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);
void* workspace_ptr = nullptr;
auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
if (workspace_size > 0) {
workspace_ptr = workspace_tensor.data_ptr();
}
void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrZ, devPtrB, devPtrI};
int64_t uids[] = {'x', 'y', 'w', 'z', 'b', 'i'};
auto variantPack = cudnn_frontend::VariantPackBuilder()
.setWorkspacePointer(workspace_ptr)
.setDataPointers(devPtrI ? 6 : 5, data_ptrs)
.setUids(devPtrI ? 6 : 5, uids)
.build();
DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
checkCudnnErr(status);
cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error");
} catch (cudnn_frontend::cudnnException e) {
std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
}
}
void
run_conv_scale_bias(int64_t* x_dim_padded,
int64_t* pad,
int64_t* convstride,
int64_t* dilation,
int64_t* w_dim_padded,
int64_t* y_dim_padded,
cudnnDataType_t dataType,
at::Half* devPtrX,
at::Half* devPtrW,
at::Half* devPtrY,
at::Half* devPtrZ,
at::Half* devPtrB) {
cudnnHandle_t handle_ = torch::native::getCudnnHandle();
std::stringstream log_buf;
try {
int convDim = 2;
// Creates the necessary tensor descriptors
common_convbias_descriptors tensors = create_conv_bias_add_act_descriptors(
x_dim_padded, pad, convstride, dilation, w_dim_padded, y_dim_padded, dataType);
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
// Define the add operation
auto scaleDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_MUL)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
DEBUG_CUDNN_MSG(log_buf, scaleDesc.describe());
// Define the bias operation
auto addDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_ADD)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
DEBUG_CUDNN_MSG(log_buf, addDesc.describe());
// Define the convolution problem
auto convDesc = cudnn_frontend::ConvDescBuilder()
.setDataType(CUDNN_DATA_FLOAT)
.setMathMode(CUDNN_CROSS_CORRELATION)
.setNDims(convDim)
.setStrides(convDim, convstride)
.setPrePadding(convDim, pad)
.setPostPadding(convDim, pad)
.setDilation(convDim, dilation)
.build();
DEBUG_CUDNN_MSG(log_buf, convDesc.describe());
float alpha = 1.0f;
float beta = 0.0f;
// Create a convolution Node
auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
.setxDesc(std::get(tensors))
.setwDesc(std::get(tensors))
.setyDesc(std::get(tensors))
.setcDesc(convDesc)
.setAlpha(alpha)
.setBeta(beta)
.build();
DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
// Create a Add Node with scaling parameters.
auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.setxDesc(conv_op.getOutputTensor())
.setbDesc(std::get(tensors))
.setyDesc(std::get(tensors)) // TODO: change enum to aftermul
.setpwDesc(scaleDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, scale_op.describe());
// Create a Bias Node.
auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.setxDesc(scale_op.getOutputTensor())
.setbDesc(std::get(tensors))
.setyDesc(std::get(tensors))
.setpwDesc(addDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, add_op.describe());
// Create an Operation Graph. In this case it is convolution add bias activation
std::array ops = {&conv_op, &scale_op, &add_op};
auto opGraph = cudnn_frontend::OperationGraphBuilder()
.setHandle(handle_)
.setOperationGraph(ops.size(), ops.data())
.build();
// Create string encoding for plan caching
auto cache_string = getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);
auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());
auto workspace_size = plan.getWorkspaceSize();
DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);
void* workspace_ptr = nullptr;
auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
if (workspace_size > 0) {
workspace_ptr = workspace_tensor.data_ptr();
}
void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrZ, devPtrB};
int64_t uids[] = {'x', 'y', 'w', 'z', 'b'};
auto variantPack = cudnn_frontend::VariantPackBuilder()
.setWorkspacePointer(workspace_ptr)
.setDataPointers(5, data_ptrs)
.setUids(5, uids)
.build();
DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
checkCudnnErr(status);
cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error");
} catch (cudnn_frontend::cudnnException e) {
std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
}
}
void
run_dconv_drelu_dscale(int64_t* x_dim_padded,
int64_t* pad,
int64_t* convstride,
int64_t* dilation,
int64_t* w_dim_padded,
int64_t* y_dim_padded,
cudnnDataType_t dataType,
at::Half* devPtrX,
at::Half* devPtrW,
at::Half* devPtrY,
at::Half* devPtrZ,
at::Half* devPtrR) {
cudnnHandle_t handle_ = torch::native::getCudnnHandle();
std::stringstream log_buf;
try {
int convDim = 2;
// Creates the necessary tensor descriptors
dconv_descriptors tensors = create_dconv_descriptors(
x_dim_padded, pad, convstride, dilation, w_dim_padded, y_dim_padded, dataType);
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
// Define the convolution problem
auto convDesc = cudnn_frontend::ConvDescBuilder()
.setDataType(CUDNN_DATA_FLOAT)
.setMathMode(CUDNN_CROSS_CORRELATION)
.setNDims(convDim)
.setStrides(convDim, convstride)
.setPrePadding(convDim, pad)
.setPostPadding(convDim, pad)
.setDilation(convDim, dilation)
.build();
DEBUG_CUDNN_MSG(log_buf, convDesc.describe());
// Define the activation backward operation
auto actDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_RELU_BWD)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
DEBUG_CUDNN_MSG(log_buf, actDesc.describe());
// Define the scale backward operation
auto scaleDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_MUL)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
DEBUG_CUDNN_MSG(log_buf, scaleDesc.describe());
float alpha = 1.0f;
float beta = 0.0f;
// Create a convolution Node
auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR)
.setdxDesc(std::get(tensors))
.setwDesc(std::get(tensors))
.setdyDesc(std::get(tensors))
.setcDesc(convDesc)
.setAlpha(alpha)
.setBeta(beta)
.build();
DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
// TODO: do we need getOutputTensor(), and what it returns in backward case?
// Create an relu backward Node.
auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.setdyDesc(std::get(tensors))
.setxDesc(std::get(tensors))
.setdxDesc(std::get(tensors))
.setpwDesc(actDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, act_op.describe());
// Create a Scale Node.
auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.setxDesc(std::get(tensors))
.setbDesc(std::get(tensors))
.setyDesc(std::get(tensors))
.setpwDesc(scaleDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, scale_op.describe());
// Create an Operation Graph. In this case it is convolution add bias activation
std::array ops = {&conv_op, &act_op, &scale_op};
auto opGraph = cudnn_frontend::OperationGraphBuilder()
.setHandle(handle_)
.setOperationGraph(ops.size(), ops.data())
.build();
// Create string encoding for plan caching
auto cache_string = getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);
auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());
auto workspace_size = plan.getWorkspaceSize();
DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);
void* workspace_ptr = nullptr;
auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
if (workspace_size > 0) {
workspace_ptr = workspace_tensor.data_ptr();
}
void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrZ, devPtrR};
int64_t uids[] = {'x', 'y', 'w', 's', 'r'};
auto variantPack = cudnn_frontend::VariantPackBuilder()
.setWorkspacePointer(workspace_ptr)
.setDataPointers(5, data_ptrs)
.setUids(5, uids)
.build();
DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
checkCudnnErr(status);
cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error");
} catch (cudnn_frontend::cudnnException e) {
std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
}
}
void
run_dconv(int64_t* x_dim_padded,
int64_t* pad,
int64_t* convstride,
int64_t* dilation,
int64_t* w_dim_padded,
int64_t* y_dim_padded,
cudnnDataType_t dataType,
at::Half* devPtrX,
at::Half* devPtrW,
at::Half* devPtrY,
cudnnBackendDescriptorType_t mode) {
cudnnHandle_t handle_ = torch::native::getCudnnHandle();
std::stringstream log_buf;
try {
int convDim = 2;
// Creates the necessary tensor descriptors
dconv_descriptors tensors = create_dconv_descriptors(
x_dim_padded, pad, convstride, dilation, w_dim_padded, y_dim_padded, dataType);
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
// Define the convolution problem
auto convDesc = cudnn_frontend::ConvDescBuilder()
.setDataType(CUDNN_DATA_FLOAT)
.setMathMode(CUDNN_CROSS_CORRELATION)
.setNDims(convDim)
.setStrides(convDim, convstride)
.setPrePadding(convDim, pad)
.setPostPadding(convDim, pad)
.setDilation(convDim, dilation)
.build();
DEBUG_CUDNN_MSG(log_buf, convDesc.describe());
float alpha = 1.0f;
float beta = 0.0f;
// Create a convolution Node
// mode should be one of following
// CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
// CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
auto conv_op_builder = cudnn_frontend::OperationBuilder(mode);
if (mode == CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR) {
conv_op_builder.setdxDesc(std::get(tensors))
.setwDesc(std::get(tensors))
.setdyDesc(std::get(tensors))
.setcDesc(convDesc)
.setAlpha(alpha)
.setBeta(beta);
}
else {
conv_op_builder.setxDesc(std::get(tensors))
.setdwDesc(std::get(tensors))
.setdyDesc(std::get(tensors))
.setcDesc(convDesc)
.setAlpha(alpha)
.setBeta(beta);
}
auto conv_op = conv_op_builder.build();
DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
// Create an Operation Graph. In this case it is convolution add bias activation
std::array ops = {&conv_op};
auto opGraph = cudnn_frontend::OperationGraphBuilder()
.setHandle(handle_)
.setOperationGraph(ops.size(), ops.data())
.build();
// Create string encoding for plan caching
auto cache_string = getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);
auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());
auto workspace_size = plan.getWorkspaceSize();
DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);
void* workspace_ptr = nullptr;
auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
if (workspace_size > 0) {
workspace_ptr = workspace_tensor.data_ptr();
}
void* data_ptrs[] = {devPtrX, devPtrY, devPtrW};
int64_t uids[] = {'x', 'y', 'w'};
auto variantPack = cudnn_frontend::VariantPackBuilder()
.setWorkspacePointer(workspace_ptr)
.setDataPointers(3, data_ptrs)
.setUids(3, uids)
.build();
DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
checkCudnnErr(status);
cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error");
} catch (cudnn_frontend::cudnnException e) {
std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
}
}
void
run_dconv_add(int64_t* x_dim_padded,
int64_t* pad,
int64_t* convstride,
int64_t* dilation,
int64_t* w_dim_padded,
int64_t* y_dim_padded,
cudnnDataType_t dataType,
at::Half* devPtrX,
at::Half* devPtrW,
at::Half* devPtrY,
at::Half* devPtrR) {
cudnnHandle_t handle_ = torch::native::getCudnnHandle();
std::stringstream log_buf;
try {
int convDim = 2;
// Creates the necessary tensor descriptors
dconv_descriptors tensors = create_dconv_descriptors(
x_dim_padded, pad, convstride, dilation, w_dim_padded, y_dim_padded, dataType);
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
// Define the convolution problem
auto convDesc = cudnn_frontend::ConvDescBuilder()
.setDataType(CUDNN_DATA_FLOAT)
.setMathMode(CUDNN_CROSS_CORRELATION)
.setNDims(convDim)
.setStrides(convDim, convstride)
.setPrePadding(convDim, pad)
.setPostPadding(convDim, pad)
.setDilation(convDim, dilation)
.build();
DEBUG_CUDNN_MSG(log_buf, convDesc.describe());
// Define the add backward operation
auto addDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_ADD)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
DEBUG_CUDNN_MSG(log_buf, addDesc.describe());
float alpha = 1.0f;
float beta = 0.0f;
// Create a convolution Node
auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR)
.setdxDesc(std::get(tensors))
.setwDesc(std::get(tensors))
.setdyDesc(std::get(tensors))
.setcDesc(convDesc)
.setAlpha(alpha)
.setBeta(beta)
.build();
DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
// TODO: do we need getOutputTensor(), and what it returns in backward case?
// Create add Node.
auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.setxDesc(std::get(tensors))
.setbDesc(std::get(tensors))
.setyDesc(std::get(tensors))
.setpwDesc(addDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, add_op.describe());
// Create an Operation Graph. In this case it is convolution add bias activation
std::array ops = {&conv_op, &add_op};
auto opGraph = cudnn_frontend::OperationGraphBuilder()
.setHandle(handle_)
.setOperationGraph(ops.size(), ops.data())
.build();
// Create string encoding for plan caching
auto cache_string = getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);
auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());
auto workspace_size = plan.getWorkspaceSize();
DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);
void* workspace_ptr = nullptr;
auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
if (workspace_size > 0) {
workspace_ptr = workspace_tensor.data_ptr();
}
void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrR};
int64_t uids[] = {'x', 'y', 'w', 'r'};
auto variantPack = cudnn_frontend::VariantPackBuilder()
.setWorkspacePointer(workspace_ptr)
.setDataPointers(4, data_ptrs)
.setUids(4, uids)
.build();
DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
checkCudnnErr(status);
cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error");
} catch (cudnn_frontend::cudnnException e) {
std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
}
}
// inputs contains x,w,z,b,(i)
std::vector bottleneck_forward(bool explicit_nhwc, int stride_1X1, std::vector inputs) {
std::cout << std::fixed;
// create output vector
std::vector outputs;
auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
// setup dimensions
int64_t dimA[] = {0, 0, 0, 0};
int64_t filterdimA1[] = {0, 0, 0, 0};
int64_t filterdimA2[] = {0, 0, 0, 0};
int64_t filterdimA3[] = {0, 0, 0, 0};
int64_t filterdimA4[] = {0, 0, 0, 0};
// All dim calculation after this order of n,c,h,w
int axis[] {0,1,2,3};
if (explicit_nhwc) {
axis[0] = 0;
axis[1] = 3;
axis[2] = 1;
axis[3] = 2;
}
for (int dim=0;dim<4;dim++) {
dimA[dim] = inputs[0].size(axis[dim]);
filterdimA1[dim] = inputs[1].size(axis[dim]);
filterdimA2[dim] = inputs[2].size(axis[dim]);
filterdimA3[dim] = inputs[3].size(axis[dim]);
}
if (stride_1X1 != 1 || filterdimA3[0] != dimA[1]) {
for (int dim=0;dim<4;dim++) {
filterdimA4[dim] = inputs[10].size(axis[dim]);
}
}
// output dim in n,c,h,w used by backend
int64_t outdimA1[] = {0, 0, 0, 0}; // Computed Below
int64_t outdimA2[] = {0, 0, 0, 0}; // Computed Below
int64_t outdimA3[] = {0, 0, 0, 0}; // Computed Below
// use these fixed value for test run
int64_t padA[] = {0, 0};
int64_t padA1[] = {1, 1};
int64_t dilationA[] = {1, 1};
int64_t convstrideA[] = {1, 1};
int64_t convstride1X1[] = {stride_1X1, stride_1X1};
// compute output from pad/stride/dilation
outdimA1[0] = dimA[0];
outdimA1[1] = filterdimA1[0];
for (int dim = 0; dim < 2; dim++) {
outdimA1[dim + 2] = getFwdConvOutputDim(dimA[dim + 2], padA[dim], filterdimA1[dim + 2], convstride1X1[dim], dilationA[dim]);
}
outdimA2[0] = outdimA1[0];
outdimA2[1] = filterdimA2[0];
for (int dim = 0; dim < 2; dim++) {
outdimA2[dim + 2] = getFwdConvOutputDim(outdimA1[dim + 2], padA1[dim], filterdimA2[dim + 2], convstrideA[dim], dilationA[dim]);
}
outdimA3[0] = outdimA2[0];
outdimA3[1] = filterdimA3[0];
for (int dim = 0; dim < 2; dim++) {
outdimA3[dim + 2] = getFwdConvOutputDim(outdimA2[dim + 2], padA[dim], filterdimA3[dim + 2], convstrideA[dim], dilationA[dim]);
}
// Create output tensor in the correct shape in pytorch's view
int64_t outdim1[] = {0, 0, 0, 0};
int64_t outdim2[] = {0, 0, 0, 0};
int64_t outdim3[] = {0, 0, 0, 0};
if (explicit_nhwc) {
axis[0] = 0;
axis[1] = 2;
axis[2] = 3;
axis[3] = 1;
}
for (int dim=0;dim<4;dim++) {
outdim1[dim] = outdimA1[axis[dim]];
outdim2[dim] = outdimA2[axis[dim]];
outdim3[dim] = outdimA3[axis[dim]];
}
// run
at::Half* x = inputs[0].data_ptr();
at::Half* w = inputs[1].data_ptr();
at::Half* z = inputs[4].data_ptr();
at::Half* b = inputs[7].data_ptr();
auto out1 = at::empty(outdim1, inputs[0].type(), output_format);
at::Half* y1 = out1.data_ptr();
run_conv_scale_bias_add_activation(dimA,
padA,
convstride1X1,
dilationA,
filterdimA1,
outdimA1,
CUDNN_DATA_HALF,
x,
w,
y1,
z,
b,
nullptr);
DEBUG_MSG("[DEBUG] new relu1 : " << out1.to(at::kFloat).sum().item());
w = inputs[2].data_ptr();
z = inputs[5].data_ptr();
b = inputs[8].data_ptr();
auto out2 = at::empty(outdim2, inputs[0].type(), output_format);
at::Half* y2 = out2.data_ptr();
run_conv_scale_bias_add_activation(outdimA1,
padA1,
convstrideA,
dilationA,
filterdimA2,
outdimA2,
CUDNN_DATA_HALF,
y1,
w,
y2,
z,
b,
nullptr);
DEBUG_MSG("[DEBUG] new relu2 : " << out2.to(at::kFloat).sum().item());
// create output of conv3
auto out3 = at::empty(outdim3, inputs[0].type(), output_format);
at::Half* y3 = out3.data_ptr();
// create output of conv4 that may exist
auto identity = at::empty_like(out3);
at::Half* yi = identity.data_ptr();
if (stride_1X1 != 1 || filterdimA3[0] != dimA[1]){
w = inputs[10].data_ptr();
z = inputs[11].data_ptr();
b = inputs[12].data_ptr();
run_conv_scale_bias(dimA,
padA,
convstride1X1,
dilationA,
filterdimA4,
outdimA3,
CUDNN_DATA_HALF,
x,
w,
yi,
z,
b);
DEBUG_MSG("[DEBUG] new downsample : " << identity.to(at::kFloat).sum().item());
}
else {
yi = x;
}
w = inputs[3].data_ptr();
z = inputs[6].data_ptr();
b = inputs[9].data_ptr();
run_conv_scale_bias_add_activation(outdimA2,
padA,
convstrideA,
dilationA,
filterdimA3,
outdimA3,
CUDNN_DATA_HALF,
y2,
w,
y3,
z,
b,
yi);
DEBUG_MSG("[DEBUG] new relu3 : " << out3.to(at::kFloat).sum().item());
outputs.push_back(out1);
outputs.push_back(out2);
outputs.push_back(out3);
return outputs;
}
std::vector bottleneck_backward(bool explicit_nhwc, int stride_1X1, std::vector inputs) {
bool requires_grad = inputs[0].requires_grad();
std::cout << std::fixed;
// create output vector
std::vector outputs;
auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
// setup dimensions
int64_t dimA[] = {0, 0, 0, 0};
int64_t filterdimA1[] = {0, 0, 0, 0};
int64_t filterdimA2[] = {0, 0, 0, 0};
int64_t filterdimA3[] = {0, 0, 0, 0};
int64_t filterdimA4[] = {0, 0, 0, 0};
// All dim calculation after this order of n,c,h,w
int axis[] {0,1,2,3};
if (explicit_nhwc) {
axis[0] = 0;
axis[1] = 3;
axis[2] = 1;
axis[3] = 2;
}
for (int dim=0;dim<4;dim++) {
dimA[dim] = inputs[0].size(axis[dim]);
filterdimA1[dim] = inputs[1].size(axis[dim]);
filterdimA2[dim] = inputs[2].size(axis[dim]);
filterdimA3[dim] = inputs[3].size(axis[dim]);
}
if (stride_1X1 != 1 || filterdimA3[0] != dimA[1]) {
for (int dim=0;dim<4;dim++) {
filterdimA4[dim] = inputs[14].size(axis[dim]);
}
}
// output dim in n,c,h,w used by backend
int64_t outdimA1[] = {0, 0, 0, 0}; // Computed Below
int64_t outdimA2[] = {0, 0, 0, 0}; // Computed Below
int64_t outdimA3[] = {0, 0, 0, 0}; // Computed Below
// use these fixed value for test run
int64_t padA[] = {0, 0};
int64_t padA1[] = {1, 1};
int64_t dilationA[] = {1, 1};
int64_t convstrideA[] = {1, 1};
int64_t convstride1X1[] = {stride_1X1, stride_1X1};
// compute output from pad/stride/dilation
outdimA1[0] = dimA[0];
outdimA1[1] = filterdimA1[0];
for (int dim = 0; dim < 2; dim++) {
outdimA1[dim + 2] = getFwdConvOutputDim(dimA[dim + 2], padA[dim], filterdimA1[dim + 2], convstride1X1[dim], dilationA[dim]);
}
outdimA2[0] = outdimA1[0];
outdimA2[1] = filterdimA2[0];
for (int dim = 0; dim < 2; dim++) {
outdimA2[dim + 2] = getFwdConvOutputDim(outdimA1[dim + 2], padA1[dim], filterdimA2[dim + 2], convstrideA[dim], dilationA[dim]);
}
outdimA3[0] = outdimA2[0];
outdimA3[1] = filterdimA3[0];
for (int dim = 0; dim < 2; dim++) {
outdimA3[dim + 2] = getFwdConvOutputDim(outdimA2[dim + 2], padA[dim], filterdimA3[dim + 2], convstrideA[dim], dilationA[dim]);
}
// Create output tensor in the correct shape in pytorch's view
int64_t outdim1[] = {0, 0, 0, 0};
int64_t outdim2[] = {0, 0, 0, 0};
int64_t outdim3[] = {0, 0, 0, 0};
if (explicit_nhwc) {
axis[0] = 0;
axis[1] = 2;
axis[2] = 3;
axis[3] = 1;
}
for (int dim=0;dim<4;dim++) {
outdim1[dim] = outdimA1[axis[dim]];
outdim2[dim] = outdimA2[axis[dim]];
outdim3[dim] = outdimA3[axis[dim]];
}
// dconv3+drelu2+dscale2
at::Half* conv_in = inputs[13].data_ptr();
at::Half* dy3 = inputs[10].data_ptr();
DEBUG_MSG("[DEBUG] new dconv3 : " << inputs[10].to(at::kFloat).sum().item());
// wgrad
auto wgrad3 = at::empty_like(inputs[3]);
at::Half* dw3 = wgrad3.data_ptr();
run_dconv(outdimA2,
padA,
convstrideA,
dilationA,
filterdimA3,
outdimA3,
CUDNN_DATA_HALF,
conv_in,
dw3,
dy3,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
// dgrad
auto grad_out2 = at::empty(outdim2, inputs[0].type(), output_format);
at::Half* dy2 = grad_out2.data_ptr();
at::Half* w = inputs[3].data_ptr();
at::Half* z = inputs[5].data_ptr();
at::Half* relu2 = inputs[13].data_ptr();
run_dconv_drelu_dscale(outdimA2,
padA,
convstrideA,
dilationA,
filterdimA3,
outdimA3,
CUDNN_DATA_HALF,
dy2,
w,
dy3,
z,
relu2);
DEBUG_MSG("[DEBUG] new dconv2 : " << grad_out2.to(at::kFloat).sum().item());
// dconv2+drelu1+dscale1
conv_in = inputs[12].data_ptr();
// wgrad
auto wgrad2 = at::empty_like(inputs[2]);
at::Half* dw2 = wgrad2.data_ptr();
run_dconv(outdimA1,
padA1,
convstrideA,
dilationA,
filterdimA2,
outdimA2,
CUDNN_DATA_HALF,
conv_in,
dw2,
dy2,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
// dgrad
auto grad_out1 = at::empty(outdim1, inputs[0].type(), output_format);
at::Half* dy1 = grad_out1.data_ptr();
w = inputs[2].data_ptr();
z = inputs[4].data_ptr();
at::Half* relu1 = inputs[12].data_ptr();
// fused dgrad
run_dconv_drelu_dscale(outdimA1,
padA1,
convstrideA,
dilationA,
filterdimA2,
outdimA2,
CUDNN_DATA_HALF,
dy1,
w,
dy2,
z,
relu1);
/*
// backward strided conv cannot be fused
// if stride == 1 but channel changes, we can fuse here
if (stride_1X1 != 1){
// dgrad
run_dconv(outdimA1,
padA1,
convstride1X1,
dilationA,
filterdimA2,
outdimA2,
CUDNN_DATA_HALF,
dy1,
w,
dy2,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);
// mul fused mask
grad_out1.mul_(inputs[15]);
}
else {
at::Half* relu1 = inputs[12].data_ptr();
// fused dgrad
run_dconv_drelu_dscale(outdimA1,
padA1,
convstride1X1,
dilationA,
filterdimA2,
outdimA2,
CUDNN_DATA_HALF,
dy1,
w,
dy2,
z,
relu1);
}
*/
DEBUG_MSG("[DEBUG] new dconv1 : " << grad_out1.to(at::kFloat).sum().item());
// create grads of conv4 that may exist
auto grad_x_conv4 = at::empty_like(inputs[0]);
at::Half* dx_conv4 = grad_x_conv4.data_ptr();
at::Tensor wgrad4;
// x used for dconv1 and dconv4 wgrad
at::Half* x = inputs[0].data_ptr();
if (stride_1X1 != 1 || filterdimA3[0] != dimA[1]){
w = inputs[14].data_ptr();
at::Half* dy_conv4 = inputs[11].data_ptr();
if (requires_grad) {
run_dconv(dimA,
padA,
convstride1X1,
dilationA,
filterdimA4,
outdimA3,
CUDNN_DATA_HALF,
dx_conv4,
w,
dy_conv4,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);
// we don't print here since we can't hook out this grad in pytorch alone to compare, due to addition with dx
// DEBUG_MSG("[DEBUG] new dx_identity : " << grad_x_conv4.to(at::kFloat).sum().item());
}
// wgrad
wgrad4 = at::empty_like(inputs[14]);
at::Half* dw4 = wgrad4.data_ptr();
run_dconv(dimA,
padA,
convstride1X1,
dilationA,
filterdimA4,
outdimA3,
CUDNN_DATA_HALF,
x,
dw4,
dy_conv4,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
}
else {
// if there is no downsample, dx_conv4 is fork of drelu3
dx_conv4 = inputs[11].data_ptr();
}
// dconv1+add
// wgrad
auto wgrad1 = at::empty_like(inputs[1]);
at::Half* dw1 = wgrad1.data_ptr();
run_dconv(dimA,
padA,
convstride1X1,
dilationA,
filterdimA1,
outdimA1,
CUDNN_DATA_HALF,
x,
dw1,
dy1,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
// dgrad
w = inputs[1].data_ptr();
auto grad_x = at::empty_like(inputs[0]);
at::Half* dx = grad_x.data_ptr();
// backward strided conv cannot be fused
// if stride == 1 but channel changes, we can fuse here
if (requires_grad){
if (stride_1X1 != 1){
run_dconv(dimA,
padA,
convstride1X1,
dilationA,
filterdimA1,
outdimA1,
CUDNN_DATA_HALF,
dx,
w,
dy1,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);
// add 2 together
grad_x.add_(grad_x_conv4);
}
else {
run_dconv_add(dimA,
padA,
convstride1X1,
dilationA,
filterdimA1,
outdimA1,
CUDNN_DATA_HALF,
dx,
w,
dy1,
dx_conv4);
}
}
DEBUG_MSG("[DEBUG] new dx : " << grad_x.to(at::kFloat).sum().item());
DEBUG_MSG("[DEBUG] new wgrad1 : " << wgrad1.to(at::kFloat).sum().item());
DEBUG_MSG("[DEBUG] new wgrad2 : " << wgrad2.to(at::kFloat).sum().item());
DEBUG_MSG("[DEBUG] new wgrad3 : " << wgrad3.to(at::kFloat).sum().item());
outputs.push_back(grad_x);
outputs.push_back(wgrad1);
outputs.push_back(wgrad2);
outputs.push_back(wgrad3);
if (stride_1X1 != 1 || filterdimA3[0] != dimA[1]) {
DEBUG_MSG("[DEBUG] new wgrad4 : " << wgrad4.to(at::kFloat).sum().item());
outputs.push_back(wgrad4);
}
return outputs;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &bottleneck_forward, "Bottleneck block forward");
m.def("backward", &bottleneck_backward, "Bottleneck block backward");
}
================================================
FILE: KoSentenceT5/apex/contrib/csrc/fmha/fmha_api.cpp
================================================
/******************************************************************************
* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#include
#include
#include "fmha.h"
void set_params(Fused_multihead_attention_fprop_params ¶ms,
// sizes
const size_t b,
const size_t s,
const size_t h,
const size_t d,
// device pointers
void *qkv_packed_d,
void *cu_seqlens_d,
void *o_packed_d,
void *s_d,
float p_dropout) {
Data_type acc_type = DATA_TYPE_FP32;
Data_type data_type = DATA_TYPE_FP16;
// Reset the parameters
memset(¶ms, 0, sizeof(params));
// Set the pointers and strides.
params.qkv_ptr = qkv_packed_d;
params.qkv_stride_in_bytes = get_size_in_bytes(h * 3 * d, data_type);
params.o_ptr = o_packed_d;
params.o_stride_in_bytes = get_size_in_bytes(h * d, data_type);
params.cu_seqlens = static_cast(cu_seqlens_d);
// S = softmax(P)
params.s_ptr = s_d;
params.s_stride_in_bytes = get_size_in_bytes(b * h * s, data_type);
// Set the dimensions.
params.b = b;
params.h = h;
params.s = s;
params.d = d;
// Set the different scale values.
const float scale_bmm1 = 1.f / sqrtf(d);
constexpr float scale_softmax = 1.f;
constexpr float scale_bmm2 = 1.f;
set_alpha(params.scale_bmm1, scale_bmm1, acc_type);
set_alpha(params.scale_softmax, scale_softmax, acc_type);
set_alpha(params.scale_bmm2, scale_bmm2, data_type);
// Set this to probability of keeping an element to simplify things.
params.p_dropout = 1.f - p_dropout;
params.rp_dropout = 1.f / params.p_dropout;
TORCH_CHECK(p_dropout < 1.f);
set_alpha(params.scale_dropout, params.rp_dropout, data_type);
}
std::vector
mha_fwd(const at::Tensor &qkv, // total x num_heads x 3 x head_size, total := \sum_{i=0}^{b} s_i
const at::Tensor &cu_seqlens, // b+1
const float p_dropout,
const int max_seq_len,
const bool is_training,
c10::optional gen_) {
auto dprops = at::cuda::getCurrentDeviceProperties();
TORCH_CHECK(dprops->major == 8 && dprops->minor == 0);
int seq_len = 512;
auto launch = &run_fmha_fp16_512_64_sm80;
if( max_seq_len <= 128 ) {
seq_len = 128;
launch = &run_fmha_fp16_128_64_sm80;
} else if( max_seq_len <= 256 ) {
seq_len = 256;
launch = &run_fmha_fp16_256_64_sm80;
} else if( max_seq_len <= 384 ) {
seq_len = 384;
launch = &run_fmha_fp16_384_64_sm80;
} else if( max_seq_len <= 512 ) {
seq_len = 512;
launch = &run_fmha_fp16_512_64_sm80;
} else {
TORCH_CHECK(false);
}
constexpr int warps_m = 1;
constexpr int warps_n = 4; // this leads to an upper bound
const int mmas_m = seq_len / 16 / warps_m;
const int mmas_n = seq_len / 16 / warps_n;
const int elts_per_thread = 8 * mmas_m * mmas_n;
auto stream = at::cuda::getCurrentCUDAStream().stream();
TORCH_CHECK(qkv.dtype() == torch::kFloat16);
TORCH_CHECK(cu_seqlens.dtype() == torch::kInt32);
TORCH_CHECK(qkv.is_cuda())
TORCH_CHECK(cu_seqlens.is_cuda())
TORCH_CHECK(qkv.is_contiguous())
TORCH_CHECK(cu_seqlens.is_contiguous())
TORCH_CHECK(cu_seqlens.dim() == 1);
TORCH_CHECK(qkv.dim() == 4);
const auto sizes = qkv.sizes();
TORCH_CHECK(sizes[THREE_DIM] == 3);
const int batch_size = cu_seqlens.numel() - 1;
const int total = sizes[TOTAL_DIM];
const int num_heads = sizes[H_DIM];
const int head_size = sizes[D_DIM];
TORCH_CHECK(batch_size > 0);
TORCH_CHECK(head_size == 64);
auto opts = qkv.options();
auto ctx = torch::empty({ total, num_heads, head_size }, opts);
auto s = torch::empty({ batch_size, num_heads, seq_len, seq_len }, opts);
auto gen = at::get_generator_or_default(
gen_, at::cuda::detail::getDefaultCUDAGenerator());
Fused_multihead_attention_fprop_params params;
set_params(params,
batch_size,
seq_len,
num_heads,
head_size,
qkv.data_ptr(),
cu_seqlens.data_ptr(),
ctx.data_ptr(),
s.data_ptr(),
p_dropout);
// number of times random will be generated per thread, to offset philox counter in thc random
// state
int64_t counter_offset = elts_per_thread;
at::PhiloxCudaState rng_engine_inputs;
if( is_training ) {
// See Note [Acquire lock when using random generators]
std::lock_guard lock(gen->mutex_);
params.philox_args = gen->philox_cuda_state(counter_offset);
}
launch(params, is_training, stream);
return { ctx, s };
}
std::vector
mha_bwd(const at::Tensor &dout, // total x num_heads, x head_size
const at::Tensor &qkv, // total x num_heads x 3 x head_size, total := \sum_{i=0}^{b} s_i
at::Tensor &softmax, // b x h x s x s softmax and dmask - will be overwritten with dP
const at::Tensor &cu_seqlens, // b+1
const float p_dropout, // probability to drop
const int max_seq_len // max sequence length to choose the kernel
) {
auto dprops = at::cuda::getCurrentDeviceProperties();
TORCH_CHECK(dprops->major == 8 && dprops->minor == 0);
int seq_len = 512;
auto launch = &run_fmha_dgrad_fp16_512_64_sm80;
if( max_seq_len <= 128 ) {
seq_len = 128;
launch = &run_fmha_dgrad_fp16_128_64_sm80;
} else if( max_seq_len <= 256 ) {
seq_len = 256;
launch = &run_fmha_dgrad_fp16_256_64_sm80;
} else if( max_seq_len <= 384 ) {
seq_len = 384;
launch = &run_fmha_dgrad_fp16_384_64_sm80;
} else if( max_seq_len <= 512 ) {
seq_len = 512;
launch = &run_fmha_dgrad_fp16_512_64_sm80;
} else {
TORCH_CHECK(false);
}
auto stream = at::cuda::getCurrentCUDAStream().stream();
TORCH_CHECK(qkv.dtype() == torch::kFloat16);
TORCH_CHECK(dout.dtype() == torch::kFloat16);
TORCH_CHECK(softmax.dtype() == torch::kFloat16);
TORCH_CHECK(cu_seqlens.dtype() == torch::kInt32);
TORCH_CHECK(qkv.is_cuda());
TORCH_CHECK(cu_seqlens.is_cuda());
TORCH_CHECK(qkv.is_contiguous());
TORCH_CHECK(cu_seqlens.is_contiguous());
TORCH_CHECK(cu_seqlens.dim() == 1);
TORCH_CHECK(qkv.dim() == 4);
const auto sizes = qkv.sizes();
TORCH_CHECK(sizes[THREE_DIM] == 3);
const int batch_size = cu_seqlens.numel() - 1;
const int num_heads = sizes[H_DIM];
const int head_size = sizes[D_DIM];
TORCH_CHECK(batch_size > 0);
TORCH_CHECK(head_size == 64);
auto dqkv = torch::empty_like(qkv);
Fused_multihead_attention_fprop_params params;
set_params(params,
batch_size,
seq_len,
num_heads,
head_size,
qkv.data_ptr(),
cu_seqlens.data_ptr(),
dout.data_ptr(), // we set o_ptr to dout
softmax.data_ptr(), // softmax gets overwritten by dP!
p_dropout);
// we're re-using these scales
Data_type acc_type = DATA_TYPE_FP32;
set_alpha(params.scale_bmm1, 1.f, acc_type);
set_alpha(params.scale_softmax, 1.f / sqrtf(head_size), acc_type);
set_alpha(params.scale_bmm2, 1.f, DATA_TYPE_FP16);
params.dqkv_ptr = dqkv.data_ptr();
launch(params, stream);
return { dqkv, softmax };
}
std::vector mha_fwd_nl(const at::Tensor &qkv, // total x num_heads x 3 x head_size, total := \sum_{i=0}^{b} s_i
const at::Tensor &cu_seqlens, // b+1
const float p_dropout,
const int max_seq_len,
const bool is_training,
c10::optional gen_) {
int seq_len = 512;
auto launch = &run_fmha_fp16_512_64_sm80_nl;
TORCH_CHECK(max_seq_len == seq_len);
constexpr int warps_m = 1;
constexpr int warps_n = 4; // this leads to an upper bound
const int mmas_m = seq_len / 16 / warps_m;
const int mmas_n = seq_len / 16 / warps_n;
// static_assert( mmas_m == 32 );
// static_assert( mmas_n == 4 );
const int elts_per_thread = 8 * mmas_m * mmas_n;
auto stream = at::cuda::getCurrentCUDAStream().stream();
TORCH_CHECK(qkv.is_cuda())
TORCH_CHECK(cu_seqlens.is_cuda())
TORCH_CHECK(qkv.is_contiguous())
TORCH_CHECK(cu_seqlens.is_contiguous())
TORCH_CHECK(cu_seqlens.dim() == 1);
TORCH_CHECK(qkv.dim() == 4);
const auto sizes = qkv.sizes();
TORCH_CHECK(sizes[THREE_DIM] == 3);
const int batch_size = cu_seqlens.numel() - 1;
const int total = sizes[TOTAL_DIM];
const int num_heads = sizes[H_DIM];
const int head_size = sizes[D_DIM];
TORCH_CHECK(batch_size > 0);
TORCH_CHECK(head_size == 64);
auto opts = qkv.options();
auto ctx = torch::empty({ total, num_heads, head_size }, opts);
auto s = torch::empty({ batch_size, num_heads, seq_len, seq_len }, opts);
auto gen = at::get_generator_or_default(gen_, at::cuda::detail::getDefaultCUDAGenerator());
Fused_multihead_attention_fprop_params params;
set_params(params,
batch_size,
seq_len,
num_heads,
head_size,
qkv.data_ptr(),
cu_seqlens.data_ptr(),
ctx.data_ptr(),
s.data_ptr(),
p_dropout);
// number of times random will be generated per thread, to offset philox counter in thc random
// state
int64_t counter_offset = elts_per_thread;
at::PhiloxCudaState rng_engine_inputs;
if( is_training ) {
// See Note [Acquire lock when using random generators]
std::lock_guard lock(gen->mutex_);
params.philox_args = gen->philox_cuda_state(counter_offset);
}
int num_chunks = 3;
if(batch_size == 3) {
num_chunks = 2;
}
launch(params, is_training, num_chunks, stream);
return { ctx, s };
}
std::vector mha_bwd_nl(const at::Tensor &dout, // total x num_heads, x head_size
const at::Tensor &qkv, // total x num_heads x 3 x head_size, total := \sum_{i=0}^{b} s_i
at::Tensor &softmax, // b x h x s x s softmax and dmask - will be overwritten with dP
const at::Tensor &cu_seqlens, // b+1
const float p_dropout, // probability to drop
const int max_seq_len // max sequence length to choose the kernel
) {
auto stream = at::cuda::getCurrentCUDAStream().stream();
TORCH_CHECK(qkv.is_cuda())
TORCH_CHECK(cu_seqlens.is_cuda())
TORCH_CHECK(qkv.is_contiguous())
TORCH_CHECK(cu_seqlens.is_contiguous())
TORCH_CHECK(cu_seqlens.dim() == 1);
TORCH_CHECK(qkv.dim() == 4);
const auto sizes = qkv.sizes();
TORCH_CHECK(sizes[THREE_DIM] == 3);
const int batch_size = cu_seqlens.numel() - 1;
const int total = sizes[TOTAL_DIM];
const int num_heads = sizes[H_DIM];
const int head_size = sizes[D_DIM];
TORCH_CHECK(batch_size > 0);
TORCH_CHECK(head_size == 64);
int seq_len = 512;
auto launch = &run_fmha_dgrad_fp16_512_64_sm80_nl;
auto opts = qkv.options();
auto dqkv = torch::empty_like(qkv);
int num_chunks = 2;
if( batch_size == 1 ) {
num_chunks = 4;
}else if( batch_size == 2 ) {
num_chunks = 3;
}
auto dkv = torch::empty({total, num_chunks, 2, num_heads, head_size}, opts);
Fused_multihead_attention_fprop_params params;
set_params(params,
batch_size,
seq_len,
num_heads,
head_size,
qkv.data_ptr(),
cu_seqlens.data_ptr(),
dout.data_ptr(), // o_ptr = dout
softmax.data_ptr(), // softmax gets overwritten by dP!
p_dropout);
params.dkv_ptr = dkv.data_ptr();
Data_type acc_type = DATA_TYPE_FP32;
set_alpha(params.scale_bmm1, 1.f, acc_type);
set_alpha(params.scale_softmax, 1.f / sqrtf(head_size), acc_type);
set_alpha(params.scale_bmm2, 1.f, DATA_TYPE_FP16);
params.dqkv_ptr = dqkv.data_ptr();
launch(params, num_chunks, stream);
//SPLIT-K reduction of num_chunks dK, dV parts
// The equivalent of the following Pytorch code:
// using namespace torch::indexing;
// at::Tensor view_out = dqkv.index({Slice(), Slice(1, None, None)});
// torch::sum_out(view_out, dkv, 1);
const int hidden_size = num_heads * head_size;
fmha_run_noloop_reduce(
dqkv.data_ptr(), dkv.data_ptr(), cu_seqlens.data_ptr(), hidden_size, batch_size, total, num_chunks, stream);
return { dqkv, softmax, dkv };
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.doc() = "Fused Multi-head Self-attention for BERT";
m.def("fwd", &mha_fwd, "Forward pass");
m.def("bwd", &mha_bwd, "Backward pass");
m.def("fwd_nl", &mha_fwd_nl, "Forward pass (small-batch)");
m.def("bwd_nl", &mha_bwd_nl, "Backward pass (small-batch)");
}
================================================
FILE: KoSentenceT5/apex/contrib/csrc/fmha/src/fmha/gemm.h
================================================
/******************************************************************************
* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#pragma once
#include
#define FMHA_DIV_UP(m, n) (((m) + (n)-1) / (n))
namespace fmha {
////////////////////////////////////////////////////////////////////////////////////////////////////
template< typename Data_type_, int NUM_ELTS_, int BITS_PER_ELT_, int ALIGNMENT_ >
struct Fragment_base_ {
// The data type.
using Data_type = Data_type_;
// default input type
using Input_type_ = Data_type_;
// Does it store the array of elements.
enum { HAS_ELTS = BITS_PER_ELT_ >= 8 };
// The number of elements.
enum { NUM_ELTS = NUM_ELTS_ };
// The size of element in bits.
enum { BITS_PER_ELT = BITS_PER_ELT_ };
// The size of byte of a single register.
enum { BYTES_PER_REG = 4 };
// The size in bits.
enum { BITS_PER_REG = BYTES_PER_REG * 8 };
// The number of registers needed to store the fragment.
enum { NUM_REGS = Div_up::VALUE };
// The size in bytes (as returned by sizeof(Fragment_base<>).
enum { SIZE_IN_BYTES = NUM_REGS * BYTES_PER_REG };
// The alignment.
enum { ALIGNMENT = ALIGNMENT_ > 0 ? ALIGNMENT_ : Min::VALUE };
};
////////////////////////////////////////////////////////////////////////////////////////////////////
template<
// The type of the elements.
typename Data_type_,
// The number of elements.
int NUM_ELTS_,
// The alignment if you want to force a value -- use 0 otherwise.
int ALIGNMENT_ = 0,
// The base class.
typename Base_ = Fragment_base_
>
struct alignas(static_cast(Base_::ALIGNMENT)) Fragment : public Base_ {
// The size of a load/store.
enum { BYTES_PER_LOAD_STORE = Base_::NUM_REGS * sizeof(uint32_t) };
// Clear the fragment. Using PTX in that code seems to produce better SASS...
inline __device__ void clear() {
#pragma unroll
for( int ii = 0; ii < Base_::NUM_REGS; ++ii ) {
asm volatile("mov.u32 %0, 0; \n" : "=r"(this->reg(ii)) : );
}
}
// Immutable access to a register.
inline __device__ const uint32_t& reg(int ii) const {
return this->regs_[ii];
}
// Mutable access to a register.
inline __device__ uint32_t& reg(int ii) {
return this->regs_[ii];
}
uint32_t regs_[Base_::NUM_REGS];
// Immutable access to the elements.
inline __device__ const Data_type_& elt(int ii) const {
return reinterpret_cast(&this->regs_[0])[ii];
}
// Mutable access to the elements.
inline __device__ Data_type_& elt(int ii) {
return reinterpret_cast(&this->regs_[0])[ii];
}
// Immutable access to the elements with a cast.
template< typename Cast_type >
inline __device__ const Cast_type& elt_as(int ii) const {
return reinterpret_cast(&this->regs_[0])[ii];
}
// Mutable access to the elements.
template< typename Cast_type >
inline __device__ Cast_type& elt_as(int ii) {
return reinterpret_cast(&this->regs_[0])[ii];
}
// Add another fragment.
inline __device__ void add(const Fragment &other) {
#pragma unroll
for( int ii = 0; ii < NUM_ELTS_; ++ii ) {
this->elt(ii) += other.elt(ii);
}
}
};
////////////////////////////////////////////////////////////////////////////////////////////////////
template< typename Layout >
struct Fragment_a : public Fragment {
};
////////////////////////////////////////////////////////////////////////////////////////////////////
template< typename Layout >
struct Fragment_b : public Fragment {
};
////////////////////////////////////////////////////////////////////////////////////////////////////
struct Fragment_accumulator : public Fragment {
// The base class.
using Base = Fragment;
// Add two fragments.
template< typename Other_fragment_ >
inline __device__ void add(const Other_fragment_ &other) {
for( int ii = 0; ii < Base::NUM_ELTS; ++ii ) {
this->elt(ii) = this->elt(ii) + other.elt(ii);
}
}
// Do the HMMA.
template< typename Layout_a, typename Layout_b >
inline __device__ void mma(const Fragment_a &a,
const Fragment_b &b) {
asm volatile( \
"mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 \n" \
" {%0, %1, %2, %3}, \n" \
" {%4, %5, %6, %7}, \n" \
" {%8, %9}, \n" \
" {%0, %1, %2, %3}; \n" \
: "+f"( elt(0)), "+f"( elt(1)), "+f"( elt(2)), "+f"( elt(3))
: "r"(a.reg(0)), "r"(a.reg(1)), "r"(a.reg(2)), "r"(a.reg(3))
, "r"(b.reg(0)), "r"(b.reg(1)));
asm volatile( \
"mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 \n" \
" {%0, %1, %2, %3}, \n" \
" {%4, %5, %6, %7}, \n" \
" {%8, %9}, \n" \
" {%0, %1, %2, %3}; \n" \
: "+f"( elt(4)), "+f"( elt(5)), "+f"( elt(6)), "+f"( elt(7))
: "r"(a.reg(0)), "r"(a.reg(1)), "r"(a.reg(2)), "r"(a.reg(3))
, "r"(b.reg(2)), "r"(b.reg(3)));
}
};
////////////////////////////////////////////////////////////////////////////////////////////////////
template< typename Fragment, int M, int N >
inline __device__ void clear(Fragment (&frag)[M][N]) {
#pragma unroll
for( int mi = 0; mi < M; ++mi ) {
#pragma unroll
for( int ni = 0; ni < N; ++ni ) {
frag[mi][ni].clear();
}
}
}
////////////////////////////////////////////////////////////////////////////////////////////////////
template< typename Accumulator_type, int WARPS_K >
struct Clear_accumulator {
};
////////////////////////////////////////////////////////////////////////////////////////////////////
template< int WARPS_K >
struct Clear_accumulator {
template< typename Acc, int M, int N >
static inline __device__ void apply(Acc (&acc)[M][N], bool = false) {
fmha::clear(acc);
}
};
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
template
inline __device__ void gemm(Acc (&acc)[M][N], const A (&a)[M], const B (&b)[N]) {
#pragma unroll
for( int mi = 0; mi < M; ++mi ) {
#pragma unroll
for( int ni = 0; ni < N; ++ni ) {
acc[mi][ni].mma(a[mi], b[ni]);
}
}
}
////////////////////////////////////////////////////////////////////////////////////////////////////
template<
// The number of rows in the CTA tile.
int M_,
// The number of cols in the CTA tile.
int N_,
// The number of elements in the the K dimension of the GEMM loop.
int K_,
// The number of rows of warps.
int WARPS_M_,
// The number of cols of warps.
int WARPS_N_,
// The number of warps in the K dimension of the GEMM loop.
int WARPS_K_>
struct Cta_tile_ {
enum { M = M_, N = N_, K = K_ };
// The number of warps.
enum { WARPS_M = WARPS_M_, WARPS_N = WARPS_N_, WARPS_K = WARPS_K_ };
// The number of warps per CTA.
enum { WARPS_PER_CTA = WARPS_M * WARPS_N * WARPS_K };
// The number of threads per warp.
enum { THREADS_PER_WARP = 32 };
// The number of threads per CTA.
enum { THREADS_PER_CTA = WARPS_PER_CTA * THREADS_PER_WARP };
};
////////////////////////////////////////////////////////////////////////////////////////////////////
template
struct Hmma_tile {
// The number of elements computed with a single warp-MMA.
enum { M_PER_MMA = 16, N_PER_MMA = 16, K_PER_MMA = 16 };
// The number of elements computed with a single CTA-MMA.
enum {
M_PER_MMA_PER_CTA = M_PER_MMA * Cta_tile::WARPS_M,
N_PER_MMA_PER_CTA = N_PER_MMA * Cta_tile::WARPS_N,
K_PER_MMA_PER_CTA = K_PER_MMA * Cta_tile::WARPS_K
};
// The number of MMAs needed to compute the GEMM.
enum {
MMAS_M = Div_up::VALUE,
MMAS_N = Div_up::VALUE,
MMAS_K = Div_up::VALUE,
};
// The number of elements computed per warp.
enum {
M_PER_WARP = MMAS_M * M_PER_MMA,
N_PER_WARP = MMAS_N * N_PER_MMA,
K_PER_WARP = MMAS_K * K_PER_MMA,
};
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using A_type = uint16_t;
using B_type = uint16_t;
using C_type = uint16_t;
using Accumulator_type = float;
using Epilogue_type = float;
constexpr int BITS_PER_ELEMENT_A = sizeof(A_type) * 8;
constexpr int BITS_PER_ELEMENT_B = sizeof(B_type) * 8;
constexpr int BITS_PER_ELEMENT_C = sizeof(C_type) * 8;
////////////////////////////////////////////////////////////////////////////////////////////////////
template
using Cta_tile_extd = Cta_tile_;
////////////////////////////////////////////////////////////////////////////////////////////////////
template
using Cta_tile_with_k_with_padding = Cta_tile_extd::VALUE,
Cta_tile_::WARPS_M,
Cta_tile_::WARPS_N,
Cta_tile_::WARPS_K>;
////////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace fmha
================================================
FILE: KoSentenceT5/apex/contrib/csrc/fmha/src/fmha/gmem_tile.h
================================================
/******************************************************************************
* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#pragma once
namespace fmha {
////////////////////////////////////////////////////////////////////////////////////////////////////
template<
// The dimensions of the tile computed by the CTA.
typename Cta_tile,
// The number of bits per element.
int BITS_PER_ELEMENT,
// The number of rows of Q, K or V loaded by this tile.
int ROWS,
// The number of columns.
int COLS,
// The number of matrics.
int NUM_MATS = 3
>
struct Gmem_tile_qkv {
// The size of each LDG.
enum { BYTES_PER_LDG = 16 };
// The size of a row in bytes.
enum { BYTES_PER_ROW = COLS * BITS_PER_ELEMENT / 8 };
// The number of threads to load a "row" of the matrix.
enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_LDG };
// The number of "rows" loaded per LDG.
enum { ROWS_PER_LDG = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
// The number of LDGs needed to load a chunk of the Q matrix.
enum { LDGS = fmha::Div_up::VALUE };
// Ctor.
template< typename Params, typename BInfo >
inline __device__ Gmem_tile_qkv(const Params ¶ms, int qkv_offset, const BInfo &binfo, int tidx)
: params_qkv_stride_in_bytes_(params.qkv_stride_in_bytes)
, actual_seqlen(binfo.actual_seqlen)
, qkv_ptr_(reinterpret_cast(params.qkv_ptr)) {
// Compute the position in the sequence (within the CTA for the moment).
int row = tidx / THREADS_PER_ROW;
// Compute the position of the thread in the row.
int col = tidx % THREADS_PER_ROW;
// Store the row as we need it to disable the loads.
row_ = row;
// The row offset in the batched GEMM. For each seq element, we store QKV in that order.
int64_t row_offset = (int64_t)row * params.qkv_stride_in_bytes;
// Add the block index.
row_offset += (int64_t)((binfo.sum_s * NUM_MATS + qkv_offset) * binfo.h + binfo.bidh) * BYTES_PER_ROW;
// Assemble the final pointer.
qkv_ptr_ += row_offset + col * BYTES_PER_LDG;
}
// Store data to shared memory.
template< typename Smem_tile >
inline __device__ void commit(Smem_tile &smem_tile) {
smem_tile.store(fetch_);
}
// Load data from memory.
template< typename Smem_tile >
inline __device__ void load(Smem_tile &smem_tile) {
const void *ptrs[LDGS];
uint32_t preds[LDGS];
#pragma unroll
for( int ii = 0; ii < LDGS; ++ii ) {
ptrs[ii] = qkv_ptr_ + (int64_t)ii * ROWS_PER_LDG * params_qkv_stride_in_bytes_;
preds[ii] = ((row_ + ii * ROWS_PER_LDG) < min(ROWS, actual_seqlen));
fetch_[ii] = make_uint4(0, 0, 0, 0);
}
// not packing predicates removes restrictions (e.g. FP16 384, 4 warps)
Ldg_functor fct(fetch_, ptrs);
#pragma unroll
for( int ii = 0; ii < LDGS; ++ii ) {
fct.load(ii, preds[ii]);
}
}
// Store data to memory.
inline __device__ void store(const uint4 (&data)[LDGS]) {
#pragma unroll
for( int ii = 0; ii < LDGS; ++ii ) {
char *ptr = qkv_ptr_ + (int64_t)ii * ROWS_PER_LDG * params_qkv_stride_in_bytes_;
if( (row_ + ii * ROWS_PER_LDG) < min(ROWS, actual_seqlen) ) {
fmha::stg(ptr, data[ii]);
}
}
}
// Move the pointer to the next location.
inline __device__ void move() {
qkv_ptr_ += (int64_t)ROWS * params_qkv_stride_in_bytes_;
actual_seqlen -= ROWS;
}
// The stride between rows for the QKV matrice.
int64_t params_qkv_stride_in_bytes_;
// The pointer.
char *qkv_ptr_;
// The fetch registers.
uint4 fetch_[LDGS];
// Keep track of the row the thread is processing as we move the tile.
int row_;
// The length of the sequence loaded by that memory tile.
int actual_seqlen;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
template< typename Cta_tile >
struct Gmem_tile_o {
// The mma tile.
using Mma_tile = fmha::Hmma_tile;
// The size of each element.
enum { BYTES_PER_ELEMENT = 2 };
// The size of a row in bytes.
enum { BYTES_PER_ROW = Cta_tile::N * BYTES_PER_ELEMENT };
// The number of threads to store a "row" of the matrix.
enum { THREADS_PER_ROW = 16 };
// The size of each STG.
enum { BYTES_PER_STG = BYTES_PER_ROW / THREADS_PER_ROW };
// The number of "rows" stored per iteration of the loop. The output of 1 MMA.
enum { ROWS = Cta_tile::M };
// The number of "rows" stored per iteration of the loop. The output of 1 MMA.
enum { ROWS_PER_LOOP = ROWS <= 64 ? ROWS : (int)Mma_tile::M_PER_MMA_PER_CTA };
// The number of outter loop for the stores.
enum { LOOPS = ROWS / ROWS_PER_LOOP };
// The number of "rows" stored per STG.
enum { ROWS_PER_STG = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
// Do we have to guard against partial writes/reads.
enum { HAS_INCOMPLETE_STG = Cta_tile::M % ROWS_PER_STG != 0 };
// The number of STGs needed to store a chunk of the Q matrix.
enum { STGS_PER_LOOP = fmha::Div_up::VALUE };
// The number of STGs needed to store a chunk of the Q matrix in total.
enum { STGS = STGS_PER_LOOP * LOOPS };
// Ctor.
template
inline __device__ Gmem_tile_o(const Params ¶ms, const BInfo &binfo, int tidx)
: params_o_stride_in_bytes_(params.o_stride_in_bytes)
, actual_seqlen_(binfo.actual_seqlen)
, o_ptr_(reinterpret_cast(params.o_ptr)) {
// Compute the position in the sequence (within the CTA for the moment).
int row = tidx / THREADS_PER_ROW;
// Compute the position of the thread in the row.
int col = tidx % THREADS_PER_ROW;
// Store the row as we need it to disable loads.
row_ = row;
// The row offset in the batched GEMM.
int64_t row_offset = (int64_t)row * params.o_stride_in_bytes + binfo.bidx * BYTES_PER_ROW;
// Assemble the final pointer.
o_ptr_ += row_offset + col * BYTES_PER_STG;
// Is that thread active on the last STG?
if( HAS_INCOMPLETE_STG ) {
is_active_for_last_stg_ = row + (STGS - 1) * ROWS_PER_STG < Cta_tile::M;
}
}
// Store data to global memory.
inline __device__ void store(const uint4 (&src)[STGS_PER_LOOP], int mi) {
#pragma unroll
for( int ii = 0; ii < STGS_PER_LOOP; ++ii ) {
int jj = mi * STGS_PER_LOOP + ii;
if( this->row_ + jj * ROWS_PER_STG >= this->actual_seqlen_ ) {
break;
}
float x = reinterpret_cast(src[ii].x);
float y = reinterpret_cast(src[ii].y);
float z = reinterpret_cast(src[ii].z);
float w = reinterpret_cast(src[ii].w);
uint2 out = float4_to_half4(x, y, z, w);
if( !HAS_INCOMPLETE_STG || (jj < STGS - 1 || this->is_active_for_last_stg_) ) {
fmha::stg(this->o_ptr_ + jj * ROWS_PER_STG * this->params_o_stride_in_bytes_, out);
}
}
}
// Move the pointer to the next location.
inline __device__ void move() {
row_ += ROWS;
o_ptr_ += (int64_t)ROWS * params_o_stride_in_bytes_;
}
// The stride between rows for the QKV matrice.
int64_t params_o_stride_in_bytes_;
// The pointer.
char *o_ptr_;
// Is the thread active for the last STG?
int is_active_for_last_stg_;
// Keep track of the row to disable loads.
int row_;
// The length of the sequence loaded by that memory tile.
int actual_seqlen_;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
template< typename Cta_tile, int BYTES_PER_ELEMENT >
struct Gmem_tile_mma_sd {
// The mma tile.
using Mma_tile = fmha::Hmma_tile;
// Each STG stores 8 elements.
enum { BYTES_PER_STG = BYTES_PER_ELEMENT * 8 };
// The number of MMAs in the M dimension.
enum { MMAS_M = Mma_tile::MMAS_M };
// The number of MMAs in the N dimension.
enum { MMAS_N = Mma_tile::MMAS_N };
// The number of rows computed per MMA per thread block.
enum { M_PER_MMA_PER_CTA = Mma_tile::M_PER_MMA_PER_CTA };
// The number of cols computed per MMA per thread block.
enum { N_PER_MMA_PER_CTA = Mma_tile::N_PER_MMA_PER_CTA };
// The number of threads per block.
enum { THREADS_PER_CTA = Cta_tile::THREADS_PER_CTA };
// The size of each row in bytes. I.e. how many bytes are stored per STG.
enum { BYTES_PER_ROW = THREADS_PER_CTA * BYTES_PER_STG };
// The fixed sequence length.
enum { SEQLEN = Cta_tile::N };
// The distance between two blocks (in bytes).
enum { BLOCK_STRIDE_BYTES = SEQLEN * SEQLEN * BYTES_PER_ELEMENT };
// The distance between elements stored per loop (in bytes).
enum { LOOP_STRIDE_BYTES = MMAS_M * MMAS_N * BYTES_PER_ROW };
// The type of elements stored per STG.
using Type = typename fmha::Uint_from_size_in_bytes::Type;
// Ctor.
template
inline __device__ Gmem_tile_mma_sd(void *ptr, const Params ¶ms, const int tidx)
: ptr_(static_cast(ptr)) {
// The block index for the batch.
const int bidb = blockIdx.y;
// The block index for the head.
const int bidh = blockIdx.x;
// The block index.
size_t bidx = bidb * params.h + bidh;
// Set store location for each thread at the beginning of the loop
ptr_ += bidx * BLOCK_STRIDE_BYTES + tidx * BYTES_PER_STG;
}
// Store to global memory.
inline __device__ void store(const Type &data, const int mi, const int ni) {
size_t offset = (mi * MMAS_N + ni) * BYTES_PER_ROW;
fmha::stg(ptr_ + offset, data);
}
// Load from global memory.
inline __device__ void load(Type &data, const int mi, const int ni) {
size_t offset = (mi * MMAS_N + ni) * BYTES_PER_ROW;
fmha::ldg(data, ptr_ + offset);
}
// Move to the next tile.
inline __device__ void move() {
ptr_ += LOOP_STRIDE_BYTES;
}
// The pointer in global memory.
char *ptr_;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
template< typename Cta_tile, typename Base = Gmem_tile_mma_sd >
struct Gmem_tile_mma_s : public Base {
// The number of mmas in the vertical dimension.
enum { M = Base::MMAS_M };
// The number of mmas in the horizontal dimension.
enum { N = Base::MMAS_N };
// The type of the vectors stored by each STG.
using Type = typename Base::Type;
// Ctor.
template< typename Params >
inline __device__ Gmem_tile_mma_s(void *ptr, const Params ¶ms, const int tidx)
: Base(ptr, params, tidx) {
}
// Store to global memory.
template
inline __device__ void store(const float (&softmax)[2 * M][4 * N], const Mask &mask) {
#pragma unroll
for( int mi = 0; mi < M; mi++ ) {
#pragma unroll
for( int ni = 0; ni < N; ni++ ) {
float tmp00 = softmax[2 * mi + 0][4 * ni + 0];
float tmp01 = softmax[2 * mi + 0][4 * ni + 1];
float tmp02 = softmax[2 * mi + 0][4 * ni + 2];
float tmp03 = softmax[2 * mi + 0][4 * ni + 3];
float tmp10 = softmax[2 * mi + 1][4 * ni + 0];
float tmp11 = softmax[2 * mi + 1][4 * ni + 1];
float tmp12 = softmax[2 * mi + 1][4 * ni + 2];
float tmp13 = softmax[2 * mi + 1][4 * ni + 3];
uint4 dst;
dst.x = fmha::float2_to_half2(tmp00, tmp01);
dst.y = fmha::float2_to_half2(tmp02, tmp03);
dst.z = fmha::float2_to_half2(tmp10, tmp11);
dst.w = fmha::float2_to_half2(tmp12, tmp13);
if( mask.is_valid(mi, ni, 0, 0) ) {
Base::store(dst, mi, ni);
}
}
}
}
// Load from global memory.
template
inline __device__ void load(uint4 (®s)[M][N], const Mask &mask) {
#pragma unroll
for( int mi = 0; mi < M; mi++ ) {
#pragma unroll
for( int ni = 0; ni < N; ni++ ) {
regs[mi][ni] = make_uint4(0, 0, 0, 0);
if( mask.is_valid(mi, ni, 0, 0) ) {
Base::load(regs[mi][ni], mi, ni);
}
}
}
}
};
////////////////////////////////////////////////////////////////////////////////////////////////////
template<
// The dimensions of the tile computed by the CTA.
typename Cta_tile,
// The base class.
typename Base = fmha::Gmem_tile_qkv
>
struct Gmem_tile_dout : public Base {
// Ctor.
template
inline __device__ Gmem_tile_dout(const Params ¶ms, const BInfo &binfo, int tidx)
: Base(params, 0, binfo, tidx) {
this->qkv_ptr_ = reinterpret_cast(params.o_ptr);
this->params_qkv_stride_in_bytes_ = params.o_stride_in_bytes; // needed for move
// Compute the position of the thread in the row.
int col = tidx % Base::THREADS_PER_ROW;
// The row offset in the batched GEMM. For each seq element, we store O in that order.
int64_t row_offset = (int64_t)this->row_ * params.o_stride_in_bytes + binfo.bidx * Base::BYTES_PER_ROW;
// Assemble the final pointer.
this->qkv_ptr_ += row_offset + col * Base::BYTES_PER_LDG;
}
};
////////////////////////////////////////////////////////////////////////////////////////////////////
template< typename Cta_tile, typename Base = fmha::Gmem_tile_o >
struct Gmem_tile_dq : public Base {
// Ctor.
template
inline __device__ Gmem_tile_dq(const Params ¶ms, const BInfo &binfo, int tidx)
: Base(params, binfo, tidx) {
this->o_ptr_ = reinterpret_cast(params.dqkv_ptr);
this->params_o_stride_in_bytes_ = params.qkv_stride_in_bytes; // needed for move
// Compute the position of the thread in the row.
int col = tidx % Base::THREADS_PER_ROW;
// The row offset in the batched GEMM. For each seq element, we store O in that order.
int64_t row_offset = (int64_t)this->row_ * params.qkv_stride_in_bytes +
(binfo.sum_s * 3 * binfo.h + binfo.bidh) * Base::BYTES_PER_ROW;
// Assemble the final pointer.
this->o_ptr_ += row_offset + col * Base::BYTES_PER_STG;
}
};
////////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace fmha
================================================
FILE: KoSentenceT5/apex/contrib/csrc/fmha/src/fmha/kernel_traits.h
================================================
/******************************************************************************
* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#pragma once
////////////////////////////////////////////////////////////////////////////////////////////////////
template
struct FMHA_kernel_traits {
// The CTA description for the 1st GEMM.
using Cta_tile_p = fmha::Cta_tile_extd;
// The CTA description for the 2nd GEMM.
using Cta_tile_o = fmha::Cta_tile_extd;
// Do we use one buffer for K and V.
enum { SHARE_SMEM_FOR_K_AND_V = (FLAGS & 0x8u) != 0u };
// The global memory tile to load Q.
using Gmem_tile_q = fmha::Gmem_tile_qkv;
// The shared memory tile to swizzle Q.
using Smem_tile_q = fmha::Smem_tile_a;
// The global memory tile to load K.
using Gmem_tile_k = fmha::Gmem_tile_qkv;
// The shared memory tile to swizzle K.
using Smem_tile_k = fmha::Smem_tile_b;
// The global memory tile to load V.
using Gmem_tile_v = fmha::Gmem_tile_qkv;
// The shared memory tile to swizzle V.
using Smem_tile_v = fmha::Smem_tile_v;
// The global memory tile to store O.
using Gmem_tile_o = fmha::Gmem_tile_o;
// The shared memory tile for O.
using Smem_tile_o = fmha::Smem_tile_o;
// The global memory tile to load/store S.
using Gmem_tile_s = fmha::Gmem_tile_mma_s;
// The shared memory tile to transpose S.
using Smem_tile_st = fmha::Smem_tile_mma_transposed;
using Gmem_tile_do = fmha::Gmem_tile_dout;
// Make sure the number of threads match.
static_assert((int)Gmem_tile_o::THREADS_PER_ROW == (int)Smem_tile_o::THREADS_PER_ROW, "");
// The number of threads.
enum { THREADS = Cta_tile_p::THREADS_PER_CTA };
// Make sure the number of threads matches both CTAs.
static_assert((int)THREADS == (int)Cta_tile_o::THREADS_PER_CTA, "");
// The amount of shared memory needed to load Q and K.
enum { BYTES_PER_SMEM_QK = Smem_tile_q::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE };
// The extra amount of shared memory needed to load V.
enum { BYTES_PER_SMEM_V = SHARE_SMEM_FOR_K_AND_V ? 0u : Smem_tile_v::BYTES_PER_TILE };
// The amount of shared memory needed for Q, K and V..
enum { BYTES_PER_SMEM_QKV = BYTES_PER_SMEM_QK + BYTES_PER_SMEM_V };
// The amount of shared memory needed to load Q and store O.
enum { BYTES_PER_SMEM_QO = Smem_tile_q::BYTES_PER_TILE + Smem_tile_o::BYTES_PER_TILE };
// The amount of shared memory needed for Q, K, V and O.
enum { BYTES_PER_SMEM = fmha::Max::VALUE };
// Make sure we have enough shared memory.
static_assert(Smem_tile_q::BYTES_PER_TILE + Smem_tile_o::BYTES_PER_TILE <= BYTES_PER_SMEM, "");
};
////////////////////////////////////////////////////////////////////////////////////////////////////
================================================
FILE: KoSentenceT5/apex/contrib/csrc/fmha/src/fmha/mask.h
================================================
/******************************************************************************
* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#pragma once
namespace fmha {
template
struct Mask {
using Mma_tile = fmha::Hmma_tile;
template
__device__ Mask(const Params ¶ms, const BInfo &blockInfo, int tidx) {
actual_seqlen = blockInfo.actual_seqlen;
const int warp = tidx / Cta_tile::THREADS_PER_WARP;
const int lane = tidx % Cta_tile::THREADS_PER_WARP;
static_assert(Cta_tile::WARPS_K == 1, "");
// find the warp in the Cta tile
const int warp_n = (warp / Cta_tile::WARPS_M);
const int warp_m = (warp % Cta_tile::WARPS_M);
// decompose warp into 8x4 tile
const int quad = lane / 4;
const int tid = (lane % 4) * 2;
row = warp_m * 16 + quad;
col = warp_n * 16 + tid;
}
inline __device__ bool is_valid(const int mi, const int ni, const int ii, const int jj) const {
// ii and jj iterate over the 2x4 fragment
const bool col_valid = (ni * Mma_tile::N_PER_MMA_PER_CTA + col + (jj & 2) * 4 + (jj & 1)) < actual_seqlen;
//&& (row + mi * Mma_tile::M_PER_MMA_PER_CTA + ii * 8) < actual_seqlen;
return col_valid;
// return row_valid && col_valid;
}
inline __device__ void load(int it) {
row_offset = it * Cta_tile::M + row;
}
int row_offset;
int row;
int col;
int actual_seqlen;
};
} // namespace fmha
================================================
FILE: KoSentenceT5/apex/contrib/csrc/fmha/src/fmha/smem_tile.h
================================================
/******************************************************************************
* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#pragma once
#include
#include
namespace fmha {
////////////////////////////////////////////////////////////////////////////////////////////////////
template<
// The description of the tile computed by this CTA.
typename Cta_tile,
// The number of rows in the 2D shared memory buffer.
int M_,
// The number of cols.
int N_,
// The size in bits of each element.
int BITS_PER_ELEMENT_,
// The number of bytes per STS.
int BYTES_PER_STS_ = 16,
// The number of buffers. (Used in multistage and double buffer cases.)
int BUFFERS_PER_TILE_ = 1,
// Do we enable the fast path for LDS.128 and friends.
int ENABLE_LDS_FAST_PATH_ = 0,
// The number of rows that are used for the XOR swizzling to allow fast STS/LDS.
int ROWS_PER_XOR_PATTERN_ = 8,
// The number of cols that are used for the XOR swizzling to allow fast STS/LDS.
int COLS_PER_XOR_PATTERN_ = 1,
// Use or not predicates
bool USE_PREDICATES_ = true
>
struct Smem_tile_without_skews {
// The size in bits of each element.
enum { BITS_PER_ELEMENT = BITS_PER_ELEMENT_ };
// The size in bytes of a single STS.
enum { BYTES_PER_STS = BYTES_PER_STS_ };
// The number of elements per STS.
enum { ELEMENTS_PER_STS = BYTES_PER_STS * 8 / BITS_PER_ELEMENT };
// To support arbitrary N, we pad some values to a power-of-2.
enum { N_WITH_PADDING = Next_power_of_two::VALUE };
// The number of bytes per row without packing of rows.
enum { BYTES_PER_ROW_BEFORE_PACKING = N_WITH_PADDING * BITS_PER_ELEMENT / 8 };
// The number of bytes per row -- we want at least 128B per row.
enum { BYTES_PER_ROW = Max::VALUE };
// The number of rows in shared memory (two rows may be packed into a single one).
enum { ROWS = M_ * BYTES_PER_ROW_BEFORE_PACKING / BYTES_PER_ROW };
// The number of threads per row.
enum { THREADS_PER_ROW_UNBOUNDED = BYTES_PER_ROW / BYTES_PER_STS };
// The number of threads per row.
enum { THREADS_PER_ROW = Min::VALUE };
// The number of STS per row.
enum { STS_PER_ROW = BYTES_PER_ROW / THREADS_PER_ROW / BYTES_PER_STS };
// It must be at least one.
static_assert(STS_PER_ROW >= 1, "");
// The number of rows written with a single STS.
enum { ROWS_PER_STS = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
// Make sure we write to at least one row per STS. Thanks Dr. Obvious ;)
static_assert(ROWS_PER_STS >= 1, "");
// The number of STS needed to store all rows.
enum { STS_PER_COL = Div_up::VALUE };
// The number of STS in total.
enum { STS = STS_PER_COL * STS_PER_ROW };
// The size of one buffer in bytes in shared memory.
enum { BYTES_PER_BUFFER = STS * BYTES_PER_STS * Cta_tile::THREADS_PER_CTA };
// The number of buffers.
enum { BUFFERS_PER_TILE = BUFFERS_PER_TILE_ };
// The size in bytes of total buffers.
enum { BYTES_PER_TILE = BYTES_PER_BUFFER * BUFFERS_PER_TILE };
// The boundary for smem_read_offset and smem_write_offset increment.
enum { BYTES_PER_TILE_INC_BOUNDARY = BYTES_PER_TILE - BYTES_PER_BUFFER };
// Do we enable the LDS.128 fast path?
enum { ENABLE_LDS_FAST_PATH = ENABLE_LDS_FAST_PATH_ };
static_assert(ENABLE_LDS_FAST_PATH == 0);
// The number of rows that are used for the XOR swizzling to allow fast STS/LDS.
enum { ROWS_PER_XOR_PATTERN = ROWS_PER_XOR_PATTERN_ };
// The number of cols that are used for the XOR swizzling to allow fast STS/LDS.
enum { COLS_PER_XOR_PATTERN = COLS_PER_XOR_PATTERN_ * 16 / BYTES_PER_STS };
// Use or not predicates
enum { USE_PREDICATES = USE_PREDICATES_ };
// The type of elements that are stored in shared memory by each thread.
using Store_type = typename Uint_from_size_in_bytes::Type;
// Ctor.
inline __device__ Smem_tile_without_skews(void *smem, int tidx)
: smem_(__nvvm_get_smem_pointer(smem)) {
// The row written by a thread. See doc/mma_smem_layout.xlsx.
int smem_write_row = tidx / THREADS_PER_ROW;
// The XOR pattern.
int smem_write_xor = smem_write_row % ROWS_PER_XOR_PATTERN * COLS_PER_XOR_PATTERN;
// Compute the column and apply the XOR pattern.
int smem_write_col = (tidx % THREADS_PER_ROW) ^ smem_write_xor;
// The offset.
this->smem_write_offset_ = smem_write_row*BYTES_PER_ROW + smem_write_col*BYTES_PER_STS;
// TODO: Why not merge it with the read offset?
this->smem_read_buffer_ = __shfl_sync(0xffffffff, 0, 0);
this->smem_write_buffer_ = __shfl_sync(0xffffffff, 0, 0);
}
// Compute the store pointers.
template< int N >
inline __device__ void compute_store_pointers(uint32_t (&ptrs)[N]) {
#pragma unroll
for( int ii = 0; ii < N; ++ii ) {
// Decompose the STS into row/col.
int row = ii / STS_PER_ROW;
int col = ii % STS_PER_ROW;
// Assemble the offset.
int offset = smem_write_offset_ + row*ROWS_PER_STS*BYTES_PER_ROW;
// Take the column into account.
if( STS_PER_ROW > 1 ) {
offset += col*THREADS_PER_ROW*BYTES_PER_STS;
}
// Apply the XOR pattern if needed.
if( ROWS_PER_STS < ROWS_PER_XOR_PATTERN ) {
const int m = row * ROWS_PER_STS % ROWS_PER_XOR_PATTERN;
offset ^= m * COLS_PER_XOR_PATTERN * BYTES_PER_STS;
}
// Assemble the final pointer :)
ptrs[ii] = smem_ + offset + smem_write_buffer_;
}
}
inline __device__ void debug_reset() {
for( int buffer = 0; buffer < BYTES_PER_TILE; buffer += BYTES_PER_BUFFER) {
for( int row = 0; row < ROWS; ++row ) {
for( int col = 0; col < BYTES_PER_ROW; col += 4 ) {
if( threadIdx.x == 0 ) {
uint32_t val = 0x0;
sts(val, smem_ + row*BYTES_PER_ROW + col + buffer);
}
}
}
}
}
// Print the content of the tile (only for debug ;)).
inline __device__ void debug_print() const {
for( int buffer = 0; buffer < BYTES_PER_TILE; buffer += BYTES_PER_BUFFER) {
for( int row = 0; row < ROWS; ++row ) {
for( int col = 0; col < BYTES_PER_ROW; col += 4 ) {
if( threadIdx.x == 0 ) {
uint32_t val;
lds(val, smem_ + row*BYTES_PER_ROW + col + buffer);
printf("block=(x=%2d, y=%2d, z=%2d) (smem_=%2d, buffer=%2d, row=%2d, byte=%4d)=0x%08x\n",
blockIdx.x,
blockIdx.y,
blockIdx.z,
smem_,
buffer,
row,
col,
val);
}
}
}
}
}
// Move the read offset to next buffer.
inline __device__ void move_to_next_read_buffer() {
if( BUFFERS_PER_TILE > 1 && smem_read_buffer_ >= BYTES_PER_TILE_INC_BOUNDARY ) {
this->smem_read_buffer_ -= BYTES_PER_TILE_INC_BOUNDARY;
} else if( BUFFERS_PER_TILE > 1 ) {
this->smem_read_buffer_ += BYTES_PER_BUFFER;
}
}
// Move the read offset to next buffer. TODO: Remove this member function!!!
inline __device__ void move_next_read_buffer() {
this->move_to_next_read_buffer();
}
// Move the read offset to next N buffer (circular-buffer).
inline __device__ void move_to_next_read_buffer(int N) {
if( BUFFERS_PER_TILE > 1 ) {
this->smem_read_buffer_ += N * BYTES_PER_BUFFER;
this->smem_read_buffer_ -= smem_read_buffer_ >= BYTES_PER_TILE ? BYTES_PER_TILE : 0;
}
}
// Move the read offset to next N buffer (circular-buffer). TODO: Remove this member function!!!
inline __device__ void move_next_read_buffer(int N) {
this->move_to_next_read_buffer(N);
}
// Move the write offset to next buffer.
inline __device__ void move_to_next_write_buffer() {
if( BUFFERS_PER_TILE > 1 && smem_write_buffer_ >= BYTES_PER_TILE_INC_BOUNDARY ) {
this->smem_write_buffer_ -= BYTES_PER_TILE_INC_BOUNDARY;
} else if( BUFFERS_PER_TILE > 1 ) {
this->smem_write_buffer_ += BYTES_PER_BUFFER;
}
}
// Move the write offset to next buffer. TODO: Remove that member function!
inline __device__ void move_next_write_buffer() {
this->move_to_next_write_buffer();
}
// Move the read offset.
inline __device__ void move_read_offset(int delta) {
this->smem_read_offset_ += delta;
}
// Move the write offset.
inline __device__ void move_write_offset(int delta) {
this->smem_write_offset_ += delta;
}
// Store to the tile in shared memory.
template< int N >
inline __device__ void store(const Store_type (&data)[N], uint64_t = 0) {
uint32_t smem_ptrs[N];
this->compute_store_pointers(smem_ptrs);
sts(smem_ptrs, data);
}
// Store to the tile in shared memory.
template< int N, int M >
inline __device__ void store(const Store_type (&data)[N], uint32_t (&preds)[M], uint64_t = 0) {
uint32_t smem_ptrs[N];
this->compute_store_pointers(smem_ptrs);
sts(smem_ptrs, data, preds);
}
// Store to the tile in shared memory.
template< int N >
inline __device__ void store(const Store_type (&data)[N], uint32_t preds, uint64_t = 0) {
this->store(data, preds);
}
// Store to the tile in shared memory.
template< int N >
inline __device__ void store(const void* (&gmem_ptrs)[N], uint32_t preds, uint64_t = 0) {
uint32_t tmp[1] = { preds };
this->store(gmem_ptrs, tmp);
}
// The shared memory pointer.
uint32_t smem_;
// The read offset. Reserve 4 offsets if needed.
int smem_read_offset_;
// The write offset.
int smem_write_offset_;
// The buffer base offset for read.
int smem_read_buffer_;
// The buffer base offset for write.
int smem_write_buffer_;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
template<
// The dimensions of the tile computed by the CTA.
typename Cta_tile,
// The layout of the tile.
typename Layout,
// The size of the STS.
int BYTES_PER_STS = 16,
// The number of buffers per tile.
int BUFFERS_PER_TILE = 1,
// Use or not predicates
bool USE_PREDICATES = true
>
struct Smem_tile_a {
};
////////////////////////////////////////////////////////////////////////////////////////////////////
template< int MMAS_K, int MMAS_K_WITH_PADDING >
struct Compute_reset_mask {
// The potential mask.
enum { HALF = MMAS_K_WITH_PADDING / 2 };
// The remainder.
enum { MOD = MMAS_K % HALF };
// The final value.
enum { VALUE = (MMAS_K == MOD ? 0 : HALF) | Compute_reset_mask::VALUE };
};
////////////////////////////////////////////////////////////////////////////////////////////////////
template< int MMAS_K_WITH_PADDING >
struct Compute_reset_mask<0, MMAS_K_WITH_PADDING> {
enum { VALUE = 0 };
};
////////////////////////////////////////////////////////////////////////////////////////////////////
template< int MMAS_K >
struct Compute_reset_mask {
enum { VALUE = MMAS_K - 1 };
};
////////////////////////////////////////////////////////////////////////////////////////////////////
template< int N >
struct Rows_per_xor_pattern_a {
// The size in bits.
enum { N_IN_BITS = N * fmha::BITS_PER_ELEMENT_A };
// The number of rows.
enum { VALUE = N_IN_BITS <= 256 ? 2 : (N_IN_BITS <= 512 ? 4 : 8) };
};
////////////////////////////////////////////////////////////////////////////////////////////////////
template< int N >
struct Rows_per_xor_pattern_row_a : public Rows_per_xor_pattern_a {
};
////////////////////////////////////////////////////////////////////////////////////////////////////
template<
// The dimensions of the tile computed by the CTA.
typename Cta_tile,
// The size of the STS.
int BYTES_PER_STS,
// The number of buffers per tile.
int BUFFERS_PER_TILE,
// How many rows to use for the XOR pattern to avoid bank conflicts?
int ROWS_PER_XOR_PATTERN_ = Rows_per_xor_pattern_row_a::VALUE
>
struct Smem_tile_row_a : public Smem_tile_without_skews {
// The MMA tile.
using Mma_tile = fmha::Hmma_tile;
// The base class.
using Base = Smem_tile_without_skews;
// The fragment.
using Fragment = Fragment_a;
// When we use padding to reach a power of two, special care has to be taken.
using Cta_tile_with_padding = Cta_tile_with_k_with_padding