Full Code of BM-K/Sentence-Embedding-Is-All-You-Need for AI

main 32cec8ea887f cached

522 files

4.0 MB

1.1M tokens

2719 symbols

1 requests

Download .txt

Showing preview only (4,282K chars total). Download the full file or copy to clipboard to get everything.

Repository: BM-K/Sentence-Embedding-Is-All-You-Need
Branch: main
Commit: 32cec8ea887f
Files: 522
Total size: 4.0 MB

Directory structure:
gitextract_y9jvowoy/

├── KoSBERT/
│   ├── Clustering.py
│   ├── README.md
│   ├── SemanticSearch.py
│   ├── con_training_sts.py
│   ├── output/
│   │   └── empty.txt
│   ├── run_example.sh
│   └── training_nli.py
├── KoSentenceT5/
│   ├── README.md
│   ├── apex/
│   │   ├── RNN/
│   │   │   ├── README.md
│   │   │   ├── RNNBackend.py
│   │   │   ├── __init__.py
│   │   │   ├── cells.py
│   │   │   └── models.py
│   │   ├── __init__.py
│   │   ├── amp/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── __version__.py
│   │   │   ├── _amp_state.py
│   │   │   ├── _initialize.py
│   │   │   ├── _process_optimizer.py
│   │   │   ├── amp.py
│   │   │   ├── compat.py
│   │   │   ├── frontend.py
│   │   │   ├── handle.py
│   │   │   ├── lists/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── functional_overrides.py
│   │   │   │   ├── tensor_overrides.py
│   │   │   │   └── torch_overrides.py
│   │   │   ├── opt.py
│   │   │   ├── rnn_compat.py
│   │   │   ├── scaler.py
│   │   │   ├── utils.py
│   │   │   └── wrap.py
│   │   ├── contrib/
│   │   │   ├── __init__.py
│   │   │   ├── bottleneck/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── bottleneck.py
│   │   │   │   └── test.py
│   │   │   ├── csrc/
│   │   │   │   ├── bottleneck/
│   │   │   │   │   └── bottleneck.cpp
│   │   │   │   ├── fmha/
│   │   │   │   │   ├── fmha_api.cpp
│   │   │   │   │   └── src/
│   │   │   │   │       ├── fmha/
│   │   │   │   │       │   ├── gemm.h
│   │   │   │   │       │   ├── gmem_tile.h
│   │   │   │   │       │   ├── kernel_traits.h
│   │   │   │   │       │   ├── mask.h
│   │   │   │   │       │   ├── smem_tile.h
│   │   │   │   │       │   ├── softmax.h
│   │   │   │   │       │   └── utils.h
│   │   │   │   │       ├── fmha.h
│   │   │   │   │       ├── fmha_dgrad_fp16_128_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_dgrad_fp16_256_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_dgrad_fp16_384_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_dgrad_fp16_512_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_dgrad_kernel_1xN_reload.h
│   │   │   │   │       ├── fmha_dgrad_kernel_1xN_reload_nl.h
│   │   │   │   │       ├── fmha_fprop_fp16_128_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_fprop_fp16_256_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_fprop_fp16_384_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_fprop_fp16_512_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_fprop_kernel_1xN.h
│   │   │   │   │       ├── fmha_fprop_kernel_1xN_nl.h
│   │   │   │   │       ├── fmha_fprop_kernel_1xN_reload_v.h
│   │   │   │   │       ├── fmha_kernel.h
│   │   │   │   │       ├── fmha_noloop_reduce.cu
│   │   │   │   │       └── fmha_utils.h
│   │   │   │   ├── groupbn/
│   │   │   │   │   ├── batch_norm.cu
│   │   │   │   │   ├── batch_norm.h
│   │   │   │   │   ├── batch_norm_add_relu.cu
│   │   │   │   │   ├── batch_norm_add_relu.h
│   │   │   │   │   ├── cuda_utils.h
│   │   │   │   │   ├── interface.cpp
│   │   │   │   │   ├── ipc.cu
│   │   │   │   │   └── nhwc_batch_norm_kernel.h
│   │   │   │   ├── layer_norm/
│   │   │   │   │   ├── ln_api.cpp
│   │   │   │   │   ├── ln_bwd_semi_cuda_kernel.cu
│   │   │   │   │   ├── ln_fwd_cuda_kernel.cu
│   │   │   │   │   ├── ln_kernel_traits.h
│   │   │   │   │   └── utils.cuh
│   │   │   │   ├── multihead_attn/
│   │   │   │   │   ├── additive_masked_softmax_dropout.cpp
│   │   │   │   │   ├── additive_masked_softmax_dropout_cuda.cu
│   │   │   │   │   ├── dropout.h
│   │   │   │   │   ├── encdec_multihead_attn.cpp
│   │   │   │   │   ├── encdec_multihead_attn_cuda.cu
│   │   │   │   │   ├── encdec_multihead_attn_norm_add.cpp
│   │   │   │   │   ├── encdec_multihead_attn_norm_add_cuda.cu
│   │   │   │   │   ├── layer_norm.h
│   │   │   │   │   ├── masked_softmax_dropout.cpp
│   │   │   │   │   ├── masked_softmax_dropout_cuda.cu
│   │   │   │   │   ├── philox.h
│   │   │   │   │   ├── self_multihead_attn.cpp
│   │   │   │   │   ├── self_multihead_attn_bias.cpp
│   │   │   │   │   ├── self_multihead_attn_bias_additive_mask.cpp
│   │   │   │   │   ├── self_multihead_attn_bias_additive_mask_cuda.cu
│   │   │   │   │   ├── self_multihead_attn_bias_cuda.cu
│   │   │   │   │   ├── self_multihead_attn_cuda.cu
│   │   │   │   │   ├── self_multihead_attn_norm_add.cpp
│   │   │   │   │   ├── self_multihead_attn_norm_add_cuda.cu
│   │   │   │   │   ├── softmax.h
│   │   │   │   │   └── strided_batched_gemm.h
│   │   │   │   ├── optimizers/
│   │   │   │   │   ├── fused_adam_cuda.cpp
│   │   │   │   │   ├── fused_adam_cuda_kernel.cu
│   │   │   │   │   ├── fused_lamb_cuda.cpp
│   │   │   │   │   ├── fused_lamb_cuda_kernel.cu
│   │   │   │   │   ├── multi_tensor_distopt_adam.cpp
│   │   │   │   │   ├── multi_tensor_distopt_adam_kernel.cu
│   │   │   │   │   ├── multi_tensor_distopt_lamb.cpp
│   │   │   │   │   └── multi_tensor_distopt_lamb_kernel.cu
│   │   │   │   ├── transducer/
│   │   │   │   │   ├── transducer_joint.cpp
│   │   │   │   │   ├── transducer_joint_kernel.cu
│   │   │   │   │   ├── transducer_loss.cpp
│   │   │   │   │   └── transducer_loss_kernel.cu
│   │   │   │   └── xentropy/
│   │   │   │       ├── interface.cpp
│   │   │   │       └── xentropy_kernel.cu
│   │   │   ├── examples/
│   │   │   │   └── multihead_attn/
│   │   │   │       ├── func_test_multihead_attn.py
│   │   │   │       └── perf_test_multihead_attn.py
│   │   │   ├── fmha/
│   │   │   │   ├── __init__.py
│   │   │   │   └── fmha.py
│   │   │   ├── groupbn/
│   │   │   │   ├── __init__.py
│   │   │   │   └── batch_norm.py
│   │   │   ├── layer_norm/
│   │   │   │   ├── __init__.py
│   │   │   │   └── layer_norm.py
│   │   │   ├── multihead_attn/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── encdec_multihead_attn.py
│   │   │   │   ├── encdec_multihead_attn_func.py
│   │   │   │   ├── fast_encdec_multihead_attn_func.py
│   │   │   │   ├── fast_encdec_multihead_attn_norm_add_func.py
│   │   │   │   ├── fast_self_multihead_attn_func.py
│   │   │   │   ├── fast_self_multihead_attn_norm_add_func.py
│   │   │   │   ├── mask_softmax_dropout_func.py
│   │   │   │   ├── self_multihead_attn.py
│   │   │   │   └── self_multihead_attn_func.py
│   │   │   ├── optimizers/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── distributed_fused_adam.py
│   │   │   │   ├── distributed_fused_adam_v2.py
│   │   │   │   ├── distributed_fused_adam_v3.py
│   │   │   │   ├── distributed_fused_lamb.py
│   │   │   │   ├── fp16_optimizer.py
│   │   │   │   ├── fused_adam.py
│   │   │   │   ├── fused_lamb.py
│   │   │   │   └── fused_sgd.py
│   │   │   ├── sparsity/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── asp.py
│   │   │   │   ├── sparse_masklib.py
│   │   │   │   └── test/
│   │   │   │       ├── checkpointing_test_part1.py
│   │   │   │       ├── checkpointing_test_part2.py
│   │   │   │       ├── checkpointing_test_reference.py
│   │   │   │       └── toy_problem.py
│   │   │   ├── test/
│   │   │   │   ├── fmha/
│   │   │   │   │   └── test_fmha.py
│   │   │   │   ├── layer_norm/
│   │   │   │   │   └── test_fast_layer_norm.py
│   │   │   │   ├── multihead_attn/
│   │   │   │   │   ├── test_encdec_multihead_attn.py
│   │   │   │   │   ├── test_encdec_multihead_attn_norm_add.py
│   │   │   │   │   ├── test_fast_self_multihead_attn_bias.py
│   │   │   │   │   ├── test_mha_fused_softmax.py
│   │   │   │   │   ├── test_self_multihead_attn.py
│   │   │   │   │   └── test_self_multihead_attn_norm_add.py
│   │   │   │   ├── test_label_smoothing.py
│   │   │   │   └── transducer/
│   │   │   │       ├── test_transducer_joint.py
│   │   │   │       ├── test_transducer_loss.py
│   │   │   │       └── transducer_ref.py
│   │   │   ├── transducer/
│   │   │   │   ├── __init__.py
│   │   │   │   └── transducer.py
│   │   │   └── xentropy/
│   │   │       ├── __init__.py
│   │   │       └── softmax_xentropy.py
│   │   ├── fp16_utils/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── fp16_optimizer.py
│   │   │   ├── fp16util.py
│   │   │   └── loss_scaler.py
│   │   ├── mlp/
│   │   │   ├── __init__.py
│   │   │   └── mlp.py
│   │   ├── multi_tensor_apply/
│   │   │   ├── __init__.py
│   │   │   └── multi_tensor_apply.py
│   │   ├── normalization/
│   │   │   ├── __init__.py
│   │   │   └── fused_layer_norm.py
│   │   ├── optimizers/
│   │   │   ├── __init__.py
│   │   │   ├── fused_adagrad.py
│   │   │   ├── fused_adam.py
│   │   │   ├── fused_lamb.py
│   │   │   ├── fused_novograd.py
│   │   │   └── fused_sgd.py
│   │   ├── parallel/
│   │   │   ├── LARC.py
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── distributed.py
│   │   │   ├── multiproc.py
│   │   │   ├── optimized_sync_batchnorm.py
│   │   │   ├── optimized_sync_batchnorm_kernel.py
│   │   │   ├── sync_batchnorm.py
│   │   │   └── sync_batchnorm_kernel.py
│   │   ├── pyprof/
│   │   │   ├── FAQs.md
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── examples/
│   │   │   │   ├── .gitignore
│   │   │   │   ├── apex/
│   │   │   │   │   ├── README.md
│   │   │   │   │   ├── fused_adam.py
│   │   │   │   │   ├── fused_layer_norm.py
│   │   │   │   │   └── test.sh
│   │   │   │   ├── custom_func_module/
│   │   │   │   │   ├── README.md
│   │   │   │   │   ├── custom_function.py
│   │   │   │   │   ├── custom_module.py
│   │   │   │   │   └── test.sh
│   │   │   │   ├── imagenet/
│   │   │   │   │   ├── imagenet.py
│   │   │   │   │   └── test.sh
│   │   │   │   ├── jit/
│   │   │   │   │   ├── README.md
│   │   │   │   │   ├── jit_script_function.py
│   │   │   │   │   ├── jit_script_method.py
│   │   │   │   │   ├── jit_trace_function.py
│   │   │   │   │   ├── jit_trace_method.py
│   │   │   │   │   └── test.sh
│   │   │   │   ├── lenet.py
│   │   │   │   ├── operators.py
│   │   │   │   ├── simple.py
│   │   │   │   └── user_annotation/
│   │   │   │       ├── README.md
│   │   │   │       ├── resnet.py
│   │   │   │       └── test.sh
│   │   │   ├── nvtx/
│   │   │   │   ├── __init__.py
│   │   │   │   └── nvmarker.py
│   │   │   ├── parse/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── db.py
│   │   │   │   ├── kernel.py
│   │   │   │   ├── nvvp.py
│   │   │   │   └── parse.py
│   │   │   └── prof/
│   │   │       ├── __init__.py
│   │   │       ├── __main__.py
│   │   │       ├── activation.py
│   │   │       ├── base.py
│   │   │       ├── blas.py
│   │   │       ├── conv.py
│   │   │       ├── convert.py
│   │   │       ├── data.py
│   │   │       ├── dropout.py
│   │   │       ├── embedding.py
│   │   │       ├── index_slice_join_mutate.py
│   │   │       ├── linear.py
│   │   │       ├── loss.py
│   │   │       ├── misc.py
│   │   │       ├── normalization.py
│   │   │       ├── optim.py
│   │   │       ├── output.py
│   │   │       ├── pointwise.py
│   │   │       ├── pooling.py
│   │   │       ├── prof.py
│   │   │       ├── randomSample.py
│   │   │       ├── recurrentCell.py
│   │   │       ├── reduction.py
│   │   │       ├── softmax.py
│   │   │       ├── usage.py
│   │   │       └── utility.py
│   │   └── reparameterization/
│   │       ├── README.md
│   │       ├── __init__.py
│   │       ├── reparameterization.py
│   │       └── weight_norm.py
│   ├── data/
│   │   └── dataloader.py
│   ├── main.py
│   ├── model/
│   │   ├── loss.py
│   │   ├── setting.py
│   │   ├── simcse/
│   │   │   ├── kost5.py
│   │   │   └── processor.py
│   │   └── utils.py
│   └── run_example.sh
├── KoSimCSE/
│   ├── README.md
│   ├── SemanticSearch.py
│   ├── apex/
│   │   ├── RNN/
│   │   │   ├── README.md
│   │   │   ├── RNNBackend.py
│   │   │   ├── __init__.py
│   │   │   ├── cells.py
│   │   │   └── models.py
│   │   ├── __init__.py
│   │   ├── amp/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── __version__.py
│   │   │   ├── _amp_state.py
│   │   │   ├── _initialize.py
│   │   │   ├── _process_optimizer.py
│   │   │   ├── amp.py
│   │   │   ├── compat.py
│   │   │   ├── frontend.py
│   │   │   ├── handle.py
│   │   │   ├── lists/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── functional_overrides.py
│   │   │   │   ├── tensor_overrides.py
│   │   │   │   └── torch_overrides.py
│   │   │   ├── opt.py
│   │   │   ├── rnn_compat.py
│   │   │   ├── scaler.py
│   │   │   ├── utils.py
│   │   │   └── wrap.py
│   │   ├── contrib/
│   │   │   ├── __init__.py
│   │   │   ├── bottleneck/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── bottleneck.py
│   │   │   │   └── test.py
│   │   │   ├── csrc/
│   │   │   │   ├── bottleneck/
│   │   │   │   │   └── bottleneck.cpp
│   │   │   │   ├── fmha/
│   │   │   │   │   ├── fmha_api.cpp
│   │   │   │   │   └── src/
│   │   │   │   │       ├── fmha/
│   │   │   │   │       │   ├── gemm.h
│   │   │   │   │       │   ├── gmem_tile.h
│   │   │   │   │       │   ├── kernel_traits.h
│   │   │   │   │       │   ├── mask.h
│   │   │   │   │       │   ├── smem_tile.h
│   │   │   │   │       │   ├── softmax.h
│   │   │   │   │       │   └── utils.h
│   │   │   │   │       ├── fmha.h
│   │   │   │   │       ├── fmha_dgrad_fp16_128_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_dgrad_fp16_256_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_dgrad_fp16_384_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_dgrad_fp16_512_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_dgrad_kernel_1xN_reload.h
│   │   │   │   │       ├── fmha_dgrad_kernel_1xN_reload_nl.h
│   │   │   │   │       ├── fmha_fprop_fp16_128_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_fprop_fp16_256_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_fprop_fp16_384_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_fprop_fp16_512_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_fprop_kernel_1xN.h
│   │   │   │   │       ├── fmha_fprop_kernel_1xN_nl.h
│   │   │   │   │       ├── fmha_fprop_kernel_1xN_reload_v.h
│   │   │   │   │       ├── fmha_kernel.h
│   │   │   │   │       ├── fmha_noloop_reduce.cu
│   │   │   │   │       └── fmha_utils.h
│   │   │   │   ├── groupbn/
│   │   │   │   │   ├── batch_norm.cu
│   │   │   │   │   ├── batch_norm.h
│   │   │   │   │   ├── batch_norm_add_relu.cu
│   │   │   │   │   ├── batch_norm_add_relu.h
│   │   │   │   │   ├── cuda_utils.h
│   │   │   │   │   ├── interface.cpp
│   │   │   │   │   ├── ipc.cu
│   │   │   │   │   └── nhwc_batch_norm_kernel.h
│   │   │   │   ├── layer_norm/
│   │   │   │   │   ├── ln_api.cpp
│   │   │   │   │   ├── ln_bwd_semi_cuda_kernel.cu
│   │   │   │   │   ├── ln_fwd_cuda_kernel.cu
│   │   │   │   │   ├── ln_kernel_traits.h
│   │   │   │   │   └── utils.cuh
│   │   │   │   ├── multihead_attn/
│   │   │   │   │   ├── additive_masked_softmax_dropout.cpp
│   │   │   │   │   ├── additive_masked_softmax_dropout_cuda.cu
│   │   │   │   │   ├── dropout.h
│   │   │   │   │   ├── encdec_multihead_attn.cpp
│   │   │   │   │   ├── encdec_multihead_attn_cuda.cu
│   │   │   │   │   ├── encdec_multihead_attn_norm_add.cpp
│   │   │   │   │   ├── encdec_multihead_attn_norm_add_cuda.cu
│   │   │   │   │   ├── layer_norm.h
│   │   │   │   │   ├── masked_softmax_dropout.cpp
│   │   │   │   │   ├── masked_softmax_dropout_cuda.cu
│   │   │   │   │   ├── philox.h
│   │   │   │   │   ├── self_multihead_attn.cpp
│   │   │   │   │   ├── self_multihead_attn_bias.cpp
│   │   │   │   │   ├── self_multihead_attn_bias_additive_mask.cpp
│   │   │   │   │   ├── self_multihead_attn_bias_additive_mask_cuda.cu
│   │   │   │   │   ├── self_multihead_attn_bias_cuda.cu
│   │   │   │   │   ├── self_multihead_attn_cuda.cu
│   │   │   │   │   ├── self_multihead_attn_norm_add.cpp
│   │   │   │   │   ├── self_multihead_attn_norm_add_cuda.cu
│   │   │   │   │   ├── softmax.h
│   │   │   │   │   └── strided_batched_gemm.h
│   │   │   │   ├── optimizers/
│   │   │   │   │   ├── fused_adam_cuda.cpp
│   │   │   │   │   ├── fused_adam_cuda_kernel.cu
│   │   │   │   │   ├── fused_lamb_cuda.cpp
│   │   │   │   │   ├── fused_lamb_cuda_kernel.cu
│   │   │   │   │   ├── multi_tensor_distopt_adam.cpp
│   │   │   │   │   ├── multi_tensor_distopt_adam_kernel.cu
│   │   │   │   │   ├── multi_tensor_distopt_lamb.cpp
│   │   │   │   │   └── multi_tensor_distopt_lamb_kernel.cu
│   │   │   │   ├── transducer/
│   │   │   │   │   ├── transducer_joint.cpp
│   │   │   │   │   ├── transducer_joint_kernel.cu
│   │   │   │   │   ├── transducer_loss.cpp
│   │   │   │   │   └── transducer_loss_kernel.cu
│   │   │   │   └── xentropy/
│   │   │   │       ├── interface.cpp
│   │   │   │       └── xentropy_kernel.cu
│   │   │   ├── examples/
│   │   │   │   └── multihead_attn/
│   │   │   │       ├── func_test_multihead_attn.py
│   │   │   │       └── perf_test_multihead_attn.py
│   │   │   ├── fmha/
│   │   │   │   ├── __init__.py
│   │   │   │   └── fmha.py
│   │   │   ├── groupbn/
│   │   │   │   ├── __init__.py
│   │   │   │   └── batch_norm.py
│   │   │   ├── layer_norm/
│   │   │   │   ├── __init__.py
│   │   │   │   └── layer_norm.py
│   │   │   ├── multihead_attn/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── encdec_multihead_attn.py
│   │   │   │   ├── encdec_multihead_attn_func.py
│   │   │   │   ├── fast_encdec_multihead_attn_func.py
│   │   │   │   ├── fast_encdec_multihead_attn_norm_add_func.py
│   │   │   │   ├── fast_self_multihead_attn_func.py
│   │   │   │   ├── fast_self_multihead_attn_norm_add_func.py
│   │   │   │   ├── mask_softmax_dropout_func.py
│   │   │   │   ├── self_multihead_attn.py
│   │   │   │   └── self_multihead_attn_func.py
│   │   │   ├── optimizers/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── distributed_fused_adam.py
│   │   │   │   ├── distributed_fused_adam_v2.py
│   │   │   │   ├── distributed_fused_adam_v3.py
│   │   │   │   ├── distributed_fused_lamb.py
│   │   │   │   ├── fp16_optimizer.py
│   │   │   │   ├── fused_adam.py
│   │   │   │   ├── fused_lamb.py
│   │   │   │   └── fused_sgd.py
│   │   │   ├── sparsity/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── asp.py
│   │   │   │   ├── sparse_masklib.py
│   │   │   │   └── test/
│   │   │   │       ├── checkpointing_test_part1.py
│   │   │   │       ├── checkpointing_test_part2.py
│   │   │   │       ├── checkpointing_test_reference.py
│   │   │   │       └── toy_problem.py
│   │   │   ├── test/
│   │   │   │   ├── fmha/
│   │   │   │   │   └── test_fmha.py
│   │   │   │   ├── layer_norm/
│   │   │   │   │   └── test_fast_layer_norm.py
│   │   │   │   ├── multihead_attn/
│   │   │   │   │   ├── test_encdec_multihead_attn.py
│   │   │   │   │   ├── test_encdec_multihead_attn_norm_add.py
│   │   │   │   │   ├── test_fast_self_multihead_attn_bias.py
│   │   │   │   │   ├── test_mha_fused_softmax.py
│   │   │   │   │   ├── test_self_multihead_attn.py
│   │   │   │   │   └── test_self_multihead_attn_norm_add.py
│   │   │   │   ├── test_label_smoothing.py
│   │   │   │   └── transducer/
│   │   │   │       ├── test_transducer_joint.py
│   │   │   │       ├── test_transducer_loss.py
│   │   │   │       └── transducer_ref.py
│   │   │   ├── transducer/
│   │   │   │   ├── __init__.py
│   │   │   │   └── transducer.py
│   │   │   └── xentropy/
│   │   │       ├── __init__.py
│   │   │       └── softmax_xentropy.py
│   │   ├── fp16_utils/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── fp16_optimizer.py
│   │   │   ├── fp16util.py
│   │   │   └── loss_scaler.py
│   │   ├── mlp/
│   │   │   ├── __init__.py
│   │   │   └── mlp.py
│   │   ├── multi_tensor_apply/
│   │   │   ├── __init__.py
│   │   │   └── multi_tensor_apply.py
│   │   ├── normalization/
│   │   │   ├── __init__.py
│   │   │   └── fused_layer_norm.py
│   │   ├── optimizers/
│   │   │   ├── __init__.py
│   │   │   ├── fused_adagrad.py
│   │   │   ├── fused_adam.py
│   │   │   ├── fused_lamb.py
│   │   │   ├── fused_novograd.py
│   │   │   └── fused_sgd.py
│   │   ├── parallel/
│   │   │   ├── LARC.py
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── distributed.py
│   │   │   ├── multiproc.py
│   │   │   ├── optimized_sync_batchnorm.py
│   │   │   ├── optimized_sync_batchnorm_kernel.py
│   │   │   ├── sync_batchnorm.py
│   │   │   └── sync_batchnorm_kernel.py
│   │   ├── pyprof/
│   │   │   ├── FAQs.md
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── examples/
│   │   │   │   ├── .gitignore
│   │   │   │   ├── apex/
│   │   │   │   │   ├── README.md
│   │   │   │   │   ├── fused_adam.py
│   │   │   │   │   ├── fused_layer_norm.py
│   │   │   │   │   └── test.sh
│   │   │   │   ├── custom_func_module/
│   │   │   │   │   ├── README.md
│   │   │   │   │   ├── custom_function.py
│   │   │   │   │   ├── custom_module.py
│   │   │   │   │   └── test.sh
│   │   │   │   ├── imagenet/
│   │   │   │   │   ├── imagenet.py
│   │   │   │   │   └── test.sh
│   │   │   │   ├── jit/
│   │   │   │   │   ├── README.md
│   │   │   │   │   ├── jit_script_function.py
│   │   │   │   │   ├── jit_script_method.py
│   │   │   │   │   ├── jit_trace_function.py
│   │   │   │   │   ├── jit_trace_method.py
│   │   │   │   │   └── test.sh
│   │   │   │   ├── lenet.py
│   │   │   │   ├── operators.py
│   │   │   │   ├── simple.py
│   │   │   │   └── user_annotation/
│   │   │   │       ├── README.md
│   │   │   │       ├── resnet.py
│   │   │   │       └── test.sh
│   │   │   ├── nvtx/
│   │   │   │   ├── __init__.py
│   │   │   │   └── nvmarker.py
│   │   │   ├── parse/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── db.py
│   │   │   │   ├── kernel.py
│   │   │   │   ├── nvvp.py
│   │   │   │   └── parse.py
│   │   │   └── prof/
│   │   │       ├── __init__.py
│   │   │       ├── __main__.py
│   │   │       ├── activation.py
│   │   │       ├── base.py
│   │   │       ├── blas.py
│   │   │       ├── conv.py
│   │   │       ├── convert.py
│   │   │       ├── data.py
│   │   │       ├── dropout.py
│   │   │       ├── embedding.py
│   │   │       ├── index_slice_join_mutate.py
│   │   │       ├── linear.py
│   │   │       ├── loss.py
│   │   │       ├── misc.py
│   │   │       ├── normalization.py
│   │   │       ├── optim.py
│   │   │       ├── output.py
│   │   │       ├── pointwise.py
│   │   │       ├── pooling.py
│   │   │       ├── prof.py
│   │   │       ├── randomSample.py
│   │   │       ├── recurrentCell.py
│   │   │       ├── reduction.py
│   │   │       ├── softmax.py
│   │   │       ├── usage.py
│   │   │       └── utility.py
│   │   └── reparameterization/
│   │       ├── README.md
│   │       ├── __init__.py
│   │       ├── reparameterization.py
│   │       └── weight_norm.py
│   ├── data/
│   │   └── dataloader.py
│   ├── main.py
│   ├── model/
│   │   ├── loss.py
│   │   ├── setting.py
│   │   ├── simcse/
│   │   │   ├── bert.py
│   │   │   └── processor.py
│   │   └── utils.py
│   ├── output/
│   │   └── empty.txt
│   ├── requirements.txt
│   └── run_example.sh
├── LICENSE
├── README.md
├── get_model_checkpoint.sh
└── get_model_dataset.sh

================================================
FILE CONTENTS
================================================

================================================
FILE: KoSBERT/Clustering.py
================================================
from sentence_transformers import SentenceTransformer, util
import numpy as np

model_path = '../Checkpoint/KoSBERT/kosbert-klue-bert-base'

embedder = SentenceTransformer(model_path)

# Corpus with example sentences
corpus = ['한 남자가 음식을 먹는다.',
          '한 남자가 빵 한 조각을 먹는다.',
          '그 여자가 아이를 돌본다.',
          '한 남자가 말을 탄다.',
          '한 여자가 바이올린을 연주한다.',
          '두 남자가 수레를 숲 솦으로 밀었다.',
          '한 남자가 담으로 싸인 땅에서 백마를 타고 있다.',
          '원숭이 한 마리가 드럼을 연주한다.',
          '치타 한 마리가 먹이 뒤에서 달리고 있다.',
          '한 남자가 파스타를 먹는다.',
          '고릴라 의상을 입은 누군가가 드럼을 연주하고 있다.',
          '치타가 들판을 가로 질러 먹이를 쫓는다.']

corpus_embeddings = embedder.encode(corpus)

# Then, we perform k-means clustering using sklearn:
from sklearn.cluster import KMeans

num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")


================================================
FILE: KoSBERT/README.md
================================================
# KoSentenceBERT
[[Github]](https://github.com/UKPLab/sentence-transformers) Official implementation of SBERT. <br>
Korean SentenceBERT : Korean Sentence Embeddings using Siamese BERT-Networks.

## Quick start
- If you want to do inference quickly, download the pre-trained models and then you can start some downstream tasks.
```
bash get_model_checkpoint.sh
python SemanticSearch.py
```

## Training
- Before training or evaluation, please download the datasets by running
    ```
    bash get_model_dataset.sh
    ```
- Two stage training
    - First step, training NLI dataset 
    
        ```
        python training_nli.py --model klue/bert-base --batch 32 --evaluation_steps 1000 --epochs 1
        ```
    - Second step, continued training STS dataset 
    
        ```
        python con_training_sts.py --model klue/bert-base --batch 32 --evaluation_steps 1000 --epochs 4
        ```
    
- Run Examples
  ```
  bash run_example.sh
  ```
### Hyperparameters
- Training NLI
  1. Pooling Method: MEAN strategy
  2. Batch Size: 32
  3. Evaluation Steps: 1000
  4. Epochs: 1(BERT), 2(RoBERTa)
  
- Continued Training STS
  1. Pooling Method: MEAN strategy
  2. Batch Size: 32
  3. Evaluation Steps: 1000
  4. Epochs: 4

### Semantic Search
```
python SemanticSearch.py
```
```python
from sentence_transformers import SentenceTransformer, util
import numpy as np

model_path = '../Checkpoint/KoSBERT/kosbert-klue-bert-base'

embedder = SentenceTransformer(model_path)

# Corpus with example sentences
corpus = ['한 남자가 음식을 먹는다.',
          '한 남자가 빵 한 조각을 먹는다.',
          '그 여자가 아이를 돌본다.',
          '한 남자가 말을 탄다.',
          '한 여자가 바이올린을 연주한다.',
          '두 남자가 수레를 숲 솦으로 밀었다.',
          '한 남자가 담으로 싸인 땅에서 백마를 타고 있다.',
          '원숭이 한 마리가 드럼을 연주한다.',
          '치타 한 마리가 먹이 뒤에서 달리고 있다.']

corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

# Query sentences:
queries = ['한 남자가 파스타를 먹는다.',
           '고릴라 의상을 입은 누군가가 드럼을 연주하고 있다.',
           '치타가 들판을 가로 질러 먹이를 쫓는다.']

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = 5
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    cos_scores = cos_scores.cpu()

    #We use np.argpartition, to only partially sort the top_k results
    top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for idx in top_results[0:top_k]:
        print(corpus[idx].strip(), "(Score: %.4f)" % (cos_scores[idx]))

```

- Results are as follows :

```

Query: 한 남자가 파스타를 먹는다.

Top 5 most similar sentences in corpus:
한 남자가 음식을 먹는다. (Score: 0.6141)
한 남자가 빵 한 조각을 먹는다. (Score: 0.5952)
한 남자가 말을 탄다. (Score: 0.1231)
한 남자가 담으로 싸인 땅에서 백마를 타고 있다. (Score: 0.0752)
두 남자가 수레를 숲 솦으로 밀었다. (Score: 0.0486)


======================


Query: 고릴라 의상을 입은 누군가가 드럼을 연주하고 있다.

Top 5 most similar sentences in corpus:
원숭이 한 마리가 드럼을 연주한다. (Score: 0.6656)
치타 한 마리가 먹이 뒤에서 달리고 있다. (Score: 0.2988)
한 여자가 바이올린을 연주한다. (Score: 0.1566)
한 남자가 말을 탄다. (Score: 0.1112)
한 남자가 담으로 싸인 땅에서 백마를 타고 있다. (Score: 0.0262)


======================


Query: 치타가 들판을 가로 질러 먹이를 쫓는다.

Top 5 most similar sentences in corpus:
치타 한 마리가 먹이 뒤에서 달리고 있다. (Score: 0.7570)
두 남자가 수레를 숲 솦으로 밀었다. (Score: 0.3658)
원숭이 한 마리가 드럼을 연주한다. (Score: 0.3583)
한 남자가 말을 탄다. (Score: 0.0505)
그 여자가 아이를 돌본다. (Score: -0.0087)
```

### Clustering 
```
python Clustering.py
```
```python
from sentence_transformers import SentenceTransformer, util
import numpy as np

model_path = '../Checkpoint/KoSBERT/kosbert-klue-bert-base'

embedder = SentenceTransformer(model_path)

# Corpus with example sentences
corpus = ['한 남자가 음식을 먹는다.',
          '한 남자가 빵 한 조각을 먹는다.',
          '그 여자가 아이를 돌본다.',
          '한 남자가 말을 탄다.',
          '한 여자가 바이올린을 연주한다.',
          '두 남자가 수레를 숲 솦으로 밀었다.',
          '한 남자가 담으로 싸인 땅에서 백마를 타고 있다.',
          '원숭이 한 마리가 드럼을 연주한다.',
          '치타 한 마리가 먹이 뒤에서 달리고 있다.',
          '한 남자가 파스타를 먹는다.',
          '고릴라 의상을 입은 누군가가 드럼을 연주하고 있다.',
          '치타가 들판을 가로 질러 먹이를 쫓는다.']

corpus_embeddings = embedder.encode(corpus)

# Then, we perform k-means clustering using sklearn:
from sklearn.cluster import KMeans

num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")
```
- Results are as follows:
```
Cluster  1
['한 남자가 음식을 먹는다.', '한 남자가 빵 한 조각을 먹는다.', '한 남자가 파스타를 먹는다.']

Cluster  2
['원숭이 한 마리가 드럼을 연주한다.', '고릴라 의상을 입은 누군가가 드럼을 연주하고 있다.']

Cluster  3
['한 남자가 말을 탄다.', '두 남자가 수레를 숲 솦으로 밀었다.', '한 남자가 담으로 싸인 땅에서 백마를 타고 있다.']

Cluster  4
['치타 한 마리가 먹이 뒤에서 달리고 있다.', '치타가 들판을 가로 질러 먹이를 쫓는다.']

Cluster  5
['그 여자가 아이를 돌본다.', '한 여자가 바이올린을 연주한다.']
```


================================================
FILE: KoSBERT/SemanticSearch.py
================================================
from sentence_transformers import SentenceTransformer, util
import numpy as np

model_path = '../Checkpoint/KoSBERT/kosbert-klue-bert-base'

embedder = SentenceTransformer(model_path)

# Corpus with example sentences
corpus = ['한 남자가 음식을 먹는다.',
          '한 남자가 빵 한 조각을 먹는다.',
          '그 여자가 아이를 돌본다.',
          '한 남자가 말을 탄다.',
          '한 여자가 바이올린을 연주한다.',
          '두 남자가 수레를 숲 솦으로 밀었다.',
          '한 남자가 담으로 싸인 땅에서 백마를 타고 있다.',
          '원숭이 한 마리가 드럼을 연주한다.',
          '치타 한 마리가 먹이 뒤에서 달리고 있다.']

corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

# Query sentences:
queries = ['한 남자가 파스타를 먹는다.',
           '고릴라 의상을 입은 누군가가 드럼을 연주하고 있다.',
           '치타가 들판을 가로 질러 먹이를 쫓는다.']

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = 5
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    cos_scores = cos_scores.cpu()

    #We use np.argpartition, to only partially sort the top_k results
    top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for idx in top_results[0:top_k]:
        print(corpus[idx].strip(), "(Score: %.4f)" % (cos_scores[idx]))




================================================
FILE: KoSBERT/con_training_sts.py
================================================
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import os
import gzip
import csv
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, default='klue/bert-base')
parser.add_argument('--batch', type=int, default=32)
parser.add_argument('--evaluation_steps', type=int, default=1000)
parser.add_argument('--epochs', type=int, default=4)
args = parser.parse_args()

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

model_name = './output/training_nli_'+args.model.replace("/", "-")

train_batch_size = args.batch
num_epochs = args.epochs

model_save_path = 'output/kosbert-'+args.model.replace("/", "-")

model = SentenceTransformer(model_name)

logging.info("Read STSbenchmark train dataset")

train_samples = []
dev_samples = []
test_samples = []
with open('../Dataset/tune_sts_dev.tsv', 'rt', encoding='utf-8') as fIn:
    lines = fIn.readlines()
    for line in lines:
        s1, s2, score = line.split('\t')
        score = score.strip()
        score = float(score) / 5.0
        dev_samples.append(InputExample(texts= [s1,s2], label=score))

with open('../Dataset/tune_sts_test.tsv', 'rt', encoding='utf-8') as fIn:
    lines = fIn.readlines()
    for line in lines:
        s1, s2, score = line.split('\t')
        score = score.strip()
        score = float(score) / 5.0
        test_samples.append(InputExample(texts= [s1,s2], label=score))

with open('../Dataset/tune_sts_train.tsv', 'rt', encoding='utf-8') as fIn:
    lines = fIn.readlines()
    for line in lines:
        s1, s2, score = line.split('\t')
        score = score.strip()
        score = float(score) / 5.0
        train_samples.append(InputExample(texts= [s1,s2], label=score))

train_dataset = SentencesDataset(train_samples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)


# Development set: Measure correlation between cosine score and gold labels
logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')

warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))


# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=args.evaluation_steps,
          warmup_steps=warmup_steps,
          output_path=model_save_path)


##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################

model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)


================================================
FILE: KoSBERT/output/empty.txt
================================================
.

================================================
FILE: KoSBERT/run_example.sh
================================================
#!/bin/bash

# bert-base
echo "First Step Training NLI Dataset (BERT-BASE)"
CUDA_VISIBLE_DEVICES=0 python training_nli.py --model klue/bert-base --batch 32 --evaluation_steps 1000 --epochs 1
echo "Second Step Continuously Training STS Dataset (BERT-BASE)"
CUDA_VISIBLE_DEVICES=0 python con_training_sts.py --model klue/bert-base --batch 32 --evaluation_steps 1000 --epochs 4

# roberta-base
echo "First Step Training NLI Dataset (ROBERTA-BASE)"
CUDA_VISIBLE_DEVICES=0 python training_nli.py --model klue/roberta-base --batch 32 --evaluation_steps 1000 --epochs 1
echo "Second Step Continuously Training STS Dataset (ROBERTA-BASE)"
CUDA_VISIBLE_DEVICES=0 python con_training_sts.py --model klue/roberta-base --batch 32 --evaluation_steps 1000 --epochs 4

# roberta-large
echo "First Step Training NLI Dataset (ROBERAT-LARGE)"
CUDA_VISIBLE_DEVICES=0 python training_nli.py --model klue/roberta-large --batch 32 --evaluation_steps 1000 --epochs 1
echo "Second Step Continuously Training STS Dataset (ROBERTA-LARGE)"
CUDA_VISIBLE_DEVICES=0 python con_training_sts.py --model klue/roberta-large --batch 32 --evaluation_steps 1000 --epochs 4



================================================
FILE: KoSBERT/training_nli.py
================================================
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, default='klue/bert-base')
parser.add_argument('--batch', type=int, default=32)
parser.add_argument('--evaluation_steps', type=int, default=1000)
parser.add_argument('--epochs', type=int, default=1)

args = parser.parse_args()

model_name = args.model

train_batch_size = args.batch

model_save_path = 'output/training_nli_'+model_name.replace("/", "-")#+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

logging.info("Read AllNLI train dataset")

label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
train_samples = []

with open('../Dataset/snli_1.0_train.ko.tsv', "rt", encoding="utf-8") as fIn:
    lines = fIn.readlines()
    for line in lines:
        s1, s2, label = line.split('\t')
        label = label2int[label.strip()]
        train_samples.append(InputExample(texts=[s1, s2], label=label))

train_dataset = SentencesDataset(train_samples, model=model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int))


#Read STSbenchmark dataset and use it as development set
logging.info("Read STSbenchmark dev dataset")
dev_samples = []

with open('../Dataset/tune_sts_dev.tsv', 'rt', encoding='utf-8') as fIn:
    lines = fIn.readlines()
    for line in lines:
        s1, s2, score = line.split('\t')
        score = score.strip()
        score = float(score) / 5.0
        dev_samples.append(InputExample(texts= [s1,s2], label=score))

dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=train_batch_size, name='sts-dev')

num_epochs = args.epochs

warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=dev_evaluator,
          epochs=num_epochs,
          evaluation_steps=args.evaluation_steps,
          warmup_steps=warmup_steps,
          output_path=model_save_path
          )



##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################

test_samples = []
with open('../Dataset/tune_sts_test.tsv', 'rt', encoding='utf-8') as fIn:
    lines = fIn.readlines()
    for line in lines:
        s1, s2, score = line.split('\t')
        score = score.strip()
        score = float(score) / 5.0
        test_samples.append(InputExample(texts=[s1,s2], label=score))

print("\n\n\n")
print("======================TEST===================")
print("\n\n\n")
model = SentenceTransformer(model_save_path)
print(f"model save path > {model_save_path}")
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=train_batch_size, name='sts-test')
test_evaluator(model, output_path=model_save_path)


================================================
FILE: KoSentenceT5/README.md
================================================
# KoSentenceT5
KoSentenceT5 : Korean Sentence Embeddings using T5. <br>
> **Warning** <br>
> This repository uses ETRI-T5 model and does not provide it. You can download T5 model from [here](https://aiopen.etri.re.kr/service_dataset.php).

## Training 
- Before training or evaluation, please download the datasets by running
```
bash get_model_dataset.sh
```
### Train KoSentenceT5
  ```
  python main.py \
    --model etri-t5 \
    --multi_gpu True \
    --test False \
    --max_len 110 \
    --batch_size 64 \
    --epochs 2 \
    --eval_steps 125 \
    --lr 0.0001 \
    --warmup_ratio 0.01 \
    --temperature 0.05 \
    --path_to_data ../Dataset/ \
    --train_data train_nli.tsv \
    --valid_data valid_sts.tsv
  ```
### Evaluation
  ```
  python main.py \
    --model etri-t5 \
    --train False \
    --test True \
    --max_len 110 \
    --batch_size 64 \
    --temperature 0.05 \
    --path_to_data ../Dataset/ \
    --test_data test_sts.tsv \
  ```

### Run Examples
```
bash run_example.sh
```


================================================
FILE: KoSentenceT5/apex/RNN/README.md
================================================
Under construction...


================================================
FILE: KoSentenceT5/apex/RNN/RNNBackend.py
================================================
import torch
import torch.nn as nn
from torch.autograd import Variable

import torch.nn.functional as F

import math


def is_iterable(maybe_iterable):
    return isinstance(maybe_iterable, list) or isinstance(maybe_iterable, tuple)


def flatten_list(tens_list):
    """
    flatten_list
    """
    if not is_iterable(tens_list):
        return tens_list
    
    return torch.cat(tens_list, dim=0).view(len(tens_list), *tens_list[0].size() )

    
#These modules always assumes batch_first
class bidirectionalRNN(nn.Module):
    """
    bidirectionalRNN
    """
    def __init__(self, inputRNN, num_layers=1, dropout = 0):
        super(bidirectionalRNN, self).__init__()
        self.dropout = dropout
        self.fwd = stackedRNN(inputRNN, num_layers=num_layers, dropout = dropout)
        self.bckwrd = stackedRNN(inputRNN.new_like(), num_layers=num_layers, dropout = dropout)
        self.rnns = nn.ModuleList([self.fwd, self.bckwrd])
        
    #collect hidden option will return all hidden/cell states from entire RNN
    def forward(self, input, collect_hidden=False):
        """
        forward()
        """
        seq_len = input.size(0)
        bsz = input.size(1)

        fwd_out, fwd_hiddens = list(self.fwd(input, collect_hidden = collect_hidden))
        bckwrd_out, bckwrd_hiddens = list(self.bckwrd(input, reverse=True, collect_hidden = collect_hidden))
        
        output = torch.cat( [fwd_out, bckwrd_out], -1 )
        hiddens = tuple( torch.cat(hidden, -1) for hidden in zip( fwd_hiddens, bckwrd_hiddens) )

        return output, hiddens

    def reset_parameters(self):
        """
        reset_parameters()
        """
        for rnn in self.rnns:
            rnn.reset_parameters()
        
    def init_hidden(self, bsz):
        """
        init_hidden()
        """
        for rnn in self.rnns:
            rnn.init_hidden(bsz)

    def detach_hidden(self):
        """
        detach_hidden()
        """
        for rnn in self.rnns:
            rnn.detachHidden()
        
    def reset_hidden(self, bsz):
        """
        reset_hidden()
        """
        for rnn in self.rnns:
            rnn.reset_hidden(bsz)

    def init_inference(self, bsz):    
        """
        init_inference()
        """
        for rnn in self.rnns:
            rnn.init_inference(bsz)

   
#assumes hidden_state[0] of inputRNN is output hidden state
#constructor either takes an RNNCell or list of RNN layers
class stackedRNN(nn.Module):        
    """
    stackedRNN
    """
    def __init__(self, inputRNN, num_layers=1, dropout=0):
        super(stackedRNN, self).__init__()
        
        self.dropout = dropout
        
        if isinstance(inputRNN, RNNCell):
            self.rnns = [inputRNN]
            for i in range(num_layers-1):
                self.rnns.append(inputRNN.new_like(inputRNN.output_size))
        elif isinstance(inputRNN, list):
            assert len(inputRNN) == num_layers, "RNN list length must be equal to num_layers"
            self.rnns=inputRNN
        else:
            raise RuntimeError()
        
        self.nLayers = len(self.rnns)
        
        self.rnns = nn.ModuleList(self.rnns)


    '''
    Returns output as hidden_state[0] Tensor([sequence steps][batch size][features])
    If collect hidden will also return Tuple(
        [n_hidden_states][sequence steps] Tensor([layer][batch size][features])
    )
    If not collect hidden will also return Tuple(
        [n_hidden_states] Tensor([layer][batch size][features])
    '''
    def forward(self, input, collect_hidden=False, reverse=False):
        """
        forward()
        """
        seq_len = input.size(0)
        bsz = input.size(1)
        inp_iter = reversed(range(seq_len)) if reverse else range(seq_len)

        hidden_states = [[] for i in range(self.nLayers)]
        outputs = []

        for seq in inp_iter:
            for layer in range(self.nLayers):

                if layer == 0:
                    prev_out = input[seq]
                    
                outs = self.rnns[layer](prev_out)

                if collect_hidden:
                    hidden_states[layer].append(outs)
                elif seq == seq_len-1:
                    hidden_states[layer].append(outs)
                    
                prev_out = outs[0]

            outputs.append(prev_out)

        if reverse:
            outputs = list(reversed(outputs))
        '''
        At this point outputs is in format:
        list( [seq_length] x Tensor([bsz][features]) )
        need to convert it to:
        list( Tensor([seq_length][bsz][features]) )
        '''
        output = flatten_list(outputs)

        '''
        hidden_states at this point is in format:
        list( [layer][seq_length][hidden_states] x Tensor([bsz][features]) )
        need to convert it to:
          For not collect hidden:
            list( [hidden_states] x Tensor([layer][bsz][features]) )
          For collect hidden:
            list( [hidden_states][seq_length] x Tensor([layer][bsz][features]) )
        '''
        if not collect_hidden:
            seq_len = 1
        n_hid = self.rnns[0].n_hidden_states
        new_hidden = [ [ [ None for k in range(self.nLayers)] for j in range(seq_len) ] for i in range(n_hid) ]


        for i in range(n_hid):
            for j in range(seq_len):
                for k in range(self.nLayers):
                    new_hidden[i][j][k] = hidden_states[k][j][i]

        hidden_states = new_hidden
        #Now in format list( [hidden_states][seq_length][layer] x Tensor([bsz][features]) )
        #Reverse seq_length if reverse
        if reverse:
            hidden_states = list( list(reversed(list(entry))) for entry in hidden_states)

        #flatten layer dimension into tensor
        hiddens = list( list(
            flatten_list(seq) for seq in hidden )
                        for hidden in hidden_states )
        
        #Now in format list( [hidden_states][seq_length] x Tensor([layer][bsz][features]) )
        #Remove seq_length dimension if not collect_hidden
        if not collect_hidden:
            hidden_states = list( entry[0] for entry in hidden_states)
        return output, hidden_states
    
    def reset_parameters(self):
        """
        reset_parameters()
        """
        for rnn in self.rnns:
            rnn.reset_parameters()
        
    def init_hidden(self, bsz):
        """
        init_hidden()
        """
        for rnn in self.rnns:
            rnn.init_hidden(bsz)

    def detach_hidden(self):
        """
        detach_hidden()
        """
        for rnn in self.rnns:
            rnn.detach_hidden()
        
    def reset_hidden(self, bsz):
        """
        reset_hidden()
        """
        for rnn in self.rnns:
            rnn.reset_hidden(bsz)

    def init_inference(self, bsz):    
        """ 
        init_inference()
        """
        for rnn in self.rnns:
            rnn.init_inference(bsz)

class RNNCell(nn.Module):
    """ 
    RNNCell 
    gate_multiplier is related to the architecture you're working with
    For LSTM-like it will be 4 and GRU-like will be 3.
    Always assumes input is NOT batch_first.
    Output size that's not hidden size will use output projection
    Hidden_states is number of hidden states that are needed for cell
    if one will go directly to cell as tensor, if more will go as list
    """
    def __init__(self, gate_multiplier, input_size, hidden_size, cell, n_hidden_states = 2, bias = False, output_size = None):
        super(RNNCell, self).__init__()

        self.gate_multiplier = gate_multiplier
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.cell = cell
        self.bias = bias
        self.output_size = output_size
        if output_size is None:
            self.output_size = hidden_size

        self.gate_size = gate_multiplier * self.hidden_size
        self.n_hidden_states = n_hidden_states

        self.w_ih = nn.Parameter(torch.Tensor(self.gate_size, self.input_size))
        self.w_hh = nn.Parameter(torch.Tensor(self.gate_size, self.output_size))

        #Check if there's recurrent projection
        if(self.output_size != self.hidden_size):
            self.w_ho = nn.Parameter(torch.Tensor(self.output_size, self.hidden_size))

        self.b_ih = self.b_hh = None
        if self.bias:
            self.b_ih = nn.Parameter(torch.Tensor(self.gate_size))
            self.b_hh = nn.Parameter(torch.Tensor(self.gate_size))
            
        #hidden states for forward
        self.hidden = [ None for states in range(self.n_hidden_states)]

        self.reset_parameters()

    def new_like(self, new_input_size=None):
        """
        new_like()
        """
        if new_input_size is None:
            new_input_size = self.input_size
            
        return type(self)(self.gate_multiplier,
                       new_input_size,
                       self.hidden_size,
                       self.cell,
                       self.n_hidden_states,
                       self.bias,
                       self.output_size)

    
    #Use xavier where we can (weights), otherwise use uniform (bias)
    def reset_parameters(self, gain=1):
        """
        reset_parameters()
        """
        stdev = 1.0 / math.sqrt(self.hidden_size)
        for param in self.parameters():
            param.data.uniform_(-stdev, stdev)
    '''
    Xavier reset:
    def reset_parameters(self, gain=1):
        stdv = 1.0 / math.sqrt(self.gate_size)

        for param in self.parameters():
            if (param.dim() > 1):
                torch.nn.init.xavier_normal(param, gain)
            else:
                param.data.uniform_(-stdv, stdv)
    '''
    def init_hidden(self, bsz):
        """
        init_hidden()
        """
        for param in self.parameters():
            if param is not None:
                a_param = param
                break

        for i, _ in enumerate(self.hidden):
            if(self.hidden[i] is None or self.hidden[i].data.size()[0] != bsz):

                if i==0:
                    hidden_size = self.output_size
                else:
                    hidden_size = self.hidden_size

                tens = a_param.data.new(bsz, hidden_size).zero_()
                self.hidden[i] = Variable(tens, requires_grad=False)
            
        
    def reset_hidden(self, bsz):
        """
        reset_hidden()
        """
        for i, _ in enumerate(self.hidden):
            self.hidden[i] = None
        self.init_hidden(bsz)

    def detach_hidden(self):
        """
        detach_hidden()
        """
        for i, _ in enumerate(self.hidden):
            if self.hidden[i] is None:
                raise RuntimeError("Must initialize hidden state before you can detach it")
        for i, _ in enumerate(self.hidden):
            self.hidden[i] = self.hidden[i].detach()
        
    def forward(self, input):
        """
        forward()
        if not inited or bsz has changed this will create hidden states
        """
        self.init_hidden(input.size()[0])

        hidden_state = self.hidden[0] if self.n_hidden_states == 1 else self.hidden
        self.hidden = self.cell(input, hidden_state, self.w_ih, self.w_hh, b_ih=self.b_ih, b_hh=self.b_hh)
        if(self.n_hidden_states > 1):
            self.hidden = list(self.hidden)
        else:
            self.hidden=[self.hidden]

        if self.output_size != self.hidden_size:
            self.hidden[0] = F.linear(self.hidden[0], self.w_ho)

        return tuple(self.hidden)


================================================
FILE: KoSentenceT5/apex/RNN/__init__.py
================================================
from .models import LSTM, GRU, ReLU, Tanh, mLSTM

__all__ = ['models']


================================================
FILE: KoSentenceT5/apex/RNN/cells.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F

from .RNNBackend import RNNCell

from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend

import math 


class mLSTMRNNCell(RNNCell):
    """
    mLSTMRNNCell
    """

    def __init__(self, input_size, hidden_size, bias = False, output_size = None):
        gate_multiplier = 4
        super(mLSTMRNNCell, self).__init__(gate_multiplier, input_size, hidden_size, mLSTMCell, n_hidden_states = 2, bias = bias, output_size = output_size)

        self.w_mih = nn.Parameter(torch.Tensor(self.output_size, self.input_size))
        self.w_mhh = nn.Parameter(torch.Tensor(self.output_size, self.output_size))

        self.reset_parameters()

    def forward(self, input):
        """
        mLSTMRNNCell.forward()
        """
        #if not inited or bsz has changed this will create hidden states
        self.init_hidden(input.size()[0])

        hidden_state = self.hidden[0] if self.n_hidden_states == 1 else self.hidden

        self.hidden = list(
                           self.cell(input, hidden_state, self.w_ih, self.w_hh, self.w_mih, self.w_mhh,
                           b_ih=self.b_ih, b_hh=self.b_hh)
        )
        
        if self.output_size != self.hidden_size:
            self.hidden[0] = F.linear(self.hidden[0], self.w_ho)
        return tuple(self.hidden)


    def new_like(self, new_input_size=None):
        if new_input_size is None:
            new_input_size = self.input_size
        
        return type(self)(
            new_input_size,
            self.hidden_size,
            self.bias,
            self.output_size)

def mLSTMCell(input, hidden, w_ih, w_hh, w_mih, w_mhh, b_ih=None, b_hh=None):
    """
    mLSTMCell
    """

    if input.is_cuda:
        igates = F.linear(input, w_ih)
        m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh)
        hgates = F.linear(m, w_hh)

        state = fusedBackend.LSTMFused.apply
        return state(igates, hgates, hidden[1], b_ih, b_hh)

    hx, cx = hidden
    
    m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh)
    gates = F.linear(input, w_ih, b_ih) + F.linear(m, w_hh, b_hh)

    ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)

    ingate = F.sigmoid(ingate)
    forgetgate = F.sigmoid(forgetgate)
    cellgate = F.tanh(cellgate)
    outgate = F.sigmoid(outgate)
    
    cy = (forgetgate * cx) + (ingate * cellgate)
    hy = outgate * F.tanh(cy)
    
    return hy, cy
                                                                            


================================================
FILE: KoSentenceT5/apex/RNN/models.py
================================================
import torch

from torch.nn._functions.rnn import LSTMCell, RNNReLUCell, RNNTanhCell, GRUCell

from .RNNBackend import bidirectionalRNN, stackedRNN, RNNCell
from .cells import mLSTMRNNCell, mLSTMCell

def toRNNBackend(inputRNN, num_layers, bidirectional=False, dropout = 0):
    """
    :class:`toRNNBackend`
    """

    if bidirectional:
        return bidirectionalRNN(inputRNN, num_layers, dropout = dropout)
    else:
        return stackedRNN(inputRNN, num_layers, dropout = dropout)


def LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
    """
    :class:`LSTM`
    """
    inputRNN = RNNCell(4, input_size, hidden_size, LSTMCell, 2, bias, output_size)
    return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)

def GRU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
    """
    :class:`GRU`
    """
    inputRNN = RNNCell(3, input_size, hidden_size, GRUCell, 1, bias, output_size)
    return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)

def ReLU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
    """
    :class:`ReLU`
    """
    inputRNN = RNNCell(1, input_size, hidden_size, RNNReLUCell, 1, bias, output_size)
    return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)

def Tanh(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
    """
    :class:`Tanh`
    """
    inputRNN = RNNCell(1, input_size, hidden_size, RNNTanhCell, 1, bias, output_size)
    return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
        
def mLSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
    """
    :class:`mLSTM`
    """
    inputRNN = mLSTMRNNCell(input_size, hidden_size, bias=bias, output_size=output_size)
    return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)




================================================
FILE: KoSentenceT5/apex/__init__.py
================================================
# May help avoid undefined symbol errors https://pytorch.org/cppdocs/notes/faq.html#undefined-symbol-errors-from-pytorch-aten
import torch
import warnings

if torch.distributed.is_available():
    from . import parallel

from . import amp
from . import fp16_utils

# For optimizers and normalization there is no Python fallback.
# Absence of cuda backend is a hard error.
# I would like the errors from importing fused_adam_cuda or fused_layer_norm_cuda
# to be triggered lazily, because if someone has installed with --cpp_ext and --cuda_ext
# so they expect those backends to be available, but for some reason they actually aren't
# available (for example because they built improperly in a way that isn't revealed until
# load time) the error message is timely and visible.
from . import optimizers
from . import normalization
from . import pyprof


================================================
FILE: KoSentenceT5/apex/amp/README.md
================================================
# amp: Automatic Mixed Precision

## Annotating User Functions

Nearly all PyTorch user code needs nothing more than the two steps
above to use amp. After all, custom layers are built out of simpler
PyTorch components, and amp already can see those.

However, any custom C++ or CUDA code is outside of amp's (default)
view of things. For example, suppose I implemented a new recurrent
cell called a "forgetful recurrent unit" that calls directly into a
CUDA backend:

```python
from backend import FRUBackend

def fru(input, hidden, weight, bias):
    # call to CUDA code
    FRUBackend(input, hidden, weight, bias)
```

In this case, it is possible to get a runtime type mismatch. For
example, you might have `input` in fp16, and `weight` in fp32, and amp
doesn't have the visibility to insert an appropriate cast.

amp exposes two ways to handle "invisible" backend code: function
annotations and explicit registration.

#### Function annotation

The first way to handle backend code is a set of function annotations:

- `@amp.half_function`
- `@amp.float_function`
- `@amp.promote_function`

These correspond to:

- Cast all arguments to fp16
- Cast all argumnets fo fp32
- If there are any type mismatches, cast everything to the widest type

In our example, we believe that the FRU unit is fp16-safe and will get
performance gains from casting its arguments to fp16, so we write:

```python
@amp.half_function
def fru(input, hidden, weight, bias):
    #...
```

#### Explicit registration

The other way to handle backend code is with explicit function
registration:

- `amp.register_half_function(module, function_name)`
- `amp.register_float_function(module, function_name)`
- `amp.register_promote_function(module, function_name)`

When using this API, `module` is the containing class or module for
the function, and `function_name` is the _string_ name of the
function. Note that the function must be registered before the call to
`amp.initalize()`.

For our FRU unit, we can register the backend function directly:

```python
import backend

amp.register_half_function(backend, 'FRUBackend')
```


================================================
FILE: KoSentenceT5/apex/amp/__init__.py
================================================
from .amp import init, half_function, float_function, promote_function,\
    register_half_function, register_float_function, register_promote_function
from .handle import scale_loss, disable_casts
from .frontend import initialize, state_dict, load_state_dict
from ._amp_state import master_params, _amp_state


================================================
FILE: KoSentenceT5/apex/amp/__version__.py
================================================
VERSION = (0, 1, 0)
__version__ = '.'.join(map(str, VERSION))


================================================
FILE: KoSentenceT5/apex/amp/_amp_state.py
================================================
# This is a "header object" that allows different amp modules to communicate.
# I'm a C++ guy, not a python guy.  I decided this approach because it seemed most C++-like.
# But apparently it's ok:
# http://effbot.org/pyfaq/how-do-i-share-global-variables-across-modules.htm
import os
import torch

TORCH_MAJOR = int(torch.__version__.split('.')[0])
TORCH_MINOR = int(torch.__version__.split('.')[1])


if TORCH_MAJOR == 1 and TORCH_MINOR < 8:
    from torch._six import container_abcs
else:
    import collections.abc as container_abcs


class AmpState(object):
    def __init__(self):
        self.hard_override=False
        self.allow_incoming_model_not_fp32 = False
        self.verbosity=1


# Attribute stash.  Could also just stash things as global module attributes.
_amp_state = AmpState()


def warn_or_err(msg):
    if _amp_state.hard_override:
        print("Warning:  " + msg)
    else:
        raise RuntimeError(msg)
        # I'm not sure if allowing hard_override is a good idea.
        # + "  If you're sure you know what you're doing, supply " +
        #                    "hard_override=True to amp.initialize.")


def maybe_print(msg, rank0=False):
    distributed = torch.distributed.is_available() and \
        torch.distributed.is_initialized() and \
        torch.distributed.get_world_size() > 1
    if _amp_state.verbosity > 0:
        if rank0:
            if distributed:
                if torch.distributed.get_rank() == 0:
                    print(msg)
            else:
                print(msg)
        else:
            print(msg)


# def iter_params(param_groups):
#     for group in param_groups:
#         for p in group['params']:
#             yield p


def master_params(optimizer):
    """
    Generator expression that iterates over the params owned by ``optimizer``.

    Args:
        optimizer: An optimizer previously returned from ``amp.initialize``.
    """
    for group in optimizer.param_groups:
        for p in group['params']:
            yield p


================================================
FILE: KoSentenceT5/apex/amp/_initialize.py
================================================
import torch
from torch._six import string_classes
import functools
import numpy as np
import sys
from types import MethodType
import warnings
from ._amp_state import _amp_state, warn_or_err, container_abcs
from .handle import disable_casts
from .scaler import LossScaler
from ._process_optimizer import _process_optimizer
from apex.fp16_utils import convert_network
from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general
from ..contrib.optimizers import FP16_Optimizer as FP16_Optimizer_for_fused

if torch.distributed.is_available():
    from ..parallel import DistributedDataParallel as apex_DDP
    from ..parallel.LARC import LARC


def to_type(dtype, t):
    if isinstance(t, torch.Tensor):
        if not t.is_cuda:
            # This should not be a hard error, since it may be legitimate.
            warnings.warn("An input tensor was not cuda.")
        # GANs require this.
        # if t.requires_grad:
        #     warn_or_err("input data requires grad.  Since input data is not a model parameter,\n"
        #         "its gradients will not be properly allreduced by DDP.")
        if t.is_floating_point():
            return t.to(dtype)
        return t
    else:
        # Trust the user's custom batch type, that's all I can do here.
        return t.to(dtype)


# Modified from torch.optim.optimizer.py.  This is a bit more general than casted_args in utils.py.
def applier(value, fn):
    if isinstance(value, torch.Tensor):
        return fn(value)
    elif isinstance(value, string_classes):
        return value
    elif isinstance(value, np.ndarray):
        return value
    elif hasattr(value, "to"): # Allow handling of custom batch classes
        return fn(value)
    elif isinstance(value, container_abcs.Mapping):
        return {applier(k, fn) : applier(v, fn) for k, v in value.items()}
    elif isinstance(value, container_abcs.Iterable):
        return type(value)(applier(v, fn) for v in value)
    else:
        # Do I want this to fire off even if someone chooses to pass something ordinary like
        # an int or float?  May be more annoying than it's worth.
        # print("Warning:  unrecognized type in applier.  If your input data is a custom class, "
        #     "provide it with a .to(dtype) method which converts its floating-point Tensors to dtype. "
        #     "Amp will check for your custom to() and invoke it to cast the batch's "
        #     "floating-point Tensors to the appropriate type. "
        #     "Also, if your data is a custom class, it is your responsibility to ensure that "
        #     "any Tensors you want to be cuda are already cuda."
        return value


def check_models(models):
    for model in models:
        parallel_type = None
        if isinstance(model, torch.nn.parallel.DistributedDataParallel):
            parallel_type = "torch.nn.parallel.DistributedDataParallel"
        if ('apex_DDP' in sys.modules) and isinstance(model, apex_DDP):
            parallel_type = "apex.parallel.DistributedDataParallel"
        if isinstance(model, torch.nn.parallel.DataParallel):
            parallel_type = "torch.nn.parallel.DataParallel"
        if parallel_type is not None:
            raise RuntimeError("Incoming model is an instance of {}. ".format(parallel_type) +
                "Parallel wrappers should only be applied to the model(s) AFTER \n"
                "the model(s) have been returned from amp.initialize.")


def check_params_fp32(models):
    for model in models:
        for name, param in model.named_parameters():
            if param.is_floating_point():
                if 'Half' in param.type():
                    warn_or_err("Found param {} with type {}, expected torch.cuda.FloatTensor.\n"
                        "When using amp.initialize, you do not need to call .half() on your model\n"
                        "before passing it, no matter what optimization level you choose.".format(
                        name, param.type()))
                elif not param.is_cuda:
                    warn_or_err("Found param {} with type {}, expected torch.cuda.FloatTensor.\n"
                        "When using amp.initialize, you need to provide a model with parameters\n"
                        "located on a CUDA device before passing it no matter what optimization level\n"
                        "you chose. Use model.to('cuda') to use the default device.".format(
                        name, param.type()))

        # Backward compatibility for PyTorch 0.4
        if hasattr(model, 'named_buffers'):
            buf_iter = model.named_buffers()
        else:
            buf_iter = model._buffers
        for obj in buf_iter:
            if type(obj)==tuple:
                name, buf = obj
            else:
                name, buf = obj, buf_iter[obj]
            if buf.is_floating_point():
                if 'Half' in buf.type():
                    warn_or_err("Found buffer {} with type {}, expected torch.cuda.FloatTensor.\n"
                        "When using amp.initialize, you do not need to call .half() on your model\n"
                        "before passing it, no matter what optimization level you choose.".format(
                        name, buf.type()))
                elif not buf.is_cuda:
                    warn_or_err("Found buffer {} with type {}, expected torch.cuda.FloatTensor.\n"
                        "When using amp.initialize, you need to provide a model with buffers\n"
                        "located on a CUDA device before passing it no matter what optimization level\n"
                        "you chose. Use model.to('cuda') to use the default device.".format(
                        name, buf.type()))


def check_optimizers(optimizers):
    for optim in optimizers:
        bad_optim_type = None
        if isinstance(optim, FP16_Optimizer_general):
            bad_optim_type = "apex.fp16_utils.FP16_Optimizer"
        if isinstance(optim, FP16_Optimizer_for_fused):
            bad_optim_type = "apex.optimizers.FP16_Optimizer"
        if bad_optim_type is not None:
            raise RuntimeError("An incoming optimizer is an instance of {}. ".format(bad_optim_type) +
                               "The optimizer(s) passed to amp.initialize() must be bare \n"
                               "instances of either ordinary Pytorch optimizers, or Apex fused \n"
                               "optimizers.\n")


class O2StateDictHook(object):
    def __init__(self, fn):
        self.fn = fn

    def __call__(self, module, state_dict, prefix, local_metadata):
        for key in state_dict:
            param = state_dict[key]
            if 'Half' in param.type():
                param = param.to(torch.float32)
                state_dict[key] = param


def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs=None):
    from .amp import init as amp_init

    optimizers_was_list = False
    if isinstance(optimizers, torch.optim.Optimizer) or ('LARC' in globals() and isinstance(optimizers, LARC)):
        optimizers = [optimizers]
    elif optimizers is None:
        optimizers = []
    elif isinstance(optimizers, list):
        optimizers_was_list = True
        check_optimizers(optimizers)
    else:
        check_optimizers([optimizers])
        raise TypeError("optimizers must be either a single optimizer or a list of optimizers.")

    if isinstance(models, torch.nn.Module):
        models_was_list = False
        models = [models]
    elif isinstance(models, list):
        models_was_list = True
    else:
        raise TypeError("models must be either a single model or a list of models.")

    check_models(models)

    if not _amp_state.allow_incoming_model_not_fp32:
        check_params_fp32(models)

    # In the future, when FP16_Optimizer can be deprecated and master weights can
    # become an attribute, remember to stash master weights before casting the model.

    if properties.cast_model_type:
        if properties.keep_batchnorm_fp32:
            for model in models:
                convert_network(model, properties.cast_model_type)
        else:
            for model in models:
                model.to(properties.cast_model_type)

        input_caster = functools.partial(to_type, properties.cast_model_type)
        if cast_model_outputs is not None:
            output_caster = functools.partial(to_type, cast_model_outputs)
        else:
            output_caster = functools.partial(to_type, torch.float32)

        for model in models:
            # Patch the forward method to cast incoming data to the correct type, and
            # outgoing data to float32, so "the user never needs to call .half()."
            # I like writing things explicitly more than decorators.
            def patch_forward(old_fwd):
                def new_fwd(*args, **kwargs):
                    output = old_fwd(*applier(args, input_caster),
                                     **applier(kwargs, input_caster))
                    return applier(output, output_caster)
                return new_fwd

            model.forward = patch_forward(model.forward)

        # State dict trick to recast any preexisting per-param state tensors
        for optimizer in optimizers:
            optimizer.load_state_dict(optimizer.state_dict())

        # patch model.state_dict() to return float32 params
        for model in models:
            for module in model.modules():
                module._register_state_dict_hook(O2StateDictHook(functools.partial(to_type, torch.float32)))

    elif cast_model_outputs is not None:
        output_caster = functools.partial(to_type, cast_model_outputs)

        for model in models:
            def patch_forward(old_fwd):
                def new_fwd(*args, **kwargs):
                    output = old_fwd(*args, **kwargs)
                    return applier(output, output_caster)
                return new_fwd

            model.forward = patch_forward(model.forward)

    for i, optimizer in enumerate(optimizers):
        optimizers[i] = _process_optimizer(optimizer, properties)

    _amp_state.loss_scalers = []
    for _ in range(num_losses):
        _amp_state.loss_scalers.append(LossScaler(properties.loss_scale,
                                                  min_loss_scale=_amp_state.min_loss_scale,
                                                  max_loss_scale=_amp_state.max_loss_scale))

    if properties.patch_torch_functions:
        # handle is unused here. It's accessible later through a global value anyway.
        handle = amp_init(loss_scale=properties.loss_scale, verbose=(_amp_state.verbosity == 2))
        for optimizer in optimizers:
            # Disable Amp casting for the optimizer step, because it should only be
            # applied to FP32 master params anyway.
            def patch_step(old_step):
                def new_step(self, *args, **kwargs):
                    with disable_casts():
                        output = old_step(*args, **kwargs)
                    return output
                return new_step

            optimizer.step = MethodType(patch_step(optimizer.step), optimizer)

    if optimizers_was_list:
        if models_was_list:
            return models, optimizers
        else:
            return models[0], optimizers
    else:
        if models_was_list:
            if len(optimizers) == 0:
                return models
            else:
                return models, optimizers[0]
        else:
            if len(optimizers) == 0:
                return models[0]
            else:
                return models[0], optimizers[0]


================================================
FILE: KoSentenceT5/apex/amp/_process_optimizer.py
================================================
import types
from ..fp16_utils import master_params_to_model_params
from ..multi_tensor_apply import multi_tensor_applier
from ._amp_state import maybe_print
import torch
from ..optimizers import FusedSGD


class AmpOptimizerState(object):
    def __init__(self):
        pass


def _master_params_to_model_params(self):
    stash = self._amp_stash
    if multi_tensor_applier.available:
        if len(stash.all_fp16_params) > 0:
            multi_tensor_applier(
                stash.multi_tensor_scale,
                stash.dummy_overflow_buf,
                [stash.all_fp32_from_fp16_params, stash.all_fp16_params],
                1.0)
    else:
        for fp16_group, fp32_from_fp16_group in zip(stash.fp16_groups, stash.fp32_from_fp16_groups):
            master_params_to_model_params(fp16_group, fp32_from_fp16_group)


def lazy_init_with_master_weights(self):
        stash = self._amp_stash
        stash.fp16_groups = []
        stash.fp32_from_fp16_groups = []
        stash.fp32_from_fp32_groups = []
        for i, param_group in enumerate(self.param_groups):
            # maybe_print("FP16_Optimizer processing param group {}:".format(i))
            fp16_params_this_group = []
            fp32_params_this_group = []
            fp32_from_fp16_params_this_group = []
            for i, param in enumerate(param_group['params']):
                if param.requires_grad:
                    if param.type() == 'torch.cuda.HalfTensor':
                        # maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
                        #             .format(param.size()))
                        fp16_params_this_group.append(param)
                        master_param = param.detach().clone().float()
                        master_param.requires_grad = True
                        param_group['params'][i] = master_param
                        fp32_from_fp16_params_this_group.append(master_param)
                        # Reset existing state dict key to the new master param.
                        # We still need to recast per-param state tensors, if any, to FP32.
                        if param in self.state:
                           self.state[master_param] = self.state.pop(param)
                    elif param.type() == 'torch.cuda.FloatTensor':
                        # maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
                        #             .format(param.size()))
                        fp32_params_this_group.append(param)
                        param_group['params'][i] = param
                    else:
                        raise TypeError("Optimizer's parameters must be either "
                                        "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
                                        "Received {}".format(param.type()))

            stash.fp16_groups.append(fp16_params_this_group)
            stash.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
            stash.fp32_from_fp32_groups.append(fp32_params_this_group)

        stash.all_fp16_params = []
        for group in stash.fp16_groups:
            stash.all_fp16_params += group

        stash.all_fp32_from_fp16_params = []
        for group in stash.fp32_from_fp16_groups:
            stash.all_fp32_from_fp16_params += group

        stash.all_fp32_from_fp32_params = []
        for group in stash.fp32_from_fp32_groups:
            stash.all_fp32_from_fp32_params += group

        # all_fp16_grad_stash is only needed for fused optimizers.
        stash.all_fp16_grad_stash = [None for _ in stash.all_fp16_params]
        # stash.all_fp32_from_fp16_grad_stash = [None for _ in stash.all_fp32_from_fp16_params]
        stash.all_fp32_from_fp32_grad_stash = [None for _ in stash.all_fp32_from_fp32_params]

        for param in stash.all_fp32_from_fp16_params:
            param.grad = None

        for param in stash.all_fp32_from_fp32_params:
            param.grad = None

        # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
        self.load_state_dict(self.state_dict())


def post_backward_models_are_masters(scaler, params, stashed_grads, scale_override=None):
        grads_have_scale, stashed_have_scale, out_scale = scaler.loss_scale(), 1.0, 1.0

        # not much to do if scale == 1.0 and static scaling
        if scaler.loss_scale() == 1.0 and not scaler.dynamic:
            # Clear the stash.
            for i in range(len(stashed_grads)):
                stashed_grads[i] = None
            return
        
        if scale_override is not None:
            grads_have_scale, stashed_have_scale, out_scale = scale_override

        # This is a lot of python overhead...
        grads_needing_unscale = []
        grads_needing_unscale_with_stash = []
        stashed = []
        for param, stashed_grad in zip(params, stashed_grads):
            if param.grad is None and stashed_grad is not None:
                param.grad = stashed_grad
            elif param.grad is not None and stashed_grad is None:
                grads_needing_unscale.append(param.grad)
            elif param.grad is not None and stashed_grad is not None:
                grads_needing_unscale_with_stash.append(param.grad)
                stashed.append(stashed_grad)
            else: # param.grad is None and stashed_grad is None
                continue

        # unscale() implements grads*(1/scale), so "scale" should be grads_have_scale/out_scale.
        if len(grads_needing_unscale) > 0:
            scaler.unscale(
                grads_needing_unscale,
                grads_needing_unscale,
                None, # unused_scale, currently present to avoid API breakage elsewhere
                models_are_masters=True,
                scale_override=grads_have_scale/out_scale)

        if len(grads_needing_unscale_with_stash) > 0:
            scaler.unscale_with_stashed(
                grads_needing_unscale_with_stash,
                stashed,
                grads_needing_unscale_with_stash,
                scale_override=(grads_have_scale, stashed_have_scale, out_scale))

        # Clear the stash.
        for i in range(len(stashed_grads)):
            stashed_grads[i] = None


def prepare_backward_with_master_weights(self):
    stash = self._amp_stash

    self._amp_lazy_init()

    for i, param in enumerate(stash.all_fp16_params):
        # Set up to leverage grad copy elision.
        # This may behave differently from an unpatched optimizer if zero_grad is used and the param is unused.
        param.grad = None

    # for i, param in enumerate(stash.all_fp32_from_fp16_params):
    #     stash.all_fp32_from_fp16_grad_stash[i] = param.grad

    for i, param in enumerate(stash.all_fp32_from_fp32_params):
        stash.all_fp32_from_fp32_grad_stash[i] = param.grad
        # Set up to leverage grad copy elision:
        param.grad = None


def post_backward_with_master_weights(self, scaler):
    stash = self._amp_stash

    self._amp_lazy_init()

    # This is a lot of python overhead...
    fp16_grads_needing_unscale = []
    new_fp32_grads = []
    fp16_grads_needing_unscale_with_stash = []
    preexisting_fp32_grads = []
    for fp16_param, fp32_param in zip(stash.all_fp16_params,
                                      stash.all_fp32_from_fp16_params):
        if fp16_param.grad is None and fp32_param.grad is not None:
            continue
        elif fp16_param.grad is not None and fp32_param.grad is None:
            fp32_param.grad = torch.empty_like(fp32_param)
            fp16_grads_needing_unscale.append(fp16_param.grad)
            new_fp32_grads.append(fp32_param.grad)
        elif fp16_param.grad is not None and fp32_param.grad is not None:
            fp16_grads_needing_unscale_with_stash.append(fp16_param.grad)
            preexisting_fp32_grads.append(fp32_param.grad)
        else: # fp16_param.grad is None and fp32_param.grad is None:
            continue

    if len(fp16_grads_needing_unscale) > 0:
        scaler.unscale(
            fp16_grads_needing_unscale,
            new_fp32_grads,
            scaler.loss_scale(),
            models_are_masters=False)

    if len(fp16_grads_needing_unscale_with_stash) > 0:
        scaler.unscale_with_stashed(
            fp16_grads_needing_unscale_with_stash,
            preexisting_fp32_grads,
            preexisting_fp32_grads)

    # fp32 params can be treated as they would be in the "no_master_weights" case.
    post_backward_models_are_masters(
        scaler,
        stash.all_fp32_from_fp32_params,
        stash.all_fp32_from_fp32_grad_stash)


def lazy_init_no_master_weights(self):
    stash = self._amp_stash
    stash.all_fp16_params = []
    stash.all_fp32_params = []
    for i, param_group in enumerate(self.param_groups):
        for i, param in enumerate(param_group['params']):
            if param.type() == 'torch.cuda.HalfTensor':
                stash.all_fp16_params.append(param)
            elif param.type() == 'torch.cuda.FloatTensor':
                stash.all_fp32_params.append(param)
            else:
                raise TypeError("Optimizer's parameters must be either "
                                "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
                                "Received {}".format(param.type()))

    stash.all_fp16_grad_stash = [None for _ in stash.all_fp16_params]
    stash.all_fp32_grad_stash = [None for _ in stash.all_fp32_params]


def prepare_backward_no_master_weights(self):
    stash = self._amp_stash

    self._amp_lazy_init()

    for i, param in enumerate(stash.all_fp16_params):
        stash.all_fp16_grad_stash[i] = param.grad
        # Set up to leverage grad copy elision:
        param.grad = None

    for i, param in enumerate(stash.all_fp32_params):
        stash.all_fp32_grad_stash[i] = param.grad
        # Set up to leverage grad copy elision:
        param.grad = None


def post_backward_no_master_weights(self, scaler):
    stash = self._amp_stash

    self._amp_lazy_init()

    split_types = ((stash.all_fp16_params, stash.all_fp16_grad_stash),
             (stash.all_fp32_params, stash.all_fp32_grad_stash))

    for params, stashed_grads in split_types:
        post_backward_models_are_masters(scaler, params, stashed_grads)


#####################################################################################
# FusedSGD versions
#####################################################################################

# FusedSGD never explicitly materializes the fp32 gradients for "fp32 from fp16" master params
# outside the kernel, so we must accumulate directly into the model grads.
def prepare_backward_with_master_weights_FusedSGD(self):
    if self.materialize_master_grads:
        prepare_backward_with_master_weights(self)
    else:
        stash = self._amp_stash

        self._amp_lazy_init()

        for i, param in enumerate(stash.all_fp16_params):
            stash.all_fp16_grad_stash[i] = param.grad
            # Set up to leverage grad copy elision:
            param.grad = None

        for i, param in enumerate(stash.all_fp32_from_fp32_params):
            stash.all_fp32_from_fp32_grad_stash[i] = param.grad
            # Set up to leverage grad copy elision:
            param.grad = None


def post_backward_with_master_weights_FusedSGD(self, scaler):
    if self.materialize_master_grads:
        post_backward_with_master_weights(self, scaler)
    else:
        stash = self._amp_stash

        self._amp_lazy_init()

        grads_have_scale = scaler.loss_scale()
        stashed_have_scale = self.most_recent_scale
        out_scale = grads_have_scale
        if self.scale_set_by_backward:
            out_scale = min(grads_have_scale, self.most_recent_scale)

        split_types = ((stash.all_fp16_params, stash.all_fp16_grad_stash),
                 (stash.all_fp32_from_fp32_params, stash.all_fp32_from_fp32_grad_stash))


        # unscale_with_stashed() implements grads*1/scale + stashed_grads*1.
        # stashed_grads are scaled by self.most_recent_scale.
        for params, stashed_grads in split_types:
            post_backward_models_are_masters(scaler, params, stashed_grads,
                                             (grads_have_scale, stashed_have_scale, out_scale))

        self.most_recent_scale = out_scale
        self.scale_set_by_backward = True


def prepare_backward_no_master_weights_FusedSGD(self):
    prepare_backward_no_master_weights(self)


def post_backward_no_master_weights_FusedSGD(self, scaler):
    post_backward_no_master_weights(self, scaler)


def _amp_lazy_init(self):
    stash = self._amp_stash

    if not stash.lazy_init_called:
        self._lazy_init_maybe_master_weights()
        stash.lazy_init_called = True


def _process_optimizer(optimizer, properties):
    if hasattr(optimizer, "_amp_stash"):
        raise RuntimeError("A given optimizer should only be passed through amp.initialize once.")
    else:
        optimizer._amp_stash = AmpOptimizerState()

    optimizer._amp_stash.lazy_init_called = False
    optimizer._amp_stash.already_patched = False
    optimizer._amp_stash.params_have_scaled_gradients = False

    for name in ("_lazy_init_maybe_master_weights",
                 "_master_params_to_model_params",
                 "_prepare_amp_backward",
                 "_post_amp_backward",
                 "_amp_lazy_init"):
        if hasattr(optimizer, name):
            raise RuntimeError("Incoming optimizer already has {} defined.".format(name))

    # TODO:  Centralize exposure and import error checking for the C backend.
    if multi_tensor_applier.available:
        import amp_C
        optimizer._amp_stash.multi_tensor_scale = amp_C.multi_tensor_scale
        optimizer._amp_stash.multi_tensor_l2norm = amp_C.multi_tensor_l2norm
        optimizer._amp_stash.dummy_overflow_buf = torch.cuda.IntTensor([0]);

    if properties.master_weights:
        optimizer._lazy_init_maybe_master_weights = types.MethodType(
            lazy_init_with_master_weights, optimizer)

        optimizer._master_params_to_model_params = types.MethodType(
            _master_params_to_model_params, optimizer)

        old_step = optimizer.step
        def new_step(self, closure=None):
            if closure is not None:
                raise RuntimeError("Currently, Amp does not support closure use with optimizers.")
            retval = old_step()
            if not isinstance(self, FusedSGD):
                self._master_params_to_model_params()
            # Clear the master grads that wouldn't be zeroed by model.zero_grad()
            for param in self._amp_stash.all_fp32_from_fp16_params:
                param.grad = None
            return retval
        optimizer.step = types.MethodType(new_step, optimizer)

        old_zero_grad = optimizer.zero_grad
        def new_zero_grad(self):
            stash = self._amp_stash
            self._amp_lazy_init()
            # Zero the model grads.
            for param in stash.all_fp16_params:
                if param.grad is not None:
                    param.grad.detach_()
                    param.grad.zero_()
            for param in stash.all_fp32_from_fp32_params:
                if param.grad is not None:
                    param.grad.detach_()
                    param.grad.zero_()
            # Clear the master grads that are independent of model grads
            for param in self._amp_stash.all_fp32_from_fp16_params:
                param.grad = None
        optimizer.zero_grad = types.MethodType(new_zero_grad, optimizer)

        if isinstance(optimizer, FusedSGD):
            optimizer._prepare_amp_backward = types.MethodType(
                prepare_backward_with_master_weights_FusedSGD, optimizer)
            optimizer._post_amp_backward = types.MethodType(
                post_backward_with_master_weights_FusedSGD, optimizer)
        else:
            optimizer._prepare_amp_backward = types.MethodType(
                prepare_backward_with_master_weights, optimizer)
            optimizer._post_amp_backward = types.MethodType(
                post_backward_with_master_weights, optimizer)
    else:
        optimizer._lazy_init_maybe_master_weights = types.MethodType(
            lazy_init_no_master_weights, optimizer)

        if isinstance(optimizer, FusedSGD):
            optimizer._prepare_amp_backward = types.MethodType(
                prepare_backward_no_master_weights_FusedSGD, optimizer)
            optimizer._post_amp_backward = types.MethodType(
                post_backward_no_master_weights_FusedSGD, optimizer)
        else:
            optimizer._prepare_amp_backward = types.MethodType(
                prepare_backward_no_master_weights, optimizer)
            optimizer._post_amp_backward = types.MethodType(
                post_backward_no_master_weights, optimizer)

    optimizer._amp_lazy_init = types.MethodType(_amp_lazy_init, optimizer)

    old_add_param_group = optimizer.add_param_group

    def new_add_param_group(self, new_group):
        stash = self._amp_stash

        if not stash.lazy_init_called:
            self._lazy_init_maybe_master_weights()
            stash.lazy_init_called = True

        assert isinstance(new_group, dict), "param group must be a dict"

        new_params = new_group['params']
        if isinstance(new_params, torch.Tensor):
            new_group['params'] = [new_params]
        elif isinstance(new_params, set):
            raise TypeError('optimizer parameters need to be organized in ordered collections, but '
                            'the ordering of tensors in sets will change between runs. Please use a list instead.')
        else:
            new_group['params'] = list(new_params)

        if properties.master_weights:
            # Mutate new_group in-place to use FP32 master params
            fp16_params_this_group = []
            fp32_params_this_group = []
            fp32_from_fp16_params_this_group = []
            for i, param in enumerate(new_group['params']):
                if param.requires_grad:
                    if param.type() == 'torch.cuda.HalfTensor':
                        fp16_params_this_group.append(param)
                        master_param = param.detach().clone().float()
                        master_param.requires_grad = True
                        new_group['params'][i] = master_param
                        fp32_from_fp16_params_this_group.append(master_param)
                    elif param.type() == 'torch.cuda.FloatTensor':
                        fp32_params_this_group.append(param)
                        new_group['params'][i] = param
                    else:
                        raise TypeError("Optimizer's parameters must be either "
                                        "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
                                        "Received {}".format(param.type()))

            stash.fp16_groups.append(fp16_params_this_group)
            stash.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
            stash.fp32_from_fp32_groups.append(fp32_params_this_group)

            stash.all_fp16_params += fp16_params_this_group
            stash.all_fp32_from_fp16_params += fp32_from_fp16_params_this_group
            stash.all_fp32_from_fp32_params += fp32_params_this_group

            # stash.all_fp32_from_fp16_grad_stash = [None for _ in stash.all_fp32_from_fp16_params]
            stash.all_fp32_from_fp32_grad_stash += [None for _ in fp32_params_this_group]

            # It should be ok to let params be added with existing .grad attributes.
            # for param in fp16_params_this_group:
            #     param.grad = None

            # for param in fp32_from_fp16_params_this_group:
            #     param.grad = None

            # for param in stash.fp32_params_this_group:
            #     param.grad = None
        else:
            for param in new_group['params']:
                if param.type() == 'torch.cuda.HalfTensor':
                    stash.all_fp16_params.append(param)
                    stash.all_fp16_grad_stash.append(None)
                elif param.type() == 'torch.cuda.FloatTensor':
                    stash.all_fp32_params.append(param)
                    stash.all_fp32_grad_stash.append(None)
                else:
                    raise TypeError("Optimizer's parameters must be either "
                                    "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
                                    "Received {}".format(param.type()))

        old_add_param_group(new_group)

    optimizer.add_param_group = types.MethodType(new_add_param_group, optimizer)

    return optimizer


================================================
FILE: KoSentenceT5/apex/amp/amp.py
================================================
from . import compat, rnn_compat, utils, wrap
from .handle import AmpHandle, NoOpHandle
from .lists import functional_overrides, torch_overrides, tensor_overrides
from ._amp_state import _amp_state
from .frontend import *

import functools
import itertools

import torch


_DECORATOR_HANDLE = None
_USER_CAST_REGISTRY = set()
_USER_PROMOTE_REGISTRY = set()


def _decorator_helper(orig_fn, cast_fn, wrap_fn):
    def wrapper(*args, **kwargs):
        handle = _DECORATOR_HANDLE
        if handle is None or not handle.is_active():
            return orig_fn(*args, **kwargs)
        inner_cast_fn = utils.verbosify(cast_fn, orig_fn.__name__,
                                  handle.verbose)
        return wrap_fn(orig_fn, inner_cast_fn, handle)(*args, **kwargs)
    return wrapper


# Decorator form
def half_function(fn):
    wrap_fn = functools.partial(wrap.make_cast_wrapper, try_caching=True)
    return _decorator_helper(fn, utils.maybe_half, wrap_fn)


def float_function(fn):
    wrap_fn = functools.partial(wrap.make_cast_wrapper, try_caching=False)
    return _decorator_helper(fn, utils.maybe_float, wrap_fn)


def promote_function(fn):
    wrap_fn = functools.partial(wrap.make_promote_wrapper)
    return _decorator_helper(fn, utils.maybe_float, wrap_fn)


# Registry form
def register_half_function(module, name):
    if not hasattr(module, name):
        raise ValueError('No function named {} in module {}.'.format(
            name, module))
    _USER_CAST_REGISTRY.add((module, name, utils.maybe_half))


def register_float_function(module, name):
    if not hasattr(module, name):
        raise ValueError('No function named {} in module {}.'.format(
            name, module))
    _USER_CAST_REGISTRY.add((module, name, utils.maybe_float))


def register_promote_function(module, name):
    if not hasattr(module, name):
        raise ValueError('No function named {} in module {}.'.format(
            name, module))
    _USER_PROMOTE_REGISTRY.add((module, name))


# Top-level function to insert _all_ the hooks.
def init(enabled=True, loss_scale="dynamic", enable_caching=True, verbose=False, allow_banned=False):
    global _DECORATOR_HANDLE

    if not enabled:
        handle = NoOpHandle()
        _DECORATOR_HANDLE = handle
        return handle

    handle = AmpHandle(loss_scale, enable_caching, verbose)

    # 0) Force-{fp16, fp32} for user-annotated functions
    for mod, fn, cast_fn in _USER_CAST_REGISTRY:
        try_caching = (cast_fn == utils.maybe_half)
        wrap.cached_cast(mod, fn, cast_fn, handle,
                         try_caching, verbose)
    _USER_CAST_REGISTRY.clear()

    # 0.5) Force-promote for user-annotated functions
    for mod, fn in _USER_PROMOTE_REGISTRY:
        wrap.promote(mod, fn, handle, verbose)
    _USER_PROMOTE_REGISTRY.clear()

    # 1) Force-{fp16, fp32} on white- / black-list functions
    override_modules = [functional_overrides,
                        torch_overrides,
                        tensor_overrides]
    cast_table = [('FP16_FUNCS', utils.maybe_half),
                  ('FP32_FUNCS', utils.maybe_float)]
    for module, (list_name, cast_fn) in itertools.product(override_modules,
                                                          cast_table):
        for fn in getattr(module, list_name):
            try_caching = (cast_fn == utils.maybe_half)
            wrap.cached_cast(module.MODULE, fn, cast_fn, handle,
                             try_caching, verbose)

    # 1.5) Pre-0.4, put the blacklist methods on HalfTensor and whitelist
    #      methods on FloatTensor, since they're distinct types.
    if compat.tensor_is_float_tensor():
        for fn in tensor_overrides.FP16_FUNCS:
            wrap.cached_cast(torch.cuda.FloatTensor, fn, utils.maybe_half,
                             handle, try_caching=True, verbose=verbose)
        for fn in tensor_overrides.FP32_FUNCS:
            wrap.cached_cast(torch.cuda.HalfTensor, fn, utils.maybe_float,
                             handle, try_caching=False, verbose=verbose)

    # 2) Enable type-promotion on multi-arg functions and methods.
    #    NB: special handling for sequence fns (e.g. `torch.cat`).
    promote_modules = [torch_overrides, tensor_overrides]
    promote_table = [('CASTS', wrap.promote),
                     ('SEQUENCE_CASTS', wrap.sequence_promote)]
    for promote_mod, (list_name, promote_fn) in itertools.product(promote_modules,
                                                                  promote_table):
        for fn in getattr(promote_mod, list_name):
            promote_fn(promote_mod.MODULE, fn, handle, verbose)

    # 2.5) Pre-0.4, add blacklist methods directly to HalfTensor and FloatTensor types
    if compat.tensor_is_float_tensor():
        for cls, (list_name, promote_fn) in itertools.product([torch.cuda.FloatTensor,
                                                               torch.cuda.HalfTensor],
                                                              promote_table):
            for fn in getattr(tensor_overrides, list_name):
                promote_fn(cls, fn, handle, verbose)

    # 3) For any in-place version of a blacklist function, error if any input is fp16.
    #    NB: this is overly conservative.
    for fn in utils.as_inplace(torch_overrides.FP32_FUNCS):
        wrap.err_if_any_half(torch_overrides.MODULE, fn, handle)

    # 3.5) For any in-place blacklist method, error if called on fp16 tensor
    for fn in utils.as_inplace(tensor_overrides.FP32_FUNCS):
        wrap.err_if_arg0_half(tensor_overrides.MODULE, fn, handle, verbose)
        if compat.tensor_is_float_tensor():
            wrap.err_if_arg0_half(torch.cuda.HalfTensor, fn, handle, verbose)

    # 4) For other in-place methods, match the type of self tensor
    for fn in utils.as_inplace(itertools.chain(
            tensor_overrides.FP16_FUNCS,
            tensor_overrides.CASTS)):
        wrap.promote_match_arg0(tensor_overrides.MODULE, fn, handle, verbose)
        if compat.tensor_is_float_tensor():
            wrap.promote_match_arg0(torch.cuda.HalfTensor, fn, handle, verbose)
            wrap.promote_match_arg0(torch.cuda.FloatTensor, fn, handle, verbose)

    # 5) RNNs + RNN cells are whitelisted specially
    if rnn_compat.has_old_rnns():
        wrap.rnn_cast(torch.nn.backends.thnn.backend, 'RNN', handle, verbose)
    if not rnn_compat.has_old_rnns():
        # Patch in our own indirection of `_VF` in modules/rnn s.t. it is mutable.
        torch.nn.modules.rnn._VF = rnn_compat.VariableFunctionsShim()
        # Wrap all the rnns
        for x in rnn_compat.RNN_NAMES:
            wrap.new_rnn_cast(x.upper(), handle, verbose)

    # Wrap all the RNN cells
    rnn_compat.whitelist_rnn_cells(handle, verbose)

    # 6) Place error+print message on banned functions.
    #    Or, if allow_banned, then cast to FP32.
    for fn, err_msg in functional_overrides.BANNED_FUNCS:
        if allow_banned:
            wrap.cached_cast(functional_overrides.MODULE, fn, utils.maybe_float,
                             handle, try_caching=True, verbose=verbose)
        else:
            wrap.err_if_any_half(functional_overrides.MODULE, fn, handle, err_msg)

    _DECORATOR_HANDLE = handle

    _amp_state.handle = handle

    return handle


================================================
FILE: KoSentenceT5/apex/amp/compat.py
================================================
import torch

# True for post-0.4, when Variables/Tensors merged.
def variable_is_tensor():
    v = torch.autograd.Variable()
    return isinstance(v, torch.Tensor)

def tensor_is_variable():
    x = torch.Tensor()
    return type(x) == torch.autograd.Variable

# False for post-0.4
def tensor_is_float_tensor():
    x = torch.Tensor()
    return type(x) == torch.FloatTensor

# Akin to `torch.is_tensor`, but returns True for Variable
# objects in pre-0.4.
def is_tensor_like(x):
    return torch.is_tensor(x) or isinstance(x, torch.autograd.Variable)

# Wraps `torch.is_floating_point` if present, otherwise checks
# the suffix of `x.type()`.
def is_floating_point(x):
    if hasattr(torch, 'is_floating_point'):
        return torch.is_floating_point(x)
    try:
        torch_type = x.type()
        return torch_type.endswith('FloatTensor') or \
            torch_type.endswith('HalfTensor') or \
            torch_type.endswith('DoubleTensor')
    except AttributeError:
        return False

def scalar_python_val(x):
    if hasattr(x, 'item'):
        return x.item()
    else:
        if isinstance(x, torch.autograd.Variable):
            return x.data[0]
        else:
            return x[0]

# Accounts for the possibility that some ops may be removed from a namespace.
def filter_attrs(module, attrs):
    return list(attrname for attrname in attrs if hasattr(module, attrname))


================================================
FILE: KoSentenceT5/apex/amp/frontend.py
================================================
import torch
from ._initialize import _initialize
from ._amp_state import _amp_state, warn_or_err, maybe_print
from collections import OrderedDict


class Properties(object):
    """
    This class has two purposes: to establish a set of default properties,
    and to route setting of these attributes through __setattr__ so that (in theory)
    they can be checked for consistency with other existing args.
    """
    def __init__(self):
        self.options = {
            "enabled" : False,
            "opt_level" : None,
            "cast_model_type" : None,
            "patch_torch_functions" : False,
            "keep_batchnorm_fp32" : None,
            "master_weights" : None,
            "loss_scale" : 1.0,
            # Reserved for future functionality
            # "fused_optimizer" : False,
            # "enable_ddp_interop" : False,
            }

    """
    This function allows updating several options at a time without routing through
    __setattr__ checks, to avoid "you can't get there from here" scenarios.
    Currently not intended to be exposed; users are expected to select an opt_level
    and apply consistent modifications.
    """
    def _update_options_dict(self, new_options):
        for k, v in new_options:
            if k in self.options:
                self.options[k] = v
            else:
                raise ValueError("Tried to set unexpected option {}".format(k))
    """
    The members of "options" are not direct attributes of self, so access attempts
    will roll down to __getattr__.  This borrows from the logic in torch.nn.Module.
    """
    def __getattr__(self, name):
        if "options" in self.__dict__:
            options =  self.__dict__["options"]
            if name in options:
                return options[name]
        raise AttributeError("'{}' object has no attribute '{}'".format(
            type(self).__name__, name))

    def __setattr__(self, name, value):
        if "options" in self.__dict__:
            if name in self.options:
                # print("setting {} {}".format(name, value))
                if name == "cast_model_type":
                    if self.opt_level == "O1" and value is not None:
                        if value is not False:
                            if value is not torch.float32:
                                warn_or_err("O1 inserts casts around Torch functions rather than "
                                            "model weights, so with O1, the model weights themselves "
                                            "should remain FP32. If you wish to cast the model to a "
                                            "different type, use opt_level='O2' or 'O3'. " +
                                            "cast_model_type was {}".format(value))
                    self.options[name] = value
                elif name == "patch_torch_functions":
                    if self.opt_level != "O1" and value:
                        warn_or_err("Currently, patch_torch_functions=True should only be set by "
                                    "selecting opt_level='O1'.")
                    self.options[name] = value
                elif name == "keep_batchnorm_fp32":
                    if self.opt_level == "O1" and value is not None:
                        warn_or_err("With opt_level O1, batchnorm functions are automatically patched "
                                    "to run in FP32, so keep_batchnorm_fp32 should be None." +
                                    " keep_batchnorm_fp32 was {}".format(value))
                    if value == "False":
                        self.options[name] = False
                    elif value == "True":
                        self.options[name] = True
                    else:
                        assert (value is True or value is False or value is None),\
                            "keep_batchnorm_fp32 must be a boolean, the string 'True' or 'False', "\
                            "or None, found keep_batchnorm_fp32={}".format(value)
                        self.options[name] = value
                elif name == "master_weights":
                    if self.opt_level == "O1" and value is not None:
                        warn_or_err("It doesn't make sense to use master_weights with O1. "
                                    "With O1, your model weights themselves should be FP32.")
                    self.options[name] = value
                elif name == "loss_scale":
                    if value == "dynamic":
                        self.options[name] = value
                    else:
                        self.options[name] = float(value)
                else:
                    self.options[name] = value
        else:
            super(Properties, self).__setattr__(name, value)


""" O0-O3 are convenience wrappers to establish defaults for typically used mixed precision options. """

class O3:
    brief = "O3:  Pure FP16 training."
    more = "Calls .half() on your model, converting the entire model to FP16.\n"\
        "A casting operation is also inserted to cast incoming Tensors to FP16,\n"\
        "so you don't need to change your data pipeline.\n"\
        "This mode is useful for establishing a performance ceiling.\n"\
        "It's also possible training may 'just work' in this mode.\n"\
        "If not, try other optimization levels."

    def __call__(self, properties):
        properties.enabled = True
        properties.opt_level = "O3"
        properties.cast_model_type = torch.float16
        properties.patch_torch_functions = False
        properties.keep_batchnorm_fp32 = False
        properties.master_weights = False
        properties.loss_scale = 1.0
        # properties.fused_optimizer = False
        # properties.enable_ddp_interop = False
        return properties # modified in place so this isn't really necessary


class O2:
    brief = "O2:  FP16 training with FP32 batchnorm and FP32 master weights.\n"
    more = "Calls .half() on your model, converting the entire model (except for batchnorms)\n"\
        "to FP16.  Batchnorms are retained in FP32 for additional stability.\n"\
        "The forward pass is patched to cast incoming Tensors to FP16, so you don't need to change\n"\
        "your data pipeline.\n"\
        "O2 creates FP32 master weights outside the model and patches any optimizers to update\n"\
        "these master weights, then copy the master weights into the FP16 model weights.\n"\
        "Master weights can also improve convergence and stability."

    def __call__(self, properties):
        properties.enabled = True
        properties.opt_level = "O2"
        properties.cast_model_type = torch.float16
        properties.patch_torch_functions = False
        properties.keep_batchnorm_fp32 = True
        properties.master_weights = True
        properties.loss_scale = "dynamic"
        # properties.fused_optimizer = False
        # properties.enable_ddp_interop = False
        return properties # modified in place so this isn't really necessary


class O1:
    brief = "O1:  Insert automatic casts around Pytorch functions and Tensor methods.\n"
    more = "The type of your model's weights is not altered.  However, internally,\n"\
        "Pytorch functions are patched to cast any Tensor Core-friendly ops to FP16 for speed,\n"\
        "while operations that might benefit from the additional stability of FP32 are patched\n"\
        "to cast their inputs to fp32.\n"\
        "O1 is the safest way to try mixed precision training, and is recommended when\n"\
        "trying mixed precision training for the first time."

    def __call__(self, properties):
        properties.enabled = True
        properties.opt_level = "O1"
        properties.cast_model_type = None
        properties.patch_torch_functions = True
        properties.keep_batchnorm_fp32 = None
        properties.master_weights = None
        properties.loss_scale = "dynamic"
        # properties.fused_optimizer = False
        # properties.enable_ddp_interop = False
        return properties # modified in place so this isn't really necessary


class O0:
    brief = "O0:  Pure FP32 training.\n"
    more = "Your models are checked to make sure parameters are FP32, but otherwise the\n"\
        "types of weights and internal Pytorch operations are not altered.  This mode disables any\n"\
        "FP16 arithmetic, although other optimizations like DDP interop may still be requested.\n"

    def __call__(self, properties):
        properties.enabled = True
        properties.opt_level = "O0"
        properties.cast_model_type = torch.float32
        properties.patch_torch_functions = False
        properties.keep_batchnorm_fp32 = None
        properties.master_weights = False
        properties.loss_scale = 1.0
        # properties.fused_optimizer = False
        # properties.enable_ddp_interop = False
        return properties # modified in place so this isn't really necessary


opt_levels = {"O3": O3(),
              "O2": O2(),
              "O1": O1(),
              "O0": O0()}


# allow user to directly pass Properties struct as well?
def initialize(
    models,
    optimizers=None,
    enabled=True,
    opt_level="O1",
    cast_model_type=None,
    patch_torch_functions=None,
    keep_batchnorm_fp32=None,
    master_weights=None,
    loss_scale=None,
    cast_model_outputs=None,
    num_losses=1,
    verbosity=1,
    min_loss_scale=None,
    max_loss_scale=2.**24
    ):
    """
    Initialize your models, optimizers, and the Torch tensor and functional namespace according to the
    chosen ``opt_level`` and overridden properties, if any.

    ``amp.initialize`` should be called **after** you have finished
    constructing your model(s) and
    optimizer(s), but **before** you send your model through any DistributedDataParallel wrapper.
    See `Distributed training`_ in the Imagenet example.

    Currently, ``amp.initialize`` should only be called **once**,
    although it can process an arbitrary number of
    models and optimizers (see the corresponding `Advanced Amp Usage topic`_).
    If you think your use case requires ``amp.initialize`` to be called more than once,
    `let us know`_.

    Any property keyword argument that is not ``None`` will be interpreted as a manual override.

    To prevent having to rewrite anything else in your script, name the returned models/optimizers
    to replace the passed models/optimizers, as in the code sample below.

    Args:
        models (torch.nn.Module or list of torch.nn.Modules):  Models to modify/cast.
        optimizers (optional, torch.optim.Optimizer or list of torch.optim.Optimizers):  Optimizers to modify/cast.
            REQUIRED for training, optional for inference.
        enabled (bool, optional, default=True):  If False, renders all Amp calls no-ops, so your script
            should run as if Amp were not present.
        opt_level (str, optional, default="O1"):  Pure or mixed precision optimization level.  Accepted values are
            "O0", "O1", "O2", and "O3", explained in detail above.
        cast_model_type (``torch.dtype``, optional, default=None):  Optional property override, see
            above.
        patch_torch_functions (bool, optional, default=None):  Optional property override.
        keep_batchnorm_fp32 (bool or str, optional, default=None):  Optional property override.  If
            passed as a string, must be the string "True" or "False".
        master_weights (bool, optional, default=None):  Optional property override.
        loss_scale (float or str, optional, default=None):  Optional property override.  If passed as a string,
            must be a string representing a number, e.g., "128.0", or the string "dynamic".
        cast_model_outputs (torch.dtype, optional, default=None):  Option to ensure that the outputs
            of your model(s) are always cast to a particular type regardless of ``opt_level``.
        num_losses (int, optional, default=1):  Option to tell Amp in advance how many losses/backward
            passes you plan to use.  When used in conjunction with the ``loss_id`` argument to
            ``amp.scale_loss``, enables Amp to use a different loss scale per loss/backward pass,
            which can improve stability.  See "Multiple models/optimizers/losses"
            under `Advanced Amp Usage`_ for examples.  If ``num_losses`` is left to 1, Amp will still
            support multiple losses/backward passes, but use a single global loss scale
            for all of them.
        verbosity (int, default=1):  Set to 0 to suppress Amp-related output.
        min_loss_scale (float, default=None):  Sets a floor for the loss scale values that can be chosen by dynamic
            loss scaling.  The default value of None means that no floor is imposed.
            If dynamic loss scaling is not used, `min_loss_scale` is ignored.
        max_loss_scale (float, default=2.**24):  Sets a ceiling for the loss scale values that can be chosen by
            dynamic loss scaling.  If dynamic loss scaling is not used, `max_loss_scale` is ignored.

    Returns:
        Model(s) and optimizer(s) modified according to the ``opt_level``.
        If either the ``models`` or ``optimizers`` args were lists, the corresponding return value will
        also be a list.

    Permissible invocations::

        model, optim = amp.initialize(model, optim,...)
        model, [optim1, optim2] = amp.initialize(model, [optim1, optim2],...)
        [model1, model2], optim = amp.initialize([model1, model2], optim,...)
        [model1, model2], [optim1, optim2] = amp.initialize([model1, model2], [optim1, optim2],...)

        # This is not an exhaustive list of the cross product of options that are possible,
        # just a set of examples.
        model, optim = amp.initialize(model, optim, opt_level="O0")
        model, optim = amp.initialize(model, optim, opt_level="O0", loss_scale="dynamic"|128.0|"128.0")

        model, optim = amp.initialize(model, optim, opt_level="O1") # uses "loss_scale="dynamic" default
        model, optim = amp.initialize(model, optim, opt_level="O1", loss_scale=128.0|"128.0")

        model, optim = amp.initialize(model, optim, opt_level="O2") # uses "loss_scale="dynamic" default
        model, optim = amp.initialize(model, optim, opt_level="O2", loss_scale=128.0|"128.0")
        model, optim = amp.initialize(model, optim, opt_level="O2", keep_batchnorm_fp32=True|False|"True"|"False")

        model, optim = amp.initialize(model, optim, opt_level="O3") # uses loss_scale=1.0 default
        model, optim = amp.initialize(model, optim, opt_level="O3", loss_scale="dynamic"|128.0|"128.0")
        model, optim = amp.initialize(model, optim, opt_level="O3", keep_batchnorm_fp32=True|False|"True"|"False")

    The `Imagenet example`_ demonstrates live use of various opt_levels and overrides.

    .. _`Distributed training`:
        https://github.com/NVIDIA/apex/tree/master/examples/imagenet#distributed-training

    .. _`Imagenet example`:
        https://github.com/NVIDIA/apex/tree/master/examples/imagenet

    .. _`Advanced Amp Usage`:
        https://nvidia.github.io/apex/advanced.html

    .. _`Advanced Amp Usage topic`:
        https://nvidia.github.io/apex/advanced.html#multiple-models-optimizers-losses

    .. _`let us know`:
        https://github.com/NVIDIA/apex/issues
    """
    _amp_state.opt_properties = Properties()
    _amp_state.verbosity = verbosity

    if not enabled:
        if optimizers is None:
            return models
        else:
            return models, optimizers

    if not torch.backends.cudnn.enabled:
        raise RuntimeError(
            "Amp requires torch.backends.cudnn.enabled = True")

    if opt_level not in opt_levels:
        raise RuntimeError(
            "Unexpected optimization level {}. ".format(opt_level) +
            "Options are 'O0', 'O1', 'O2', 'O3'.  Note that in `O0`, `O1`, etc., the prefix O is the letter O, " +
            "not the number zero.")
    else:
        _amp_state.opt_properties = opt_levels[opt_level](_amp_state.opt_properties)
        maybe_print("Selected optimization level {}".format(opt_levels[opt_level].brief), True)
        maybe_print("Defaults for this optimization level are:", True)
        for k, v in _amp_state.opt_properties.options.items():
            maybe_print("{:22} : {}".format(k, v), True)

    _amp_state.min_loss_scale = min_loss_scale
    _amp_state.max_loss_scale = max_loss_scale

    maybe_print("Processing user overrides (additional kwargs that are not None)...", True)
    # I chose to have the keyword arguments listed directly in the argument list,
    # instead of **kwargs, so I can't use kwargs.items() here.
    if enabled is not None:
        _amp_state.opt_properties.enabled = enabled
    if opt_level is not None:
        _amp_state.opt_properties.opt_level = opt_level
    if cast_model_type is not None:
        _amp_state.opt_properties.cast_model_type = cast_model_type
    if patch_torch_functions is not None:
        _amp_state.opt_properties.patch_torch_functions = patch_torch_functions
    if keep_batchnorm_fp32 is not None:
        _amp_state.opt_properties.keep_batchnorm_fp32 = keep_batchnorm_fp32
    if master_weights is not None:
        _amp_state.opt_properties.master_weights = master_weights
    if loss_scale is not None:
        _amp_state.opt_properties.loss_scale = loss_scale

    maybe_print("After processing overrides, optimization options are:", True)
    for k, v in _amp_state.opt_properties.options.items():
        maybe_print("{:22} : {}".format(k, v), True)

    return _initialize(models, optimizers, _amp_state.opt_properties, num_losses, cast_model_outputs)


def state_dict(destination=None):
    if destination is None:
        destination = OrderedDict()

    for idx, loss_scaler in enumerate(_amp_state.loss_scalers):
        destination['loss_scaler%d' % idx] = {
            'loss_scale': loss_scaler.loss_scale(),
            'unskipped': loss_scaler._unskipped,
        }
    return destination


def load_state_dict(state_dict):
    # Check if state_dict containes the same number of loss_scalers as current setup
    if len(state_dict) != len(_amp_state.loss_scalers):
        print('Warning: state_dict contains {} entries, while {} loss_scalers are used'.format(
            len(state_dict), len(_amp_state.loss_scalers)))

    state_dict = state_dict.copy()
    
    nb_loss_scalers = len(_amp_state.loss_scalers)
    unexpected_keys = []
    # Initialize idx outside, since unexpected_keys will increase it if enumerate is used
    idx = 0
    for key in state_dict:
        if 'loss_scaler' not in key:
            unexpected_keys.append(key)
        else:
            if idx > (nb_loss_scalers - 1):
                print('Skipping loss_scaler[{}], since num_losses was set to {}'.format(
                    idx, nb_loss_scalers))
                break
            _amp_state.loss_scalers[idx]._loss_scale = state_dict[key]['loss_scale']
            _amp_state.loss_scalers[idx]._unskipped = state_dict[key]['unskipped']
            idx += 1

    if len(unexpected_keys) > 0:
        raise RuntimeError(
            'Error(s) in loading state_dict. Unexpected key(s) in state_dict: {}. '.format(
                ', '.join('"{}"'.format(k) for k in unexpected_keys)))


# TODO:  is this necessary/useful?
# def check_option_consistency(enabled=True,
#                              opt_level=None,
#                              cast_model_type=None,
#                              patch_torch_functions=None,
#                              keep_batchnorm_fp32=None,
#                              master_weights=None,
#                              loss_scale=None,
#                              enable_ddp_interop=None,
#                              hard_override=False):
#     """
#     Utility function that enables users to quickly check if the option combination they intend
#     to use is permitted.  ``check_option_consistency`` does not require models or optimizers
#     to be constructed, and can be called at any point in the script.  ``check_option_consistency``
#     is totally self-contained; it does not set any amp global state or affect anything outside
#     of itself.
#     """
#
#     if not enabled:
#         return
#
#     if opt_level not in opt_levels:
#         raise RuntimeError("Unexpected optimization level.  Options are 'O0', 'O1', 'O2', 'O3'.")
#     else:
#         opt_properties = opt_levels[opt_level](Properties())
#         print("Selected optimization level {}", opt_levels[opt_level].brief)
#         print("Defaults for this optimization level are:")
#         for k, v in opt_properties.options:
#             print("{:22} : {}".format(k, v))
#
#     print("Processing user overrides (additional kwargs that are not None)...")
#     for k, v in kwargs:
#         if k not in _amp_state.opt_properties.options:
#             raise RuntimeError("Unexpected kwarg {}".format(k))
#         if v is not None:
#             setattr(opt_properties, k, v)
#
#     print("After processing overrides, optimization options are:")
#     for k, v in opt_properties.options:
#         print("{:22} : {}".format(k, v))


================================================
FILE: KoSentenceT5/apex/amp/handle.py
================================================
import contextlib
import warnings
import sys
import torch

from . import utils
from .opt import OptimWrapper
from .scaler import LossScaler
from ._amp_state import _amp_state, master_params, maybe_print

if torch.distributed.is_available():
    from ..parallel.LARC import LARC


# There's no reason to expose the notion of a "handle". Everything can happen through amp.* calls.
@contextlib.contextmanager
def scale_loss(loss,
               optimizers,
               loss_id=0,
               model=None,
               delay_unscale=False,
               delay_overflow_check=False):
    """
    On context manager entrance, creates ``scaled_loss = (loss.float())*current loss scale``.
    ``scaled_loss`` is yielded so that the user can call ``scaled_loss.backward()``::

        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()

    On context manager exit (if ``delay_unscale=False``), the gradients are checked for infs/NaNs
    and unscaled, so that ``optimizer.step()`` can be called.

    .. note::
        If Amp is using explicit FP32 master params (which is the default for ``opt_level=O2``, and
        can also be manually enabled by supplying ``master_weights=True`` to ``amp.initialize``)
        any FP16 gradients are copied to FP32 master gradients before being unscaled.
        ``optimizer.step()`` will then apply the unscaled master gradients to the master params.

    .. warning::
        If Amp is using explicit FP32 master params, only the FP32 master gradients will be
        unscaled.  The direct ``.grad`` attributes of any FP16
        model params will remain scaled after context manager exit.
        This subtlety affects gradient clipping.  See "Gradient clipping" under
        `Advanced Amp Usage`_ for best practices.

    Args:
        loss(Tensor):  Typically a scalar Tensor. The ``scaled_loss`` that the context
            manager yields is simply ``loss.float()*loss_scale``, so in principle
            ``loss`` could have more than one element, as long as you call
            ``backward()`` on ``scaled_loss`` appropriately within the context manager body.
        optimizers:  All optimizer(s) for which the current backward pass is creating gradients.
            Must be an optimizer or list of optimizers returned from an earlier call
            to ``amp.initialize``.  For example use with multiple optimizers, see
            "Multiple models/optimizers/losses" under `Advanced Amp Usage`_.
        loss_id(int, optional, default=0):  When used in conjunction with the ``num_losses`` argument
            to ``amp.initialize``, enables Amp to use a different loss scale per loss.  ``loss_id``
            must be an integer between 0 and ``num_losses`` that tells Amp which loss is
            being used for the current backward pass.  See "Multiple models/optimizers/losses"
            under `Advanced Amp Usage`_ for examples.  If ``loss_id`` is left unspecified, Amp
            will use the default global loss scaler for this backward pass.
        model(torch.nn.Module, optional, default=None):  Currently unused, reserved to enable future
            optimizations.
        delay_unscale(bool, optional, default=False):  ``delay_unscale`` is never necessary, and
            the default value of ``False`` is strongly recommended.
            If ``True``, Amp will not unscale the gradients or perform model->master
            gradient copies on context manager exit.
            ``delay_unscale=True`` is a minor ninja performance optimization and can result
            in weird gotchas (especially with multiple models/optimizers/losses),
            so only use it if you know what you're doing.
            "Gradient accumulation across iterations" under `Advanced Amp Usage`_
            illustrates a situation where this CAN (but does not need to) be used.

    .. warning::
        If ``delay_unscale`` is ``True`` for a given backward pass, ``optimizer.step()`` cannot be
        called yet after context manager exit, and must wait for another, later backward context
        manager invocation with ``delay_unscale`` left to False.

    .. _`Advanced Amp Usage`:
        https://nvidia.github.io/apex/advanced.html
    """
    if not hasattr(_amp_state, "opt_properties"):
        raise RuntimeError("Invoked 'with amp.scale_loss`, but internal Amp state has not been initialized.  "
                           "model, optimizer = amp.initialize(model, optimizer, opt_level=...) must be called "
                           "before `with amp.scale_loss`.")

    if not _amp_state.opt_properties.enabled:
        yield loss
        return

    if isinstance(optimizers, torch.optim.Optimizer) or ('LARC' in globals() and isinstance(optimizers, LARC)):
        optimizers = [optimizers]

    loss_scaler = _amp_state.loss_scalers[loss_id]
    loss_scale = loss_scaler.loss_scale()

    if ((not _amp_state.opt_properties.master_weights)
        and (not loss_scaler.dynamic)
        and loss_scale == 1.0):
        yield loss.float()
        # Needing to drop the cache here as well is an ugly gotcha.
        # But for now I think it's necessary to short-circuit.
        # Probably ok to skip this if not delay_unscale
        if _amp_state.opt_properties.patch_torch_functions:
            _amp_state.handle._clear_cache()
        return

    if not delay_unscale:
        if isinstance(optimizers, list):
            for optimizer in optimizers:
                if not optimizer._amp_stash.params_have_scaled_gradients:
                    optimizer._prepare_amp_backward()

    yield (loss.float())*loss_scale

    if delay_unscale:
        for optimizer in optimizers:
            optimizer._amp_stash.params_have_scaled_gradients = True
    else:
        # FusedSGD may take care of unscaling as part of their step() methods.
        # if not isinstance(optimizers, FP16_Optimizer_for_fused):
            loss_scaler.clear_overflow_state()
            for optimizer in optimizers:
                optimizer._post_amp_backward(loss_scaler)
                optimizer._amp_stash.params_have_scaled_gradients = False
            # For future fused optimizers that enable sync-free dynamic loss scaling,
            # should_skip will always be False.
            should_skip = False if delay_overflow_check else loss_scaler.update_scale()
            if should_skip:
                for optimizer in optimizers:
                    if not optimizer._amp_stash.already_patched:
                        # Close on loss_scaler and loss_id as well, to be safe.  Probably not
                        # necessary because amp.scale_loss is already creating a temporary scope.
                        def patch_step(opt, loss_scaler, loss_id):
                            opt_step = opt.step
                            def skip_step(closure=None):
                                if closure is not None:
                                    raise RuntimeError("Currently, Amp does not support closure use with optimizers.")
                                maybe_print(("Gradient overflow.  Skipping step, loss scaler " +
                                             "{} reducing loss scale to {}").format(loss_id,
                                             loss_scaler.loss_scale()))
                                # TODO:  I don't like the special casing for different optimizer implementations.
                                # Maybe skip should delegate to a method owned by the optimizers themselves.
                                if hasattr(opt._amp_stash, "all_fp32_from_fp16_params"):
                                    # Clear the master grads that wouldn't be zeroed by model.zero_grad()
                                    for param in opt._amp_stash.all_fp32_from_fp16_params:
                                        param.grad = None
                                if hasattr(opt, "most_recent_scale"):
                                    opt.most_recent_scale = 1.0
                                    opt.scale_set_by_backward = False
                                opt.step = opt_step
                                opt._amp_stash.already_patched = False
                            return skip_step
                        optimizer.step = patch_step(optimizer, loss_scaler, loss_id)
                        optimizer._amp_stash.already_patched = True

    # Probably ok to skip this if not delay_unscale
    if _amp_state.opt_properties.patch_torch_functions:
        _amp_state.handle._clear_cache()


# Free function version of AmpHandle.disable_casts, another step on the
# path to removing the concept of "AmpHandle"
@contextlib.contextmanager
def disable_casts():
    _amp_state.handle._is_active = False
    yield
    _amp_state.handle._is_active = True


class AmpHandle(object):
    def __init__(self, loss_scale="dynamic", enable_caching=True, verbose=False):
        self._enable_caching = enable_caching
        self._verbose = verbose
        self._cache = dict()
        self._default_scaler = LossScaler(loss_scale)
        self._is_active = True
        self._all_wrappers = []

    def is_active(self):
        return self._is_active

    @contextlib.contextmanager
    def _disable_casts(self):
        self._is_active = False
        yield
        self._is_active = True

    def wrap_optimizer(self, optimizer, num_loss=1):
        self._default_scaler = None
        return OptimWrapper(optimizer, self, num_loss)

    @contextlib.contextmanager
    def scale_loss(self, loss, optimizer):
        raise RuntimeError("The old Amp API is no longer supported.  Please move to the new API, "
            "documented here:  https://nvidia.github.io/apex/amp.html.  Transition guide:  "
            "https://nvidia.github.io/apex/amp.html#transition-guide-for-old-api-users")

        if not self.is_active():
            yield loss
            return

        if self._default_scaler is None:
            raise RuntimeError(
                'After calling `handle.wrap_optimizer()`, you must explicitly ' +
                'use `optimizer.scale_loss(loss)`.')

        # TODO: this code block is duplicated here and `opt.py`. Unify.
        loss_scale = self._default_scaler.loss_scale()
        yield loss * loss_scale

        self._default_scaler.clear_overflow_state()
        self._default_scaler.unscale(
            master_params(optimizer),
            master_params(optimizer),
            loss_scale)
        should_skip = self._default_scaler.update_scale()
        if should_skip:
            optimizer_step = optimizer.step
            def skip_step():
                maybe_print('Gradient overflow, skipping update')
                optimizer.step = optimizer_step
            optimizer.step = skip_step

        self._clear_cache()

    def _clear_cache(self):
        self._cache.clear()

    # Experimental support for saving / restoring uncasted versions of functions
    def _save_func(self, mod, fn, func):
        self._all_wrappers.append((mod, fn, func))

    def _deactivate(self):
        for mod, fn, func in self._all_wrappers:
            utils.set_func(mod, fn, func)
        self._all_wrappers = []

    @property
    def has_cache(self):
        return self._enable_caching

    @property
    def cache(self):
        return self._cache

    def remove_cache(self, param):
        if self.has_cache and param in self.cache:
            del self.cache[param]

    @property
    def verbose(self):
        return self._verbose

class NoOpHandle(object):
    def is_active(self):
        return False

    @contextlib.contextmanager
    def _disable_casts(self):
        yield

    def wrap_optimizer(self, optimizer, num_loss=1):
        return OptimWrapper(optimizer, self, num_loss)

    @contextlib.contextmanager
    def scale_loss(self, loss, optimizer):
        yield loss

    @property
    def has_cache(self):
        return False

    @property
    def verbose(self):
        return False

    def _clear_cache(self):
        pass

    def _deactivate(self):
        pass


================================================
FILE: KoSentenceT5/apex/amp/lists/__init__.py
================================================


================================================
FILE: KoSentenceT5/apex/amp/lists/functional_overrides.py
================================================

# TODO: think about the following two. They do weird things.
# - torch.nn.utils.clip_grad (but it should always be fp32 anyway)
# - torch.nn.utils.weight_norm

# Notes:
# F.instance_norm uses batch_norm internally. Which correctly handles
#   fp16 in/out with fp32 weights. So we shouldn't do anything for
#   either of these.
# F.normalize calls `input.norm()` internally, so it's redundant, but
#   kept here in case impl. changes.
# F.cosine_similarity is same: calls `x.norm()` internally.

import torch.nn.functional

MODULE = torch.nn.functional

FP16_FUNCS = [
    'conv1d',
    'conv2d',
    'conv3d',
    'conv_transpose1d',
    'conv_transpose2d',
    'conv_transpose3d',
    'conv_tbc', # Undocumented / maybe new?
    'linear',
]

FP32_FUNCS = [

    # Interpolation/Upsampling TODO:  Remove for 1.2
    'interpolate',
    'grid_sample',

    # Pointwise
    'softplus',
    'softmin',
    'log_softmax',
    'softmax',
    'gelu',
    
    # Normalization
    'layer_norm',
    'group_norm',
    'local_response_norm',
    'normalize',
    'cosine_similarity',

    # Loss functions
    # TODO: which of these can be fp16?
    'poisson_nll_loss',
    'cosine_embedding_loss',
    'cross_entropy',
    'hinge_embedding_loss',
    'kl_div',
    'l1_loss',
    'mse_loss',
    'margin_ranking_loss',
    'multilabel_margin_loss',
    'multilabel_soft_margin_loss',
    'multi_margin_loss',
    'nll_loss',
    'binary_cross_entropy_with_logits',
    'smooth_l1_loss',
    'soft_margin_loss',
    'triplet_margin_loss',
    'ctc_loss'
]

BANNED_FUNCS = [
    ('binary_cross_entropy',
     ("\namp does not work out-of-the-box with `F.binary_cross_entropy` or `torch.nn.BCELoss.` "
      "It requires that the output of the previous function be already a FloatTensor. \n\n"
      "Most models have a Sigmoid right before BCELoss. In that case, you can use\n"
      "    torch.nn.BCEWithLogitsLoss\nto combine Sigmoid+BCELoss into a single layer "
      "that is compatible with amp.\nAnother option is to add\n"
      "    amp.register_float_function(torch, 'sigmoid')\nbefore calling `amp.init()`.\n"
      "If you _really_ know what you are doing, you can disable this warning by passing "
      "allow_banned=True to `amp.init()`."))
]


================================================
FILE: KoSentenceT5/apex/amp/lists/tensor_overrides.py
================================================
from .. import compat
from . import torch_overrides

import importlib

import torch

# if compat.variable_is_tensor() and not compat.tensor_is_variable():
MODULE = torch.Tensor
# else:
#     MODULE = torch.autograd.Variable


FP16_FUNCS = compat.filter_attrs(MODULE, [
    '__matmul__',
])

FP32_FUNCS = compat.filter_attrs(MODULE, [
    '__ipow__',
    '__pow__',
    '__rpow__',

    # Cast to fp32 before transfer to CPU
    'cpu',
])

CASTS = compat.filter_attrs(MODULE, [
    '__add__',
    '__div__',
    '__eq__',
    '__ge__',
    '__gt__',
    '__iadd__',
    '__idiv__',
    '__imul__',
    '__isub__',
    '__itruediv__',
    '__le__',
    '__lt__',
    '__mul__',
    '__ne__',
    '__radd__',
    '__rdiv__',
    '__rmul__',
    '__rsub__',
    '__rtruediv__',
    '__sub__',
    '__truediv__',
])

# None of these, but here to make code cleaner.
SEQUENCE_CASTS = []

# We need to grab all the methods from torch_overrides and add them to
# the Tensor lists as well, as almost all methods are duplicated
# between `torch` and `torch.Tensor` (and check with `hasattr`,
# because a few random ones aren't defined on Tensor)
_self_mod = importlib.import_module(__name__)
for attrname in ['FP16_FUNCS', 'FP32_FUNCS', 'CASTS', 'SEQUENCE_CASTS']:
    lst = getattr(_self_mod, attrname)
    for fn in getattr(torch_overrides, attrname):
        if hasattr(MODULE, fn):
            lst.append(fn)


================================================
FILE: KoSentenceT5/apex/amp/lists/torch_overrides.py
================================================
import torch

from .. import utils

MODULE = torch

FP16_FUNCS = [
    # Low level functions wrapped by torch.nn layers.
    # The wrapper layers contain the weights which are then passed in as a parameter
    # to these functions.
    'conv1d',
    'conv2d',
    'conv3d',
    'conv_transpose1d',
    'conv_transpose2d',
    'conv_transpose3d',
    'conv_tbc',
    'prelu',

    # BLAS
    'addmm',
    'addmv',
    'addr',
    'matmul',
    'mm',
    'mv',
]

FP32_FUNCS = [
    # Pointwise
    'acos',
    'asin',
    'cosh',
    'erfinv',
    'exp',
    'expm1',
    'log',
    'log10',
    'log2',
    'reciprocal',
    'rsqrt',
    'sinh',
    'tan',

    # Other math
    'pow',

    # Reduction
    'cumprod',
    'cumsum',
    'dist',
    # 'mean',
    'norm',
    'prod',
    'std',
    'sum',
    'var',

    # Misc
    'renorm'
]

version_strings = torch.__version__.split('.')
version_major = version_strings[0]
version_minor = version_strings[1]
version_num = float(version_major + "." + version_minor)
# Before torch 1.1, mean must be blacklisted.
if version_num < 1.1:
    FP32_FUNCS.append('mean')

# Before CUDA 9.1, batched matmul was missing fast FP16 kernels. We
# check the CUDA version -- if at least 9.1, then put the bmm
# functions on the fp16 list. Otherwise, put them on the fp32 list.
_bmms = ['addbmm',
         'baddbmm',
         'bmm']

if utils.is_cuda_enabled():
  # workaround https://github.com/facebookresearch/maskrcnn-benchmark/issues/802
  if utils.get_cuda_version() >= (9, 1, 0):
      FP16_FUNCS.extend(_bmms)
  else:
      FP32_FUNCS.extend(_bmms)

# Multi-tensor fns that may need type promotion
CASTS = [
    # Multi-tensor math
    'addcdiv',
    'addcmul',
    'atan2',
    'cross',
    'bilinear',
    'dot',

    # Element-wise _or_ tensor-wise math
    'add',
    'div',
    'mul',

    # Comparison
    'eq',
    'equal',
    'ge',
    'gt',
    'le',
    'lt',
    'ne'
]

# Functions that take sequence arguments. We need to inspect the whole
# sequence and cast to the widest type.
SEQUENCE_CASTS = [
    'cat',
    'stack'
]


================================================
FILE: KoSentenceT5/apex/amp/opt.py
================================================
import contextlib
import warnings

from .scaler import LossScaler, master_params
from ._amp_state import maybe_print

import numpy as np

class OptimWrapper(object):
    def __init__(self, optimizer, amp_handle, num_loss):
        self._optimizer = optimizer
        self._amp_handle = amp_handle
        self._num_loss = num_loss
        self._loss_idx = 0
        self._skip_next = [False] * num_loss
        self._loss_scaler = [LossScaler('dynamic') for _ in range(num_loss)]

    @contextlib.contextmanager
    def scale_loss(self, loss):
        if not self._amp_handle.is_active():
            yield loss
            return

        # When there are multiple losses per-optimizer, we need
        # to save out current grad accumulation, since we won't be
        # able to unscale this particulare loss once the grads are
        # all mixed together.
        cached_grads = []
        if self._loss_idx > 0:
            for p in master_params(self._optimizer):
                if p.grad is not None:
                    cached_grads.append(p.grad.data.detach().clone())
                else:
                    cached_grads.append(None)
            self._optimizer.zero_grad()

        loss_scale = self._cur_loss_scaler().loss_scale()
        yield loss * loss_scale

        self._cur_loss_scaler().clear_overflow_state()
        self._cur_loss_scaler().unscale(
            master_params(self._optimizer),
            master_params(self._optimizer),
            loss_scale)
        self._skip_next[self._loss_idx] = self._cur_loss_scaler().update_scale()
        self._loss_idx += 1

        if len(cached_grads) > 0:
            for p, cached_grad in zip(master_params(self._optimizer),
                                      cached_grads):
                if cached_grad is not None:
                    p.grad.data.add_(cached_grad)
            cached_grads = []

    def _cur_loss_scaler(self):
        assert 0 <= self._loss_idx < self._num_loss
        return self._loss_scaler[self._loss_idx]

    def step(self, closure=None):
        if not self._amp_handle.is_active():
            return self._optimizer.step(closure=closure)

        self._loss_idx = 0

        for group in self._optimizer.param_groups:
            for p in group['params']:
                self._amp_handle.remove_cache(p)

        if closure is not None:
            raise NotImplementedError(
                'The `closure` argument is unsupported by the amp ' +
                'optimizer wrapper.')
        if any(self._skip_next):
            maybe_print('Gradient overflow, skipping update')
            self._skip_next = [False] * self._num_loss
        else:
            return self._optimizer.step(closure=closure)

    # Forward any attribute lookups
    def __getattr__(self, attr):
        return getattr(self._optimizer, attr)

    # Forward all torch.optim.Optimizer methods
    def __getstate__(self):
        return self._optimizer.__getstate__()

    def __setstate__(self):
        return self._optimizer.__setstate__()

    def __repr__(self):
        return self._optimizer.__repr__()

    def state_dict(self):
        return self._optimizer.state_dict()

    def load_state_dict(self, state_dict):
        return self._optimizer.load_state_dict(state_dict)

    def zero_grad(self):
        return self._optimizer.zero_grad()

    def add_param_group(self, param_group):
        return self._optimizer.add_param_group(param_group)


================================================
FILE: KoSentenceT5/apex/amp/rnn_compat.py
================================================
from . import utils, wrap

import torch
_VF = torch._C._VariableFunctions
RNN_NAMES = ['rnn_relu', 'rnn_tanh', 'gru', 'lstm']

def _gen_VF_wrapper(name):
    def wrapper(*args, **kwargs):
        return getattr(_VF, name)(*args, **kwargs)
    return wrapper

# Some python magic to generate an object that has the rnn cell functions
# defined on it, all of which call into corresponding _VF version.
# Intended to patch torch.nn.modules.rnn._VF (aka, the ref named "_VF"
# imported at module scope within torch.nn.modules.rnn).  This should
# not affect third-party importers of _VF.py.
class VariableFunctionsShim(object):
    def __init__(self):
        for name in RNN_NAMES:
            for suffix in ['', '_cell']:
               fn_name = name + suffix
               setattr(self, fn_name, _gen_VF_wrapper(fn_name))

def has_old_rnns():
    try:
        torch.nn.backends.thnn.backend.LSTMCell
        return True
    except:
        return False

def whitelist_rnn_cells(handle, verbose):
    # Different module + function names in old/new RNN cases
    if has_old_rnns():
        fn_names = ['RNNReLUCell', 'RNNTanhCell', 'LSTMCell', 'GRUCell']
        mod = torch.nn.backends.thnn.backend
    else:
        fn_names = [x + '_cell' for x in RNN_NAMES]
        mod = torch.nn.modules.rnn._VF
        assert isinstance(mod, VariableFunctionsShim)

    # Insert casts on cell functions
    for fn in fn_names:
        wrap.cached_cast(mod, fn, utils.maybe_half, handle,
                         try_caching=True, verbose=verbose)

    if has_old_rnns():
        # Special handling of `backward` for fused gru / lstm:
        # The `backward` method calls Tensor.sum() (blacklist) internally,
        # and then the resulting grad_input has the wrong type.
        # TODO: where else is this a problem?
        for rnn_type in ['GRUFused', 'LSTMFused']:
            mod = getattr(torch.nn._functions.thnn.rnnFusedPointwise, rnn_type)
            wrap.disable_casts(mod, 'backward', handle)


================================================
FILE: KoSentenceT5/apex/amp/scaler.py
================================================
import torch
from ..multi_tensor_apply import multi_tensor_applier
from ._amp_state import _amp_state, master_params, maybe_print
from itertools import product

def scale_check_overflow_python(model_grad, master_grad, scale, check_overflow=False):
    # Exception handling for 18.04 compatibility
    if check_overflow:
        cpu_sum = float(model_grad.float().sum())
        if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
            return True

    if master_grad is not model_grad: # copy_ probably internally short-circuits this
        master_grad.copy_(model_grad)
    if scale != 1.0:
        master_grad.mul_(scale)
    return False

def axpby_check_overflow_python(model_grad, stashed_grad, master_grad, a, b, check_overflow=False):
    # Exception handling for 18.04 compatibility
    if check_overflow:
        cpu_sum = float(model_grad.float().sum())
        if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
            return True

    # if master_grad is not model_grad: # copy_ probably internally short-circuits this
    #     master_grad.copy_(model_grad)
    assert stashed_grad.dtype == master_grad.dtype
    converted_model_grad = model_grad.data.to(master_grad.dtype)
    master_grad.data = a*converted_model_grad.data + b*stashed_grad.data
    return False

class LossScaler(object):
    warned_no_fused_kernel = False
    warned_unscaling_non_fp32_grad = False
    has_fused_kernel = False

    def __init__(self,
                 loss_scale,
                 init_scale=2.**16,
                 scale_factor=2.,
                 scale_window=2000,
                 min_loss_scale=None,
                 max_loss_scale=2.**24):
        if loss_scale == "dynamic":
            self.dynamic = True
            self._loss_scale = min(max_loss_scale, init_scale)
        else:
            self.dynamic = False
            self._loss_scale = loss_scale
        self._max_loss_scale = max_loss_scale
        self._min_loss_scale = min_loss_scale
        self._scale_seq_len = scale_window
        self._unskipped = 0
        self._has_overflow = False
        self._overflow_buf = torch.cuda.IntTensor([0])
        if multi_tensor_applier.available:
            import amp_C
            LossScaler.has_fused_kernel = multi_tensor_applier.available
            LossScaler.multi_tensor_scale_cuda = amp_C.multi_tensor_scale
            LossScaler.multi_tensor_axpby_cuda = amp_C.multi_tensor_axpby
        else:
            if not LossScaler.warned_no_fused_kernel:
                maybe_print(
                    "Warning:  multi_tensor_applier fused unscale kernel is unavailable, "
                    "possibly because apex was installed without --cuda_ext --cpp_ext. "
                    "Using Python fallback.  Original ImportError was: " +
                    repr(multi_tensor_applier.import_err),
                    True)
            LossScaler.has_fused_kernel = False
            LossScaler.warned_no_fused_kernel = True

    def loss_scale(self):
        return self._loss_scale

    def unscale_python(self, model_grads, master_grads, scale):
        for model, master in zip(model_grads, master_grads):
            if model is not None:
                if not LossScaler.warned_unscaling_non_fp32_grad:
                    if master.dtype != torch.float32:
                        maybe_print(
                            "Attempting to unscale a grad with type {} ".format(master.type()) +
                            "Unscaling non-fp32 grads may indicate an error. "
                            "When using Amp, you don't need to call .half() on your model.")
                        LossScaler.warned_unscaling_non_fp32_grad = True
                self._has_overflow = scale_check_overflow_python(model,
                                                                 master,
                                                                 1./scale,
                                                                 self.dynamic)
                if self._has_overflow and self.dynamic:
                    break

    # unused_scale keeps some of the old API alive for hopefully a short time.
    def unscale(self, model_grads, master_grads, unused_scale, models_are_masters=False, scale_override=None):
        if self._has_overflow:
            return

        scale = self._loss_scale
        if scale_override is not None:
            scale = scale_override

        if scale == 1.0 and models_are_masters and not self.dynamic:
            return

        if LossScaler.has_fused_kernel:
            # if (not LossScaler.warned_unscaling_non_fp32_grad
            #     and master_grads[0].dtype == torch.float16):
            #     print("Warning:  unscaling grads that are not FP32. "
            #           "Unscaling non-fp32 grads may indicate an error. "
            #           "When using Amp, you don't need to call .half() on your model.")
            #     # Setting this to True unconditionally allows the possibility of an escape
            #     # if never-before-seen non-fp32 grads are created in some later iteration.
            #     LossScaler.warned_unscaling_non_fp32_grad = True
            multi_tensor_applier(LossScaler.multi_tensor_scale_cuda,
                                 self._overflow_buf,
                                 [model_grads, master_grads],
                                 1./scale)
        else:
            self.unscale_python(model_grads, master_grads, scale)

        # Defer to update_scale
        # If the fused kernel is available, we only need one D2H memcopy and sync.
        # if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
        #     self._has_overflow = self._overflow_buf.item()

    def unscale_with_stashed_python(self,
                                    model_grads,
                                    stashed_master_grads,
                                    master_grads,
                                    a,
                                    b):
        for model, stashed, master in zip(model_grads, stashed_master_grads, master_grads):
            if model is None and stashed is None:
                continue
            else:
                if not LossScaler.warned_unscaling_non_fp32_grad:
                    if master.dtype != torch.float32:
                        maybe_print(
                            "Attempting to unscale a grad with type {} ".format(master.type()) +
                            "Unscaling non-fp32 grads may indicate an error. "
                            "When using Amp, you don't need to call .half() on your model.")
                        LossScaler.warned_unscaling_non_fp32_grad = True
                self._has_overflow = axpby_check_overflow_python(model,
                                                                 stashed,
                                                                 master,
                                                                 a,
                                                                 b,
                                                                 self.dynamic)
                if self._has_overflow and self.dynamic:
                    break

    def unscale_with_stashed(self,
                             model_grads,
                             stashed_master_grads,
                             master_grads,
                             scale_override=None):
        if self._has_overflow:
            return

        grads_have_scale, stashed_have_scale, out_scale = self._loss_scale, 1.0, 1.0
        if scale_override is not None:
            grads_have_scale, stashed_have_scale, out_scale = scale_override

        if LossScaler.has_fused_kernel:
            if (not LossScaler.warned_unscaling_non_fp32_grad
                and master_grads[0].dtype == torch.float16):
                print("Warning:  unscaling grads that are not FP32. "
                      "Unscaling non-fp32 grads may indicate an error. "
                      "When using Amp, you don't need to call .half() on your model.")
                # Setting this to True unconditionally allows the possibility of an escape
                # if never-before-seen non-fp32 grads are created in some later iteration.
                LossScaler.warned_unscaling_non_fp32_grad = True
            multi_tensor_applier(LossScaler.multi_tensor_axpby_cuda,
                                 self._overflow_buf,
                                 [model_grads, stashed_master_grads, master_grads],
                                 out_scale/grads_have_scale,   # 1./scale,
                                 out_scale/stashed_have_scale, # 1.0,
                                 0) # check only arg 0, aka the incoming model grads, for infs
        else:
            self.unscale_with_stashed_python(model_grads,
                                             stashed_master_grads,
                                             master_grads,
                                             out_scale/grads_have_scale,
                                             out_scale/stashed_have_scale)

        # Defer to update_scale
        # If the fused kernel is available, we only need one D2H memcopy and sync.
        # if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
        #     self._has_overflow = self._overflow_buf.item()

    def clear_overflow_state(self):
        self._has_overflow = False
        if self.has_fused_kernel:
            self._overflow_buf.zero_()

    # Separate so unscale() can be called more that once before updating.
    def update_scale(self):
        # If the fused kernel is available, we only need one D2H memcopy and sync.
        if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
            self._has_overflow = self._overflow_buf.item()

        if self._has_overflow and self.dynamic:
            should_skip = True
            if(self._min_loss_scale):
                self._loss_scale = max(self._min_loss_scale, self._loss_scale/2.)
            else:
                self._loss_scale = self._loss_scale/2.
            self._unskipped = 0
        else:
            should_skip = False
            self._unskipped += 1

        if self._unskipped == self._scale_seq_len and self.dynamic:
            self._loss_scale = min(self._max_loss_scale, self._loss_scale*2.)
            self._unskipped = 0

        return should_skip


================================================
FILE: KoSentenceT5/apex/amp/utils.py
================================================
from . import compat

import functools
import itertools

import torch

def is_cuda_enabled():
    return torch.version.cuda is not None

def get_cuda_version():
    return tuple(int(x) for x in torch.version.cuda.split('.'))

def is_fp_tensor(x):
    if is_nested(x):
        # Fast-fail version of all(is_fp_tensor)
        for y in x:
            if not is_fp_tensor(y):
                return False
        return True
    return compat.is_tensor_like(x) and compat.is_floating_point(x)

def is_nested(x):
    return isinstance(x, tuple) or isinstance(x, list)

def should_cache(x):
    if is_nested(x):
        # Fast-fail version of all(should_cache)
        for y in x:
            if not should_cache(y):
                return False
        return True
    return isinstance(x, torch.nn.parameter.Parameter) and \
        type_string(x) == 'FloatTensor'

def collect_fp_tensor_types(args, kwargs):
    def collect_types(x, types):
        if is_nested(x):
            for y in x:
                collect_types(y, types)
        else:
            types.add(type_string(x))

    all_args = itertools.chain(args, kwargs.values())
    types = set()
    for x in all_args:
        if is_fp_tensor(x):
            collect_types(x, types)
    return types

def type_string(x):
    return x.type().split('.')[-1]

def maybe_half(x, name='', verbose=False):
    if is_nested(x):
        return type(x)([maybe_half(y) for y in x])

    if not x.is_cuda or type_string(x) == 'HalfTensor':
        return x
    else:
        if verbose:
            print('Float->Half ({})'.format(name))
        return x.half()

def maybe_float(x, name='', verbose=False):
    if is_nested(x):
        return type(x)([maybe_float(y) for y in x])

    if not x.is_cuda or type_string(x) == 'FloatTensor':
        return x
    else:
        if verbose:
            print('Half->Float ({})'.format(name))
        return x.float()

# NB: returneds casted `args`, mutates `kwargs` in-place
def casted_args(cast_fn, args, kwargs):
    new_args = []
    for x in args:
        if is_fp_tensor(x):
            new_args.append(cast_fn(x))
        else:
            new_args.append(x)
    for k in kwargs:
        val = kwargs[k]
        if is_fp_tensor(val):
            kwargs[k] = cast_fn(val)
    return new_args

def cached_cast(cast_fn, x, cache):
    if is_nested(x):
        return type(x)([cached_cast(y) for y in x])
    if x in cache:
        cached_x = cache[x]
        if x.requires_grad and cached_x.requires_grad:
            # Make sure x is actually cached_x's autograd parent.
            if cached_x.grad_fn.next_functions[1][0].variable is not x:
                raise RuntimeError("x and cache[x] both require grad, but x is not "
                                   "cache[x]'s parent.  This is likely an error.")
        # During eval, it's possible to end up caching casted weights with
        # requires_grad=False.  On the next training iter, if cached_x is found
        # and reused from the cache, it will not actually have x as its parent.
        # Therefore, we choose to invalidate the cache (and force refreshing the cast)
        # if x.requires_grad and cached_x.requires_grad do not match.
        #
        # During eval (i.e. running under with torch.no_grad()) the invalidation
        # check would cause the cached value to be dropped every time, because
        # cached_x would always be created with requires_grad=False, while x would
        # still have requires_grad=True.  This would render the cache effectively
        # useless during eval.  Therefore, if we are running under the no_grad()
        # context manager (torch.is_grad_enabled=False) we elide the invalidation
        # check, and use the cached value even though its requires_grad flag doesn't
        # match.  During eval, we don't care that there's no autograd-graph
        # connection between x and cached_x.
        if torch.is_grad_enabled() and x.requires_grad != cached_x.requires_grad:
            del cache[x]
        else:
            return cached_x

    casted_x = cast_fn(x)
    cache[x] = casted_x
    return casted_x

def verbosify(cast_fn, fn_name, verbose):
    if verbose:
        return functools.partial(cast_fn, name=fn_name, verbose=verbose)
    else:
        return cast_fn

def as_inplace(fns):
    for x in fns:
        yield x + '_'

def has_func(mod, fn):
    if isinstance(mod, dict):
        return fn in mod
    else:
        return hasattr(mod, fn)

def get_func(mod, fn):
    if isinstance(mod, dict):
        return mod[fn]
    else:
        return getattr(mod, fn)

def set_func(mod, fn, new_fn):
    if isinstance(mod, dict):
        mod[fn] = new_fn
    else:
        setattr(mod, fn, new_fn)

def set_func_save(handle, mod, fn, new_fn):
    cur_fn = get_func(mod, fn)
    handle._save_func(mod, fn, cur_fn)
    set_func(mod, fn, new_fn)

# A couple problems get solved here:
# - The flat_weight buffer is disconnected from autograd graph,
#   so the fp16 weights need to be derived from the input weights
#   to this forward call, not the flat buffer.
# - The ordering of weights in the flat buffer is...idiosyncratic.
# First problem is solved with combination of set_ (to set up
# correct storage) and copy_ (so the fp16 weight derives from the
# fp32 one in autograd.
# Second is solved by doing ptr arithmetic on the fp32 weights
# to derive the correct offset.
#
# TODO: maybe this should actually use
# `torch._cudnn_rnn_flatten_weight`? But then I need to call
# on first iter and cache the right offsets. Ugh.
def synthesize_flattened_rnn_weights(fp32_weights,
                                     fp16_flat_tensor,
                                     rnn_fn='',
                                     verbose=False):
    fp16_weights = []
    fp32_base_ptr = fp32_weights[0][0].data_ptr()
    for layer_weights in fp32_weights:
        fp16_layer_weights = []
        for w_fp32 in layer_weights:
            w_fp16 = w_fp32.new().half()
            offset = (w_fp32.data_ptr() - fp32_base_ptr) // w_fp32.element_size()
            w_fp16.set_(fp16_flat_tensor.storage(),
                        offset,
                        w_fp32.shape)
            w_fp16.copy_(w_fp32)
            if verbose:
                print('Float->Half ({})'.format(rnn_fn))
            fp16_layer_weights.append(w_fp16)
        fp16_weights.append(fp16_layer_weights)
    return fp16_weights

# Roughly same as above, just the `fp32_weights` aren't nested.
# Code kept separate for readability.
def new_synthesize_flattened_rnn_weights(fp32_weights,
                                         fp16_flat_tensor,
                                         rnn_fn='',
                                         verbose=False):
    fp16_weights = []
    fp32_base_ptr = fp32_weights[0].data_ptr()
    for w_fp32 in fp32_weights:
        w_fp16 = w_fp32.new().half()
        offset = (w_fp32.data_ptr() - fp32_base_ptr) // w_fp32.element_size()
        w_fp16.set_(fp16_flat_tensor.storage(),
                    offset,
                    w_fp32.shape)
        w_fp16.copy_(w_fp32)
        if verbose:
            print('Float->Half ({})'.format(rnn_fn))
        fp16_weights.append(w_fp16)
    return fp16_weights


================================================
FILE: KoSentenceT5/apex/amp/wrap.py
================================================
from . import compat
from . import utils
from ._amp_state import _amp_state
from . import rnn_compat

import functools

import torch

def make_cast_wrapper(orig_fn, cast_fn, handle,
                      try_caching=False):
    @functools.wraps(orig_fn)
    def wrapper(*args, **kwargs):
        if not handle.is_active():
            return orig_fn(*args, **kwargs)

        if try_caching and handle.has_cache:
            args = list(args)
            for i in range(len(args)):
                if utils.should_cache(args[i]):
                    args[i] = utils.cached_cast(cast_fn, args[i], handle.cache)
            for k in kwargs:
                if utils.should_cache(kwargs[k]):
                    kwargs[k] = utils.cached_cast(cast_fn, kwargs[k], handle.cache)
        new_args = utils.casted_args(cast_fn,
                                     args,
                                     kwargs)
        return orig_fn(*new_args, **kwargs)
    return wrapper

def cached_cast(mod, fn, cast_fn, handle,
                try_caching=False, verbose=False):
    if not utils.has_func(mod, fn):
        return

    orig_fn = utils.get_func(mod, fn)
    cast_fn = utils.verbosify(cast_fn, fn, verbose)
    wrapper = make_cast_wrapper(orig_fn, cast_fn, handle, try_caching)
    utils.set_func_save(handle, mod, fn, wrapper)

# `handle` arg is unused, but simplifies API to make `make_cast_wrapper`
# Annoyingly, make_promote_wrapper still uses the global handle.  Once everyone
# is on the new API and I am free to get rid of handle, I can clean this up.
def make_promote_wrapper(orig_fn, cast_fn, handle=None):
    @functools.wraps(orig_fn)
    def wrapper(*args, **kwargs):
        if not _amp_state.handle.is_active():
            return orig_fn(*args, **kwargs)

        types = utils.collect_fp_tensor_types(args, kwargs)

        if len(types) <= 1:
            return orig_fn(*args, **kwargs)
        elif len(types) == 2 and types == set(['HalfTensor', 'FloatTensor']):
            new_args = utils.casted_args(cast_fn,
                                         args,
                                         kwargs)
            return orig_fn(*new_args, **kwargs)
        else:
            raise NotImplementedError('Do not know how to handle ' +
                                      'these types to promote: {}'
                                      .format(types))
    return wrapper

def promote(mod, fn, handle, verbose=False):
    orig_fn = utils.get_func(mod, fn)
    maybe_float = utils.verbosify(utils.maybe_float, fn, verbose)
    wrapper = make_promote_wrapper(orig_fn, maybe_float)
    utils.set_func_save(handle, mod, fn, wrapper)

def sequence_promote(mod, fn, handle, verbose=False):
    orig_fn = utils.get_func(mod, fn)
    maybe_float = utils.verbosify(utils.maybe_float, fn, verbose)
    @functools.wraps(orig_fn)
    def wrapper(seq, *args, **kwargs):
        if not _amp_state.handle.is_active():
            return orig_fn(seq, *args, **kwargs)

        types = set([utils.type_string(x) for x in seq])
        if len(types) <= 1:
            return orig_fn(seq, *args, **kwargs)
        elif types == set(['HalfTensor', 'FloatTensor']):
            cast_seq = utils.casted_args(maybe_float,
                                         seq, {})
            return orig_fn(cast_seq, *args, **kwargs)
        else:
            # TODO: other mixed-type cases aren't due to amp.
            #       Just pass through?
            return orig_fn(seq, *args, **kwargs)
    utils.set_func_save(handle, mod, fn, wrapper)

def promote_match_arg0(mod, fn, handle, verbose=False):
    if not utils.has_func(mod, fn):
        return

    orig_fn = utils.get_func(mod, fn)
    @functools.wraps(orig_fn)
    def wrapper(arg0, *args, **kwargs):
        assert compat.is_tensor_like(arg0)
        if not _amp_state.handle.is_active():
            return orig_fn(arg0, *args, **kwargs)

        if utils.type_string(arg0) == 'HalfTensor':
            cast_fn = utils.maybe_half
        elif utils.type_string(arg0) == 'FloatTensor':
            cast_fn = utils.maybe_float
        else:
            return orig_fn(arg0, *args, **kwargs)
        cast_fn = utils.verbosify(cast_fn, fn, verbose)
        new_args = utils.casted_args(cast_fn, args, kwargs)
        return orig_fn(arg0, *new_args, **kwargs)
    utils.set_func_save(handle, mod, fn, wrapper)

def err_if_any_half(mod, fn, handle, custom_err_msg=None):
    if not utils.has_func(mod, fn):
        return

    orig_fn = utils.get_func(mod, fn)
    @functools.wraps(orig_fn)
    def wrapper(*args, **kwargs):
        types = utils.collect_fp_tensor_types(args, kwargs)
        if 'HalfTensor' in types:
            if custom_err_msg:
                raise NotImplementedError(custom_err_msg)
            else:
                raise NotImplementedError('Cannot call in-place function ' +
                                          '{} with fp16 arguments.'.format(fn))
        else:
            return orig_fn(*args, **kwargs)
    utils.set_func_save(handle, mod, fn, wrapper)

def err_if_arg0_half(mod, fn, handle, verbose=False):
    if not utils.has_func(mod, fn):
        return

    orig_fn = utils.get_func(mod, fn)
    @functools.wraps(orig_fn)
    def wrapper(arg0, *args, **kwargs):
        assert compat.is_tensor_like(arg0)
        if utils.type_string(arg0) == 'HalfTensor':
            raise NotImplementedError('Cannot call in-place method ' +
                                      '{} on fp16 Tensors.'.format(fn))
        else:
            cast_fn = utils.verbosify(utils.maybe_float, fn, verbose)
            new_args = utils.casted_args(cast_fn, args, kwargs)
            return orig_fn(arg0, *new_args, **kwargs)
    utils.set_func_save(handle, mod, fn, wrapper)

# Current RNN approach:
# - Wrap top-level `RNN` function in thnn backend
# - Will call into either CudnnRNN or AutogradRNN
#  - Each of these are factory functions that return a per-iter
#    `forward` function
# - We interpose on the factory function to:
#   1) Interpose on the actual forward function and put in casts
#   2) Insert an fp16 `flat_weight` if necessary
def rnn_cast(backend, fn, handle, verbose=False):
    orig_rnn = utils.get_func(backend, fn)
    @functools.wraps(orig_rnn)
    def rnn_wrapper(*args, **kwargs):
        flat_weight = kwargs.get('flat_weight')
        if flat_weight is not None:
            # We replace `flat_weight` with an uninitialized fp16
            # Tensor. The "actual" weight tensors (provided in `forward`),
            # will then be set up as ptrs into the buffer and have the
            # corresponding fp32 values copied in.
            # We need to call `copy` on the "actual" weights so that the
            # autograd graph correctly backprops from the wgrads computed
            # inside cuDNN (on fp16 weights) into the fp32 weights.
            assert utils.type_string(flat_weight) == 'FloatTensor'
            if compat.tensor_is_float_tensor() or compat.tensor_is_variable():
                # Pre-0.4. A little slower, since it zeros out memory.
                flat_weight_fp16 = flat_weight.new().half().resize_(flat_weight.shape)
            else:
                flat_weight_fp16 = torch.empty_like(flat_weight,
                                                    dtype=torch.float16)
            kwargs['flat_weight'] = flat_weight_fp16
        else:
            flat_weight_fp16 = None

        forward = orig_rnn(*args, **kwargs)
        @functools.wraps(forward)
        def fwd_wrapper(*fargs, **fkwargs):
            assert len(fargs) == 3 or len(fargs) == 4
            inputs, weights, hiddens = fargs[:3]
            assert utils.is_fp_tensor(inputs)
            assert isinstance(weights, list)
            cast_fn = utils.verbosify(utils.maybe_half,
                                      fn,
                                      verbose)
            new_args = []

            # 0) Inputs
            new_args.append(cast_fn(inputs))

            # 1) Weights
            if flat_weight_fp16 is not None:
                fp16_weights = utils.synthesize_flattened_rnn_weights(
                    weights, flat_weight_fp16, fn, verbose)
            else:
                fp16_weights = [[cast_fn(w) for w in layer]
                                for layer in weights]
            new_args.append(fp16_weights)

            # 2) Inputs: either a tuple (for LSTM) or single tensor
            if isinstance(hiddens, tuple):
                new_args.append(tuple(cast_fn(x) for x in hiddens))
            elif utils.is_fp_tensor(hiddens):
                new_args.append(cast_fn(hiddens))
            else:
                # Hiddens can, in principle, be `None` -- pass through
                new_args.append(hiddens)

            # 3) Batch sizes (0.4 or later only)
            if len(fargs) == 4:
                new_args.append(fargs[3])

            return forward(*new_args, **fkwargs)
        return fwd_wrapper
    utils.set_func_save(handle, backend, fn, rnn_wrapper)

def new_rnn_cast(fn, handle, verbose=False):
    # Forward+backward compatibility around https://github.com/pytorch/pytorch/pull/15744
    # For rnn backend calls that route through _rnn_impls, we must patch the ref
    # that _rnn_impls stashed.  For rnn backend calls that directly invoke
    # _VF.<backend>, e.g. _VF.lstm, we can patch onto VariableFunctionsShim,
    # which in turn has patched the ref named "_VF" in torch.nn.modules.rnn.
    if utils.has_func(torch.nn.modules.rnn._rnn_impls, fn):
        mod = torch.nn.modules.rnn._rnn_impls
    else:
        mod = torch.nn.modules.rnn._VF
        assert isinstance(mod, rnn_compat.VariableFunctionsShim)
        fn = fn.lower()
    orig_fn = utils.get_func(mod, fn)
    cast_fn = utils.verbosify(utils.maybe_half, fn, verbose)
    @functools.wraps(orig_fn)
    def wrapper(*args, **kwargs):
        # Exact call signature from modules/rnn.py
        assert len(args) == 9
        assert len(kwargs) == 0

        if not _amp_state.handle.is_active():
            return orig_fn(*args, **kwargs)

        if isinstance(args[6], bool):
            params_idx = 2 # Not PackedSequence case
        else:
            params_idx = 3 # PackedSequence case

        new_args = []
        for i, arg in enumerate(args):
            if i == params_idx:
                num_params = sum([x.numel() for x in arg])
                fp16_weight_buf = args[0].new_empty((num_params,),
                                                    dtype=torch.half)
                casted_weights = utils.new_synthesize_flattened_rnn_weights(
                    arg, fp16_weight_buf, fn, verbose)
                new_args.append(casted_weights)
            elif utils.is_fp_tensor(arg):
                new_args.append(cast_fn(arg))
            else:
                new_args.append(arg)

        return orig_fn(*new_args)
    utils.set_func_save(handle, mod, fn, wrapper)

def disable_casts(mod, fn, handle):
    if not utils.has_func(mod, fn):
        return

    orig_fn = utils.get_func(mod, fn)
    @functools.wraps(orig_fn)
    def wrapper(*args, **kwargs):
        with handle._disable_casts():
            return orig_fn(*args, **kwargs)
    utils.set_func_save(handle, mod, fn, wrapper)


================================================
FILE: KoSentenceT5/apex/contrib/__init__.py
================================================


================================================
FILE: KoSentenceT5/apex/contrib/bottleneck/__init__.py
================================================
from .bottleneck import Bottleneck


================================================
FILE: KoSentenceT5/apex/contrib/bottleneck/bottleneck.py
================================================
import torch
from torch import nn
import fast_bottleneck

def kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
    weight_tensor_nchw = tensor
    nn.init.kaiming_uniform_(weight_tensor_nchw, a=a, mode=mode, nonlinearity=nonlinearity)

class FrozenBatchNorm2d(torch.nn.Module):
    """
    BatchNorm2d where the batch statistics and the affine parameters are fixed
    """
    def __init__(self, n):
        super(FrozenBatchNorm2d, self).__init__()
        self.register_buffer("weight", torch.ones(n))
        self.register_buffer("bias", torch.zeros(n))
        self.register_buffer("running_mean", torch.zeros(n))
        self.register_buffer("running_var", torch.ones(n))

    def get_scale_bias(self, nhwc=False):
        scale = self.weight * self.running_var.rsqrt()
        bias = self.bias - self.running_mean * scale
        if nhwc:
            scale = scale.reshape(1, 1, 1, -1)
            bias = bias.reshape(1, 1, 1, -1)
        else:
            scale = scale.reshape(1, -1, 1, 1)
            bias = bias.reshape(1, -1, 1, 1)
        return scale, bias

    def forward(self, x):
        scale, bias = self.get_scale_bias()
        return x * scale + bias


@torch.jit.script
def drelu_dscale1(grad_o, output, scale1):
    relu_mask = (output>0).half()
    dx_relu = relu_mask * grad_o
    g1 = dx_relu * scale1
    return g1, dx_relu

@torch.jit.script
def drelu_dscale2(grad_o, output, scale1, scale2):
    relu_mask = (output>0).half()
    dx_relu = relu_mask * grad_o
    g1 = dx_relu * scale1
    g2 = dx_relu * scale2
    return g1, g2

class BottleneckFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, nhwc, stride_1x1, scale, bias, x, *conv):
        # TODO: clean up order of tensors
        args = [x, *conv[0:3], *scale[0:3], *bias[0:3]]
        ctx.downsample = len(conv) > 3
        if ctx.downsample:
            args.append(conv[3])
            args.append(scale[3])
            args.append(bias[3])

        # weight buffers are always in nhwc while shape can be nhwc or channels_last
        # here we pass in flag and let c++ handle it
        # alternatively, we can put all sizes into a fixed format and pass it in
        outputs = fast_bottleneck.forward(nhwc, stride_1x1, args)
        ctx.save_for_backward(*(args+outputs))
        # save relu outputs for drelu
        ctx.nhwc = nhwc
        ctx.stride_1x1 = stride_1x1
        return outputs[2]

    # backward relu is not exposed, MUL with mask used now
    # only support dgrad
    @staticmethod
    def backward(ctx, grad_o):
        outputs = ctx.saved_tensors[-3:]

        if ctx.downsample:
            grad_conv3, grad_conv4 = drelu_dscale2(grad_o, outputs[2], ctx.saved_tensors[6], ctx.saved_tensors[11])
        else:
            grad_conv3, grad_conv4 = drelu_dscale1(grad_o, outputs[2], ctx.saved_tensors[6])

        # create input vector for backward
        t_list = [*ctx.saved_tensors[0:10]]
        t_list.append(grad_conv3)
        t_list.append(grad_conv4)

        # outputs used for wgrad and generating drelu mask
        t_list.append(outputs[0])
        t_list.append(outputs[1])

        # in case there is downsample
        if ctx.downsample:
            t_list.append(ctx.saved_tensors[10])

        grads = fast_bottleneck.backward(ctx.nhwc, ctx.stride_1x1, t_list)

        return (None, None, None, None, *grads)

bottleneck_function = BottleneckFunction.apply

def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=dilation, groups=groups, bias=False, dilation=dilation)

def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)

class Bottleneck(torch.nn.Module):
    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
    # This variant is also known as ResNet V1.5 and improves accuracy according to
    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
    # here we put it at 1x1

    def __init__(self, in_channels, bottleneck_channels, out_channels, stride=1, groups=1,
                 dilation=1, norm_func=None, use_cudnn=False, explicit_nhwc=False):
        super(Bottleneck, self).__init__()
        if groups != 1:
            raise RuntimeError('Only support groups == 1')
        if dilation != 1:
            raise RuntimeError('Only support dilation == 1')
        if norm_func == None:
            norm_func = FrozenBatchNorm2d
        else:
            raise RuntimeError('Only support frozen BN now.')

        if stride != 1 or in_channels != out_channels:
            self.downsample = nn.Sequential(
                conv1x1(in_channels, out_channels, stride),
                norm_func(out_channels),
            )
        else:
            self.downsample = None

        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv1x1(in_channels, bottleneck_channels, stride)
        self.conv2 = conv3x3(bottleneck_channels, bottleneck_channels)
        self.conv3 = conv1x1(bottleneck_channels, out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.stride = stride

        self.bn1 = norm_func(bottleneck_channels)
        self.bn2 = norm_func(bottleneck_channels)
        self.bn3 = norm_func(out_channels)

        self.use_cudnn = use_cudnn

        # setup conv weights
        self.w_conv = [self.conv1.weight, self.conv2.weight, self.conv3.weight]
        if self.downsample is not None:
            self.w_conv.append(self.downsample[0].weight)

        # init weight in nchw format before possible transpose
        for w in self.w_conv:
            kaiming_uniform_(w, a=1)

        # TODO: prevent unsupported case usage
        # support cases
        #                 native      cudnn
        # normal             yes         no
        # channel_last       yes        yes
        # explicit_nhwc       no        yes
        self.explicit_nhwc = explicit_nhwc
        if self.explicit_nhwc:
            for p in self.parameters():
                with torch.no_grad():
                    p.data = p.data.permute(0,2,3,1).contiguous()
        return

    def forward(self, x):
        if self.use_cudnn:
            # calculate scale/bias from registered buffers
            # TODO: make this better
            s1, b1 = self.bn1.get_scale_bias(self.explicit_nhwc)
            s2, b2 = self.bn2.get_scale_bias(self.explicit_nhwc)
            s3, b3 = self.bn3.get_scale_bias(self.explicit_nhwc)
            w_scale = [s1, s2, s3]
            w_bias = [b1, b2, b3]
            if self.downsample is not None:
                s4, b4 = self.downsample[1].get_scale_bias(self.explicit_nhwc)
                w_scale.append(s4)
                w_bias.append(b4)

            out = bottleneck_function(self.explicit_nhwc, self.stride, w_scale, w_bias, x, *self.w_conv)
            return out

        if self.explicit_nhwc:
            raise RuntimeError('explicit nhwc with native ops is not supported.')

        # fallback to native ops
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


================================================
FILE: KoSentenceT5/apex/contrib/bottleneck/test.py
================================================
import torch
from bottleneck import Bottleneck
torch.manual_seed(23337)

# use True to print layerwise sum for all outputs in reference code path
DEBUG = False#True

for stride, o_channel in [(1,32), (1,128), (2,32)]:
    print("testing stride ==", stride, ", in_channel == 32 , out_channel ==", o_channel)
    a_ = torch.randn(17,32,28,28)

    a = a_.cuda().half().to(memory_format=torch.channels_last).requires_grad_()
    model = Bottleneck(32,8,o_channel,stride=stride).cuda().half().to(memory_format=torch.channels_last)

    # test model
    b = model(a)
    b.mean().backward()
    d_grad = a.grad.float()
    a.grad = None
    torch.cuda.synchronize()

    if DEBUG:
        print("[DEBUG] ref dx :", d_grad.sum().item())
        # print wgrad. we don't need to reset since later cpp print before accumulation
        for i, w in enumerate(model.w_conv):
            print("[DEBUG] ref wgrad{} :".format(i+1), w.grad.sum().item())

    wgrads = []
    for w in model.w_conv:
        wgrads.append(w.grad.float())

    model.use_cudnn = True
    model.zero_grad()
    c = model(a)
    c.mean().backward()

    torch.cuda.synchronize()
    print("comparing native and channels_last:")
    print("max error fprop:", (b-c).abs().max().item(), "max elem:", b.abs().max().item())
    print("max error dgrad:", (d_grad-a.grad.float()).abs().max().item(), "max elem:", d_grad.abs().max().item())
    for i, (w, wgrad) in enumerate(zip(model.w_conv, wgrads)):
        print("max error wgrad{}:".format(i+1), (wgrad - w.grad.float()).abs().max().item(), "max elem:", wgrad.abs().max().item())

    nhwc_a = a_.permute(0,2,3,1).contiguous().cuda().half().requires_grad_()
    nhwc_model = Bottleneck(32,8,o_channel,stride=stride,explicit_nhwc=True, use_cudnn=True).cuda().half()
    for p,q in zip(model.parameters(), nhwc_model.parameters()):
        # model's storage is already in nhwc, we clone and assign to explicit nhwc model
        q.data.copy_(p.data.permute(0,2,3,1).contiguous())
    for p,q in zip(model.buffers(), nhwc_model.buffers()):
        q.data.copy_(p.data)

    d = nhwc_model(nhwc_a)
    d.mean().backward()
    torch.cuda.synchronize()

    # reset reference to cudnn channels_last permute
    #c_s = c.storage().tolist()
    #d_s = d.storage().tolist()
    #print(max([x-y for x,y in zip(c_s,d_s)]))
    c = c.contiguous(memory_format=torch.contiguous_format).permute(0,2,3,1).contiguous()
    d_grad = a.grad.float().permute(0,2,3,1).contiguous()
    wgrads = []
    for w in model.w_conv:
        wgrads.append(w.grad.float().permute(0,2,3,1).contiguous())

    torch.cuda.synchronize()
    print("comparing nhwc and channels_last:")
    print("max error fprop:", (d-c).abs().max().item(), "max elem:", c.abs().max().item())
    print("max error dgrad:", (d_grad-nhwc_a.grad.float()).abs().max().item(), "max elem:", d_grad.abs().max().item())
    for i, (w, wgrad) in enumerate(zip(nhwc_model.w_conv, wgrads)):
        print("max error wgrad{}:".format(i+1), (wgrad - w.grad.float()).abs().max().item(), "max elem:", wgrad.abs().max().item())


================================================
FILE: KoSentenceT5/apex/contrib/csrc/bottleneck/bottleneck.cpp
================================================
#include <ATen/ATen.h>
#include <ATen/cudnn/Handle.h>  // for getcudnnhandle
#include <torch/extension.h>
#include <torch/torch.h>
#include <vector>
#include <cudnn_frontend.h>

#include <iostream>

#ifdef DEBUG
#define DEBUG_MSG(str) do { std::cout << str << std::endl; } while( false )
#else
#define DEBUG_MSG(str) do { } while ( false )
#endif

#ifdef DEBUG_CUDNN
#define DEBUG_CUDNN_MSG(buf, str) do { buf << str << std::endl; } while( false )
#else
#define DEBUG_CUDNN_MSG(buf, str) do { } while ( false )
#endif

#define checkCudnnErr(...)                                                        \
    do {                                                                          \
        int err = checkCudnnError(__VA_ARGS__, #__VA_ARGS__, __FILE__, __LINE__); \
        if (err) {                                                                \
            return;                                                    \
	}                                                                         \
    } while (0)


int checkCudnnError(cudnnStatus_t code, const char* expr, const char* file, int line) {
    if (code) {
        printf("CUDNN error at %s:%d, code=%d (%s) in '%s'\n", file, line, (int)code, cudnnGetErrorString(code), expr);
        return 1;
    }
    return 0;
}

void checkError(cudaError_t code, char const * func, const char *file, const int line, bool abort = true);
#define checkCUDAError(val) { checkError((val), #val, __FILE__, __LINE__); }    // in-line regular function

void checkError(cudaError_t code, char const * func, const char *file, const int line, bool abort)
{
  if (code != cudaSuccess)
  {
    const char * errorMessage = cudaGetErrorString(code);
    fprintf(stderr, "CUDA error returned from \"%s\" at %s:%d, Error code: %d (%s)\n", func, file, line, code, errorMessage);
    if (abort){
      cudaDeviceReset();
      exit(code);
    }
  }
}

void generateStrides(const int64_t* dimA, int64_t* strideA, int nbDims, cudnnTensorFormat_t filterFormat) {
    // For INT8x4 and INT8x32 we still compute standard strides here to input
    // into the cuDNN functions. We will manually scale by resizeFactor in the cpu ref.
    if (filterFormat == CUDNN_TENSOR_NCHW) {
        strideA[nbDims - 1] = 1;
        for (int64_t d = nbDims - 2; d >= 0; d--) {
            strideA[d] = strideA[d + 1] * dimA[d + 1];
        }
    } else {
        // Here we assume that the format is CUDNN_TENSOR_NHWC
	strideA[1]          = 1;
        strideA[nbDims - 1] = strideA[1] * dimA[1];
        for (int64_t d = nbDims - 2; d >= 2; d--) {
            strideA[d] = strideA[d + 1] * dimA[d + 1];
        }
        strideA[0] = strideA[2] * dimA[2];
    }
}


int getFwdConvDilatedFilterDim(int filterDim, int dilation) {
    return ((filterDim - 1) * dilation) + 1;
}

int getFwdConvPaddedImageDim(int tensorDim, int pad) {
    return tensorDim + (2 * pad);
}

int getFwdConvOutputDim(
    int tensorDim,
    int pad,
    int filterDim,
    int stride,
    int dilation)
{
    int p = (getFwdConvPaddedImageDim(tensorDim, pad) - getFwdConvDilatedFilterDim(filterDim, dilation)) / stride + 1;
    return (p);
}

enum {
    X_TENSOR,
    Y_TENSOR,
    W_TENSOR,
    Z_TENSOR,
    B_TENSOR,
    AFTERADD_TENSOR,
    AFTERBIAS_TENSOR,
    AFTERCONV_TENSOR,
    OPTIONAL,
    AFTEROPT_TENSOR,
};

using common_conv_descriptors =
    std::tuple<cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::ConvDesc>;


common_conv_descriptors
create_common_descriptors(int64_t* x_dim_padded,
                          int64_t* padA,
                          int64_t* convstrideA,
                          int64_t* dilationA,
                          int64_t* w_dim_padded,
                          int64_t* y_dim_padded,
                          cudnnDataType_t dataType,
                          cudnnConvolutionMode_t mode) {
    const int convDim = 2;

    int64_t strideA_padded[4];
    int64_t outstrideA_padded[4];
    int64_t filterstrideA_padded[4];

    generateStrides(w_dim_padded, filterstrideA_padded, 4, CUDNN_TENSOR_NHWC);
    generateStrides(x_dim_padded, strideA_padded, 4, CUDNN_TENSOR_NHWC);
    generateStrides(y_dim_padded, outstrideA_padded, 4, CUDNN_TENSOR_NHWC);

    return common_conv_descriptors(cudnn_frontend::TensorBuilder()
                                       .setDim(4, x_dim_padded)
                                       .setStrides(4, strideA_padded)
                                       .setId('x')
                                       .setAlignment(16)
                                       .setDataType(dataType)
                                       .build(),
                                   cudnn_frontend::TensorBuilder()
                                       .setDim(4, y_dim_padded)
                                       .setStrides(4, outstrideA_padded)
                                       .setId('y')
                                       .setAlignment(16)
                                       .setDataType(dataType)
                                       .build(),
                                   cudnn_frontend::TensorBuilder()
                                       .setDim(4, w_dim_padded)
                                       .setStrides(4, filterstrideA_padded)
                                       .setId('w')
                                       .setAlignment(16)
                                       .setDataType(dataType)
                                       .build(),
                                   cudnn_frontend::ConvDescBuilder()
                                       .setDataType(CUDNN_DATA_FLOAT)
                                       .setMathMode(mode)
                                       .setNDims(convDim)
                                       .setStrides(convDim, convstrideA)
                                       .setPrePadding(convDim, padA)
                                       .setPostPadding(convDim, padA)
                                       .setDilation(convDim, dilationA)
                                       .build());
}

using common_convbias_descriptors = std::tuple<cudnn_frontend::Tensor,
                                               cudnn_frontend::Tensor,
                                               cudnn_frontend::Tensor,
                                               cudnn_frontend::Tensor,
                                               cudnn_frontend::Tensor,
                                               cudnn_frontend::Tensor,
                                               cudnn_frontend::Tensor,
                                               cudnn_frontend::Tensor,
                                               cudnn_frontend::Tensor,
                                               cudnn_frontend::Tensor>;

common_convbias_descriptors
create_conv_bias_add_act_descriptors(int64_t* x_dim_padded,
                                     int64_t* padA,
                                     int64_t* convstrideA,
                                     int64_t* dilationA,
                                     int64_t* w_dim_padded,
                                     int64_t* y_dim_padded,
                                     cudnnDataType_t dataType) {
    const int convDim = 2;

    int64_t b_dim_padded[4];
    b_dim_padded[0] = 1;
    b_dim_padded[1] = y_dim_padded[1];
    b_dim_padded[2] = 1;
    b_dim_padded[3] = 1;

    int64_t x_stride_padded[4];
    int64_t y_stride_padded[4];
    int64_t w_stride_padded[4];
    int64_t b_stride_padded[4];

    generateStrides(w_dim_padded, w_stride_padded, 4, CUDNN_TENSOR_NHWC);
    generateStrides(x_dim_padded, x_stride_padded, 4, CUDNN_TENSOR_NHWC);
    generateStrides(y_dim_padded, y_stride_padded, 4, CUDNN_TENSOR_NHWC);
    generateStrides(b_dim_padded, b_stride_padded, 4, CUDNN_TENSOR_NHWC);

    return common_convbias_descriptors(cudnn_frontend::TensorBuilder()
                                           .setDim(4, x_dim_padded)
                                           .setStrides(4, x_stride_padded)
                                           .setId('x')
                                           .setAlignment(16)
                                           .setDataType(dataType)
                                           .build(),
                                       cudnn_frontend::TensorBuilder()
                                           .setDim(4, y_dim_padded)
                                           .setStrides(4, y_stride_padded)
                                           .setId('y')
                                           .setAlignment(16)
                                           .setDataType(dataType)
                                           .build(),
                                       cudnn_frontend::TensorBuilder()
                                           .setDim(4, w_dim_padded)
                                           .setStrides(4, w_stride_padded)
                                           .setId('w')
                                           .setAlignment(16)
                                           .setDataType(dataType)
                                           .build(),
                                       cudnn_frontend::TensorBuilder()
                                           .setDim(4, b_dim_padded)
                                           .setStrides(4, b_stride_padded)
                                           .setId('z')
                                           .setAlignment(16)
                                           .setDataType(dataType)
                                           .build(),
                                       cudnn_frontend::TensorBuilder()
                                           .setDim(4, b_dim_padded)
                                           .setStrides(4, b_stride_padded)
                                           .setId('b')
                                           .setAlignment(16)
                                           .setDataType(dataType)
                                           .build(),
                                       cudnn_frontend::TensorBuilder()
                                           .setDim(4, y_dim_padded)
                                           .setStrides(4, y_stride_padded)
                                           .setVirtual()
                                           .setId('A')  // after add
                                           .setAlignment(16)
                                           .setDataType(dataType)
                                           .build(),
                                       cudnn_frontend::TensorBuilder()
                                           .setDim(4, y_dim_padded)
                                           .setStrides(4, y_stride_padded)
                                           .setVirtual()
                                           .setId('B')  // after bias
                                           .setAlignment(16)
                                           .setDataType(dataType)
                                           .build(),
                                       cudnn_frontend::TensorBuilder()
                                           .setDim(4, y_dim_padded)
                                           .setStrides(4, y_stride_padded)
                                           .setId('C')  // after conv
                                           .setAlignment(16)
                                           .setVirtual()
                                           .setDataType(dataType)
                                           .build(),
                                       cudnn_frontend::TensorBuilder()
                                           .setDim(4, y_dim_padded)
                                           .setStrides(4, y_stride_padded)
                                           .setId('i')
                                           .setAlignment(16)
                                           .setDataType(dataType)
                                           .build(),
                                       cudnn_frontend::TensorBuilder()
                                           .setDim(4, y_dim_padded)
                                           .setStrides(4, y_stride_padded)
                                           .setId('D')  // after optional add
                                           .setAlignment(16)
                                           .setVirtual()
                                           .setDataType(dataType)
                                           .build());
}

// tensor descriptors used for dgrad
enum {
    X_OR_DX_TENSOR,
    DY_TENSOR,
    W_OR_DW_TENSOR,
    SCALE_TENSOR,
    RELU_TENSOR,
    AFTER_DCONV_TENSOR,
    AFTER_DRELU_TENSOR,
};

using dconv_descriptors = std::tuple<cudnn_frontend::Tensor,
                                     cudnn_frontend::Tensor,
                                     cudnn_frontend::Tensor,
                                     cudnn_frontend::Tensor,
                                     cudnn_frontend::Tensor,
                                     cudnn_frontend::Tensor,
                                     cudnn_frontend::Tensor>;

dconv_descriptors
create_dconv_descriptors(int64_t* x_dim_padded,
                         int64_t* padA,
                         int64_t* convstrideA,
                         int64_t* dilationA,
                         int64_t* w_dim_padded,
                         int64_t* y_dim_padded,
                         cudnnDataType_t dataType) {
    const int convDim = 2;

    int64_t b_dim_padded[4];
    b_dim_padded[0] = 1;
    b_dim_padded[1] = x_dim_padded[1];
    b_dim_padded[2] = 1;
    b_dim_padded[3] = 1;

    int64_t x_stride_padded[4];
    int64_t y_stride_padded[4];
    int64_t w_stride_padded[4];
    int64_t b_stride_padded[4];

    generateStrides(w_dim_padded, w_stride_padded, 4, CUDNN_TENSOR_NHWC);
    generateStrides(x_dim_padded, x_stride_padded, 4, CUDNN_TENSOR_NHWC);
    generateStrides(y_dim_padded, y_stride_padded, 4, CUDNN_TENSOR_NHWC);
    generateStrides(b_dim_padded, b_stride_padded, 4, CUDNN_TENSOR_NHWC);

    return dconv_descriptors(cudnn_frontend::TensorBuilder()
                             .setDim(4, x_dim_padded)
                             .setStrides(4, x_stride_padded)
                             .setId('x')
                             .setAlignment(16)
                             .setDataType(dataType)
                             .build(),
                             cudnn_frontend::TensorBuilder()
                             .setDim(4, y_dim_padded)
                             .setStrides(4, y_stride_padded)
                             .setId('y')
                             .setAlignment(16)
                             .setDataType(dataType)
                             .build(),
                             cudnn_frontend::TensorBuilder()
                             .setDim(4, w_dim_padded)
                             .setStrides(4, w_stride_padded)
                             .setId('w')
                             .setAlignment(16)
                             .setDataType(dataType)
                             .build(),
                             cudnn_frontend::TensorBuilder()
                             .setDim(4, b_dim_padded)
                             .setStrides(4, b_stride_padded)
                             .setId('s')
                             .setAlignment(16)
                             .setDataType(dataType)
                             .build(),
                             cudnn_frontend::TensorBuilder()
                             .setDim(4, x_dim_padded)
                             .setStrides(4, x_stride_padded)
                             .setId('r')
                             .setAlignment(16)
                             .setDataType(dataType)
                             .build(),
                             cudnn_frontend::TensorBuilder()
                             .setDim(4, x_dim_padded)
                             .setStrides(4, x_stride_padded)
                             .setVirtual()
                             .setId('A')  // after dconv
                             .setAlignment(16)
                             .setDataType(dataType)
                             .build(),
                             cudnn_frontend::TensorBuilder()
                             .setDim(4, x_dim_padded)
                             .setStrides(4, x_stride_padded)
                             .setVirtual()
                             .setId('B')  // after drelu
                             .setAlignment(16)
                             .setDataType(dataType)
                             .build());
}

// create a cache for plan
std::unordered_map<std::string, cudnn_frontend::ExecutionPlan> plan_cache;

// TODO: better name
std::string getConvFusionString(int64_t* x_dim_padded,
                                int64_t* padA,
                                int64_t* convstrideA,
                                int64_t* dilationA,
                                int64_t* w_dim_padded,
                                cudnnDataType_t dataType,
                                std::string fusion_string) {

  for(int i=0;i<4;i++) {
    fusion_string += 'X';
    fusion_string += std::to_string(x_dim_padded[i]);
  }
  for(int i=0;i<4;i++) {
    fusion_string += 'W';
    fusion_string += std::to_string(w_dim_padded[i]);
  }
  for(int i=0;i<2;i++) {
    fusion_string += 'P';
    fusion_string += std::to_string(padA[i]);
  }
  for(int i=0;i<2;i++) {
    fusion_string += 'S';
    fusion_string += std::to_string(convstrideA[i]);
  }
  for(int i=0;i<2;i++) {
    fusion_string += 'D';
    fusion_string += std::to_string(dilationA[i]);
  }
  fusion_string += 'T';
  fusion_string += std::to_string(dataType);
  return fusion_string;
}

cudnn_frontend::ExecutionPlan& getOrCreatePlan(cudnnHandle_t handle_,
                                               std::stringstream& log_buf,
                                               cudnn_frontend::OperationGraph& opGraph,
                                               std::string cache_string,
                                               bool use_heuristic = true){
  auto it = plan_cache.find(cache_string);
  if (it != plan_cache.end()) {
    DEBUG_CUDNN_MSG(log_buf, "Found plan in cache");
    return it->second;
  } else {
    if (use_heuristic){
      // TODO: confirm which mode to use
      auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
        .setOperationGraph(opGraph)
        .setHeurMode(CUDNN_HEUR_MODE_INSTANT)
        .build();
      // try 3 times for now as WAR for no heuristic training
      int max_tries = 3, count = 0;
      auto& engine_configs = heuristics.getEngineConfig(max_tries);
      while(true) {
        try {
          plan_cache.emplace(cache_string, std::move(cudnn_frontend::ExecutionPlanBuilder()
                                                     .setHandle(handle_)
                                                     .setEngineConfig(engine_configs[count], opGraph.getTag())
                                                     .build()));
          break;
        } catch (cudnn_frontend::cudnnException e) {
          if (++count == max_tries) throw e;
        }
      }
    }else{
    DEBUG_CUDNN_MSG(log_buf, "No plan in cache");
    // How many engines support this operation graph ?
    auto total_engines = opGraph.getEngineCount();
    DEBUG_CUDNN_MSG(log_buf, opGraph.describe() << " has " << total_engines << " engines.");
    // We have to randomly pick one engine from [0, total_engines)
    // Selecting "0" by default
    auto engine = cudnn_frontend::EngineBuilder().setGlobalEngineIdx(0).setOperationGraph(opGraph).build();
    DEBUG_CUDNN_MSG(log_buf, engine.describe());
    auto& knobs = engine.getSupportedKnobs();
    for (auto it = std::begin(knobs); it != std::end(knobs); ++it) {
      DEBUG_CUDNN_MSG(log_buf, it->describe());
    }
    if (knobs.begin() != knobs.end()) {
      DEBUG_CUDNN_MSG(log_buf, "Updated knob choice");
      knobs.begin()->setChoice(knobs.begin()->getMinValue() + 1);
      DEBUG_CUDNN_MSG(log_buf, knobs.begin()->describe());
    }

    // Createmplacee the requisite engine config
    auto engine_config = cudnn_frontend::EngineConfigBuilder().setEngine(engine).build();
    DEBUG_CUDNN_MSG(log_buf, engine_config.describe());
    plan_cache.emplace(cache_string, std::move(cudnn_frontend::ExecutionPlanBuilder().setHandle(handle_).setEngineConfig(engine_config).build()));
    }

    return plan_cache.find(cache_string)->second;
  }
}

void
run_conv_scale_bias_add_activation(int64_t* x_dim_padded,
                                   int64_t* pad,
                                   int64_t* convstride,
                                   int64_t* dilation,
                                   int64_t* w_dim_padded,
                                   int64_t* y_dim_padded,
                                   cudnnDataType_t dataType,
                                   at::Half* devPtrX,
                                   at::Half* devPtrW,
                                   at::Half* devPtrY,
                                   at::Half* devPtrZ,
                                   at::Half* devPtrB,
                                   at::Half* devPtrI) {
    cudnnHandle_t handle_ = torch::native::getCudnnHandle();
    std::stringstream log_buf;
    try {
        int convDim = 2;

        // Creates the necessary tensor descriptors
        common_convbias_descriptors tensors = create_conv_bias_add_act_descriptors(
            x_dim_padded, pad, convstride, dilation, w_dim_padded, y_dim_padded, dataType);
        DEBUG_CUDNN_MSG(log_buf, std::get<X_TENSOR>(tensors).describe());
        DEBUG_CUDNN_MSG(log_buf, std::get<Y_TENSOR>(tensors).describe());
        DEBUG_CUDNN_MSG(log_buf, std::get<W_TENSOR>(tensors).describe());
        DEBUG_CUDNN_MSG(log_buf, std::get<Z_TENSOR>(tensors).describe());
        DEBUG_CUDNN_MSG(log_buf, std::get<B_TENSOR>(tensors).describe());
        DEBUG_CUDNN_MSG(log_buf, std::get<AFTERADD_TENSOR>(tensors).describe());
        DEBUG_CUDNN_MSG(log_buf, std::get<AFTERBIAS_TENSOR>(tensors).describe());
        DEBUG_CUDNN_MSG(log_buf, std::get<AFTERCONV_TENSOR>(tensors).describe());
        DEBUG_CUDNN_MSG(log_buf, std::get<OPTIONAL>(tensors).describe());

        // Define the add operation
        auto scaleDesc = cudnn_frontend::PointWiseDescBuilder()
                           .setMode(CUDNN_POINTWISE_MUL)
                           .setMathPrecision(CUDNN_DATA_FLOAT)
                           .build();
        DEBUG_CUDNN_MSG(log_buf, scaleDesc.describe());

        // Define the bias operation
        auto biasDesc = cudnn_frontend::PointWiseDescBuilder()
                            .setMode(CUDNN_POINTWISE_ADD)
                            .setMathPrecision(CUDNN_DATA_FLOAT)
                            .build();
        DEBUG_CUDNN_MSG(log_buf, biasDesc.describe());

        // optional add
        auto addDesc = cudnn_frontend::PointWiseDescBuilder()
                            .setMode(CUDNN_POINTWISE_ADD)
                            .setMathPrecision(CUDNN_DATA_FLOAT)
                            .build();
        DEBUG_CUDNN_MSG(log_buf, addDesc.describe());

        // Define the activation operation
        auto actDesc = cudnn_frontend::PointWiseDescBuilder()
                           .setMode(CUDNN_POINTWISE_RELU_FWD)
                           .setMathPrecision(CUDNN_DATA_FLOAT)
                           .build();
        DEBUG_CUDNN_MSG(log_buf, actDesc.describe());

        // Define the convolution problem
        auto convDesc = cudnn_frontend::ConvDescBuilder()
                            .setDataType(CUDNN_DATA_FLOAT)
                            .setMathMode(CUDNN_CROSS_CORRELATION)
                            .setNDims(convDim)
                            .setStrides(convDim, convstride)
                            .setPrePadding(convDim, pad)
                            .setPostPadding(convDim, pad)
                            .setDilation(convDim, dilation)
                            .build();
        DEBUG_CUDNN_MSG(log_buf, convDesc.describe());

        float alpha  = 1.0f;
        float beta   = 0.0f;

        // Create a convolution Node
        auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
                           .setxDesc(std::get<X_TENSOR>(tensors))
                           .setwDesc(std::get<W_TENSOR>(tensors))
                           .setyDesc(std::get<AFTERCONV_TENSOR>(tensors))
                           .setcDesc(convDesc)
                           .setAlpha(alpha)
                           .setBeta(beta)
                           .build();
        DEBUG_CUDNN_MSG(log_buf, conv_op.describe());

        // Create a Add Node with scaling parameters.
        auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                           .setxDesc(conv_op.getOutputTensor())
                           .setbDesc(std::get<Z_TENSOR>(tensors))
                           .setyDesc(std::get<AFTERADD_TENSOR>(tensors))
                           .setpwDesc(scaleDesc)
                           .build();
        DEBUG_CUDNN_MSG(log_buf, scale_op.describe());

        // Create a Bias Node.
        auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                           .setxDesc(scale_op.getOutputTensor())
                           .setbDesc(std::get<B_TENSOR>(tensors))
                           .setyDesc(std::get<AFTERBIAS_TENSOR>(tensors))
                           .setpwDesc(biasDesc)
                           .build();
        DEBUG_CUDNN_MSG(log_buf, bias_op.describe());

        // Create a optional add Node.
        auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                           .setxDesc(bias_op.getOutputTensor())
                           .setbDesc(std::get<OPTIONAL>(tensors))
                           .setyDesc(std::get<AFTEROPT_TENSOR>(tensors))
                           .setpwDesc(addDesc)
                           .build();
        DEBUG_CUDNN_MSG(log_buf, add_op.describe());


        // Create an Activation Node.
        auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
          .setxDesc(devPtrI ? add_op.getOutputTensor() : bias_op.getOutputTensor())
                          .setyDesc(std::get<Y_TENSOR>(tensors))
                          .setpwDesc(actDesc)
                          .build();
        DEBUG_CUDNN_MSG(log_buf, act_op.describe());

        // Create an Operation Graph. In this case it is convolution add bias activation
        std::array<cudnn_frontend::Operation const*, 5> ops = {&conv_op, &scale_op, &bias_op, devPtrI ? &add_op : &act_op, &act_op};

        auto opGraph = cudnn_frontend::OperationGraphBuilder()
          .setHandle(handle_)
          .setOperationGraph(devPtrI ? ops.size() : 4, ops.data())
          .build();

        // Create string encoding for plan caching
        auto cache_string = getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
        DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);

        auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
        DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());

        auto workspace_size = plan.getWorkspaceSize();
        DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);

        void* workspace_ptr = nullptr;
        auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
        if (workspace_size > 0) {
          workspace_ptr = workspace_tensor.data_ptr<float>();
        }
        void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrZ, devPtrB, devPtrI};
        int64_t uids[]    = {'x', 'y', 'w', 'z', 'b', 'i'};
        auto variantPack  = cudnn_frontend::VariantPackBuilder()
                               .setWorkspacePointer(workspace_ptr)
          .setDataPointers(devPtrI ? 6 : 5, data_ptrs)
          .setUids(devPtrI ? 6 : 5, uids)
                               .build();
        DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
        checkCudnnErr(status);
        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error");
    } catch (cudnn_frontend::cudnnException e) {
      std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
    }
}

void
run_conv_scale_bias(int64_t* x_dim_padded,
                    int64_t* pad,
                    int64_t* convstride,
                    int64_t* dilation,
                    int64_t* w_dim_padded,
                    int64_t* y_dim_padded,
                    cudnnDataType_t dataType,
                    at::Half* devPtrX,
                    at::Half* devPtrW,
                    at::Half* devPtrY,
                    at::Half* devPtrZ,
                    at::Half* devPtrB) {
    cudnnHandle_t handle_ = torch::native::getCudnnHandle();
    std::stringstream log_buf;
    try {
        int convDim = 2;

        // Creates the necessary tensor descriptors
        common_convbias_descriptors tensors = create_conv_bias_add_act_descriptors(
            x_dim_padded, pad, convstride, dilation, w_dim_padded, y_dim_padded, dataType);
        DEBUG_CUDNN_MSG(log_buf, std::get<X_TENSOR>(tensors).describe());
        DEBUG_CUDNN_MSG(log_buf, std::get<Y_TENSOR>(tensors).describe());
        DEBUG_CUDNN_MSG(log_buf, std::get<W_TENSOR>(tensors).describe());
        DEBUG_CUDNN_MSG(log_buf, std::get<Z_TENSOR>(tensors).describe());
        DEBUG_CUDNN_MSG(log_buf, std::get<B_TENSOR>(tensors).describe());
        DEBUG_CUDNN_MSG(log_buf, std::get<AFTERADD_TENSOR>(tensors).describe());
        DEBUG_CUDNN_MSG(log_buf, std::get<

Download .txt

gitextract_y9jvowoy/

├── KoSBERT/
│   ├── Clustering.py
│   ├── README.md
│   ├── SemanticSearch.py
│   ├── con_training_sts.py
│   ├── output/
│   │   └── empty.txt
│   ├── run_example.sh
│   └── training_nli.py
├── KoSentenceT5/
│   ├── README.md
│   ├── apex/
│   │   ├── RNN/
│   │   │   ├── README.md
│   │   │   ├── RNNBackend.py
│   │   │   ├── __init__.py
│   │   │   ├── cells.py
│   │   │   └── models.py
│   │   ├── __init__.py
│   │   ├── amp/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── __version__.py
│   │   │   ├── _amp_state.py
│   │   │   ├── _initialize.py
│   │   │   ├── _process_optimizer.py
│   │   │   ├── amp.py
│   │   │   ├── compat.py
│   │   │   ├── frontend.py
│   │   │   ├── handle.py
│   │   │   ├── lists/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── functional_overrides.py
│   │   │   │   ├── tensor_overrides.py
│   │   │   │   └── torch_overrides.py
│   │   │   ├── opt.py
│   │   │   ├── rnn_compat.py
│   │   │   ├── scaler.py
│   │   │   ├── utils.py
│   │   │   └── wrap.py
│   │   ├── contrib/
│   │   │   ├── __init__.py
│   │   │   ├── bottleneck/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── bottleneck.py
│   │   │   │   └── test.py
│   │   │   ├── csrc/
│   │   │   │   ├── bottleneck/
│   │   │   │   │   └── bottleneck.cpp
│   │   │   │   ├── fmha/
│   │   │   │   │   ├── fmha_api.cpp
│   │   │   │   │   └── src/
│   │   │   │   │       ├── fmha/
│   │   │   │   │       │   ├── gemm.h
│   │   │   │   │       │   ├── gmem_tile.h
│   │   │   │   │       │   ├── kernel_traits.h
│   │   │   │   │       │   ├── mask.h
│   │   │   │   │       │   ├── smem_tile.h
│   │   │   │   │       │   ├── softmax.h
│   │   │   │   │       │   └── utils.h
│   │   │   │   │       ├── fmha.h
│   │   │   │   │       ├── fmha_dgrad_fp16_128_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_dgrad_fp16_256_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_dgrad_fp16_384_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_dgrad_fp16_512_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_dgrad_kernel_1xN_reload.h
│   │   │   │   │       ├── fmha_dgrad_kernel_1xN_reload_nl.h
│   │   │   │   │       ├── fmha_fprop_fp16_128_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_fprop_fp16_256_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_fprop_fp16_384_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_fprop_fp16_512_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_fprop_kernel_1xN.h
│   │   │   │   │       ├── fmha_fprop_kernel_1xN_nl.h
│   │   │   │   │       ├── fmha_fprop_kernel_1xN_reload_v.h
│   │   │   │   │       ├── fmha_kernel.h
│   │   │   │   │       ├── fmha_noloop_reduce.cu
│   │   │   │   │       └── fmha_utils.h
│   │   │   │   ├── groupbn/
│   │   │   │   │   ├── batch_norm.cu
│   │   │   │   │   ├── batch_norm.h
│   │   │   │   │   ├── batch_norm_add_relu.cu
│   │   │   │   │   ├── batch_norm_add_relu.h
│   │   │   │   │   ├── cuda_utils.h
│   │   │   │   │   ├── interface.cpp
│   │   │   │   │   ├── ipc.cu
│   │   │   │   │   └── nhwc_batch_norm_kernel.h
│   │   │   │   ├── layer_norm/
│   │   │   │   │   ├── ln_api.cpp
│   │   │   │   │   ├── ln_bwd_semi_cuda_kernel.cu
│   │   │   │   │   ├── ln_fwd_cuda_kernel.cu
│   │   │   │   │   ├── ln_kernel_traits.h
│   │   │   │   │   └── utils.cuh
│   │   │   │   ├── multihead_attn/
│   │   │   │   │   ├── additive_masked_softmax_dropout.cpp
│   │   │   │   │   ├── additive_masked_softmax_dropout_cuda.cu
│   │   │   │   │   ├── dropout.h
│   │   │   │   │   ├── encdec_multihead_attn.cpp
│   │   │   │   │   ├── encdec_multihead_attn_cuda.cu
│   │   │   │   │   ├── encdec_multihead_attn_norm_add.cpp
│   │   │   │   │   ├── encdec_multihead_attn_norm_add_cuda.cu
│   │   │   │   │   ├── layer_norm.h
│   │   │   │   │   ├── masked_softmax_dropout.cpp
│   │   │   │   │   ├── masked_softmax_dropout_cuda.cu
│   │   │   │   │   ├── philox.h
│   │   │   │   │   ├── self_multihead_attn.cpp
│   │   │   │   │   ├── self_multihead_attn_bias.cpp
│   │   │   │   │   ├── self_multihead_attn_bias_additive_mask.cpp
│   │   │   │   │   ├── self_multihead_attn_bias_additive_mask_cuda.cu
│   │   │   │   │   ├── self_multihead_attn_bias_cuda.cu
│   │   │   │   │   ├── self_multihead_attn_cuda.cu
│   │   │   │   │   ├── self_multihead_attn_norm_add.cpp
│   │   │   │   │   ├── self_multihead_attn_norm_add_cuda.cu
│   │   │   │   │   ├── softmax.h
│   │   │   │   │   └── strided_batched_gemm.h
│   │   │   │   ├── optimizers/
│   │   │   │   │   ├── fused_adam_cuda.cpp
│   │   │   │   │   ├── fused_adam_cuda_kernel.cu
│   │   │   │   │   ├── fused_lamb_cuda.cpp
│   │   │   │   │   ├── fused_lamb_cuda_kernel.cu
│   │   │   │   │   ├── multi_tensor_distopt_adam.cpp
│   │   │   │   │   ├── multi_tensor_distopt_adam_kernel.cu
│   │   │   │   │   ├── multi_tensor_distopt_lamb.cpp
│   │   │   │   │   └── multi_tensor_distopt_lamb_kernel.cu
│   │   │   │   ├── transducer/
│   │   │   │   │   ├── transducer_joint.cpp
│   │   │   │   │   ├── transducer_joint_kernel.cu
│   │   │   │   │   ├── transducer_loss.cpp
│   │   │   │   │   └── transducer_loss_kernel.cu
│   │   │   │   └── xentropy/
│   │   │   │       ├── interface.cpp
│   │   │   │       └── xentropy_kernel.cu
│   │   │   ├── examples/
│   │   │   │   └── multihead_attn/
│   │   │   │       ├── func_test_multihead_attn.py
│   │   │   │       └── perf_test_multihead_attn.py
│   │   │   ├── fmha/
│   │   │   │   ├── __init__.py
│   │   │   │   └── fmha.py
│   │   │   ├── groupbn/
│   │   │   │   ├── __init__.py
│   │   │   │   └── batch_norm.py
│   │   │   ├── layer_norm/
│   │   │   │   ├── __init__.py
│   │   │   │   └── layer_norm.py
│   │   │   ├── multihead_attn/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── encdec_multihead_attn.py
│   │   │   │   ├── encdec_multihead_attn_func.py
│   │   │   │   ├── fast_encdec_multihead_attn_func.py
│   │   │   │   ├── fast_encdec_multihead_attn_norm_add_func.py
│   │   │   │   ├── fast_self_multihead_attn_func.py
│   │   │   │   ├── fast_self_multihead_attn_norm_add_func.py
│   │   │   │   ├── mask_softmax_dropout_func.py
│   │   │   │   ├── self_multihead_attn.py
│   │   │   │   └── self_multihead_attn_func.py
│   │   │   ├── optimizers/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── distributed_fused_adam.py
│   │   │   │   ├── distributed_fused_adam_v2.py
│   │   │   │   ├── distributed_fused_adam_v3.py
│   │   │   │   ├── distributed_fused_lamb.py
│   │   │   │   ├── fp16_optimizer.py
│   │   │   │   ├── fused_adam.py
│   │   │   │   ├── fused_lamb.py
│   │   │   │   └── fused_sgd.py
│   │   │   ├── sparsity/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── asp.py
│   │   │   │   ├── sparse_masklib.py
│   │   │   │   └── test/
│   │   │   │       ├── checkpointing_test_part1.py
│   │   │   │       ├── checkpointing_test_part2.py
│   │   │   │       ├── checkpointing_test_reference.py
│   │   │   │       └── toy_problem.py
│   │   │   ├── test/
│   │   │   │   ├── fmha/
│   │   │   │   │   └── test_fmha.py
│   │   │   │   ├── layer_norm/
│   │   │   │   │   └── test_fast_layer_norm.py
│   │   │   │   ├── multihead_attn/
│   │   │   │   │   ├── test_encdec_multihead_attn.py
│   │   │   │   │   ├── test_encdec_multihead_attn_norm_add.py
│   │   │   │   │   ├── test_fast_self_multihead_attn_bias.py
│   │   │   │   │   ├── test_mha_fused_softmax.py
│   │   │   │   │   ├── test_self_multihead_attn.py
│   │   │   │   │   └── test_self_multihead_attn_norm_add.py
│   │   │   │   ├── test_label_smoothing.py
│   │   │   │   └── transducer/
│   │   │   │       ├── test_transducer_joint.py
│   │   │   │       ├── test_transducer_loss.py
│   │   │   │       └── transducer_ref.py
│   │   │   ├── transducer/
│   │   │   │   ├── __init__.py
│   │   │   │   └── transducer.py
│   │   │   └── xentropy/
│   │   │       ├── __init__.py
│   │   │       └── softmax_xentropy.py
│   │   ├── fp16_utils/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── fp16_optimizer.py
│   │   │   ├── fp16util.py
│   │   │   └── loss_scaler.py
│   │   ├── mlp/
│   │   │   ├── __init__.py
│   │   │   └── mlp.py
│   │   ├── multi_tensor_apply/
│   │   │   ├── __init__.py
│   │   │   └── multi_tensor_apply.py
│   │   ├── normalization/
│   │   │   ├── __init__.py
│   │   │   └── fused_layer_norm.py
│   │   ├── optimizers/
│   │   │   ├── __init__.py
│   │   │   ├── fused_adagrad.py
│   │   │   ├── fused_adam.py
│   │   │   ├── fused_lamb.py
│   │   │   ├── fused_novograd.py
│   │   │   └── fused_sgd.py
│   │   ├── parallel/
│   │   │   ├── LARC.py
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── distributed.py
│   │   │   ├── multiproc.py
│   │   │   ├── optimized_sync_batchnorm.py
│   │   │   ├── optimized_sync_batchnorm_kernel.py
│   │   │   ├── sync_batchnorm.py
│   │   │   └── sync_batchnorm_kernel.py
│   │   ├── pyprof/
│   │   │   ├── FAQs.md
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── examples/
│   │   │   │   ├── .gitignore
│   │   │   │   ├── apex/
│   │   │   │   │   ├── README.md
│   │   │   │   │   ├── fused_adam.py
│   │   │   │   │   ├── fused_layer_norm.py
│   │   │   │   │   └── test.sh
│   │   │   │   ├── custom_func_module/
│   │   │   │   │   ├── README.md
│   │   │   │   │   ├── custom_function.py
│   │   │   │   │   ├── custom_module.py
│   │   │   │   │   └── test.sh
│   │   │   │   ├── imagenet/
│   │   │   │   │   ├── imagenet.py
│   │   │   │   │   └── test.sh
│   │   │   │   ├── jit/
│   │   │   │   │   ├── README.md
│   │   │   │   │   ├── jit_script_function.py
│   │   │   │   │   ├── jit_script_method.py
│   │   │   │   │   ├── jit_trace_function.py
│   │   │   │   │   ├── jit_trace_method.py
│   │   │   │   │   └── test.sh
│   │   │   │   ├── lenet.py
│   │   │   │   ├── operators.py
│   │   │   │   ├── simple.py
│   │   │   │   └── user_annotation/
│   │   │   │       ├── README.md
│   │   │   │       ├── resnet.py
│   │   │   │       └── test.sh
│   │   │   ├── nvtx/
│   │   │   │   ├── __init__.py
│   │   │   │   └── nvmarker.py
│   │   │   ├── parse/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── db.py
│   │   │   │   ├── kernel.py
│   │   │   │   ├── nvvp.py
│   │   │   │   └── parse.py
│   │   │   └── prof/
│   │   │       ├── __init__.py
│   │   │       ├── __main__.py
│   │   │       ├── activation.py
│   │   │       ├── base.py
│   │   │       ├── blas.py
│   │   │       ├── conv.py
│   │   │       ├── convert.py
│   │   │       ├── data.py
│   │   │       ├── dropout.py
│   │   │       ├── embedding.py
│   │   │       ├── index_slice_join_mutate.py
│   │   │       ├── linear.py
│   │   │       ├── loss.py
│   │   │       ├── misc.py
│   │   │       ├── normalization.py
│   │   │       ├── optim.py
│   │   │       ├── output.py
│   │   │       ├── pointwise.py
│   │   │       ├── pooling.py
│   │   │       ├── prof.py
│   │   │       ├── randomSample.py
│   │   │       ├── recurrentCell.py
│   │   │       ├── reduction.py
│   │   │       ├── softmax.py
│   │   │       ├── usage.py
│   │   │       └── utility.py
│   │   └── reparameterization/
│   │       ├── README.md
│   │       ├── __init__.py
│   │       ├── reparameterization.py
│   │       └── weight_norm.py
│   ├── data/
│   │   └── dataloader.py
│   ├── main.py
│   ├── model/
│   │   ├── loss.py
│   │   ├── setting.py
│   │   ├── simcse/
│   │   │   ├── kost5.py
│   │   │   └── processor.py
│   │   └── utils.py
│   └── run_example.sh
├── KoSimCSE/
│   ├── README.md
│   ├── SemanticSearch.py
│   ├── apex/
│   │   ├── RNN/
│   │   │   ├── README.md
│   │   │   ├── RNNBackend.py
│   │   │   ├── __init__.py
│   │   │   ├── cells.py
│   │   │   └── models.py
│   │   ├── __init__.py
│   │   ├── amp/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── __version__.py
│   │   │   ├── _amp_state.py
│   │   │   ├── _initialize.py
│   │   │   ├── _process_optimizer.py
│   │   │   ├── amp.py
│   │   │   ├── compat.py
│   │   │   ├── frontend.py
│   │   │   ├── handle.py
│   │   │   ├── lists/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── functional_overrides.py
│   │   │   │   ├── tensor_overrides.py
│   │   │   │   └── torch_overrides.py
│   │   │   ├── opt.py
│   │   │   ├── rnn_compat.py
│   │   │   ├── scaler.py
│   │   │   ├── utils.py
│   │   │   └── wrap.py
│   │   ├── contrib/
│   │   │   ├── __init__.py
│   │   │   ├── bottleneck/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── bottleneck.py
│   │   │   │   └── test.py
│   │   │   ├── csrc/
│   │   │   │   ├── bottleneck/
│   │   │   │   │   └── bottleneck.cpp
│   │   │   │   ├── fmha/
│   │   │   │   │   ├── fmha_api.cpp
│   │   │   │   │   └── src/
│   │   │   │   │       ├── fmha/
│   │   │   │   │       │   ├── gemm.h
│   │   │   │   │       │   ├── gmem_tile.h
│   │   │   │   │       │   ├── kernel_traits.h
│   │   │   │   │       │   ├── mask.h
│   │   │   │   │       │   ├── smem_tile.h
│   │   │   │   │       │   ├── softmax.h
│   │   │   │   │       │   └── utils.h
│   │   │   │   │       ├── fmha.h
│   │   │   │   │       ├── fmha_dgrad_fp16_128_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_dgrad_fp16_256_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_dgrad_fp16_384_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_dgrad_fp16_512_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_dgrad_kernel_1xN_reload.h
│   │   │   │   │       ├── fmha_dgrad_kernel_1xN_reload_nl.h
│   │   │   │   │       ├── fmha_fprop_fp16_128_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_fprop_fp16_256_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_fprop_fp16_384_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_fprop_fp16_512_64_kernel.sm80.cu
│   │   │   │   │       ├── fmha_fprop_kernel_1xN.h
│   │   │   │   │       ├── fmha_fprop_kernel_1xN_nl.h
│   │   │   │   │       ├── fmha_fprop_kernel_1xN_reload_v.h
│   │   │   │   │       ├── fmha_kernel.h
│   │   │   │   │       ├── fmha_noloop_reduce.cu
│   │   │   │   │       └── fmha_utils.h
│   │   │   │   ├── groupbn/
│   │   │   │   │   ├── batch_norm.cu
│   │   │   │   │   ├── batch_norm.h
│   │   │   │   │   ├── batch_norm_add_relu.cu
│   │   │   │   │   ├── batch_norm_add_relu.h
│   │   │   │   │   ├── cuda_utils.h
│   │   │   │   │   ├── interface.cpp
│   │   │   │   │   ├── ipc.cu
│   │   │   │   │   └── nhwc_batch_norm_kernel.h
│   │   │   │   ├── layer_norm/
│   │   │   │   │   ├── ln_api.cpp
│   │   │   │   │   ├── ln_bwd_semi_cuda_kernel.cu
│   │   │   │   │   ├── ln_fwd_cuda_kernel.cu
│   │   │   │   │   ├── ln_kernel_traits.h
│   │   │   │   │   └── utils.cuh
│   │   │   │   ├── multihead_attn/
│   │   │   │   │   ├── additive_masked_softmax_dropout.cpp
│   │   │   │   │   ├── additive_masked_softmax_dropout_cuda.cu
│   │   │   │   │   ├── dropout.h
│   │   │   │   │   ├── encdec_multihead_attn.cpp
│   │   │   │   │   ├── encdec_multihead_attn_cuda.cu
│   │   │   │   │   ├── encdec_multihead_attn_norm_add.cpp
│   │   │   │   │   ├── encdec_multihead_attn_norm_add_cuda.cu
│   │   │   │   │   ├── layer_norm.h
│   │   │   │   │   ├── masked_softmax_dropout.cpp
│   │   │   │   │   ├── masked_softmax_dropout_cuda.cu
│   │   │   │   │   ├── philox.h
│   │   │   │   │   ├── self_multihead_attn.cpp
│   │   │   │   │   ├── self_multihead_attn_bias.cpp
│   │   │   │   │   ├── self_multihead_attn_bias_additive_mask.cpp
│   │   │   │   │   ├── self_multihead_attn_bias_additive_mask_cuda.cu
│   │   │   │   │   ├── self_multihead_attn_bias_cuda.cu
│   │   │   │   │   ├── self_multihead_attn_cuda.cu
│   │   │   │   │   ├── self_multihead_attn_norm_add.cpp
│   │   │   │   │   ├── self_multihead_attn_norm_add_cuda.cu
│   │   │   │   │   ├── softmax.h
│   │   │   │   │   └── strided_batched_gemm.h
│   │   │   │   ├── optimizers/
│   │   │   │   │   ├── fused_adam_cuda.cpp
│   │   │   │   │   ├── fused_adam_cuda_kernel.cu
│   │   │   │   │   ├── fused_lamb_cuda.cpp
│   │   │   │   │   ├── fused_lamb_cuda_kernel.cu
│   │   │   │   │   ├── multi_tensor_distopt_adam.cpp
│   │   │   │   │   ├── multi_tensor_distopt_adam_kernel.cu
│   │   │   │   │   ├── multi_tensor_distopt_lamb.cpp
│   │   │   │   │   └── multi_tensor_distopt_lamb_kernel.cu
│   │   │   │   ├── transducer/
│   │   │   │   │   ├── transducer_joint.cpp
│   │   │   │   │   ├── transducer_joint_kernel.cu
│   │   │   │   │   ├── transducer_loss.cpp
│   │   │   │   │   └── transducer_loss_kernel.cu
│   │   │   │   └── xentropy/
│   │   │   │       ├── interface.cpp
│   │   │   │       └── xentropy_kernel.cu
│   │   │   ├── examples/
│   │   │   │   └── multihead_attn/
│   │   │   │       ├── func_test_multihead_attn.py
│   │   │   │       └── perf_test_multihead_attn.py
│   │   │   ├── fmha/
│   │   │   │   ├── __init__.py
│   │   │   │   └── fmha.py
│   │   │   ├── groupbn/
│   │   │   │   ├── __init__.py
│   │   │   │   └── batch_norm.py
│   │   │   ├── layer_norm/
│   │   │   │   ├── __init__.py
│   │   │   │   └── layer_norm.py
│   │   │   ├── multihead_attn/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── encdec_multihead_attn.py
│   │   │   │   ├── encdec_multihead_attn_func.py
│   │   │   │   ├── fast_encdec_multihead_attn_func.py
│   │   │   │   ├── fast_encdec_multihead_attn_norm_add_func.py
│   │   │   │   ├── fast_self_multihead_attn_func.py
│   │   │   │   ├── fast_self_multihead_attn_norm_add_func.py
│   │   │   │   ├── mask_softmax_dropout_func.py
│   │   │   │   ├── self_multihead_attn.py
│   │   │   │   └── self_multihead_attn_func.py
│   │   │   ├── optimizers/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── distributed_fused_adam.py
│   │   │   │   ├── distributed_fused_adam_v2.py
│   │   │   │   ├── distributed_fused_adam_v3.py
│   │   │   │   ├── distributed_fused_lamb.py
│   │   │   │   ├── fp16_optimizer.py
│   │   │   │   ├── fused_adam.py
│   │   │   │   ├── fused_lamb.py
│   │   │   │   └── fused_sgd.py
│   │   │   ├── sparsity/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── asp.py
│   │   │   │   ├── sparse_masklib.py
│   │   │   │   └── test/
│   │   │   │       ├── checkpointing_test_part1.py
│   │   │   │       ├── checkpointing_test_part2.py
│   │   │   │       ├── checkpointing_test_reference.py
│   │   │   │       └── toy_problem.py
│   │   │   ├── test/
│   │   │   │   ├── fmha/
│   │   │   │   │   └── test_fmha.py
│   │   │   │   ├── layer_norm/
│   │   │   │   │   └── test_fast_layer_norm.py
│   │   │   │   ├── multihead_attn/
│   │   │   │   │   ├── test_encdec_multihead_attn.py
│   │   │   │   │   ├── test_encdec_multihead_attn_norm_add.py
│   │   │   │   │   ├── test_fast_self_multihead_attn_bias.py
│   │   │   │   │   ├── test_mha_fused_softmax.py
│   │   │   │   │   ├── test_self_multihead_attn.py
│   │   │   │   │   └── test_self_multihead_attn_norm_add.py
│   │   │   │   ├── test_label_smoothing.py
│   │   │   │   └── transducer/
│   │   │   │       ├── test_transducer_joint.py
│   │   │   │       ├── test_transducer_loss.py
│   │   │   │       └── transducer_ref.py
│   │   │   ├── transducer/
│   │   │   │   ├── __init__.py
│   │   │   │   └── transducer.py
│   │   │   └── xentropy/
│   │   │       ├── __init__.py
│   │   │       └── softmax_xentropy.py
│   │   ├── fp16_utils/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── fp16_optimizer.py
│   │   │   ├── fp16util.py
│   │   │   └── loss_scaler.py
│   │   ├── mlp/
│   │   │   ├── __init__.py
│   │   │   └── mlp.py
│   │   ├── multi_tensor_apply/
│   │   │   ├── __init__.py
│   │   │   └── multi_tensor_apply.py
│   │   ├── normalization/
│   │   │   ├── __init__.py
│   │   │   └── fused_layer_norm.py
│   │   ├── optimizers/
│   │   │   ├── __init__.py
│   │   │   ├── fused_adagrad.py
│   │   │   ├── fused_adam.py
│   │   │   ├── fused_lamb.py
│   │   │   ├── fused_novograd.py
│   │   │   └── fused_sgd.py
│   │   ├── parallel/
│   │   │   ├── LARC.py
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── distributed.py
│   │   │   ├── multiproc.py
│   │   │   ├── optimized_sync_batchnorm.py
│   │   │   ├── optimized_sync_batchnorm_kernel.py
│   │   │   ├── sync_batchnorm.py
│   │   │   └── sync_batchnorm_kernel.py
│   │   ├── pyprof/
│   │   │   ├── FAQs.md
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── examples/
│   │   │   │   ├── .gitignore
│   │   │   │   ├── apex/
│   │   │   │   │   ├── README.md
│   │   │   │   │   ├── fused_adam.py
│   │   │   │   │   ├── fused_layer_norm.py
│   │   │   │   │   └── test.sh
│   │   │   │   ├── custom_func_module/
│   │   │   │   │   ├── README.md
│   │   │   │   │   ├── custom_function.py
│   │   │   │   │   ├── custom_module.py
│   │   │   │   │   └── test.sh
│   │   │   │   ├── imagenet/
│   │   │   │   │   ├── imagenet.py
│   │   │   │   │   └── test.sh
│   │   │   │   ├── jit/
│   │   │   │   │   ├── README.md
│   │   │   │   │   ├── jit_script_function.py
│   │   │   │   │   ├── jit_script_method.py
│   │   │   │   │   ├── jit_trace_function.py
│   │   │   │   │   ├── jit_trace_method.py
│   │   │   │   │   └── test.sh
│   │   │   │   ├── lenet.py
│   │   │   │   ├── operators.py
│   │   │   │   ├── simple.py
│   │   │   │   └── user_annotation/
│   │   │   │       ├── README.md
│   │   │   │       ├── resnet.py
│   │   │   │       └── test.sh
│   │   │   ├── nvtx/
│   │   │   │   ├── __init__.py
│   │   │   │   └── nvmarker.py
│   │   │   ├── parse/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── db.py
│   │   │   │   ├── kernel.py
│   │   │   │   ├── nvvp.py
│   │   │   │   └── parse.py
│   │   │   └── prof/
│   │   │       ├── __init__.py
│   │   │       ├── __main__.py
│   │   │       ├── activation.py
│   │   │       ├── base.py
│   │   │       ├── blas.py
│   │   │       ├── conv.py
│   │   │       ├── convert.py
│   │   │       ├── data.py
│   │   │       ├── dropout.py
│   │   │       ├── embedding.py
│   │   │       ├── index_slice_join_mutate.py
│   │   │       ├── linear.py
│   │   │       ├── loss.py
│   │   │       ├── misc.py
│   │   │       ├── normalization.py
│   │   │       ├── optim.py
│   │   │       ├── output.py
│   │   │       ├── pointwise.py
│   │   │       ├── pooling.py
│   │   │       ├── prof.py
│   │   │       ├── randomSample.py
│   │   │       ├── recurrentCell.py
│   │   │       ├── reduction.py
│   │   │       ├── softmax.py
│   │   │       ├── usage.py
│   │   │       └── utility.py
│   │   └── reparameterization/
│   │       ├── README.md
│   │       ├── __init__.py
│   │       ├── reparameterization.py
│   │       └── weight_norm.py
│   ├── data/
│   │   └── dataloader.py
│   ├── main.py
│   ├── model/
│   │   ├── loss.py
│   │   ├── setting.py
│   │   ├── simcse/
│   │   │   ├── bert.py
│   │   │   └── processor.py
│   │   └── utils.py
│   ├── output/
│   │   └── empty.txt
│   ├── requirements.txt
│   └── run_example.sh
├── LICENSE
├── README.md
├── get_model_checkpoint.sh
└── get_model_dataset.sh

Download .txt

Showing preview only (211K chars total). Download the full file or copy to clipboard to get everything.

SYMBOL INDEX (2719 symbols across 329 files)

FILE: KoSentenceT5/apex/RNN/RNNBackend.py
  function is_iterable (line 10) | def is_iterable(maybe_iterable):
  function flatten_list (line 14) | def flatten_list(tens_list):
  class bidirectionalRNN (line 25) | class bidirectionalRNN(nn.Module):
    method __init__ (line 29) | def __init__(self, inputRNN, num_layers=1, dropout = 0):
    method forward (line 37) | def forward(self, input, collect_hidden=False):
    method reset_parameters (line 52) | def reset_parameters(self):
    method init_hidden (line 59) | def init_hidden(self, bsz):
    method detach_hidden (line 66) | def detach_hidden(self):
    method reset_hidden (line 73) | def reset_hidden(self, bsz):
    method init_inference (line 80) | def init_inference(self, bsz):
  class stackedRNN (line 90) | class stackedRNN(nn.Module):
    method __init__ (line 94) | def __init__(self, inputRNN, num_layers=1, dropout=0):
    method forward (line 122) | def forward(self, input, collect_hidden=False, reverse=False):
    method reset_parameters (line 197) | def reset_parameters(self):
    method init_hidden (line 204) | def init_hidden(self, bsz):
    method detach_hidden (line 211) | def detach_hidden(self):
    method reset_hidden (line 218) | def reset_hidden(self, bsz):
    method init_inference (line 225) | def init_inference(self, bsz):
  class RNNCell (line 232) | class RNNCell(nn.Module):
    method __init__ (line 242) | def __init__(self, gate_multiplier, input_size, hidden_size, cell, n_h...
    method new_like (line 274) | def new_like(self, new_input_size=None):
    method reset_parameters (line 291) | def reset_parameters(self, gain=1):
    method init_hidden (line 309) | def init_hidden(self, bsz):
    method reset_hidden (line 330) | def reset_hidden(self, bsz):
    method detach_hidden (line 338) | def detach_hidden(self):
    method forward (line 348) | def forward(self, input):

FILE: KoSentenceT5/apex/RNN/cells.py
  class mLSTMRNNCell (line 12) | class mLSTMRNNCell(RNNCell):
    method __init__ (line 17) | def __init__(self, input_size, hidden_size, bias = False, output_size ...
    method forward (line 26) | def forward(self, input):
    method new_like (line 45) | def new_like(self, new_input_size=None):
  function mLSTMCell (line 55) | def mLSTMCell(input, hidden, w_ih, w_hh, w_mih, w_mhh, b_ih=None, b_hh=N...

FILE: KoSentenceT5/apex/RNN/models.py
  function toRNNBackend (line 8) | def toRNNBackend(inputRNN, num_layers, bidirectional=False, dropout = 0):
  function LSTM (line 19) | def LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=Fal...
  function GRU (line 26) | def GRU(input_size, hidden_size, num_layers, bias=True, batch_first=Fals...
  function ReLU (line 33) | def ReLU(input_size, hidden_size, num_layers, bias=True, batch_first=Fal...
  function Tanh (line 40) | def Tanh(input_size, hidden_size, num_layers, bias=True, batch_first=Fal...
  function mLSTM (line 47) | def mLSTM(input_size, hidden_size, num_layers, bias=True, batch_first=Fa...

FILE: KoSentenceT5/apex/amp/_amp_state.py
  class AmpState (line 18) | class AmpState(object):
    method __init__ (line 19) | def __init__(self):
  function warn_or_err (line 29) | def warn_or_err(msg):
  function maybe_print (line 39) | def maybe_print(msg, rank0=False):
  function master_params (line 60) | def master_params(optimizer):

FILE: KoSentenceT5/apex/amp/_initialize.py
  function to_type (line 21) | def to_type(dtype, t):
  function applier (line 39) | def applier(value, fn):
  function check_models (line 64) | def check_models(models):
  function check_params_fp32 (line 79) | def check_params_fp32(models):
  function check_optimizers (line 119) | def check_optimizers(optimizers):
  class O2StateDictHook (line 133) | class O2StateDictHook(object):
    method __init__ (line 134) | def __init__(self, fn):
    method __call__ (line 137) | def __call__(self, module, state_dict, prefix, local_metadata):
  function _initialize (line 145) | def _initialize(models, optimizers, properties, num_losses=1, cast_model...

FILE: KoSentenceT5/apex/amp/_process_optimizer.py
  class AmpOptimizerState (line 9) | class AmpOptimizerState(object):
    method __init__ (line 10) | def __init__(self):
  function _master_params_to_model_params (line 14) | def _master_params_to_model_params(self):
  function lazy_init_with_master_weights (line 28) | def lazy_init_with_master_weights(self):
  function post_backward_models_are_masters (line 93) | def post_backward_models_are_masters(scaler, params, stashed_grads, scal...
  function prepare_backward_with_master_weights (line 142) | def prepare_backward_with_master_weights(self):
  function post_backward_with_master_weights (line 161) | def post_backward_with_master_weights(self, scaler):
  function lazy_init_no_master_weights (line 205) | def lazy_init_no_master_weights(self):
  function prepare_backward_no_master_weights (line 224) | def prepare_backward_no_master_weights(self):
  function post_backward_no_master_weights (line 240) | def post_backward_no_master_weights(self, scaler):
  function prepare_backward_with_master_weights_FusedSGD (line 258) | def prepare_backward_with_master_weights_FusedSGD(self):
  function post_backward_with_master_weights_FusedSGD (line 277) | def post_backward_with_master_weights_FusedSGD(self, scaler):
  function prepare_backward_no_master_weights_FusedSGD (line 305) | def prepare_backward_no_master_weights_FusedSGD(self):
  function post_backward_no_master_weights_FusedSGD (line 309) | def post_backward_no_master_weights_FusedSGD(self, scaler):
  function _amp_lazy_init (line 313) | def _amp_lazy_init(self):
  function _process_optimizer (line 321) | def _process_optimizer(optimizer, properties):

FILE: KoSentenceT5/apex/amp/amp.py
  function _decorator_helper (line 18) | def _decorator_helper(orig_fn, cast_fn, wrap_fn):
  function half_function (line 30) | def half_function(fn):
  function float_function (line 35) | def float_function(fn):
  function promote_function (line 40) | def promote_function(fn):
  function register_half_function (line 46) | def register_half_function(module, name):
  function register_float_function (line 53) | def register_float_function(module, name):
  function register_promote_function (line 60) | def register_promote_function(module, name):
  function init (line 68) | def init(enabled=True, loss_scale="dynamic", enable_caching=True, verbos...

FILE: KoSentenceT5/apex/amp/compat.py
  function variable_is_tensor (line 4) | def variable_is_tensor():
  function tensor_is_variable (line 8) | def tensor_is_variable():
  function tensor_is_float_tensor (line 13) | def tensor_is_float_tensor():
  function is_tensor_like (line 19) | def is_tensor_like(x):
  function is_floating_point (line 24) | def is_floating_point(x):
  function scalar_python_val (line 35) | def scalar_python_val(x):
  function filter_attrs (line 45) | def filter_attrs(module, attrs):

FILE: KoSentenceT5/apex/amp/frontend.py
  class Properties (line 7) | class Properties(object):
    method __init__ (line 13) | def __init__(self):
    method _update_options_dict (line 33) | def _update_options_dict(self, new_options):
    method __getattr__ (line 43) | def __getattr__(self, name):
    method __setattr__ (line 51) | def __setattr__(self, name, value):
  class O3 (line 102) | class O3:
    method __call__ (line 111) | def __call__(self, properties):
  class O2 (line 124) | class O2:
    method __call__ (line 134) | def __call__(self, properties):
  class O1 (line 147) | class O1:
    method __call__ (line 156) | def __call__(self, properties):
  class O0 (line 169) | class O0:
    method __call__ (line 175) | def __call__(self, properties):
  function initialize (line 195) | def initialize(
  function state_dict (line 361) | def state_dict(destination=None):
  function load_state_dict (line 373) | def load_state_dict(state_dict):

FILE: KoSentenceT5/apex/amp/handle.py
  function scale_loss (line 17) | def scale_loss(loss,
  function disable_casts (line 164) | def disable_casts():
  class AmpHandle (line 170) | class AmpHandle(object):
    method __init__ (line 171) | def __init__(self, loss_scale="dynamic", enable_caching=True, verbose=...
    method is_active (line 179) | def is_active(self):
    method _disable_casts (line 183) | def _disable_casts(self):
    method wrap_optimizer (line 188) | def wrap_optimizer(self, optimizer, num_loss=1):
    method scale_loss (line 193) | def scale_loss(self, loss, optimizer):
    method _clear_cache (line 226) | def _clear_cache(self):
    method _save_func (line 230) | def _save_func(self, mod, fn, func):
    method _deactivate (line 233) | def _deactivate(self):
    method has_cache (line 239) | def has_cache(self):
    method cache (line 243) | def cache(self):
    method remove_cache (line 246) | def remove_cache(self, param):
    method verbose (line 251) | def verbose(self):
  class NoOpHandle (line 254) | class NoOpHandle(object):
    method is_active (line 255) | def is_active(self):
    method _disable_casts (line 259) | def _disable_casts(self):
    method wrap_optimizer (line 262) | def wrap_optimizer(self, optimizer, num_loss=1):
    method scale_loss (line 266) | def scale_loss(self, loss, optimizer):
    method has_cache (line 270) | def has_cache(self):
    method verbose (line 274) | def verbose(self):
    method _clear_cache (line 277) | def _clear_cache(self):
    method _deactivate (line 280) | def _deactivate(self):

FILE: KoSentenceT5/apex/amp/opt.py
  class OptimWrapper (line 9) | class OptimWrapper(object):
    method __init__ (line 10) | def __init__(self, optimizer, amp_handle, num_loss):
    method scale_loss (line 19) | def scale_loss(self, loss):
    method _cur_loss_scaler (line 55) | def _cur_loss_scaler(self):
    method step (line 59) | def step(self, closure=None):
    method __getattr__ (line 80) | def __getattr__(self, attr):
    method __getstate__ (line 84) | def __getstate__(self):
    method __setstate__ (line 87) | def __setstate__(self):
    method __repr__ (line 90) | def __repr__(self):
    method state_dict (line 93) | def state_dict(self):
    method load_state_dict (line 96) | def load_state_dict(self, state_dict):
    method zero_grad (line 99) | def zero_grad(self):
    method add_param_group (line 102) | def add_param_group(self, param_group):

FILE: KoSentenceT5/apex/amp/rnn_compat.py
  function _gen_VF_wrapper (line 7) | def _gen_VF_wrapper(name):
  class VariableFunctionsShim (line 17) | class VariableFunctionsShim(object):
    method __init__ (line 18) | def __init__(self):
  function has_old_rnns (line 24) | def has_old_rnns():
  function whitelist_rnn_cells (line 31) | def whitelist_rnn_cells(handle, verbose):

FILE: KoSentenceT5/apex/amp/scaler.py
  function scale_check_overflow_python (line 6) | def scale_check_overflow_python(model_grad, master_grad, scale, check_ov...
  function axpby_check_overflow_python (line 19) | def axpby_check_overflow_python(model_grad, stashed_grad, master_grad, a...
  class LossScaler (line 33) | class LossScaler(object):
    method __init__ (line 38) | def __init__(self,
    method loss_scale (line 73) | def loss_scale(self):
    method unscale_python (line 76) | def unscale_python(self, model_grads, master_grads, scale):
    method unscale (line 94) | def unscale(self, model_grads, master_grads, unused_scale, models_are_...
    method unscale_with_stashed_python (line 126) | def unscale_with_stashed_python(self,
    method unscale_with_stashed (line 152) | def unscale_with_stashed(self,
    method clear_overflow_state (line 191) | def clear_overflow_state(self):
    method update_scale (line 197) | def update_scale(self):

FILE: KoSentenceT5/apex/amp/utils.py
  function is_cuda_enabled (line 8) | def is_cuda_enabled():
  function get_cuda_version (line 11) | def get_cuda_version():
  function is_fp_tensor (line 14) | def is_fp_tensor(x):
  function is_nested (line 23) | def is_nested(x):
  function should_cache (line 26) | def should_cache(x):
  function collect_fp_tensor_types (line 36) | def collect_fp_tensor_types(args, kwargs):
  function type_string (line 51) | def type_string(x):
  function maybe_half (line 54) | def maybe_half(x, name='', verbose=False):
  function maybe_float (line 65) | def maybe_float(x, name='', verbose=False):
  function casted_args (line 77) | def casted_args(cast_fn, args, kwargs):
  function cached_cast (line 90) | def cached_cast(cast_fn, x, cache):
  function verbosify (line 124) | def verbosify(cast_fn, fn_name, verbose):
  function as_inplace (line 130) | def as_inplace(fns):
  function has_func (line 134) | def has_func(mod, fn):
  function get_func (line 140) | def get_func(mod, fn):
  function set_func (line 146) | def set_func(mod, fn, new_fn):
  function set_func_save (line 152) | def set_func_save(handle, mod, fn, new_fn):
  function synthesize_flattened_rnn_weights (line 171) | def synthesize_flattened_rnn_weights(fp32_weights,
  function new_synthesize_flattened_rnn_weights (line 194) | def new_synthesize_flattened_rnn_weights(fp32_weights,

FILE: KoSentenceT5/apex/amp/wrap.py
  function make_cast_wrapper (line 10) | def make_cast_wrapper(orig_fn, cast_fn, handle,
  function cached_cast (line 31) | def cached_cast(mod, fn, cast_fn, handle,
  function make_promote_wrapper (line 44) | def make_promote_wrapper(orig_fn, cast_fn, handle=None):
  function promote (line 65) | def promote(mod, fn, handle, verbose=False):
  function sequence_promote (line 71) | def sequence_promote(mod, fn, handle, verbose=False):
  function promote_match_arg0 (line 92) | def promote_match_arg0(mod, fn, handle, verbose=False):
  function err_if_any_half (line 114) | def err_if_any_half(mod, fn, handle, custom_err_msg=None):
  function err_if_arg0_half (line 132) | def err_if_arg0_half(mod, fn, handle, verbose=False):
  function rnn_cast (line 157) | def rnn_cast(backend, fn, handle, verbose=False):
  function new_rnn_cast (line 222) | def new_rnn_cast(fn, handle, verbose=False):
  function disable_casts (line 267) | def disable_casts(mod, fn, handle):

FILE: KoSentenceT5/apex/contrib/bottleneck/bottleneck.py
  function kaiming_uniform_ (line 5) | def kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_rel...
  class FrozenBatchNorm2d (line 9) | class FrozenBatchNorm2d(torch.nn.Module):
    method __init__ (line 13) | def __init__(self, n):
    method get_scale_bias (line 20) | def get_scale_bias(self, nhwc=False):
    method forward (line 31) | def forward(self, x):
  function drelu_dscale1 (line 37) | def drelu_dscale1(grad_o, output, scale1):
  function drelu_dscale2 (line 44) | def drelu_dscale2(grad_o, output, scale1, scale2):
  class BottleneckFunction (line 51) | class BottleneckFunction(torch.autograd.Function):
    method forward (line 53) | def forward(ctx, nhwc, stride_1x1, scale, bias, x, *conv):
    method backward (line 75) | def backward(ctx, grad_o):
  function conv3x3 (line 102) | def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
  function conv1x1 (line 107) | def conv1x1(in_planes, out_planes, stride=1):
  class Bottleneck (line 111) | class Bottleneck(torch.nn.Module):
    method __init__ (line 119) | def __init__(self, in_channels, bottleneck_channels, out_channels, str...
    method forward (line 174) | def forward(self, x):

FILE: KoSentenceT5/apex/contrib/csrc/bottleneck/bottleneck.cpp
  function checkCudnnError (line 31) | int checkCudnnError(cudnnStatus_t code, const char* expr, const char* fi...
  function checkError (line 42) | void checkError(cudaError_t code, char const * func, const char *file, c...
  function generateStrides (line 55) | void generateStrides(const int64_t* dimA, int64_t* strideA, int nbDims, ...
  function getFwdConvDilatedFilterDim (line 75) | int getFwdConvDilatedFilterDim(int filterDim, int dilation) {
  function getFwdConvPaddedImageDim (line 79) | int getFwdConvPaddedImageDim(int tensorDim, int pad) {
  function getFwdConvOutputDim (line 83) | int getFwdConvOutputDim(
  function common_conv_descriptors (line 111) | common_conv_descriptors
  function common_convbias_descriptors (line 173) | common_convbias_descriptors
  function dconv_descriptors (line 294) | dconv_descriptors
  function getConvFusionString (line 377) | std::string getConvFusionString(int64_t* x_dim_padded,
  function run_conv_scale_bias_add_activation (line 469) | void
  function run_conv_scale_bias (line 630) | void
  function run_dconv_drelu_dscale (line 759) | void
  function run_dconv (line 886) | void
  function run_dconv_add (line 992) | void
  function bottleneck_forward (line 1104) | std::vector<at::Tensor> bottleneck_forward(bool explicit_nhwc, int strid...
  function bottleneck_backward (line 1287) | std::vector<at::Tensor> bottleneck_backward(bool explicit_nhwc, int stri...
  function PYBIND11_MODULE (line 1609) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSentenceT5/apex/contrib/csrc/fmha/fmha_api.cpp
  function set_params (line 33) | void set_params(Fused_multihead_attention_fprop_params &params,
  function mha_fwd (line 86) | std::vector<at::Tensor>
  function mha_bwd (line 182) | std::vector<at::Tensor>
  function mha_fwd_nl (line 262) | std::vector<at::Tensor> mha_fwd_nl(const at::Tensor &qkv,         // tot...
  function mha_bwd_nl (line 342) | std::vector<at::Tensor> mha_bwd_nl(const at::Tensor &dout,        // tot...
  function PYBIND11_MODULE (line 426) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSentenceT5/apex/contrib/csrc/fmha/src/fmha.h
  type Qkv_params (line 46) | struct Qkv_params {
  function Qkv_params (line 59) | struct Fused_multihead_attention_fprop_params : public Qkv_params {

FILE: KoSentenceT5/apex/contrib/csrc/fmha/src/fmha/gemm.h
  function namespace (line 34) | namespace fmha {
  type Fragment_accumulator (line 145) | struct Fragment_accumulator
  function add (line 152) | void add(const Other_fragment_ &other) {

FILE: KoSentenceT5/apex/contrib/csrc/fmha/src/fmha/gmem_tile.h
  function namespace (line 30) | namespace fmha {
  function __device__ (line 112) | inline __device__ void store(const uint4 (&data)[LDGS]) {
  function __device__ (line 123) | inline __device__ void move() {
  function __device__ (line 201) | inline __device__ void store(const uint4 (&src)[STGS_PER_LOOP], int mi) {
  function __device__ (line 222) | inline __device__ void move() {
  function __device__ (line 273) | __device__ Gmem_tile_mma_sd(void *ptr, const Params &params, const int t...
  function __device__ (line 288) | inline __device__ void store(const Type &data, const int mi, const int n...
  function __device__ (line 300) | inline __device__ void move() {
  function Base (line 311) | struct Gmem_tile_mma_s : public Base {
  function Base (line 404) | struct Gmem_tile_dq : public Base {

FILE: KoSentenceT5/apex/contrib/csrc/fmha/src/fmha/mask.h
  function namespace (line 30) | namespace fmha {

FILE: KoSentenceT5/apex/contrib/csrc/fmha/src/fmha/smem_tile.h
  function namespace (line 33) | namespace fmha {
  function __device__ (line 396) | inline __device__ Smem_tile_row_a(void *smem, int tidx) : Base(smem, tid...
  function __device__ (line 462) | inline __device__ void reset_read_offset() {
  function __device__ (line 494) | inline __device__ Smem_tile_a(void *smem, int tidx) : Base(smem, tidx) {
  function __device__ (line 581) | inline __device__ Smem_tile_col_b(void *smem, int tidx) : Base(smem, tid...
  function __device__ (line 653) | inline __device__ void reset_read_offset() {
  function __device__ (line 685) | inline __device__ Smem_tile_b(void *smem, int tidx) : Base(smem, tidx) {
  function __device__ (line 748) | inline __device__ Smem_tile_row_b(void *smem, int tidx) : Base(smem, tid...
  function __device__ (line 892) | inline __device__ Smem_tile_b(void *smem, int tidx) : Base(smem, tidx) {
  function __device__ (line 912) | inline __device__ Smem_tile_v(void *smem, int tidx) : Base(smem, tidx) {
  function __device__ (line 1003) | inline __device__ Smem_tile_o(void *smem, int tidx) {
  function store (line 1057) | void store(const Accumulator (&acc)[M][N], int mi) {
  function __device__ (line 1129) | inline __device__ Smem_tile_mma(char *smem, int tidx) {
  function store (line 1147) | void store(const uint4 (&regs)[M][N]) {
  function __device__ (line 1177) | inline __device__ Smem_tile_mma_transposed(char *smem, int tidx) : Base(...
  function load (line 1189) | void load(Fragment (&frag)[M][N]) {
  function __device__ (line 1223) | inline __device__ Smem_tile_mma_epilogue(char *smem, int tidx) : Base(sm...
  function store (line 1238) | void store(const Acc (&acc)[M][N]){
  function store (line 1272) | void store(const uint4 (&regs)[M][N]) {

FILE: KoSentenceT5/apex/contrib/csrc/fmha/src/fmha/softmax.h
  function namespace (line 30) | namespace fmha {

FILE: KoSentenceT5/apex/contrib/csrc/fmha/src/fmha/utils.h
  function namespace (line 38) | namespace fmha {
  function __device__ (line 247) | static inline __device__ uint32_t hadd2(uint32_t a, uint32_t b) {
  function __device__ (line 255) | static inline __device__ uint32_t hmin2(uint32_t a, uint32_t b) {
  function __device__ (line 263) | static inline __device__ uint32_t hmul2(uint32_t a, uint32_t b) {
  function __device__ (line 271) | static inline __device__ uint2 hmul4(uint2 a, uint2 b) {
  function __device__ (line 280) | static inline __device__ uint4 hmul8(uint4 a, uint4 b) {
  function __device__ (line 291) | static inline __device__ uint4 hmul8(uint32_t a, uint4 b) {
  function __device__ (line 317) | static inline __device__ uint32_t habs2(uint32_t x) {
  function __device__ (line 332) | static inline __device__ uint16_t clamp_to_zero(uint16_t x) {
  function __device__ (line 340) | static inline __device__ uint16_t float_to_half(float f) {
  function __device__ (line 348) | static inline __device__ uint32_t float2_to_half2(float a, float b) {
  function __device__ (line 362) | static inline __device__ uint32_t float_to_half2(float a) {
  function __device__ (line 368) | static inline __device__ uint32_t float2_to_half2(const float2 &f) {
  function __device__ (line 374) | static inline __device__ uint2 float4_to_half4(float x, float y, float z...
  function __device__ (line 383) | static inline __device__ uint32_t hfma2(uint32_t a, uint32_t b, uint32_t...
  function __device__ (line 391) | static inline __device__ uint32_t hfma2_relu(uint32_t a, uint32_t b, uin...
  function __device__ (line 403) | static inline __device__ uint32_t h0_h0(uint32_t x) {
  function __device__ (line 412) | static inline __device__ float h0_to_float(uint32_t h2) {
  function __device__ (line 424) | static inline __device__ uint32_t h1_h1(uint32_t x) {
  function __device__ (line 433) | static inline __device__ uint16_t hadd(uint16_t a, uint16_t b) {
  function __device__ (line 441) | static inline __device__ uint32_t hadd(uint32_t a, uint32_t b) {
  function __device__ (line 447) | static inline __device__ uint2 hadd4(uint2 a, uint2 b) {
  function __device__ (line 456) | static inline __device__ uint2 hadd(uint2 a, uint2 b) {
  function __device__ (line 462) | static inline __device__ uint4 hadd8(uint4 a, uint4 b) {
  function __device__ (line 473) | static inline __device__ uint4 fadd4(uint4 a, uint4 b) {
  function __device__ (line 484) | static inline __device__ uint4 hadd(uint4 a, uint4 b) {
  function __device__ (line 490) | static inline __device__ float half_to_float(uint16_t h) {
  function __device__ (line 498) | static inline __device__ float2 half2_to_float2(uint32_t x) {
  function __device__ (line 514) | static inline __device__ uint16_t hfma(uint16_t a, uint16_t b, uint16_t ...
  function __device__ (line 522) | static inline __device__ uint16_t hmul(uint16_t a, uint16_t b) {
  function __device__ (line 530) | static inline __device__ float sigmoid(float x) {
  function __device__ (line 685) | inline __device__ Ldg_functor(Data_type (&fetch)[N], const void* (&ptrs)...
  function __device__ (line 690) | inline __device__ void clear(int ii) {
  function __device__ (line 695) | inline __device__ void load(int ii, bool p) {
  function __device__ (line 847) | inline __device__ void stg(void *ptr, uint8_t val) {
  function __device__ (line 853) | inline __device__ void stg(void *ptr, uint16_t val) {
  function __device__ (line 859) | inline __device__ void stg(void *ptr, uint32_t val) {
  function __device__ (line 865) | inline __device__ void stg(void *ptr, uint2 val) {
  function __device__ (line 871) | inline __device__ void stg(void *ptr, uint4 val) {
  function __device__ (line 881) | inline __device__ void sts(uint32_t ptr, uint16_t val) {
  function __device__ (line 887) | inline __device__ void sts(uint32_t ptr, uint32_t val) {
  function __device__ (line 893) | inline __device__ void sts(uint32_t ptr, uint2 val) {
  function __device__ (line 903) | inline __device__ void sts(uint32_t ptr, uint4 val) {

FILE: KoSentenceT5/apex/contrib/csrc/fmha/src/fmha_dgrad_kernel_1xN_reload.h
  function namespace (line 34) | namespace fmha {

FILE: KoSentenceT5/apex/contrib/csrc/fmha/src/fmha_dgrad_kernel_1xN_reload_nl.h
  function namespace (line 34) | namespace fmha {

FILE: KoSentenceT5/apex/contrib/csrc/fmha/src/fmha_fprop_kernel_1xN.h
  function namespace (line 34) | namespace fmha {

FILE: KoSentenceT5/apex/contrib/csrc/fmha/src/fmha_fprop_kernel_1xN_nl.h
  function namespace (line 35) | namespace fmha {

FILE: KoSentenceT5/apex/contrib/csrc/fmha/src/fmha_fprop_kernel_1xN_reload_v.h
  function namespace (line 34) | namespace fmha {

FILE: KoSentenceT5/apex/contrib/csrc/fmha/src/fmha_kernel.h
  function namespace (line 39) | namespace fmha {
  function __device__ (line 90) | inline __device__ Noloop_traits(const int bidc)
  function move_all (line 96) | void move_all(Tiles & ... tiles) const {
  function __device__ (line 113) | inline __device__ int offset_loop_count(const int l) {
  function __device__ (line 157) | inline __device__ int offset_loop_count(const int l) {

FILE: KoSentenceT5/apex/contrib/csrc/fmha/src/fmha_utils.h
  type Data_type (line 53) | enum Data_type { DATA_TYPE_FP16, DATA_TYPE_FP32, DATA_TYPE_INT32, DATA_T...
  function set_alpha (line 57) | static inline void set_alpha( uint32_t &alpha, float norm, Data_type dty...
  function get_size_in_bytes (line 75) | static inline size_t get_size_in_bytes( size_t n, Data_type dtype ) {

FILE: KoSentenceT5/apex/contrib/csrc/groupbn/batch_norm.h
  function class (line 41) | class NhwcBatchNorm {
  function createTensorDescriptor (line 193) | void createTensorDescriptor(cudnnTensorDescriptor_t *descriptor) {
  function destroyTensorDescriptor (line 199) | void destroyTensorDescriptor(cudnnTensorDescriptor_t descriptor) {
  type StorageType (line 223) | typedef uint16_t StorageType;
  function _fwdKernelLauncher (line 258) | void _fwdKernelLauncher(cudaStream_t stream, NhwcBatchNormFwdParams params,
  function _bwdKernelLauncher (line 338) | void _bwdKernelLauncher(cudaStream_t stream, NhwcBatchNormBwdParams params,
  function smem_driven_bwd_occupancy (line 469) | static int smem_driven_bwd_occupancy(int device_id, const int max_cta_pe...
  function std (line 478) | const std::vector<size_t> NhwcBatchNorm::numWorkspaceBytes() const {
  function _setFwdParams (line 510) | void NhwcBatchNorm::_setFwdParams(NhwcBatchNormFwdParams *params) const {
  function _setFwdInferenceParams (line 534) | void NhwcBatchNorm::_setFwdInferenceParams(NhwcBatchNormFwdInferenceParams
  function _setBwdParams (line 548) | void NhwcBatchNorm::_setBwdParams(NhwcBatchNormBwdParams *params) const {
  function fwdInference (line 569) | void NhwcBatchNorm::fwdInference(cudaStream_t stream, bool use_relu) {
  function dim3 (line 612) | dim3 NhwcBatchNorm::calc_fwd_grid(int *loop, const int grid_dim_x) {
  function dim3 (line 635) | dim3 NhwcBatchNorm::calc_bwd_grid(int *loop, const int grid_dim_x) {
  function fwd (line 658) | void NhwcBatchNorm::fwd(cudaStream_t stream, bool use_relu, void* my_dat...
  function dgrad (line 697) | void NhwcBatchNorm::dgrad(cudaStream_t stream, bool use_relu, void* my_d...

FILE: KoSentenceT5/apex/contrib/csrc/groupbn/batch_norm_add_relu.h
  function class (line 41) | class NhwcBatchNormAddRelu {
  function createTensorDescriptor (line 197) | void createTensorDescriptor(cudnnTensorDescriptor_t *descriptor) {
  function destroyTensorDescriptor (line 203) | void destroyTensorDescriptor(cudnnTensorDescriptor_t descriptor) {
  type StorageType (line 228) | typedef uint16_t StorageType;
  function _fwdKernelLauncher (line 262) | void _fwdKernelLauncher(cudaStream_t stream, NhwcBatchNormFwdParams params,
  function _bwdKernelLauncher (line 332) | void _bwdKernelLauncher(cudaStream_t stream, NhwcBatchNormBwdParams params,
  function smem_driven_bwd_occupancy (line 409) | static int smem_driven_bwd_occupancy(int device_id, const int max_cta_pe...
  function std (line 418) | const std::vector<size_t> NhwcBatchNormAddRelu::numWorkspaceBytes() const {
  function _setFwdParams (line 456) | void NhwcBatchNormAddRelu::_setFwdParams(NhwcBatchNormFwdParams *params)...
  function _setFwdInferenceParams (line 480) | void NhwcBatchNormAddRelu::_setFwdInferenceParams(NhwcBatchNormFwdInfere...
  function _setBwdParams (line 494) | void NhwcBatchNormAddRelu::_setBwdParams(NhwcBatchNormBwdParams *params)...
  function fwdInference (line 515) | void NhwcBatchNormAddRelu::fwdInference(cudaStream_t stream) {
  function dim3 (line 552) | dim3 NhwcBatchNormAddRelu::calc_fwd_grid(int *loop, const int grid_dim_x) {
  function dim3 (line 575) | dim3 NhwcBatchNormAddRelu::calc_bwd_grid(int *loop, const int grid_dim_x) {
  function fwd (line 598) | void NhwcBatchNormAddRelu::fwd(cudaStream_t stream, void* my_data, void*...
  function dgrad (line 640) | void NhwcBatchNormAddRelu::dgrad(cudaStream_t stream, void* my_data, voi...

FILE: KoSentenceT5/apex/contrib/csrc/groupbn/cuda_utils.h
  function namespace (line 5) | namespace at {

FILE: KoSentenceT5/apex/contrib/csrc/groupbn/interface.cpp
  function PYBIND11_MODULE (line 154) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSentenceT5/apex/contrib/csrc/groupbn/nhwc_batch_norm_kernel.h
  type T (line 43) | typedef T Type;
  type Type (line 51) | typedef int Type;
  function DEVICE_FUNCTION (line 247) | DEVICE_FUNCTION void write_to_gmem(float *gmem, int idx, const float (&s...
  function DEVICE_FUNCTION (line 253) | DEVICE_FUNCTION void write_to_gmem(float *gmem, int idx, const float (&s...
  function DEVICE_FUNCTION (line 259) | DEVICE_FUNCTION void scaled_write_to_gmem(float *gmem, int idx, const fl...
  function DEVICE_FUNCTION (line 265) | DEVICE_FUNCTION void write_to_smem(float *smem, int idx, const float (&x...
  function DEVICE_FUNCTION (line 271) | DEVICE_FUNCTION void write_to_smem(int *smem, int idx, const int (&x)[1]) {
  function DEVICE_FUNCTION (line 277) | DEVICE_FUNCTION void write_to_smem(float *smem, int idx, const float (&x...
  function DEVICE_FUNCTION (line 283) | DEVICE_FUNCTION void write_to_smem(int *smem, int idx, const int (&x)[2]) {
  function Storage (line 351) | Storage relu(Storage in) {
  function parallel_sums (line 544) | void parallel_sums(float *smem, float (&x)[ELEMENTS_PER_LDG], int nhw) {
  type ParallelSums (line 637) | struct ParallelSums
  type ParallelSums (line 650) | struct ParallelSums
  function div_up (line 661) | static inline int div_up(int m, int n) {
  function DEVICE_FUNCTION (line 668) | DEVICE_FUNCTION void inter_block_sync(int* gmem_retired_ctas, int expect...
  type NhwcBatchNormFwdInferenceParams (line 697) | struct NhwcBatchNormFwdInferenceParams {
  type NhwcBatchNormFwdParams (line 799) | struct NhwcBatchNormFwdParams {
  type PackedStorage (line 870) | typedef PackedStorage<Storage, ELEMENTS_PER_LDG> PackedStorage_;
  type typename (line 872) | typedef typename PackedStorage_::Type PackedStorageType;
  type NhwcBatchNormBwdParams (line 1388) | struct NhwcBatchNormBwdParams {
  function nhwc_batch_norm_bwd (line 1528) | void nhwc_batch_norm_bwd(NhwcBatchNormBwdParams params) {
  function nhwc_batch_norm_bwd_relu (line 1892) | void nhwc_batch_norm_bwd_relu(NhwcBatchNormBwdParams params) {
  function nhwc_batch_norm_bwd_add_relu (line 2280) | void nhwc_batch_norm_bwd_add_relu(NhwcBatchNormBwdParams params) {

FILE: KoSentenceT5/apex/contrib/csrc/layer_norm/ln_api.cpp
  function ln_fwd (line 15) | std::vector<at::Tensor> ln_fwd(const at::Tensor &x,      // BxSxhidden_size
  function ln_bwd (line 58) | std::vector<at::Tensor> ln_bwd(const at::Tensor &dw,     // BxSxhidden_size
  function PYBIND11_MODULE (line 102) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSentenceT5/apex/contrib/csrc/multihead_attn/additive_masked_softmax_dropout.cpp
  type multihead_attn (line 5) | namespace multihead_attn {
    type fused_softmax (line 6) | namespace fused_softmax {
      type additive_mask_softmax_dropout (line 7) | namespace additive_mask_softmax_dropout {
        function fwd (line 31) | std::vector<torch::Tensor> fwd(
        function bwd (line 57) | torch::Tensor bwd(
  function PYBIND11_MODULE (line 87) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSentenceT5/apex/contrib/csrc/multihead_attn/encdec_multihead_attn.cpp
  type multihead_attn (line 4) | namespace multihead_attn {
    type encdec (line 5) | namespace encdec {
      type cublas_gemmex (line 6) | namespace cublas_gemmex {
        function fwd (line 43) | std::vector<torch::Tensor> fwd(
        function bwd (line 88) | std::vector<torch::Tensor> bwd(
  function PYBIND11_MODULE (line 153) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSentenceT5/apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add.cpp
  type multihead_attn (line 4) | namespace multihead_attn {
    type encdec_norm_add (line 5) | namespace encdec_norm_add {
      type cublas_gemmex (line 6) | namespace cublas_gemmex {
        function fwd (line 52) | std::vector<torch::Tensor> fwd(
        function bwd (line 105) | std::vector<torch::Tensor> bwd(
  function PYBIND11_MODULE (line 194) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSentenceT5/apex/contrib/csrc/multihead_attn/layer_norm.h
  function rsqrt (line 230) | float rsqrt(float v) {
  function rsqrt (line 233) | double rsqrt(double v) {
  function float (line 256) | struct SharedMemory <float>
  function double (line 266) | struct SharedMemory <double>
  function stream (line 653) | auto stream = at::cuda::getCurrentCUDAStream().stream();

FILE: KoSentenceT5/apex/contrib/csrc/multihead_attn/masked_softmax_dropout.cpp
  type multihead_attn (line 4) | namespace multihead_attn {
    type fused_softmax (line 5) | namespace fused_softmax {
      type mask_softmax_dropout (line 6) | namespace mask_softmax_dropout {
        function fwd (line 31) | std::vector<torch::Tensor> fwd(
        function bwd (line 57) | torch::Tensor bwd(
  function PYBIND11_MODULE (line 89) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSentenceT5/apex/contrib/csrc/multihead_attn/philox.h
  function class (line 4) | class Philox {
  function __device__ (line 17) | __device__ inline uint4 operator()() {
  function __device__ (line 45) | __device__ inline void incr_n(unsigned long long n) {
  function __device__ (line 58) | __device__ inline void incr() {
  function mulhilo32 (line 67) | __device__ unsigned int mulhilo32(unsigned int a, unsigned int b,
  function __device__ (line 72) | __device__ inline uint4 single_round(uint4 ctr, uint2 key) {
  function __device__ (line 87) | __device__  __inline__ float4 uniform4(uint4 x) {

FILE: KoSentenceT5/apex/contrib/csrc/multihead_attn/self_multihead_attn.cpp
  type multihead_attn (line 4) | namespace multihead_attn {
    type self (line 5) | namespace self {
      type cublas_gemmex (line 6) | namespace cublas_gemmex {
        function fwd (line 39) | std::vector<torch::Tensor> fwd(
        function bwd (line 75) | std::vector<torch::Tensor> bwd(
  function PYBIND11_MODULE (line 128) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSentenceT5/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias.cpp
  type multihead_attn (line 4) | namespace multihead_attn {
    type self_bias (line 5) | namespace self_bias {
      type cublas_gemmex (line 6) | namespace cublas_gemmex {
        function fwd (line 43) | std::vector<torch::Tensor> fwd(
        function bwd (line 82) | std::vector<torch::Tensor> bwd(
  function PYBIND11_MODULE (line 135) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSentenceT5/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask.cpp
  type multihead_attn (line 5) | namespace multihead_attn {
    type self_bias_additive_mask (line 6) | namespace self_bias_additive_mask {
      type cublas_gemmex (line 7) | namespace cublas_gemmex {
        function fwd (line 46) | std::vector<torch::Tensor> fwd(
        function bwd (line 86) | std::vector<torch::Tensor> bwd(
  function PYBIND11_MODULE (line 139) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSentenceT5/apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add.cpp
  type multihead_attn (line 4) | namespace multihead_attn {
    type self_norm_add (line 5) | namespace self_norm_add {
      type cublas_gemmex (line 6) | namespace cublas_gemmex {
        function fwd (line 47) | std::vector<torch::Tensor> fwd(
        function bwd (line 93) | std::vector<torch::Tensor> bwd(
  function PYBIND11_MODULE (line 169) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSentenceT5/apex/contrib/csrc/multihead_attn/softmax.h
  function acc_t (line 139) | acc_t sum[WARP_BATCH] { 0.0f };
  function acc_t (line 363) | acc_t sum[WARP_BATCH] { 0.0f };
  function additive_masked_softmax_dropout_warp_forward (line 429) | void additive_masked_softmax_dropout_warp_forward(output_t *dst, uint8_t...
  function softmax_warp_backward (line 2244) | void softmax_warp_backward(__half *gradInput, const __half *grad, const ...
  function masked_softmax_warp_backward (line 2455) | void masked_softmax_warp_backward(__half *gradInput, const __half *grad,...

FILE: KoSentenceT5/apex/contrib/csrc/multihead_attn/strided_batched_gemm.h
  function cublasOperation_t (line 21) | cublasOperation_t convertTransToCublasOperation(char trans) {
  function CublasStridedBatchedGemm (line 31) | void CublasStridedBatchedGemm(THCState *state, char transa, char transb,...
  type cutlass (line 78) | typedef cutlass::gemm::Gemm<WmmaGemmTraits> Gemm;
  function gemm_switch_fp32accum (line 149) | void gemm_switch_fp32accum(THCState *state, char transa, char transb, lo...
  function adjustLdLevel3 (line 278) | void adjustLdLevel3(char transa, char transb, int64_t m, int64_t n, int6...
  function HgemmStridedBatched (line 312) | void HgemmStridedBatched(THCState *state, char transa, char transb, long...

FILE: KoSentenceT5/apex/contrib/csrc/optimizers/fused_adam_cuda.cpp
  function strided_check_finite (line 20) | void strided_check_finite(
  function adam (line 29) | void adam(at::Tensor & p, at::Tensor & p_copy, at::Tensor & m, at::Tenso...
  function reversible_adam (line 43) | void reversible_adam(at::Tensor & p, at::Tensor & p_copy, at::Tensor & m...
  function maybe_adam_undo (line 57) | void maybe_adam_undo(at::Tensor & overflow_flag, at::Tensor & p, at::Ten...
  function maybe_cast (line 69) | void maybe_cast(at::Tensor & overflow_flag, at::Tensor & p_in, at::Tenso...
  function PYBIND11_MODULE (line 78) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSentenceT5/apex/contrib/csrc/optimizers/fused_lamb_cuda.cpp
  function PYBIND11_MODULE (line 19) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSentenceT5/apex/contrib/csrc/optimizers/multi_tensor_distopt_adam.cpp
  function PYBIND11_MODULE (line 17) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSentenceT5/apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb.cpp
  function PYBIND11_MODULE (line 31) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSentenceT5/apex/contrib/csrc/transducer/transducer_joint.cpp
  function transducer_joint_forward (line 33) | std::vector<torch::Tensor> transducer_joint_forward(
  function transducer_joint_backward (line 67) | std::vector<torch::Tensor> transducer_joint_backward(
  function PYBIND11_MODULE (line 95) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSentenceT5/apex/contrib/csrc/transducer/transducer_loss.cpp
  function transducer_loss_forward (line 35) | std::vector<torch::Tensor> transducer_loss_forward(
  function transducer_loss_backward (line 65) | torch::Tensor transducer_loss_backward(
  function PYBIND11_MODULE (line 106) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSentenceT5/apex/contrib/csrc/xentropy/interface.cpp
  function softmax_xentropy_forward (line 24) | std::vector<at::Tensor> softmax_xentropy_forward(
  function softmax_xentropy_backward (line 35) | at::Tensor softmax_xentropy_backward(
  function PYBIND11_MODULE (line 49) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSentenceT5/apex/contrib/fmha/fmha.py
  class FMHAFun (line 33) | class FMHAFun(torch.autograd.Function):
    method forward (line 35) | def forward(ctx, qkv, cu_seqlens, p_dropout, max_s, is_training):
    method backward (line 48) | def backward(ctx, dout):
  class FMHA (line 58) | class FMHA(torch.nn.Module):
    method __init__ (line 60) | def __init__(self, config):
    method forward (line 70) | def forward(self, qkv, cu_seqlens, max_s, is_training=True):

FILE: KoSentenceT5/apex/contrib/groupbn/batch_norm.py
  class bn_NHWC_impl (line 7) | class bn_NHWC_impl(torch.autograd.Function):
    method forward (line 9) | def forward(ctx, x, s, b, rm, riv, mini_m, mini_riv, ret_cta, mom, eps...
    method backward (line 32) | def backward(ctx, grad_y):
  class bn_addrelu_NHWC_impl (line 53) | class bn_addrelu_NHWC_impl(torch.autograd.Function):
    method forward (line 55) | def forward(ctx, x, z, s, b, rm, riv, mini_m, mini_riv, grid_dim_y, re...
    method backward (line 78) | def backward(ctx, grad_y):
  class BatchNorm2d_NHWC (line 101) | class BatchNorm2d_NHWC(_BatchNorm):
    method __init__ (line 103) | def __init__(self, num_features, fuse_relu=False, bn_group=1, max_cta_...
    method forward (line 196) | def forward(self, x, z=None):
    method __del__ (line 219) | def __del__(self):

FILE: KoSentenceT5/apex/contrib/layer_norm/layer_norm.py
  class FastLayerNormFN (line 6) | class FastLayerNormFN(torch.autograd.Function):
    method forward (line 8) | def forward(ctx, x, gamma, beta, epsilon):
    method backward (line 19) | def backward(ctx, dy):
  class FastLayerNorm (line 31) | class FastLayerNorm(torch.nn.Module):
    method __init__ (line 32) | def __init__(self, hidden_size, eps=1e-5):
    method reset_parameters (line 39) | def reset_parameters(self):
    method forward (line 43) | def forward(self, x):

FILE: KoSentenceT5/apex/contrib/multihead_attn/encdec_multihead_attn.py
  function jit_dropout_add (line 19) | def jit_dropout_add(x, residual, prob, is_training):
  class EncdecMultiheadAttn (line 26) | class EncdecMultiheadAttn(nn.Module):
    method __init__ (line 31) | def __init__(self, embed_dim, num_heads, dropout=0., bias=False, inclu...
    method reset_parameters (line 79) | def reset_parameters(self):
    method forward (line 98) | def forward(self, query, key, value, key_padding_mask=None, need_weigh...

FILE: KoSentenceT5/apex/contrib/multihead_attn/encdec_multihead_attn_func.py
  class EncdecAttnFunc (line 5) | class EncdecAttnFunc(torch.autograd.Function):
    method forward (line 7) | def forward(ctx, use_time_mask, is_training, heads, scale, inputs_q, i...
    method backward (line 135) | def backward(ctx, output_grads):

FILE: KoSentenceT5/apex/contrib/multihead_attn/fast_encdec_multihead_attn_func.py
  class FastEncdecAttnFunc (line 5) | class FastEncdecAttnFunc(torch.autograd.Function):
    method forward (line 7) | def forward(ctx, use_time_mask, is_training, heads, inputs_q, inputs_k...
    method backward (line 50) | def backward(ctx, output_grads):

FILE: KoSentenceT5/apex/contrib/multihead_attn/fast_encdec_multihead_attn_norm_add_func.py
  class FastEncdecAttnNormAddFunc (line 12) | class FastEncdecAttnNormAddFunc(torch.autograd.Function):
    method forward (line 14) | def forward(ctx, use_time_mask, is_training, heads, inputs_q, inputs_k...
    method backward (line 69) | def backward(ctx, output_grads):

FILE: KoSentenceT5/apex/contrib/multihead_attn/fast_self_multihead_attn_func.py
  class FastSelfAttnFunc (line 6) | class FastSelfAttnFunc(torch.autograd.Function) :
    method forward (line 8) | def forward(ctx, use_time_mask, is_training, heads, inputs, input_weig...
    method backward (line 120) | def backward(ctx, output_grads):

FILE: KoSentenceT5/apex/contrib/multihead_attn/fast_self_multihead_attn_norm_add_func.py
  class FastSelfAttnNormAddFunc (line 5) | class FastSelfAttnNormAddFunc(torch.autograd.Function):
    method forward (line 7) | def forward(ctx, use_time_mask, is_training, heads, inputs, lyr_nrm_ga...
    method backward (line 56) | def backward(ctx, output_grads):

FILE: KoSentenceT5/apex/contrib/multihead_attn/mask_softmax_dropout_func.py
  class MaskSoftmaxDropout (line 6) | class MaskSoftmaxDropout(torch.autograd.Function) :
    method forward (line 8) | def forward(ctx, is_training, heads, inputs, pad_mask, mask_additive, ...
    method backward (line 51) | def backward(ctx, output_grads):

FILE: KoSentenceT5/apex/contrib/multihead_attn/self_multihead_attn.py
  function jit_dropout_add (line 19) | def jit_dropout_add(x, residual, prob, is_training):
  class SelfMultiheadAttn (line 26) | class SelfMultiheadAttn(nn.Module):
    method __init__ (line 31) | def __init__(self, embed_dim, num_heads, dropout=0., bias=False, inclu...
    method reset_parameters (line 97) | def reset_parameters(self):
    method forward (line 124) | def forward(self, query, key, value, key_padding_mask=None, need_weigh...

FILE: KoSentenceT5/apex/contrib/multihead_attn/self_multihead_attn_func.py
  class SelfAttnFunc (line 4) | class SelfAttnFunc(torch.autograd.Function):
    method forward (line 6) | def forward(ctx, use_time_mask, is_training, heads, scale, inputs,
    method backward (line 121) | def backward(ctx, output_grads):

FILE: KoSentenceT5/apex/contrib/optimizers/distributed_fused_adam.py
  class DistributedFusedAdam (line 9) | class DistributedFusedAdam(torch.optim.Optimizer):
    method __init__ (line 55) | def __init__(self, params,
    method _first_step_init (line 128) | def _first_step_init(self):
    method _init_everything (line 373) | def _init_everything(self):
    method set_last_step (line 378) | def set_last_step(self, last_step):
    method _get_flush_block (line 381) | def _get_flush_block(self):
    method _pipeline_block_reductions (line 397) | def _pipeline_block_reductions(self, block_id):
    method __launch_step_kernel (line 443) | def __launch_step_kernel(self):
    method _pipeline_step (line 469) | def _pipeline_step(self):
    method _flatten_grad_mt (line 479) | def _flatten_grad_mt(self, scale):
    method _do_overlapped_reduction (line 489) | def _do_overlapped_reduction(self, param_i, param_grads_size, param_of...
    method set_global_scale (line 504) | def set_global_scale(self, global_scale):
    method global_scale (line 510) | def global_scale(self):
    method has_overflow (line 514) | def has_overflow(self):
    method peek_overflow (line 523) | def peek_overflow(self):
    method strided_check_finite (line 529) | def strided_check_finite(self, output_params, stride=1, start=-1, end=...
    method L2_grad_norm (line 545) | def L2_grad_norm(self):
    method complete_reductions (line 552) | def complete_reductions(self):
    method step (line 577) | def step(self, closure=None):
    method state_dict (line 598) | def state_dict(self):
    method load_state_dict (line 615) | def load_state_dict(self, state_dict):

FILE: KoSentenceT5/apex/contrib/optimizers/distributed_fused_adam_v2.py
  class DistributedFusedAdamV2 (line 7) | class DistributedFusedAdamV2(torch.optim.Optimizer):
    method __init__ (line 43) | def __init__(self, params,
    method set_last_step (line 351) | def set_last_step(self, last_step):
    method _get_flush_block (line 354) | def _get_flush_block(self):
    method _pipeline_block_reductions (line 370) | def _pipeline_block_reductions(self, block_id):
    method __launch_step_kernel (line 406) | def __launch_step_kernel(self, p, p_copy, m, v, g):
    method _pipeline_block_step (line 425) | def _pipeline_block_step(self, block_id):
    method _pipeline_step (line 445) | def _pipeline_step(self):
    method _flatten_grad_mt (line 460) | def _flatten_grad_mt(self, scale):
    method _do_overlapped_reduction (line 470) | def _do_overlapped_reduction(self, param_i, param_grads_size, param_of...
    method set_global_scale (line 487) | def set_global_scale(self, global_scale):
    method global_scale (line 493) | def global_scale(self):
    method has_overflow (line 497) | def has_overflow(self):
    method peek_overflow (line 506) | def peek_overflow(self):
    method strided_check_finite (line 512) | def strided_check_finite(self, output_params, stride=1, start=-1, end=...
    method L2_grad_norm (line 528) | def L2_grad_norm(self):
    method complete_reductions (line 535) | def complete_reductions(self):
    method revert_step (line 560) | def revert_step(self):
    method step (line 586) | def step(self, closure=None, skip_overflow_check=False):

FILE: KoSentenceT5/apex/contrib/optimizers/distributed_fused_adam_v3.py
  class DistributedFusedAdamV3 (line 7) | class DistributedFusedAdamV3(torch.optim.Optimizer):
    method __init__ (line 43) | def __init__(self, params,
    method has_overflow (line 196) | def has_overflow(self):
    method set_last_step (line 199) | def set_last_step(self, last_step):
    method _get_flush_block (line 202) | def _get_flush_block(self):
    method __launch_step_kernel (line 218) | def __launch_step_kernel(self, p, p_copy, m, v, g):
    method _flatten_grad_mt (line 237) | def _flatten_grad_mt(self, scale):
    method _do_overlapped_reduction (line 247) | def _do_overlapped_reduction(self, param_i, param_grads_size, param_of...
    method set_global_scale (line 268) | def set_global_scale(self, global_scale):
    method global_scale (line 274) | def global_scale(self):
    method L2_grad_norm (line 278) | def L2_grad_norm(self):
    method complete_reductions (line 282) | def complete_reductions(self):
    method step (line 306) | def step(self, closure=None, skip_overflow_check=False):

FILE: KoSentenceT5/apex/contrib/optimizers/distributed_fused_lamb.py
  class DistributedFusedLAMB (line 9) | class DistributedFusedLAMB(torch.optim.Optimizer):
    class AtomicCounter (line 70) | class AtomicCounter(object):
      method __init__ (line 71) | def __init__(self):
      method add (line 77) | def add(self, idx):
    method __init__ (line 82) | def __init__(self, params,
    method _lazy_init_stage1 (line 210) | def _lazy_init_stage1(self):
    method _lazy_init_stage2 (line 330) | def _lazy_init_stage2(self):
    method set_is_accumulation_step (line 451) | def set_is_accumulation_step(self, is_accumulation_step):
    method set_last_step (line 454) | def set_last_step(self, last_step):
    method _get_flush_block (line 457) | def _get_flush_block(self):
    method _pipeline_block_reductions (line 473) | def _pipeline_block_reductions(self, block_id):
    method __compute_contrib_param_norm (line 556) | def __compute_contrib_param_norm(self):
    method __compute_contrib_update_norm (line 569) | def __compute_contrib_update_norm(self):
    method _pipeline_step (line 577) | def _pipeline_step(self):
    method _flatten_grad_mt (line 633) | def _flatten_grad_mt(self, scale):
    method _do_overlapped_reduction (line 651) | def _do_overlapped_reduction(self, param_i, param):
    method set_global_scale (line 667) | def set_global_scale(self, global_scale):
    method global_scale (line 673) | def global_scale(self):
    method L2_grad_norm (line 677) | def L2_grad_norm(self):
    method complete_reductions (line 681) | def complete_reductions(self):
    method step (line 704) | def step(self, closure=None, grad_scaler=None):
    method state_dict (line 740) | def state_dict(self):
    method load_state_dict (line 757) | def load_state_dict(self, state_dict):

FILE: KoSentenceT5/apex/contrib/optimizers/fp16_optimizer.py
  class FP16_Optimizer (line 4) | class FP16_Optimizer(object):
    method __init__ (line 25) | def __init__(self,
    method zero_grad (line 79) | def zero_grad(self, set_grads_to_None=True):
    method step (line 94) | def step(self, closure=None):
    method backward (line 132) | def backward(self, loss):
    method _update_scale (line 142) | def _update_scale(self, skip):
    method _get_state (line 161) | def _get_state(self):
    method _set_state (line 164) | def _set_state(self, value):
    method _get_param_groups (line 171) | def _get_param_groups(self):
    method _set_param_groups (line 174) | def _set_param_groups(self, value):
    method state_dict (line 179) | def state_dict(self):
    method load_state_dict (line 202) | def load_state_dict(self, state_dict):

FILE: KoSentenceT5/apex/contrib/optimizers/fused_adam.py
  class FusedAdam (line 6) | class FusedAdam(torch.optim.Optimizer):
    method __init__ (line 38) | def __init__(self, params,
    method step (line 64) | def step(self, closure=None, grads=None, output_params=None, scale=1.,...

FILE: KoSentenceT5/apex/contrib/optimizers/fused_lamb.py
  class FusedLAMB (line 6) | class FusedLAMB(torch.optim.Optimizer):
    method __init__ (line 63) | def __init__(self, params, lr=1e-3, bias_correction=True,
    method zero_grad (line 87) | def zero_grad(self):
    method step (line 95) | def step(self, closure=None):

FILE: KoSentenceT5/apex/contrib/optimizers/fused_sgd.py
  class FusedSGD (line 7) | class FusedSGD(Optimizer):
    method __init__ (line 66) | def __init__(self, params, lr=required, momentum=0, dampening=0,
    method __setstate__ (line 93) | def __setstate__(self, state):
    method get_momentums (line 98) | def get_momentums(self, params):
    method step (line 115) | def step(self, closure=None, grads=None, output_params=None, scale=1.,...

FILE: KoSentenceT5/apex/contrib/sparsity/asp.py
  function eligible_modules (line 12) | def eligible_modules(model, whitelist_layer_types, allowed_layer_names, ...
  class ASP (line 21) | class ASP:
    method init_model_for_pruning (line 29) | def init_model_for_pruning(cls, model, mask_calculator="m4n2_1d",
    method init_optimizer_for_pruning (line 127) | def init_optimizer_for_pruning(cls, optimizer):
    method compute_sparse_masks (line 155) | def compute_sparse_masks(cls):
    method restore_pruned_weights (line 176) | def restore_pruned_weights(cls):
    method is_sparsity_enabled (line 191) | def is_sparsity_enabled(cls):
    method prune_trained_model (line 212) | def prune_trained_model(cls, model, optimizer):

FILE: KoSentenceT5/apex/contrib/sparsity/sparse_masklib.py
  function fill (line 9) | def fill(x):
  function reshape_1d (line 13) | def reshape_1d(matrix, m):
  function compute_valid_1d_patterns (line 25) | def compute_valid_1d_patterns(m,n):
  function mn_1d_best (line 37) | def mn_1d_best(matrix, m, n):
  function m4n2_1d (line 49) | def m4n2_1d(mat, density):
  function mn_2d_greedy (line 67) | def mn_2d_greedy(matrix, m, n):
  function m4n2_2d_greedy (line 98) | def m4n2_2d_greedy(mat, density):
  function compute_valid_2d_patterns (line 103) | def compute_valid_2d_patterns(m,n):
  function mn_2d_best (line 122) | def mn_2d_best(matrix, m, n):
  function m4n2_2d_best (line 140) | def m4n2_2d_best(mat, density):
  function create_mask (line 145) | def create_mask(tensor, pattern="m4n2_1d", density=0.5):

FILE: KoSentenceT5/apex/contrib/sparsity/test/checkpointing_test_part1.py
  function build_model (line 7) | def build_model(args):
  function train_step (line 21) | def train_step(args, model, optimizer, input_batch, target_batch, step):
  function train_loop (line 31) | def train_loop(args, model, optimizer, step, num_steps):
  function main (line 38) | def main(args):
  class Args (line 76) | class Args:

FILE: KoSentenceT5/apex/contrib/sparsity/test/checkpointing_test_part2.py
  function build_model (line 7) | def build_model(args):
  function train_step (line 21) | def train_step(args, model, optimizer, input_batch, target_batch, step):
  function train_loop (line 31) | def train_loop(args, model, optimizer, step, num_steps):
  function main (line 38) | def main(step, args, model_state_dict, optimizer_state_dict):
  class Args (line 61) | class Args:

FILE: KoSentenceT5/apex/contrib/sparsity/test/checkpointing_test_reference.py
  function build_model (line 11) | def build_model(args):
  function train_step (line 25) | def train_step(args, model, optimizer, input_batch, target_batch, step):
  function train_loop (line 35) | def train_loop(args, model, optimizer, step, num_steps):
  function main (line 42) | def main(args):
  class Args (line 79) | class Args:

FILE: KoSentenceT5/apex/contrib/sparsity/test/toy_problem.py
  function build_model (line 7) | def build_model(args):
  function train_step (line 21) | def train_step(args, model, optimizer, input_batch, target_batch, step):
  function train_loop (line 31) | def train_loop(args, model, optimizer, step, num_steps):
  function main (line 38) | def main(args):
  class Args (line 75) | class Args:

FILE: KoSentenceT5/apex/contrib/test/fmha/test_fmha.py
  function py_mha (line 37) | def py_mha(qkv, amask, b, s, h, d):
  class TestFMHA (line 52) | class TestFMHA(unittest.TestCase):
    method run_test (line 54) | def run_test(self, s, b):
    method test_128 (line 106) | def test_128(self):
    method test_256 (line 109) | def test_256(self):
    method test_384 (line 112) | def test_384(self):
    method test_512 (line 115) | def test_512(self):

FILE: KoSentenceT5/apex/contrib/test/layer_norm/test_fast_layer_norm.py
  class GPUTimer (line 12) | class GPUTimer:
    method __init__ (line 13) | def __init__(self, stream):
    method start (line 17) | def start(self):
    method stop (line 19) | def stop(self):
    method sync (line 21) | def sync(self):
    method millis (line 23) | def millis(self):
  function size_in_bytes (line 26) | def size_in_bytes(t):
  function abs_err (line 28) | def abs_err(x, y):
  class TestFastLayerNorm (line 35) | class TestFastLayerNorm(unittest.TestCase):
    method setUp (line 37) | def setUp(self, seed=1234):
    method test_ln_fp32 (line 42) | def test_ln_fp32(self):
    method test_ln_fp16 (line 44) | def test_ln_fp16(self):
    method run_test_layer_norm (line 47) | def run_test_layer_norm(self, dtype, atol, rtol=1e-5):
    method test_performance (line 94) | def test_performance(self):

FILE: KoSentenceT5/apex/contrib/test/multihead_attn/test_encdec_multihead_attn.py
  class EncdecMultiheadAttnTest (line 7) | class EncdecMultiheadAttnTest(unittest.TestCase):
    method setUp (line 8) | def setUp(self, seed=1234):
    method test_encdec_multihead_attn (line 49) | def test_encdec_multihead_attn(self) :
    method test_encdec_multihead_attn_time_mask (line 76) | def test_encdec_multihead_attn_time_mask(self) :
    method test_encdec_multihead_attn_pad_mask (line 105) | def test_encdec_multihead_attn_pad_mask(self) :

FILE: KoSentenceT5/apex/contrib/test/multihead_attn/test_encdec_multihead_attn_norm_add.py
  class EncdecMultiheadAttnNormAddTest (line 7) | class EncdecMultiheadAttnNormAddTest(unittest.TestCase):
    method setUp (line 8) | def setUp(self, seed=1234):
    method test_encdec_multihead_attn_norm_add (line 49) | def test_encdec_multihead_attn_norm_add(self) :

FILE: KoSentenceT5/apex/contrib/test/multihead_attn/test_fast_self_multihead_attn_bias.py
  class SelfMultiheadAttnTest (line 7) | class SelfMultiheadAttnTest(unittest.TestCase):
    method setUp (line 8) | def setUp(self, seed=1234):
    method test_self_multihead_attn_additive_mask (line 48) | def test_self_multihead_attn_additive_mask(self) :

FILE: KoSentenceT5/apex/contrib/test/multihead_attn/test_mha_fused_softmax.py
  class FusedSoftmaxTest (line 6) | class FusedSoftmaxTest(unittest.TestCase):
    method setUp (line 7) | def setUp(self, seed=1234):
    method test_fused_softmax (line 24) | def test_fused_softmax(self) :

FILE: KoSentenceT5/apex/contrib/test/multihead_attn/test_self_multihead_attn.py
  class SelfMultiheadAttnTest (line 7) | class SelfMultiheadAttnTest(unittest.TestCase):
    method setUp (line 8) | def setUp(self, seed=1234):
    method test_self_multihead_attn (line 45) | def test_self_multihead_attn(self) :
    method test_self_multihead_attn_time_mask (line 71) | def test_self_multihead_attn_time_mask(self) :
    method test_self_multihead_attn_pad_mask (line 100) | def test_self_multihead_attn_pad_mask(self) :

FILE: KoSentenceT5/apex/contrib/test/multihead_attn/test_self_multihead_attn_norm_add.py
  class SelfMultiheadAttnNormAddTest (line 7) | class SelfMultiheadAttnNormAddTest(unittest.TestCase):
    method setUp (line 8) | def setUp(self, seed=1234):
    method test_self_multihead_attn_norm_add (line 45) | def test_self_multihead_attn_norm_add(self) :

FILE: KoSentenceT5/apex/contrib/test/test_label_smoothing.py
  function label_smoothing_raw (line 10) | def label_smoothing_raw(x, target, padding_idx, smoothing):
  function label_smoothing_opt_1 (line 20) | def label_smoothing_opt_1(x, target, padding_idx, smoothing):
  class LabelSmoothingTest (line 30) | class LabelSmoothingTest(unittest.TestCase):
    method setUp (line 31) | def setUp(self, seed=1234):
    method gen_test_inputs (line 40) | def gen_test_inputs(self, N, T, H, smoothing, padding_idx):
    method print_max_diff_elem (line 50) | def print_max_diff_elem(self, ref, tst):
    method test_label_smoothing_function (line 57) | def test_label_smoothing_function(self):
    method test_label_smoothing_perf (line 91) | def test_label_smoothing_perf(self):

FILE: KoSentenceT5/apex/contrib/test/transducer/test_transducer_joint.py
  class TransducerJointTest (line 6) | class TransducerJointTest(unittest.TestCase):
    method setUp (line 7) | def setUp(self, seed=1234):
    method gen_input (line 11) | def gen_input(self, for_vector_kernel):
    method _pack (line 41) | def _pack(self, x, f_len, g_len):
    method _unpack (line 53) | def _unpack(self, x, f_len, g_len):
    method run_transducer_joint (line 67) | def run_transducer_joint(self, for_vector_kernel, pack_output, relu, d...
    method test_transducer_joint (line 118) | def test_transducer_joint(self):
    method test_transducer_joint_vec (line 121) | def test_transducer_joint_vec(self):
    method test_transducer_joint_pack (line 124) | def test_transducer_joint_pack(self):
    method test_transducer_joint_vec_pack (line 127) | def test_transducer_joint_vec_pack(self):
    method test_transducer_joint_relu (line 130) | def test_transducer_joint_relu(self):
    method test_transducer_joint_vec_relu (line 133) | def test_transducer_joint_vec_relu(self):
    method test_transducer_joint_pack_relu (line 136) | def test_transducer_joint_pack_relu(self):
    method test_transducer_joint_vec_pack_relu (line 139) | def test_transducer_joint_vec_pack_relu(self):
    method test_transducer_joint_relu_dropout (line 142) | def test_transducer_joint_relu_dropout(self):
    method test_transducer_joint_vec_relu_dropout (line 145) | def test_transducer_joint_vec_relu_dropout(self):
    method test_transducer_joint_pack_relu_dropout (line 148) | def test_transducer_joint_pack_relu_dropout(self):
    method test_transducer_joint_vec_pack_relu_dropout (line 151) | def test_transducer_joint_vec_pack_relu_dropout(self):

FILE: KoSentenceT5/apex/contrib/test/transducer/test_transducer_loss.py
  class TransducerLossTest (line 6) | class TransducerLossTest(unittest.TestCase):
    method setUp (line 7) | def setUp(self, seed=1234):
    method gen_input (line 11) | def gen_input(self, scalar_t, for_vector_kernel):
    method _pack (line 41) | def _pack(self, x):
    method _unpack (line 52) | def _unpack(self, x):
    method run_transducer_loss (line 64) | def run_transducer_loss(self, scalar_t, fuse_softmax_backward, packed_...
    method test_transducer_loss_fp32 (line 90) | def test_transducer_loss_fp32(self):
    method test_transducer_loss_fp16 (line 98) | def test_transducer_loss_fp16(self):
    method test_transducer_loss_fp16_backward_fusion (line 106) | def test_transducer_loss_fp16_backward_fusion(self):
    method test_transducer_loss_fp16_backward_fusion_packed (line 114) | def test_transducer_loss_fp16_backward_fusion_packed(self):
    method test_transducer_loss_fp16_backward_fusion_packed_vec (line 122) | def test_transducer_loss_fp16_backward_fusion_packed_vec(self):

FILE: KoSentenceT5/apex/contrib/test/transducer/transducer_ref.py
  function transducer_loss_reference (line 5) | def transducer_loss_reference(x, label, f_len, y_len, blank_idx, loss_gr...
  function transducer_joint_reference (line 79) | def transducer_joint_reference(f, g, h_grad, f_len, g_len, pack_output, ...

FILE: KoSentenceT5/apex/contrib/transducer/transducer.py
  class TransducerJoint (line 5) | class TransducerJoint(torch.nn.Module):
    method __init__ (line 27) | def __init__(self, pack_output=False, relu=False, dropout=False, opt=1...
    method forward (line 43) | def forward(self, f, g, f_len, g_len, batch_offset=None, packed_batch=0):
  class TransducerLoss (line 68) | class TransducerLoss(torch.nn.Module):
    method __init__ (line 81) | def __init__(self, fuse_softmax_backward=True, opt=1, packed_input=Fal...
    method forward (line 89) | def forward(self, x, label, f_len, y_len, blank_idx, batch_offset=None...
  class TransducerLossFunc (line 127) | class TransducerLossFunc(torch.autograd.Function):
    method forward (line 129) | def forward(ctx, x, label, f_len, y_len, batch_offset, max_f_len, blan...
    method backward (line 149) | def backward(ctx, loss_grad):
  class TransducerJointFunc (line 158) | class TransducerJointFunc(torch.autograd.Function):
    method forward (line 160) | def forward(ctx, f, g, f_len, g_len, pack_output, relu, dropout, batch...
    method backward (line 180) | def backward(ctx, loss_grad):

FILE: KoSentenceT5/apex/contrib/xentropy/softmax_xentropy.py
  class SoftmaxCrossEntropyLoss (line 4) | class SoftmaxCrossEntropyLoss(torch.autograd.Function):
    method forward (line 6) | def forward(ctx, logits, labels, smoothing=0.0, padding_idx=0, half_to...
    method backward (line 18) | def backward(ctx, grad_loss):

FILE: KoSentenceT5/apex/fp16_utils/fp16_optimizer.py
  class FP16_Optimizer (line 13) | class FP16_Optimizer(object):
    method __init__ (line 14) | def __init__(self,
    method maybe_print (line 110) | def maybe_print(self, msg):
    method __getstate__ (line 114) | def __getstate__(self):
    method __setstate__ (line 117) | def __setstate__(self, state):
    method zero_grad (line 120) | def zero_grad(self, set_grads_to_None=False):
    method _master_params_to_model_params (line 160) | def _master_params_to_model_params(self):
    method clip_master_grads (line 185) | def clip_master_grads(self, max_norm, norm_type=2):
    method state_dict (line 209) | def state_dict(self):
    method load_state_dict (line 230) | def load_state_dict(self, state_dict):
    method step (line 272) | def step(self, closure=None): # could add clip option.
    method _step_with_closure (line 334) | def _step_with_closure(self, closure):
    method backward (line 373) | def backward(self, loss, update_master_grads=True, retain_graph=False):
    method update_master_grads (line 436) | def update_master_grads(self):
    method inspect_master_grad_data (line 493) | def inspect_master_grad_data(self):
    method _get_loss_scale (line 528) | def _get_loss_scale(self):
    method _set_loss_scale (line 531) | def _set_loss_scale(self, value):
    method _get_state (line 537) | def _get_state(self):
    method _set_state (line 540) | def _set_state(self, value):
    method _get_param_groups (line 547) | def _get_param_groups(self):
    method _set_param_groups (line 550) | def _set_param_groups(self, value):

FILE: KoSentenceT5/apex/fp16_utils/fp16util.py
  class tofp16 (line 7) | class tofp16(nn.Module):
    method __init__ (line 15) | def __init__(self):
    method forward (line 18) | def forward(self, input):
  function BN_convert_float (line 22) | def BN_convert_float(module):
  function network_to_half (line 35) | def network_to_half(network):
  function convert_module (line 44) | def convert_module(module, dtype):
  function convert_network (line 60) | def convert_network(network, dtype):
  class FP16Model (line 73) | class FP16Model(nn.Module):
    method __init__ (line 78) | def __init__(self, network):
    method forward (line 82) | def forward(self, *inputs):
  function backwards_debug_hook (line 87) | def backwards_debug_hook(grad):
  function prep_param_lists (line 90) | def prep_param_lists(model, flat_master=False):
  function model_grads_to_master_grads (line 136) | def model_grads_to_master_grads(model_params, master_params, flat_master...
  function master_params_to_model_params (line 158) | def master_params_to_model_params(model_params, master_params, flat_mast...
  function to_python_float (line 176) | def to_python_float(t):

FILE: KoSentenceT5/apex/fp16_utils/loss_scaler.py
  function to_python_float (line 4) | def to_python_float(t):
  class LossScaler (line 10) | class LossScaler:
    method __init__ (line 22) | def __init__(self, scale=1):
    method has_overflow (line 26) | def has_overflow(self, params):
    method _has_inf_or_nan (line 30) | def _has_inf_or_nan(x):
    method update_scale (line 33) | def update_scale(self, overflow):
    method loss_scale (line 37) | def loss_scale(self):
    method scale_gradient (line 40) | def scale_gradient(self, module, grad_in, grad_out):
    method backward (line 43) | def backward(self, loss, retain_graph=False):
  class DynamicLossScaler (line 47) | class DynamicLossScaler:
    method __init__ (line 73) | def __init__(self,
    method has_overflow (line 84) | def has_overflow(self, params):
    method _has_inf_or_nan (line 92) | def _has_inf_or_nan(x):
    method update_scale (line 113) | def update_scale(self, overflow):
    method loss_scale (line 124) | def loss_scale(self):
    method scale_gradient (line 127) | def scale_gradient(self, module, grad_in, grad_out):
    method backward (line 130) | def backward(self, loss, retain_graph=False):

FILE: KoSentenceT5/apex/mlp/mlp.py
  class MlpFunction (line 8) | class MlpFunction(torch.autograd.Function):
    method forward (line 10) | def forward(ctx, bias, activation, *args):
    method backward (line 19) | def backward(ctx, grad_o):
  class MLP (line 26) | class MLP(torch.nn.Module):
    method __init__ (line 34) | def __init__(self, mlp_sizes, bias=True, activation='relu'):
    method reset_parameters (line 64) | def reset_parameters(self):
    method forward (line 74) | def forward(self, input):
    method extra_repr (line 77) | def extra_repr(self):

FILE: KoSentenceT5/apex/multi_tensor_apply/multi_tensor_apply.py
  class MultiTensorApply (line 3) | class MultiTensorApply(object):
    method __init__ (line 7) | def __init__(self, chunk_size):
    method check_avail (line 16) | def check_avail(self):
    method __call__ (line 24) | def __call__(self, op, noop_flag_buffer, tensor_lists, *args):

FILE: KoSentenceT5/apex/normalization/fused_layer_norm.py
  class FusedLayerNormAffineFunction (line 12) | class FusedLayerNormAffineFunction(torch.autograd.Function):
    method forward (line 15) | def forward(ctx, input, weight, bias, normalized_shape, eps):
    method backward (line 30) | def backward(ctx, grad_output):
  class FusedLayerNormFunction (line 39) | class FusedLayerNormFunction(torch.autograd.Function):
    method forward (line 42) | def forward(ctx, input, normalized_shape, eps):
    method backward (line 55) | def backward(ctx, grad_output):
  function fused_layer_norm_affine (line 64) | def fused_layer_norm_affine(input, normalized_shape, weight, bias, eps=1...
  function fused_layer_norm (line 67) | def fused_layer_norm(input, normalized_shape, eps=1e-6):
  class FusedLayerNorm (line 70) | class FusedLayerNorm(torch.nn.Module):
    method __init__ (line 129) | def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
    method reset_parameters (line 148) | def reset_parameters(self):
    method forward (line 153) | def forward(self, input):
    method extra_repr (line 163) | def extra_repr(self):

FILE: KoSentenceT5/apex/optimizers/fused_adagrad.py
  class FusedAdagrad (line 5) | class FusedAdagrad(torch.optim.Optimizer):
    method __init__ (line 43) | def __init__(self, params, lr=1e-2, eps=1e-10,
    method zero_grad (line 59) | def zero_grad(self):
    method step (line 67) | def step(self, closure=None):

FILE: KoSentenceT5/apex/optimizers/fused_adam.py
  class FusedAdam (line 4) | class FusedAdam(torch.optim.Optimizer):
    method __init__ (line 63) | def __init__(self, params, lr=1e-3, bias_correction=True,
    method zero_grad (line 82) | def zero_grad(self):
    method step (line 90) | def step(self, closure=None, grads=None, output_params=None, scale=Non...

FILE: KoSentenceT5/apex/optimizers/fused_lamb.py
  class FusedLAMB (line 4) | class FusedLAMB(torch.optim.Optimizer):
    method __init__ (line 63) | def __init__(self, params, lr=1e-3, bias_correction=True,
    method zero_grad (line 88) | def zero_grad(self):
    method step (line 96) | def step(self, closure=None):

FILE: KoSentenceT5/apex/optimizers/fused_novograd.py
  class FusedNovoGrad (line 4) | class FusedNovoGrad(torch.optim.Optimizer):
    method __init__ (line 67) | def __init__(self, params, lr=1e-3, bias_correction=True,
    method zero_grad (line 92) | def zero_grad(self):
    method load_state_dict (line 100) | def load_state_dict(self, state_dict):
    method step (line 108) | def step(self, closure=None):

FILE: KoSentenceT5/apex/optimizers/fused_sgd.py
  class FusedSGD (line 6) | class FusedSGD(Optimizer):
    method __init__ (line 76) | def __init__(self, params, lr=required, momentum=0, dampening=0,
    method __setstate__ (line 108) | def __setstate__(self, state):
    method zero_grad (line 113) | def zero_grad(self):
    method get_momentums (line 121) | def get_momentums(self, params):
    method step (line 138) | def step(self, closure=None):

FILE: KoSentenceT5/apex/parallel/LARC.py
  class LARC (line 5) | class LARC(object):
    method __init__ (line 39) | def __init__(self, optimizer, trust_coefficient=0.02, clip=True, eps=1...
    method __getstate__ (line 45) | def __getstate__(self):
    method __setstate__ (line 48) | def __setstate__(self, state):
    method state (line 52) | def state(self):
    method __repr__ (line 55) | def __repr__(self):
    method param_groups (line 59) | def param_groups(self):
    method param_groups (line 63) | def param_groups(self, value):
    method state_dict (line 66) | def state_dict(self):
    method load_state_dict (line 69) | def load_state_dict(self, state_dict):
    method zero_grad (line 72) | def zero_grad(self):
    method add_param_group (line 75) | def add_param_group(self, param_group):
    method step (line 78) | def step(self):

FILE: KoSentenceT5/apex/parallel/__init__.py
  function convert_syncbn_model (line 21) | def convert_syncbn_model(module, process_group=None, channel_last=False):
  function create_syncbn_process_group (line 58) | def create_syncbn_process_group(group_size):

FILE: KoSentenceT5/apex/parallel/distributed.py
  function import_flatten_impl (line 13) | def import_flatten_impl():
  function flatten (line 25) | def flatten(bucket):
  function unflatten (line 30) | def unflatten(coalesced, bucket):
  function apply_flat_dist_call (line 36) | def apply_flat_dist_call(bucket, call, extra_args=None):
  function split_half_float_double (line 51) | def split_half_float_double(tensors):
  function split_by_type (line 60) | def split_by_type(tensors):
  function flat_dist_call (line 70) | def flat_dist_call(tensors, call, extra_args=None):
  function extract_tensors (line 78) | def extract_tensors(maybe_tensor, tensor_list):
  class Reducer (line 89) | class Reducer(object):
    method __init__ (line 111) | def __init__(self, module_or_grads_list):
    method reduce (line 121) | def reduce(self):
  class DistributedDataParallel (line 129) | class DistributedDataParallel(Module):
    method __init__ (line 162) | def __init__(self,
    method __setstate__ (line 256) | def __setstate__(self, state):
    method __getstate__ (line 268) | def __getstate__(self):
    method enable_allreduce (line 275) | def enable_allreduce(self):
    method disable_allreduce (line 278) | def disable_allreduce(self):
    method sync_bucket_structure (line 283) | def sync_bucket_structure(self):
    method create_hooks (line 319) | def create_hooks(self):
    method _stream_this_bucket (line 411) | def _stream_this_bucket(self, bucket_idx):
    method _event_this_bucket (line 418) | def _event_this_bucket(self, bucket_idx):
    method allreduce_bucket (line 425) | def allreduce_bucket(self, bucket, bucket_idx, force_default_stream):
    method allreduce_maybe_retain (line 478) | def allreduce_maybe_retain(self, bucket, bucket_idx, force_default_str...
    method allreduce_fallback (line 491) | def allreduce_fallback(self):
    method comm_ready_buckets (line 513) | def comm_ready_buckets(self, param):
    method forward (line 559) | def forward(self, *inputs, **kwargs):

FILE: KoSentenceT5/apex/parallel/multiproc.py
  function docstring_hack (line 5) | def docstring_hack():

FILE: KoSentenceT5/apex/parallel/optimized_sync_batchnorm.py
  class SyncBatchNorm (line 9) | class SyncBatchNorm(_BatchNorm):
    method __init__ (line 58) | def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, ...
    method _specify_process_group (line 64) | def _specify_process_group(self, process_group):
    method _specify_channel_last (line 67) | def _specify_channel_last(self, channel_last):
    method forward (line 70) | def forward(self, input, z = None):

FILE: KoSentenceT5/apex/parallel/optimized_sync_batchnorm_kernel.py
  class SyncBatchnormFunction (line 7) | class SyncBatchnormFunction(Function):
    method forward (line 10) | def forward(ctx, input, z, weight, bias, running_mean, running_varianc...
    method backward (line 75) | def backward(ctx, grad_output):

FILE: KoSentenceT5/apex/parallel/sync_batchnorm.py
  class SyncBatchNorm (line 9) | class SyncBatchNorm(_BatchNorm):
    method __init__ (line 51) | def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, ...
    method _specify_process_group (line 65) | def _specify_process_group(self, process_group):
    method forward (line 68) | def forward(self, input):

FILE: KoSentenceT5/apex/parallel/sync_batchnorm_kernel.py
  class SyncBatchnormFunction (line 7) | class SyncBatchnormFunction(Function):
    method forward (line 10) | def forward(ctx, input, weight, bias, running_mean, running_variance, ...
    method backward (line 33) | def backward(ctx, grad_output):

FILE: KoSentenceT5/apex/pyprof/examples/custom_func_module/custom_function.py
  class Foo (line 9) | class Foo(torch.autograd.Function):
    method forward (line 11) | def forward(ctx, in1, in2):
    method backward (line 16) | def backward(ctx, grad):

FILE: KoSentenceT5/apex/pyprof/examples/custom_func_module/custom_module.py
  class Foo (line 8) | class Foo(torch.nn.Module):
    method __init__ (line 9) | def __init__(self, size):
    method forward (line 14) | def forward(self, input):

FILE: KoSentenceT5/apex/pyprof/examples/imagenet/imagenet.py
  function parseArgs (line 17) | def parseArgs():
  function main (line 89) | def main():

FILE: KoSentenceT5/apex/pyprof/examples/jit/jit_script_function.py
  function foo (line 11) | def foo(x, y):

FILE: KoSentenceT5/apex/pyprof/examples/jit/jit_script_method.py
  class Foo (line 7) | class Foo(torch.jit.ScriptModule):
    method __init__ (line 8) | def __init__(self, size):
    method forward (line 14) | def forward(self, input):

FILE: KoSentenceT5/apex/pyprof/examples/jit/jit_trace_function.py
  function foo (line 7) | def foo(x, y):

FILE: KoSentenceT5/apex/pyprof/examples/jit/jit_trace_method.py
  class Foo (line 7) | class Foo(torch.nn.Module):
    method __init__ (line 8) | def __init__(self, size):
    method forward (line 13) | def forward(self, input):

FILE: KoSentenceT5/apex/pyprof/examples/lenet.py
  class LeNet5 (line 12) | class LeNet5(nn.Module):
    method __init__ (line 13) | def __init__(self):
    method forward (line 24) | def forward(self, x):
    method num_flat_features (line 35) | def num_flat_features(self, x):

FILE: KoSentenceT5/apex/pyprof/examples/user_annotation/resnet.py
  function conv3x3 (line 15) | def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
  function conv1x1 (line 20) | def conv1x1(in_planes, out_planes, stride=1):
  class Bottleneck (line 24) | class Bottleneck(nn.Module):
    method __init__ (line 28) | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
    method forward (line 48) | def forward(self, x):
  class ResNet (line 102) | class ResNet(nn.Module):
    method __init__ (line 104) | def __init__(self, block, layers, num_classes=1000,
    method _make_layer (line 134) | def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
    method forward (line 158) | def forward(self, x):
  function resnet50 (line 193) | def resnet50():

FILE: KoSentenceT5/apex/pyprof/nvtx/nvmarker.py
  function isfunc (line 27) | def isfunc(mod, f):
  function traceMarker (line 46) | def traceMarker(stack):
  function modMarker (line 56) | def modMarker(mod, fn_name, args):
  function add_wrapper (line 67) | def add_wrapper(mod, fn_name):
  function argMarker (line 110) | def argMarker(mod, op, args, kwargs):
  function patchClass (line 201) | def patchClass(cls):
  function init (line 206) | def init():

FILE: KoSentenceT5/apex/pyprof/parse/db.py
  class DB (line 3) | class DB(object):
    method __init__ (line 9) | def __init__(self, dbFile):
    method select (line 21) | def select(self, cmd):
    method insert (line 36) | def insert(self, cmd, data):
    method execute (line 46) | def execute(self, cmd):
    method commit (line 56) | def commit(self):
    method close (line 59) | def close(self):

FILE: KoSentenceT5/apex/pyprof/parse/kernel.py
  function demangle (line 5) | def demangle(name):
  function encode_object_id (line 11) | def encode_object_id(pid, tid):
  function getShortName (line 20) | def getShortName(name):
  class Kernel (line 33) | class Kernel(object):
    method __init__ (line 41) | def __init__(self):
    method setKernelInfo (line 77) | def setKernelInfo(self, info):
    method setKernelName (line 93) | def setKernelName(self, name):
    method setRunTimeInfo (line 98) | def setRunTimeInfo(self, info):
    method setMarkerInfo (line 107) | def setMarkerInfo(self, info):
    method setDirection (line 111) | def setDirection(self):
    method setOp (line 123) | def setOp(self):
    method print (line 180) | def print(self):

FILE: KoSentenceT5/apex/pyprof/parse/nvvp.py
  class NVVP (line 3) | class NVVP(object):
    method __init__ (line 14) | def __init__(self, db):
    method getProfileStart (line 18) | def getProfileStart(self):
    method getString (line 36) | def getString(self, id_):
    method createMarkerTable (line 45) | def createMarkerTable(self):
    method getCPUInfo (line 65) | def getCPUInfo(self, corrId):
    method getKernelInfo (line 91) | def getKernelInfo(self):
    method getMarkerInfo (line 99) | def getMarkerInfo(self, objId, startTime, endTime):

FILE: KoSentenceT5/apex/pyprof/parse/parse.py
  function parseArgs (line 15) | def parseArgs():
  function main (line 25) | def main():

FILE: KoSentenceT5/apex/pyprof/prof/activation.py
  class Activation (line 5) | class Activation(OperatorLayerBase):
    method __init__ (line 12) | def __init__(self, d):
    method params (line 35) | def params(self):
    method flops (line 39) | def flops(self):
    method bytes (line 48) | def bytes(self):
    method tc (line 58) | def tc(self):
    method op (line 61) | def op(self):
    method mod (line 64) | def mod(self):

FILE: KoSentenceT5/apex/pyprof/prof/base.py
  class OperatorLayerBase (line 3) | class OperatorLayerBase(ABC):
    method tc (line 10) | def tc(self):
    method params (line 18) | def params(self):
    method flops (line 25) | def flops(self):
    method bytes (line 32) | def bytes(self):
    method mod (line 36) | def mod(self):
    method op (line 43) | def op(self):

FILE: KoSentenceT5/apex/pyprof/prof/blas.py
  class Addmm (line 8) | class Addmm(OperatorLayerBase):
    method __init__ (line 10) | def __init__(self, d):
    method tc (line 63) | def tc(self):
    method bytes (line 69) | def bytes(self):
    method flops (line 73) | def flops(self):
    method op (line 76) | def op(self):
    method mod (line 79) | def mod(self):
    method params (line 82) | def params(self):
  class Bmm (line 86) | class Bmm(OperatorLayerBase):
    method __init__ (line 88) | def __init__(self, d):
    method tc (line 123) | def tc(self):
    method params (line 129) | def params(self):
    method flops (line 134) | def flops(self):
    method bytes (line 137) | def bytes(self):
    method op (line 141) | def op(self):
    method mod (line 144) | def mod(self):
  class Matmul (line 147) | class Matmul(OperatorLayerBase):
    method __init__ (line 152) | def __init__(self, d):
    method params (line 252) | def params(self):
    method tc (line 255) | def tc(self):
    method bytes (line 264) | def bytes(self):
    method flops (line 272) | def flops(self):
    method op (line 279) | def op(self):
    method mod (line 282) | def mod(self):
  class Mm (line 285) | class Mm(OperatorLayerBase):
    method __init__ (line 287) | def __init__(self, d):
    method params (line 319) | def params(self):
    method tc (line 323) | def tc(self):
    method bytes (line 329) | def bytes(self):
    method flops (line 333) | def flops(self):
    method op (line 336) | def op(self):
    method mod (line 339) | def mod(self):

FILE: KoSentenceT5/apex/pyprof/prof/conv.py
  class Conv (line 5) | class Conv(OperatorLayerBase):
    method __init__ (line 26) | def __init__(self, d):
    method params (line 180) | def params(self):
    method conv_bytes_flops (line 184) | def conv_bytes_flops(self, N, C, H, W, K, P, Q, R, S, g, t):
    method bytes_flops (line 190) | def bytes_flops(self):
    method bytes (line 218) | def bytes(self):
    method flops (line 222) | def flops(self):
    method tc (line 226) | def tc(self):
    method op (line 232) | def op(self):
    method mod (line 235) | def mod(self):

FILE: KoSentenceT5/apex/pyprof/prof/convert.py
  class Convert (line 5) | class Convert(OperatorLayerBase):
    method __init__ (line 11) | def __init__(self, d):
    method params (line 41) | def params(self):
    method op (line 45) | def op(self):
    method mod (line 48) | def mod(self):
    method tc (line 51) | def tc(self):
    method elems (line 54) | def elems(self):
    method flops (line 57) | def flops(self):
    method bytes (line 60) | def bytes(self):

FILE: KoSentenceT5/apex/pyprof/prof/data.py
  class Data (line 3) | class Data(object):
    method __init__ (line 7) | def __init__(self, kernel):
    method setParams (line 41) | def setParams(self, params):

FILE: KoSentenceT5/apex/pyprof/prof/dropout.py
  class Dropout (line 5) | class Dropout(OperatorLayerBase):
    method __init__ (line 7) | def __init__(self, d):
    method params (line 28) | def params(self):
    method op (line 32) | def op(self):
    method mod (line 35) | def mod(self):
    method tc (line 38) | def tc(self):
    method elems (line 41) | def elems(self):
    method bytes (line 44) | def bytes(self):
    method flops (line 48) | def flops(self):

FILE: KoSentenceT5/apex/pyprof/prof/embedding.py
  class Embedding (line 5) | class Embedding(OperatorLayerBase):
    method __init__ (line 7) | def __init__(self, d):
    method params (line 33) | def params(self):
    method op (line 37) | def op(self):
    method mod (line 40) | def mod(self):
    method tc (line 43) | def tc(self):
    method bytes (line 46) | def bytes(self):
    method flops (line 69) | def flops(self):

FILE: KoSentenceT5/apex/pyprof/prof/index_slice_join_mutate.py
  class Cat (line 6) | class Cat(OperatorLayerBase):
    method __init__ (line 8) | def __init__(self, d):
    method params (line 34) | def params(self):
    method flops (line 38) | def flops(self):
    method tc (line 41) | def tc(self):
    method op (line 44) | def op(self):
    method mod (line 47) | def mod(self):
    method bytes (line 50) | def bytes(self):
  class Reshape (line 56) | class Reshape(OperatorLayerBase):
    method __init__ (line 58) | def __init__(self, d):
    method params (line 82) | def params(self):
    method flops (line 86) | def flops(self):
    method tc (line 89) | def tc(self):
    method op (line 92) | def op(self):
    method mod (line 95) | def mod(self):
    method bytes (line 98) | def bytes(self):
  class Gather (line 101) | class Gather(OperatorLayerBase):
    method __init__ (line 103) | def __init__(self, d):
    method params (line 132) | def params(self):
    method flops (line 136) | def flops(self):
    method tc (line 139) | def tc(self):
    method op (line 142) | def op(self):
    method mod (line 145) | def mod(self):
    method bytes (line 148) | def bytes(self):
  class MaskedScatter (line 151) | class MaskedScatter(OperatorLayerBase):
    method __init__ (line 153) | def __init__(self, d):
    method params (line 178) | def params(self):
    method flops (line 182) | def flops(self):
    method tc (line 185) | def tc(self):
    method op (line 188) | def op(self):
    method mod (line 191) | def mod(self):
    method bytes (line 194) | def bytes(self):
  class Nonzero (line 207) | class Nonzero(OperatorLayerBase):
    method __init__ (line 209) | def __init__(self, d):
    method params (line 229) | def params(self):
    method flops (line 233) | def flops(self):
    method tc (line 236) | def tc(self):
    method op (line 239) | def op(self):
    method mod (line 242) | def mod(self):
    method bytes (line 245) | def bytes(self):
  class IndexSelect (line 260) | class IndexSelect(OperatorLayerBase):
    method __init__ (line 262) | def __init__(self, d):
    method params (line 311) | def params(self):
    method tc (line 315) | def tc(self):
    method op (line 318) | def op(self):
    method mod (line 321) | def mod(self):
    method flops (line 324) | def flops(self):
    method bytes (line 327) | def bytes(self):
  class MaskedSelect (line 343) | class MaskedSelect(OperatorLayerBase):
    method __init__ (line 345) | def __init__(self, d):
    method params (line 393) | def params(self):
    method tc (line 397) | def tc(self):
    method op (line 400) | def op(self):
    method mod (line 403) | def mod(self):
    method bytes (line 406) | def bytes(self):
    method flops (line 418) | def flops(self):

FILE: KoSentenceT5/apex/pyprof/prof/linear.py
  class Linear (line 5) | class Linear(OperatorLayerBase):
    method setXWBMNK (line 17) | def setXWBMNK(self, args):
    method tc (line 63) | def tc(self):
    method __init__ (line 69) | def __init__(self, d):
    method params (line 118) | def params(self):
    method op (line 145) | def op(self):
    method bytesFlops (line 148) | def bytesFlops(self):
    method bytes (line 179) | def bytes(self):
    method flops (line 183) | def flops(self):
    method mod (line 187) | def mod(self):

FILE: KoSentenceT5/apex/pyprof/prof/loss.py
  class MSELoss (line 7) | class MSELoss(OperatorLayerBase):
    method __init__ (line 9) | def __init__(self, d):
    method params (line 51) | def params(self):
    method elems (line 55) | def elems(self):
    method bytes (line 71) | def bytes(self):
    method flops (line 74) | def flops(self):
    method tc (line 77) | def tc(self):
    method op (line 80) | def op(self):
    method mod (line 83) | def mod(self):

FILE: KoSentenceT5/apex/pyprof/prof/misc.py
  class Foo (line 5) | class Foo(OperatorLayerBase):
    method __init__ (line 9) | def __init__(self, d):
    method params (line 31) | def params(self):
    method tc (line 35) | def tc(self):
    method op (line 38) | def op(self):
    method mod (line 41) | def mod(self):
    method flops (line 44) | def flops(self):
    method bytes (line 47) | def bytes(self):
  class Copy (line 50) | class Copy(OperatorLayerBase):
    method __init__ (line 52) | def __init__(self, d):
    method params (line 75) | def params(self):
    method tc (line 80) | def tc(self):
    method op (line 83) | def op(self):
    method mod (line 86) | def mod(self):
    method flops (line 89) | def flops(self):
    method elems (line 92) | def elems(self):
    method bytes (line 95) | def bytes(self):
  class Clone (line 98) | class Clone(OperatorLayerBase):
    method __init__ (line 100) | def __init__(self, d):
    method params (line 118) | def params(self):
    method flops (line 122) | def flops(self):
    method tc (line 125) | def tc(self):
    method op (line 128) | def op(self):
    method mod (line 131) | def mod(self):
    method elems (line 134) | def elems(self):
    method bytes (line 137) | def bytes(self):
  class Contiguous (line 140) | class Contiguous(OperatorLayerBase):
    method __init__ (line 142) | def __init__(self, d):
    method params (line 160) | def params(self):
    method flops (line 164) | def flops(self):
    method bytes (line 167) | def bytes(self):
    method tc (line 170) | def tc(self):
    method op (line 173) | def op(self):
    method mod (line 176) | def mod(self):
  class Any (line 179) | class Any(OperatorLayerBase):
    method __init__ (line 181) | def __init__(self, d):
    method params (line 202) | def params(self):
    method op (line 206) | def op(self):
    method mod (line 209) | def mod(self):
    method tc (line 212) | def tc(self):
    method flops (line 215) | def flops(self):
    method bytes (line 218) | def bytes(self):

FILE: KoSentenceT5/apex/pyprof/prof/normalization.py
  class BatchNorm (line 5) | class BatchNorm(OperatorLayerBase):
    method __init__ (line 7) | def __init__(self, d):
    method params (line 27) | def params(self):
    method tc (line 31) | def tc(self):
    method op (line 34) | def op(self):
    method mod (line 37) | def mod(self):
    method elems (line 40) | def elems(self):
    method flops (line 43) | def flops(self):
    method bytes (line 47) | def bytes(self):

FILE: KoSentenceT5/apex/pyprof/prof/optim.py
  class Adam (line 7) | class Adam(OperatorLayerBase):
    method __init__ (line 9) | def __init__(self, d):
    method params (line 31) | def params(self):
    method flops (line 35) | def flops(self):
    method bytes (line 38) | def bytes(self):
    method tc (line 58) | def tc(self):
    method op (line 61) | def op(self):
    method mod (line 64) | def mod(self):

FILE: KoSentenceT5/apex/pyprof/prof/output.py
  class Output (line 3) | class Output():
    method __init__ (line 33) | def __init__(self, args):
    method foo (line 77) | def foo(self, cadena, pformat):
    method header (line 99) | def header(self):
    method data (line 107) | def data(self, a):

FILE: KoSentenceT5/apex/pyprof/prof/pointwise.py
  class Pointwise (line 6) | class Pointwise(OperatorLayerBase):
    method foo (line 26) | def foo(d):
    method __init__ (line 29) | def __init__(self, d):
    method params (line 84) | def params(self):
    method tc (line 88) | def tc(self):
    method op (line 91) | def op(self):
    method mod (line 94) | def mod(self):
    method elems (line 97) | def elems(self):
    method bytes (line 138) | def bytes(self):
    method flops (line 141) | def flops(self):

FILE: KoSentenceT5/apex/pyprof/prof/pooling.py
  class MaxPool2d (line 7) | class MaxPool2d(object):
    method parse (line 9) | def parse(marker):

FILE: KoSentenceT5/apex/pyprof/prof/prof.py
  function findFpropKernel (line 39) | def findFpropKernel(seq):
  function foo (line 56) | def foo(mod, op, d):
  function main (line 171) | def main():

FILE: KoSentenceT5/apex/pyprof/prof/randomSample.py
  class RandPerm (line 5) | class RandPerm(OperatorLayerBase):
    method __init__ (line 7) | def __init__(self, d):
    method params (line 25) | def params(self):
    method tc (line 29) | def tc(self):
    method op (line 32) | def op(self):
    method mod (line 35) | def mod(self):
    method bytes (line 38) | def bytes(self):
    method flops (line 41) | def flops(self):

FILE: KoSentenceT5/apex/pyprof/prof/recurrentCell.py
  function hasTileSize (line 5) | def hasTileSize(name):
  function ctaTile (line 11) | def ctaTile(name):
  class RNNCell (line 21) | class RNNCell(OperatorLayerBase):
    method __init__ (line 26) | def __init__(self, d):
    method params (line 73) | def params(self):
    method tc (line 83) | def tc(self):
    method op (line 89) | def op(self):
    method mod (line 92) | def mod(self):
    method bytes (line 95) | def bytes(self):
    method flops (line 105) | def flops(self):
    method bar (line 115) | def bar(self):

FILE: KoSentenceT5/apex/pyprof/prof/reduction.py
  class Mean (line 5) | class Mean(OperatorLayerBase):
    method __init__ (line 7) | def __init__(self, d):
    method params (line 32) | def params(self):
    method tc (line 36) | def tc(self):
    method op (line 39) | def op(self):
    method mod (line 42) | def mod(self):
    method elems (line 45) | def elems(self):
    method bytes (line 48) | def bytes(self):
    method flops (line 54) | def flops(self):
  class Sum (line 60) | class Sum(OperatorLayerBase):
    method __init__ (line 62) | def __init__(self, d):
    method params (line 86) | def params(self):
    method tc (line 90) | def tc(self):
    method op (line 93) | def op(self):
    method mod (line 96) | def mod(self):
    method elems (line 99) | def elems(self):
    method flops (line 102) | def flops(self):
    method bytes (line 106) | def bytes(self):
  class Norm (line 109) | class Norm(OperatorLayerBase):
    method __init__ (line 111) | def __init__(self, d):
    method params (line 129) | def params(self):
    method elems (line 133) | def elems(self):
    method bytes (line 136) | def bytes(self):
    method flops (line 139) | def flops(self):
    method tc (line 143) | def tc(self):
    method op (line 146) | def op(self):
    method mod (line 149) | def mod(self):

FILE: KoSentenceT5/apex/pyprof/prof/softmax.py
  class Softmax (line 5) | class Softmax(OperatorLayerBase):
    method __init__ (line 7) | def __init__(self, d):
    method op (line 31) | def op(self):
    method mod (line 34) | def mod(self):
    method tc (line 37) | def tc(self):
    method params (line 40) | def params(self):
    method elems (line 44) | def elems(self):
    method flops (line 47) | def flops(self):
    method bytes (line 52) | def bytes(self):
  class LogSoftmax (line 57) | class LogSoftmax(OperatorLayerBase):
    method __init__ (line 59) | def __init__(self, d):
    method op (line 91) | def op(self):
    method mod (line 94) | def mod(self):
    method tc (line 97) | def tc(self):
    method params (line 100) | def params(self):
    method elems (line 104) | def elems(self):
    method flops (line 107) | def flops(self):
    method bytes (line 112) | def bytes(self):

FILE: KoSentenceT5/apex/pyprof/prof/usage.py
  function parseArgs (line 4) | def parseArgs():

FILE: KoSentenceT5/apex/pyprof/prof/utility.py
  class Utility (line 3) | class Utility(object):
    method numElems (line 6) | def numElems(shape):
    method typeToBytes (line 11) | def typeToBytes(t):
    method typeToString (line 23) | def typeToString(t):
    method hasNVTX (line 45) | def hasNVTX(marker):
    method isscalar (line 59) | def isscalar(t):

FILE: KoSentenceT5/apex/reparameterization/__init__.py
  function apply_weight_norm (line 4) | def apply_weight_norm(module, name='', dim=0, hook_child=True):
  function remove_weight_norm (line 50) | def remove_weight_norm(module, name='', remove_all=False):
  function apply_reparameterization (line 64) | def apply_reparameterization(module, reparameterization=None, name='', d...
  function remove_reparameterization (line 96) | def remove_reparameterization(module, reparameterization=Reparameterizat...

FILE: KoSentenceT5/apex/reparameterization/reparameterization.py
  class Reparameterization (line 4) | class Reparameterization(object):
    method __init__ (line 19) | def __init__(self, name, dim, module, retain_forward=True):
    method compute_weight (line 28) | def compute_weight(self, module=None, name=None):
    method reparameterize (line 40) | def reparameterize(self, name, weight, dim):
    method apply (line 57) | def apply(module, name, dim, reparameterization=None, hook_child=True):
    method get_module_and_name (line 105) | def get_module_and_name(module, name):
    method get_params (line 123) | def get_params(self, module):
    method remove (line 127) | def remove(self, module):
    method __call__ (line 139) | def __call__(self, module, inputs):
    method backward_hook (line 147) | def backward_hook(self, module, grad_input, grad_output):

FILE: KoSentenceT5/apex/reparameterization/weight_norm.py
  function _norm (line 8) | def _norm(p, dim):
  class WeightNorm (line 22) | class WeightNorm(Reparameterization):
    method compute_weight (line 39) | def compute_weight(self, module=None, name=None):
    method reparameterize (line 62) | def reparameterize(self, name, weight, dim):

FILE: KoSentenceT5/data/dataloader.py
  class ModelDataLoader (line 10) | class ModelDataLoader(Dataset):
    method __init__ (line 11) | def __init__(self, file_path, args, metric, tokenizer, type_):
    method load_data (line 57) | def load_data(self, type):
    method data2tensor (line 70) | def data2tensor(self, line, type):
    method __getitem__ (line 133) | def __getitem__(self, index):
    method __len__ (line 175) | def __len__(self):
  function get_loader (line 183) | def get_loader(args, metric):

FILE: KoSentenceT5/main.py
  function main (line 5) | def main(args, logger) -> None:

FILE: KoSentenceT5/model/loss.py
  class Loss (line 12) | class Loss():
    method __init__ (line 14) | def __init__(self, args):
    method train_loss_fct (line 19) | def train_loss_fct(self, config, inputs, a, p, n):
    method evaluation_during_training (line 31) | def evaluation_during_training(self, embeddings1, embeddings2, labels,...

FILE: KoSentenceT5/model/setting.py
  class Arguments (line 8) | class Arguments():
    method __init__ (line 10) | def __init__(self):
    method add_type_of_processing (line 13) | def add_type_of_processing(self):
    method add_hyper_parameters (line 21) | def add_hyper_parameters(self):
    method add_data_parameters (line 35) | def add_data_parameters(self):
    method print_args (line 45) | def print_args(self, args):
    method add_argument (line 51) | def add_argument(self, *args, **kw_args):
    method parse (line 54) | def parse(self):
  class Setting (line 61) | class Setting():
    method set_logger (line 63) | def set_logger(self):
    method set_seed (line 77) | def set_seed(self, args):
    method run (line 91) | def run(self):

FILE: KoSentenceT5/model/simcse/kost5.py
  class KoSentenceT5 (line 5) | class KoSentenceT5(nn.Module):
    method __init__ (line 6) | def __init__(self, model):
    method forward (line 10) | def forward(self, config, inputs, mode):
    method encode (line 44) | def encode(self, inputs, device):

FILE: KoSentenceT5/model/simcse/processor.py
  class Processor (line 19) | class Processor():
    method __init__ (line 21) | def __init__(self, args):
    method run (line 33) | def run(self, inputs, indicator=None, type=None):
    method progress (line 52) | def progress(self, loss):
    method progress_validation (line 56) | def progress_validation(self, score):
    method return_value (line 60) | def return_value(self):
    method get_object (line 66) | def get_object(self, tokenizer, model):
    method get_scheduler (line 81) | def get_scheduler(self, optim, train_loader):
    method model_setting (line 89) | def model_setting(self):
    method train (line 125) | def train(self, epoch):
    method valid (line 155) | def valid(self):
    method test (line 179) | def test(self):

FILE: KoSentenceT5/model/utils.py
  class Metric (line 10) | class Metric():
    method __init__ (line 12) | def __init__(self, args):
    method get_lr (line 15) | def get_lr(self, optimizer):
    method count_parameters (line 18) | def count_parameters(self, model):
    method cal_acc (line 21) | def cal_acc(self, yhat, y):
    method cal_time (line 28) | def cal_time(self, start_time, end_time):
    method cal_dev_score (line 35) | def cal_dev_score(self, score, indicator):
    method update_indicator (line 51) | def update_indicator(self, indicator, score):
    method draw_graph (line 70) | def draw_graph(self, cp):
    method performance_check (line 74) | def performance_check(self, cp, config):
    method print_size_of_model (line 80) | def print_size_of_model(self, model):
    method move2device (line 85) | def move2device(self, sample, device):
    method save_model (line 106) | def save_model(self, config, cp, pco):
  function pytorch_cos_sim (line 131) | def pytorch_cos_sim(a, b):

FILE: KoSimCSE/SemanticSearch.py
  function main (line 6) | def main():

FILE: KoSimCSE/apex/RNN/RNNBackend.py
  function is_iterable (line 10) | def is_iterable(maybe_iterable):
  function flatten_list (line 14) | def flatten_list(tens_list):
  class bidirectionalRNN (line 25) | class bidirectionalRNN(nn.Module):
    method __init__ (line 29) | def __init__(self, inputRNN, num_layers=1, dropout = 0):
    method forward (line 37) | def forward(self, input, collect_hidden=False):
    method reset_parameters (line 52) | def reset_parameters(self):
    method init_hidden (line 59) | def init_hidden(self, bsz):
    method detach_hidden (line 66) | def detach_hidden(self):
    method reset_hidden (line 73) | def reset_hidden(self, bsz):
    method init_inference (line 80) | def init_inference(self, bsz):
  class stackedRNN (line 90) | class stackedRNN(nn.Module):
    method __init__ (line 94) | def __init__(self, inputRNN, num_layers=1, dropout=0):
    method forward (line 122) | def forward(self, input, collect_hidden=False, reverse=False):
    method reset_parameters (line 197) | def reset_parameters(self):
    method init_hidden (line 204) | def init_hidden(self, bsz):
    method detach_hidden (line 211) | def detach_hidden(self):
    method reset_hidden (line 218) | def reset_hidden(self, bsz):
    method init_inference (line 225) | def init_inference(self, bsz):
  class RNNCell (line 232) | class RNNCell(nn.Module):
    method __init__ (line 242) | def __init__(self, gate_multiplier, input_size, hidden_size, cell, n_h...
    method new_like (line 274) | def new_like(self, new_input_size=None):
    method reset_parameters (line 291) | def reset_parameters(self, gain=1):
    method init_hidden (line 309) | def init_hidden(self, bsz):
    method reset_hidden (line 330) | def reset_hidden(self, bsz):
    method detach_hidden (line 338) | def detach_hidden(self):
    method forward (line 348) | def forward(self, input):

FILE: KoSimCSE/apex/RNN/cells.py
  class mLSTMRNNCell (line 12) | class mLSTMRNNCell(RNNCell):
    method __init__ (line 17) | def __init__(self, input_size, hidden_size, bias = False, output_size ...
    method forward (line 26) | def forward(self, input):
    method new_like (line 45) | def new_like(self, new_input_size=None):
  function mLSTMCell (line 55) | def mLSTMCell(input, hidden, w_ih, w_hh, w_mih, w_mhh, b_ih=None, b_hh=N...

FILE: KoSimCSE/apex/RNN/models.py
  function toRNNBackend (line 8) | def toRNNBackend(inputRNN, num_layers, bidirectional=False, dropout = 0):
  function LSTM (line 19) | def LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=Fal...
  function GRU (line 26) | def GRU(input_size, hidden_size, num_layers, bias=True, batch_first=Fals...
  function ReLU (line 33) | def ReLU(input_size, hidden_size, num_layers, bias=True, batch_first=Fal...
  function Tanh (line 40) | def Tanh(input_size, hidden_size, num_layers, bias=True, batch_first=Fal...
  function mLSTM (line 47) | def mLSTM(input_size, hidden_size, num_layers, bias=True, batch_first=Fa...

FILE: KoSimCSE/apex/amp/_amp_state.py
  class AmpState (line 18) | class AmpState(object):
    method __init__ (line 19) | def __init__(self):
  function warn_or_err (line 29) | def warn_or_err(msg):
  function maybe_print (line 39) | def maybe_print(msg, rank0=False):
  function master_params (line 60) | def master_params(optimizer):

FILE: KoSimCSE/apex/amp/_initialize.py
  function to_type (line 21) | def to_type(dtype, t):
  function applier (line 39) | def applier(value, fn):
  function check_models (line 64) | def check_models(models):
  function check_params_fp32 (line 79) | def check_params_fp32(models):
  function check_optimizers (line 119) | def check_optimizers(optimizers):
  class O2StateDictHook (line 133) | class O2StateDictHook(object):
    method __init__ (line 134) | def __init__(self, fn):
    method __call__ (line 137) | def __call__(self, module, state_dict, prefix, local_metadata):
  function _initialize (line 145) | def _initialize(models, optimizers, properties, num_losses=1, cast_model...

FILE: KoSimCSE/apex/amp/_process_optimizer.py
  class AmpOptimizerState (line 9) | class AmpOptimizerState(object):
    method __init__ (line 10) | def __init__(self):
  function _master_params_to_model_params (line 14) | def _master_params_to_model_params(self):
  function lazy_init_with_master_weights (line 28) | def lazy_init_with_master_weights(self):
  function post_backward_models_are_masters (line 93) | def post_backward_models_are_masters(scaler, params, stashed_grads, scal...
  function prepare_backward_with_master_weights (line 142) | def prepare_backward_with_master_weights(self):
  function post_backward_with_master_weights (line 161) | def post_backward_with_master_weights(self, scaler):
  function lazy_init_no_master_weights (line 205) | def lazy_init_no_master_weights(self):
  function prepare_backward_no_master_weights (line 224) | def prepare_backward_no_master_weights(self):
  function post_backward_no_master_weights (line 240) | def post_backward_no_master_weights(self, scaler):
  function prepare_backward_with_master_weights_FusedSGD (line 258) | def prepare_backward_with_master_weights_FusedSGD(self):
  function post_backward_with_master_weights_FusedSGD (line 277) | def post_backward_with_master_weights_FusedSGD(self, scaler):
  function prepare_backward_no_master_weights_FusedSGD (line 305) | def prepare_backward_no_master_weights_FusedSGD(self):
  function post_backward_no_master_weights_FusedSGD (line 309) | def post_backward_no_master_weights_FusedSGD(self, scaler):
  function _amp_lazy_init (line 313) | def _amp_lazy_init(self):
  function _process_optimizer (line 321) | def _process_optimizer(optimizer, properties):

FILE: KoSimCSE/apex/amp/amp.py
  function _decorator_helper (line 18) | def _decorator_helper(orig_fn, cast_fn, wrap_fn):
  function half_function (line 30) | def half_function(fn):
  function float_function (line 35) | def float_function(fn):
  function promote_function (line 40) | def promote_function(fn):
  function register_half_function (line 46) | def register_half_function(module, name):
  function register_float_function (line 53) | def register_float_function(module, name):
  function register_promote_function (line 60) | def register_promote_function(module, name):
  function init (line 68) | def init(enabled=True, loss_scale="dynamic", enable_caching=True, verbos...

FILE: KoSimCSE/apex/amp/compat.py
  function variable_is_tensor (line 4) | def variable_is_tensor():
  function tensor_is_variable (line 8) | def tensor_is_variable():
  function tensor_is_float_tensor (line 13) | def tensor_is_float_tensor():
  function is_tensor_like (line 19) | def is_tensor_like(x):
  function is_floating_point (line 24) | def is_floating_point(x):
  function scalar_python_val (line 35) | def scalar_python_val(x):
  function filter_attrs (line 45) | def filter_attrs(module, attrs):

FILE: KoSimCSE/apex/amp/frontend.py
  class Properties (line 7) | class Properties(object):
    method __init__ (line 13) | def __init__(self):
    method _update_options_dict (line 33) | def _update_options_dict(self, new_options):
    method __getattr__ (line 43) | def __getattr__(self, name):
    method __setattr__ (line 51) | def __setattr__(self, name, value):
  class O3 (line 102) | class O3:
    method __call__ (line 111) | def __call__(self, properties):
  class O2 (line 124) | class O2:
    method __call__ (line 134) | def __call__(self, properties):
  class O1 (line 147) | class O1:
    method __call__ (line 156) | def __call__(self, properties):
  class O0 (line 169) | class O0:
    method __call__ (line 175) | def __call__(self, properties):
  function initialize (line 195) | def initialize(
  function state_dict (line 361) | def state_dict(destination=None):
  function load_state_dict (line 373) | def load_state_dict(state_dict):

FILE: KoSimCSE/apex/amp/handle.py
  function scale_loss (line 17) | def scale_loss(loss,
  function disable_casts (line 164) | def disable_casts():
  class AmpHandle (line 170) | class AmpHandle(object):
    method __init__ (line 171) | def __init__(self, loss_scale="dynamic", enable_caching=True, verbose=...
    method is_active (line 179) | def is_active(self):
    method _disable_casts (line 183) | def _disable_casts(self):
    method wrap_optimizer (line 188) | def wrap_optimizer(self, optimizer, num_loss=1):
    method scale_loss (line 193) | def scale_loss(self, loss, optimizer):
    method _clear_cache (line 226) | def _clear_cache(self):
    method _save_func (line 230) | def _save_func(self, mod, fn, func):
    method _deactivate (line 233) | def _deactivate(self):
    method has_cache (line 239) | def has_cache(self):
    method cache (line 243) | def cache(self):
    method remove_cache (line 246) | def remove_cache(self, param):
    method verbose (line 251) | def verbose(self):
  class NoOpHandle (line 254) | class NoOpHandle(object):
    method is_active (line 255) | def is_active(self):
    method _disable_casts (line 259) | def _disable_casts(self):
    method wrap_optimizer (line 262) | def wrap_optimizer(self, optimizer, num_loss=1):
    method scale_loss (line 266) | def scale_loss(self, loss, optimizer):
    method has_cache (line 270) | def has_cache(self):
    method verbose (line 274) | def verbose(self):
    method _clear_cache (line 277) | def _clear_cache(self):
    method _deactivate (line 280) | def _deactivate(self):

FILE: KoSimCSE/apex/amp/opt.py
  class OptimWrapper (line 9) | class OptimWrapper(object):
    method __init__ (line 10) | def __init__(self, optimizer, amp_handle, num_loss):
    method scale_loss (line 19) | def scale_loss(self, loss):
    method _cur_loss_scaler (line 55) | def _cur_loss_scaler(self):
    method step (line 59) | def step(self, closure=None):
    method __getattr__ (line 80) | def __getattr__(self, attr):
    method __getstate__ (line 84) | def __getstate__(self):
    method __setstate__ (line 87) | def __setstate__(self):
    method __repr__ (line 90) | def __repr__(self):
    method state_dict (line 93) | def state_dict(self):
    method load_state_dict (line 96) | def load_state_dict(self, state_dict):
    method zero_grad (line 99) | def zero_grad(self):
    method add_param_group (line 102) | def add_param_group(self, param_group):

FILE: KoSimCSE/apex/amp/rnn_compat.py
  function _gen_VF_wrapper (line 7) | def _gen_VF_wrapper(name):
  class VariableFunctionsShim (line 17) | class VariableFunctionsShim(object):
    method __init__ (line 18) | def __init__(self):
  function has_old_rnns (line 24) | def has_old_rnns():
  function whitelist_rnn_cells (line 31) | def whitelist_rnn_cells(handle, verbose):

FILE: KoSimCSE/apex/amp/scaler.py
  function scale_check_overflow_python (line 6) | def scale_check_overflow_python(model_grad, master_grad, scale, check_ov...
  function axpby_check_overflow_python (line 19) | def axpby_check_overflow_python(model_grad, stashed_grad, master_grad, a...
  class LossScaler (line 33) | class LossScaler(object):
    method __init__ (line 38) | def __init__(self,
    method loss_scale (line 73) | def loss_scale(self):
    method unscale_python (line 76) | def unscale_python(self, model_grads, master_grads, scale):
    method unscale (line 94) | def unscale(self, model_grads, master_grads, unused_scale, models_are_...
    method unscale_with_stashed_python (line 126) | def unscale_with_stashed_python(self,
    method unscale_with_stashed (line 152) | def unscale_with_stashed(self,
    method clear_overflow_state (line 191) | def clear_overflow_state(self):
    method update_scale (line 197) | def update_scale(self):

FILE: KoSimCSE/apex/amp/utils.py
  function is_cuda_enabled (line 8) | def is_cuda_enabled():
  function get_cuda_version (line 11) | def get_cuda_version():
  function is_fp_tensor (line 14) | def is_fp_tensor(x):
  function is_nested (line 23) | def is_nested(x):
  function should_cache (line 26) | def should_cache(x):
  function collect_fp_tensor_types (line 36) | def collect_fp_tensor_types(args, kwargs):
  function type_string (line 51) | def type_string(x):
  function maybe_half (line 54) | def maybe_half(x, name='', verbose=False):
  function maybe_float (line 65) | def maybe_float(x, name='', verbose=False):
  function casted_args (line 77) | def casted_args(cast_fn, args, kwargs):
  function cached_cast (line 90) | def cached_cast(cast_fn, x, cache):
  function verbosify (line 124) | def verbosify(cast_fn, fn_name, verbose):
  function as_inplace (line 130) | def as_inplace(fns):
  function has_func (line 134) | def has_func(mod, fn):
  function get_func (line 140) | def get_func(mod, fn):
  function set_func (line 146) | def set_func(mod, fn, new_fn):
  function set_func_save (line 152) | def set_func_save(handle, mod, fn, new_fn):
  function synthesize_flattened_rnn_weights (line 171) | def synthesize_flattened_rnn_weights(fp32_weights,
  function new_synthesize_flattened_rnn_weights (line 194) | def new_synthesize_flattened_rnn_weights(fp32_weights,

FILE: KoSimCSE/apex/amp/wrap.py
  function make_cast_wrapper (line 10) | def make_cast_wrapper(orig_fn, cast_fn, handle,
  function cached_cast (line 31) | def cached_cast(mod, fn, cast_fn, handle,
  function make_promote_wrapper (line 44) | def make_promote_wrapper(orig_fn, cast_fn, handle=None):
  function promote (line 65) | def promote(mod, fn, handle, verbose=False):
  function sequence_promote (line 71) | def sequence_promote(mod, fn, handle, verbose=False):
  function promote_match_arg0 (line 92) | def promote_match_arg0(mod, fn, handle, verbose=False):
  function err_if_any_half (line 114) | def err_if_any_half(mod, fn, handle, custom_err_msg=None):
  function err_if_arg0_half (line 132) | def err_if_arg0_half(mod, fn, handle, verbose=False):
  function rnn_cast (line 157) | def rnn_cast(backend, fn, handle, verbose=False):
  function new_rnn_cast (line 222) | def new_rnn_cast(fn, handle, verbose=False):
  function disable_casts (line 267) | def disable_casts(mod, fn, handle):

FILE: KoSimCSE/apex/contrib/bottleneck/bottleneck.py
  function kaiming_uniform_ (line 5) | def kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_rel...
  class FrozenBatchNorm2d (line 9) | class FrozenBatchNorm2d(torch.nn.Module):
    method __init__ (line 13) | def __init__(self, n):
    method get_scale_bias (line 20) | def get_scale_bias(self, nhwc=False):
    method forward (line 31) | def forward(self, x):
  function drelu_dscale1 (line 37) | def drelu_dscale1(grad_o, output, scale1):
  function drelu_dscale2 (line 44) | def drelu_dscale2(grad_o, output, scale1, scale2):
  class BottleneckFunction (line 51) | class BottleneckFunction(torch.autograd.Function):
    method forward (line 53) | def forward(ctx, nhwc, stride_1x1, scale, bias, x, *conv):
    method backward (line 75) | def backward(ctx, grad_o):
  function conv3x3 (line 102) | def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
  function conv1x1 (line 107) | def conv1x1(in_planes, out_planes, stride=1):
  class Bottleneck (line 111) | class Bottleneck(torch.nn.Module):
    method __init__ (line 119) | def __init__(self, in_channels, bottleneck_channels, out_channels, str...
    method forward (line 174) | def forward(self, x):

FILE: KoSimCSE/apex/contrib/csrc/bottleneck/bottleneck.cpp
  function checkCudnnError (line 31) | int checkCudnnError(cudnnStatus_t code, const char* expr, const char* fi...
  function checkError (line 42) | void checkError(cudaError_t code, char const * func, const char *file, c...
  function generateStrides (line 55) | void generateStrides(const int64_t* dimA, int64_t* strideA, int nbDims, ...
  function getFwdConvDilatedFilterDim (line 75) | int getFwdConvDilatedFilterDim(int filterDim, int dilation) {
  function getFwdConvPaddedImageDim (line 79) | int getFwdConvPaddedImageDim(int tensorDim, int pad) {
  function getFwdConvOutputDim (line 83) | int getFwdConvOutputDim(
  function common_conv_descriptors (line 111) | common_conv_descriptors
  function common_convbias_descriptors (line 173) | common_convbias_descriptors
  function dconv_descriptors (line 294) | dconv_descriptors
  function getConvFusionString (line 377) | std::string getConvFusionString(int64_t* x_dim_padded,
  function run_conv_scale_bias_add_activation (line 469) | void
  function run_conv_scale_bias (line 630) | void
  function run_dconv_drelu_dscale (line 759) | void
  function run_dconv (line 886) | void
  function run_dconv_add (line 992) | void
  function bottleneck_forward (line 1104) | std::vector<at::Tensor> bottleneck_forward(bool explicit_nhwc, int strid...
  function bottleneck_backward (line 1287) | std::vector<at::Tensor> bottleneck_backward(bool explicit_nhwc, int stri...
  function PYBIND11_MODULE (line 1609) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSimCSE/apex/contrib/csrc/fmha/fmha_api.cpp
  function set_params (line 33) | void set_params(Fused_multihead_attention_fprop_params &params,
  function mha_fwd (line 86) | std::vector<at::Tensor>
  function mha_bwd (line 182) | std::vector<at::Tensor>
  function mha_fwd_nl (line 262) | std::vector<at::Tensor> mha_fwd_nl(const at::Tensor &qkv,         // tot...
  function mha_bwd_nl (line 342) | std::vector<at::Tensor> mha_bwd_nl(const at::Tensor &dout,        // tot...
  function PYBIND11_MODULE (line 426) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSimCSE/apex/contrib/csrc/fmha/src/fmha.h
  type Qkv_params (line 46) | struct Qkv_params {
  function Qkv_params (line 59) | struct Fused_multihead_attention_fprop_params : public Qkv_params {

FILE: KoSimCSE/apex/contrib/csrc/fmha/src/fmha/gemm.h
  function namespace (line 34) | namespace fmha {
  type Fragment_accumulator (line 145) | struct Fragment_accumulator
  function add (line 152) | void add(const Other_fragment_ &other) {

FILE: KoSimCSE/apex/contrib/csrc/fmha/src/fmha/gmem_tile.h
  function namespace (line 30) | namespace fmha {
  function __device__ (line 112) | inline __device__ void store(const uint4 (&data)[LDGS]) {
  function __device__ (line 123) | inline __device__ void move() {
  function __device__ (line 201) | inline __device__ void store(const uint4 (&src)[STGS_PER_LOOP], int mi) {
  function __device__ (line 222) | inline __device__ void move() {
  function __device__ (line 273) | __device__ Gmem_tile_mma_sd(void *ptr, const Params &params, const int t...
  function __device__ (line 288) | inline __device__ void store(const Type &data, const int mi, const int n...
  function __device__ (line 300) | inline __device__ void move() {
  function Base (line 311) | struct Gmem_tile_mma_s : public Base {
  function Base (line 404) | struct Gmem_tile_dq : public Base {

FILE: KoSimCSE/apex/contrib/csrc/fmha/src/fmha/mask.h
  function namespace (line 30) | namespace fmha {

FILE: KoSimCSE/apex/contrib/csrc/fmha/src/fmha/smem_tile.h
  function namespace (line 33) | namespace fmha {
  function __device__ (line 396) | inline __device__ Smem_tile_row_a(void *smem, int tidx) : Base(smem, tid...
  function __device__ (line 462) | inline __device__ void reset_read_offset() {
  function __device__ (line 494) | inline __device__ Smem_tile_a(void *smem, int tidx) : Base(smem, tidx) {
  function __device__ (line 581) | inline __device__ Smem_tile_col_b(void *smem, int tidx) : Base(smem, tid...
  function __device__ (line 653) | inline __device__ void reset_read_offset() {
  function __device__ (line 685) | inline __device__ Smem_tile_b(void *smem, int tidx) : Base(smem, tidx) {
  function __device__ (line 748) | inline __device__ Smem_tile_row_b(void *smem, int tidx) : Base(smem, tid...
  function __device__ (line 892) | inline __device__ Smem_tile_b(void *smem, int tidx) : Base(smem, tidx) {
  function __device__ (line 912) | inline __device__ Smem_tile_v(void *smem, int tidx) : Base(smem, tidx) {
  function __device__ (line 1003) | inline __device__ Smem_tile_o(void *smem, int tidx) {
  function store (line 1057) | void store(const Accumulator (&acc)[M][N], int mi) {
  function __device__ (line 1129) | inline __device__ Smem_tile_mma(char *smem, int tidx) {
  function store (line 1147) | void store(const uint4 (&regs)[M][N]) {
  function __device__ (line 1177) | inline __device__ Smem_tile_mma_transposed(char *smem, int tidx) : Base(...
  function load (line 1189) | void load(Fragment (&frag)[M][N]) {
  function __device__ (line 1223) | inline __device__ Smem_tile_mma_epilogue(char *smem, int tidx) : Base(sm...
  function store (line 1238) | void store(const Acc (&acc)[M][N]){
  function store (line 1272) | void store(const uint4 (&regs)[M][N]) {

FILE: KoSimCSE/apex/contrib/csrc/fmha/src/fmha/softmax.h
  function namespace (line 30) | namespace fmha {

FILE: KoSimCSE/apex/contrib/csrc/fmha/src/fmha/utils.h
  function namespace (line 38) | namespace fmha {
  function __device__ (line 247) | static inline __device__ uint32_t hadd2(uint32_t a, uint32_t b) {
  function __device__ (line 255) | static inline __device__ uint32_t hmin2(uint32_t a, uint32_t b) {
  function __device__ (line 263) | static inline __device__ uint32_t hmul2(uint32_t a, uint32_t b) {
  function __device__ (line 271) | static inline __device__ uint2 hmul4(uint2 a, uint2 b) {
  function __device__ (line 280) | static inline __device__ uint4 hmul8(uint4 a, uint4 b) {
  function __device__ (line 291) | static inline __device__ uint4 hmul8(uint32_t a, uint4 b) {
  function __device__ (line 317) | static inline __device__ uint32_t habs2(uint32_t x) {
  function __device__ (line 332) | static inline __device__ uint16_t clamp_to_zero(uint16_t x) {
  function __device__ (line 340) | static inline __device__ uint16_t float_to_half(float f) {
  function __device__ (line 348) | static inline __device__ uint32_t float2_to_half2(float a, float b) {
  function __device__ (line 362) | static inline __device__ uint32_t float_to_half2(float a) {
  function __device__ (line 368) | static inline __device__ uint32_t float2_to_half2(const float2 &f) {
  function __device__ (line 374) | static inline __device__ uint2 float4_to_half4(float x, float y, float z...
  function __device__ (line 383) | static inline __device__ uint32_t hfma2(uint32_t a, uint32_t b, uint32_t...
  function __device__ (line 391) | static inline __device__ uint32_t hfma2_relu(uint32_t a, uint32_t b, uin...
  function __device__ (line 403) | static inline __device__ uint32_t h0_h0(uint32_t x) {
  function __device__ (line 412) | static inline __device__ float h0_to_float(uint32_t h2) {
  function __device__ (line 424) | static inline __device__ uint32_t h1_h1(uint32_t x) {
  function __device__ (line 433) | static inline __device__ uint16_t hadd(uint16_t a, uint16_t b) {
  function __device__ (line 441) | static inline __device__ uint32_t hadd(uint32_t a, uint32_t b) {
  function __device__ (line 447) | static inline __device__ uint2 hadd4(uint2 a, uint2 b) {
  function __device__ (line 456) | static inline __device__ uint2 hadd(uint2 a, uint2 b) {
  function __device__ (line 462) | static inline __device__ uint4 hadd8(uint4 a, uint4 b) {
  function __device__ (line 473) | static inline __device__ uint4 fadd4(uint4 a, uint4 b) {
  function __device__ (line 484) | static inline __device__ uint4 hadd(uint4 a, uint4 b) {
  function __device__ (line 490) | static inline __device__ float half_to_float(uint16_t h) {
  function __device__ (line 498) | static inline __device__ float2 half2_to_float2(uint32_t x) {
  function __device__ (line 514) | static inline __device__ uint16_t hfma(uint16_t a, uint16_t b, uint16_t ...
  function __device__ (line 522) | static inline __device__ uint16_t hmul(uint16_t a, uint16_t b) {
  function __device__ (line 530) | static inline __device__ float sigmoid(float x) {
  function __device__ (line 685) | inline __device__ Ldg_functor(Data_type (&fetch)[N], const void* (&ptrs)...
  function __device__ (line 690) | inline __device__ void clear(int ii) {
  function __device__ (line 695) | inline __device__ void load(int ii, bool p) {
  function __device__ (line 847) | inline __device__ void stg(void *ptr, uint8_t val) {
  function __device__ (line 853) | inline __device__ void stg(void *ptr, uint16_t val) {
  function __device__ (line 859) | inline __device__ void stg(void *ptr, uint32_t val) {
  function __device__ (line 865) | inline __device__ void stg(void *ptr, uint2 val) {
  function __device__ (line 871) | inline __device__ void stg(void *ptr, uint4 val) {
  function __device__ (line 881) | inline __device__ void sts(uint32_t ptr, uint16_t val) {
  function __device__ (line 887) | inline __device__ void sts(uint32_t ptr, uint32_t val) {
  function __device__ (line 893) | inline __device__ void sts(uint32_t ptr, uint2 val) {
  function __device__ (line 903) | inline __device__ void sts(uint32_t ptr, uint4 val) {

FILE: KoSimCSE/apex/contrib/csrc/fmha/src/fmha_dgrad_kernel_1xN_reload.h
  function namespace (line 34) | namespace fmha {

FILE: KoSimCSE/apex/contrib/csrc/fmha/src/fmha_dgrad_kernel_1xN_reload_nl.h
  function namespace (line 34) | namespace fmha {

FILE: KoSimCSE/apex/contrib/csrc/fmha/src/fmha_fprop_kernel_1xN.h
  function namespace (line 34) | namespace fmha {

FILE: KoSimCSE/apex/contrib/csrc/fmha/src/fmha_fprop_kernel_1xN_nl.h
  function namespace (line 35) | namespace fmha {

FILE: KoSimCSE/apex/contrib/csrc/fmha/src/fmha_fprop_kernel_1xN_reload_v.h
  function namespace (line 34) | namespace fmha {

FILE: KoSimCSE/apex/contrib/csrc/fmha/src/fmha_kernel.h
  function namespace (line 39) | namespace fmha {
  function __device__ (line 90) | inline __device__ Noloop_traits(const int bidc)
  function move_all (line 96) | void move_all(Tiles & ... tiles) const {
  function __device__ (line 113) | inline __device__ int offset_loop_count(const int l) {
  function __device__ (line 157) | inline __device__ int offset_loop_count(const int l) {

FILE: KoSimCSE/apex/contrib/csrc/fmha/src/fmha_utils.h
  type Data_type (line 53) | enum Data_type { DATA_TYPE_FP16, DATA_TYPE_FP32, DATA_TYPE_INT32, DATA_T...
  function set_alpha (line 57) | static inline void set_alpha( uint32_t &alpha, float norm, Data_type dty...
  function get_size_in_bytes (line 75) | static inline size_t get_size_in_bytes( size_t n, Data_type dtype ) {

FILE: KoSimCSE/apex/contrib/csrc/groupbn/batch_norm.h
  function class (line 41) | class NhwcBatchNorm {
  function createTensorDescriptor (line 193) | void createTensorDescriptor(cudnnTensorDescriptor_t *descriptor) {
  function destroyTensorDescriptor (line 199) | void destroyTensorDescriptor(cudnnTensorDescriptor_t descriptor) {
  type StorageType (line 223) | typedef uint16_t StorageType;
  function _fwdKernelLauncher (line 258) | void _fwdKernelLauncher(cudaStream_t stream, NhwcBatchNormFwdParams params,
  function _bwdKernelLauncher (line 338) | void _bwdKernelLauncher(cudaStream_t stream, NhwcBatchNormBwdParams params,
  function smem_driven_bwd_occupancy (line 469) | static int smem_driven_bwd_occupancy(int device_id, const int max_cta_pe...
  function std (line 478) | const std::vector<size_t> NhwcBatchNorm::numWorkspaceBytes() const {
  function _setFwdParams (line 510) | void NhwcBatchNorm::_setFwdParams(NhwcBatchNormFwdParams *params) const {
  function _setFwdInferenceParams (line 534) | void NhwcBatchNorm::_setFwdInferenceParams(NhwcBatchNormFwdInferenceParams
  function _setBwdParams (line 548) | void NhwcBatchNorm::_setBwdParams(NhwcBatchNormBwdParams *params) const {
  function fwdInference (line 569) | void NhwcBatchNorm::fwdInference(cudaStream_t stream, bool use_relu) {
  function dim3 (line 612) | dim3 NhwcBatchNorm::calc_fwd_grid(int *loop, const int grid_dim_x) {
  function dim3 (line 635) | dim3 NhwcBatchNorm::calc_bwd_grid(int *loop, const int grid_dim_x) {
  function fwd (line 658) | void NhwcBatchNorm::fwd(cudaStream_t stream, bool use_relu, void* my_dat...
  function dgrad (line 697) | void NhwcBatchNorm::dgrad(cudaStream_t stream, bool use_relu, void* my_d...

FILE: KoSimCSE/apex/contrib/csrc/groupbn/batch_norm_add_relu.h
  function class (line 41) | class NhwcBatchNormAddRelu {
  function createTensorDescriptor (line 197) | void createTensorDescriptor(cudnnTensorDescriptor_t *descriptor) {
  function destroyTensorDescriptor (line 203) | void destroyTensorDescriptor(cudnnTensorDescriptor_t descriptor) {
  type StorageType (line 228) | typedef uint16_t StorageType;
  function _fwdKernelLauncher (line 262) | void _fwdKernelLauncher(cudaStream_t stream, NhwcBatchNormFwdParams params,
  function _bwdKernelLauncher (line 332) | void _bwdKernelLauncher(cudaStream_t stream, NhwcBatchNormBwdParams params,
  function smem_driven_bwd_occupancy (line 409) | static int smem_driven_bwd_occupancy(int device_id, const int max_cta_pe...
  function std (line 418) | const std::vector<size_t> NhwcBatchNormAddRelu::numWorkspaceBytes() const {
  function _setFwdParams (line 456) | void NhwcBatchNormAddRelu::_setFwdParams(NhwcBatchNormFwdParams *params)...
  function _setFwdInferenceParams (line 480) | void NhwcBatchNormAddRelu::_setFwdInferenceParams(NhwcBatchNormFwdInfere...
  function _setBwdParams (line 494) | void NhwcBatchNormAddRelu::_setBwdParams(NhwcBatchNormBwdParams *params)...
  function fwdInference (line 515) | void NhwcBatchNormAddRelu::fwdInference(cudaStream_t stream) {
  function dim3 (line 552) | dim3 NhwcBatchNormAddRelu::calc_fwd_grid(int *loop, const int grid_dim_x) {
  function dim3 (line 575) | dim3 NhwcBatchNormAddRelu::calc_bwd_grid(int *loop, const int grid_dim_x) {
  function fwd (line 598) | void NhwcBatchNormAddRelu::fwd(cudaStream_t stream, void* my_data, void*...
  function dgrad (line 640) | void NhwcBatchNormAddRelu::dgrad(cudaStream_t stream, void* my_data, voi...

FILE: KoSimCSE/apex/contrib/csrc/groupbn/cuda_utils.h
  function namespace (line 5) | namespace at {

FILE: KoSimCSE/apex/contrib/csrc/groupbn/interface.cpp
  function PYBIND11_MODULE (line 154) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSimCSE/apex/contrib/csrc/groupbn/nhwc_batch_norm_kernel.h
  type T (line 43) | typedef T Type;
  type Type (line 51) | typedef int Type;
  function DEVICE_FUNCTION (line 247) | DEVICE_FUNCTION void write_to_gmem(float *gmem, int idx, const float (&s...
  function DEVICE_FUNCTION (line 253) | DEVICE_FUNCTION void write_to_gmem(float *gmem, int idx, const float (&s...
  function DEVICE_FUNCTION (line 259) | DEVICE_FUNCTION void scaled_write_to_gmem(float *gmem, int idx, const fl...
  function DEVICE_FUNCTION (line 265) | DEVICE_FUNCTION void write_to_smem(float *smem, int idx, const float (&x...
  function DEVICE_FUNCTION (line 271) | DEVICE_FUNCTION void write_to_smem(int *smem, int idx, const int (&x)[1]) {
  function DEVICE_FUNCTION (line 277) | DEVICE_FUNCTION void write_to_smem(float *smem, int idx, const float (&x...
  function DEVICE_FUNCTION (line 283) | DEVICE_FUNCTION void write_to_smem(int *smem, int idx, const int (&x)[2]) {
  function Storage (line 351) | Storage relu(Storage in) {
  function parallel_sums (line 544) | void parallel_sums(float *smem, float (&x)[ELEMENTS_PER_LDG], int nhw) {
  type ParallelSums (line 637) | struct ParallelSums
  type ParallelSums (line 650) | struct ParallelSums
  function div_up (line 661) | static inline int div_up(int m, int n) {
  function DEVICE_FUNCTION (line 668) | DEVICE_FUNCTION void inter_block_sync(int* gmem_retired_ctas, int expect...
  type NhwcBatchNormFwdInferenceParams (line 697) | struct NhwcBatchNormFwdInferenceParams {
  type NhwcBatchNormFwdParams (line 799) | struct NhwcBatchNormFwdParams {
  type PackedStorage (line 870) | typedef PackedStorage<Storage, ELEMENTS_PER_LDG> PackedStorage_;
  type typename (line 872) | typedef typename PackedStorage_::Type PackedStorageType;
  type NhwcBatchNormBwdParams (line 1388) | struct NhwcBatchNormBwdParams {
  function nhwc_batch_norm_bwd (line 1528) | void nhwc_batch_norm_bwd(NhwcBatchNormBwdParams params) {
  function nhwc_batch_norm_bwd_relu (line 1892) | void nhwc_batch_norm_bwd_relu(NhwcBatchNormBwdParams params) {
  function nhwc_batch_norm_bwd_add_relu (line 2280) | void nhwc_batch_norm_bwd_add_relu(NhwcBatchNormBwdParams params) {

FILE: KoSimCSE/apex/contrib/csrc/layer_norm/ln_api.cpp
  function ln_fwd (line 15) | std::vector<at::Tensor> ln_fwd(const at::Tensor &x,      // BxSxhidden_size
  function ln_bwd (line 58) | std::vector<at::Tensor> ln_bwd(const at::Tensor &dw,     // BxSxhidden_size
  function PYBIND11_MODULE (line 102) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSimCSE/apex/contrib/csrc/multihead_attn/additive_masked_softmax_dropout.cpp
  type multihead_attn (line 5) | namespace multihead_attn {
    type fused_softmax (line 6) | namespace fused_softmax {
      type additive_mask_softmax_dropout (line 7) | namespace additive_mask_softmax_dropout {
        function fwd (line 31) | std::vector<torch::Tensor> fwd(
        function bwd (line 57) | torch::Tensor bwd(
  function PYBIND11_MODULE (line 87) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSimCSE/apex/contrib/csrc/multihead_attn/encdec_multihead_attn.cpp
  type multihead_attn (line 4) | namespace multihead_attn {
    type encdec (line 5) | namespace encdec {
      type cublas_gemmex (line 6) | namespace cublas_gemmex {
        function fwd (line 43) | std::vector<torch::Tensor> fwd(
        function bwd (line 88) | std::vector<torch::Tensor> bwd(
  function PYBIND11_MODULE (line 153) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSimCSE/apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add.cpp
  type multihead_attn (line 4) | namespace multihead_attn {
    type encdec_norm_add (line 5) | namespace encdec_norm_add {
      type cublas_gemmex (line 6) | namespace cublas_gemmex {
        function fwd (line 52) | std::vector<torch::Tensor> fwd(
        function bwd (line 105) | std::vector<torch::Tensor> bwd(
  function PYBIND11_MODULE (line 194) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSimCSE/apex/contrib/csrc/multihead_attn/layer_norm.h
  function rsqrt (line 230) | float rsqrt(float v) {
  function rsqrt (line 233) | double rsqrt(double v) {
  function float (line 256) | struct SharedMemory <float>
  function double (line 266) | struct SharedMemory <double>
  function stream (line 653) | auto stream = at::cuda::getCurrentCUDAStream().stream();

FILE: KoSimCSE/apex/contrib/csrc/multihead_attn/masked_softmax_dropout.cpp
  type multihead_attn (line 4) | namespace multihead_attn {
    type fused_softmax (line 5) | namespace fused_softmax {
      type mask_softmax_dropout (line 6) | namespace mask_softmax_dropout {
        function fwd (line 31) | std::vector<torch::Tensor> fwd(
        function bwd (line 57) | torch::Tensor bwd(
  function PYBIND11_MODULE (line 89) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSimCSE/apex/contrib/csrc/multihead_attn/philox.h
  function class (line 4) | class Philox {
  function __device__ (line 17) | __device__ inline uint4 operator()() {
  function __device__ (line 45) | __device__ inline void incr_n(unsigned long long n) {
  function __device__ (line 58) | __device__ inline void incr() {
  function mulhilo32 (line 67) | __device__ unsigned int mulhilo32(unsigned int a, unsigned int b,
  function __device__ (line 72) | __device__ inline uint4 single_round(uint4 ctr, uint2 key) {
  function __device__ (line 87) | __device__  __inline__ float4 uniform4(uint4 x) {

FILE: KoSimCSE/apex/contrib/csrc/multihead_attn/self_multihead_attn.cpp
  type multihead_attn (line 4) | namespace multihead_attn {
    type self (line 5) | namespace self {
      type cublas_gemmex (line 6) | namespace cublas_gemmex {
        function fwd (line 39) | std::vector<torch::Tensor> fwd(
        function bwd (line 75) | std::vector<torch::Tensor> bwd(
  function PYBIND11_MODULE (line 128) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSimCSE/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias.cpp
  type multihead_attn (line 4) | namespace multihead_attn {
    type self_bias (line 5) | namespace self_bias {
      type cublas_gemmex (line 6) | namespace cublas_gemmex {
        function fwd (line 43) | std::vector<torch::Tensor> fwd(
        function bwd (line 82) | std::vector<torch::Tensor> bwd(
  function PYBIND11_MODULE (line 135) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSimCSE/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask.cpp
  type multihead_attn (line 5) | namespace multihead_attn {
    type self_bias_additive_mask (line 6) | namespace self_bias_additive_mask {
      type cublas_gemmex (line 7) | namespace cublas_gemmex {
        function fwd (line 46) | std::vector<torch::Tensor> fwd(
        function bwd (line 86) | std::vector<torch::Tensor> bwd(
  function PYBIND11_MODULE (line 139) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSimCSE/apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add.cpp
  type multihead_attn (line 4) | namespace multihead_attn {
    type self_norm_add (line 5) | namespace self_norm_add {
      type cublas_gemmex (line 6) | namespace cublas_gemmex {
        function fwd (line 47) | std::vector<torch::Tensor> fwd(
        function bwd (line 93) | std::vector<torch::Tensor> bwd(
  function PYBIND11_MODULE (line 169) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSimCSE/apex/contrib/csrc/multihead_attn/softmax.h
  function acc_t (line 139) | acc_t sum[WARP_BATCH] { 0.0f };
  function acc_t (line 363) | acc_t sum[WARP_BATCH] { 0.0f };
  function additive_masked_softmax_dropout_warp_forward (line 429) | void additive_masked_softmax_dropout_warp_forward(output_t *dst, uint8_t...
  function softmax_warp_backward (line 2244) | void softmax_warp_backward(__half *gradInput, const __half *grad, const ...
  function masked_softmax_warp_backward (line 2455) | void masked_softmax_warp_backward(__half *gradInput, const __half *grad,...

FILE: KoSimCSE/apex/contrib/csrc/multihead_attn/strided_batched_gemm.h
  function cublasOperation_t (line 21) | cublasOperation_t convertTransToCublasOperation(char trans) {
  function CublasStridedBatchedGemm (line 31) | void CublasStridedBatchedGemm(THCState *state, char transa, char transb,...
  type cutlass (line 78) | typedef cutlass::gemm::Gemm<WmmaGemmTraits> Gemm;
  function gemm_switch_fp32accum (line 149) | void gemm_switch_fp32accum(THCState *state, char transa, char transb, lo...
  function adjustLdLevel3 (line 278) | void adjustLdLevel3(char transa, char transb, int64_t m, int64_t n, int6...
  function HgemmStridedBatched (line 312) | void HgemmStridedBatched(THCState *state, char transa, char transb, long...

FILE: KoSimCSE/apex/contrib/csrc/optimizers/fused_adam_cuda.cpp
  function strided_check_finite (line 20) | void strided_check_finite(
  function adam (line 29) | void adam(at::Tensor & p, at::Tensor & p_copy, at::Tensor & m, at::Tenso...
  function reversible_adam (line 43) | void reversible_adam(at::Tensor & p, at::Tensor & p_copy, at::Tensor & m...
  function maybe_adam_undo (line 57) | void maybe_adam_undo(at::Tensor & overflow_flag, at::Tensor & p, at::Ten...
  function maybe_cast (line 69) | void maybe_cast(at::Tensor & overflow_flag, at::Tensor & p_in, at::Tenso...
  function PYBIND11_MODULE (line 78) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSimCSE/apex/contrib/csrc/optimizers/fused_lamb_cuda.cpp
  function PYBIND11_MODULE (line 19) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSimCSE/apex/contrib/csrc/optimizers/multi_tensor_distopt_adam.cpp
  function PYBIND11_MODULE (line 17) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSimCSE/apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb.cpp
  function PYBIND11_MODULE (line 31) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSimCSE/apex/contrib/csrc/transducer/transducer_joint.cpp
  function transducer_joint_forward (line 33) | std::vector<torch::Tensor> transducer_joint_forward(
  function transducer_joint_backward (line 67) | std::vector<torch::Tensor> transducer_joint_backward(
  function PYBIND11_MODULE (line 95) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSimCSE/apex/contrib/csrc/transducer/transducer_loss.cpp
  function transducer_loss_forward (line 35) | std::vector<torch::Tensor> transducer_loss_forward(
  function transducer_loss_backward (line 65) | torch::Tensor transducer_loss_backward(
  function PYBIND11_MODULE (line 106) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSimCSE/apex/contrib/csrc/xentropy/interface.cpp
  function softmax_xentropy_forward (line 24) | std::vector<at::Tensor> softmax_xentropy_forward(
  function softmax_xentropy_backward (line 35) | at::Tensor softmax_xentropy_backward(
  function PYBIND11_MODULE (line 49) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: KoSimCSE/apex/contrib/fmha/fmha.py
  class FMHAFun (line 33) | class FMHAFun(torch.autograd.Function):
    method forward (line 35) | def forward(ctx, qkv, cu_seqlens, p_dropout, max_s, is_training):
    method backward (line 48) | def backward(ctx, dout):
  class FMHA (line 58) | class FMHA(torch.nn.Module):
    method __init__ (line 60) | def __init__(self, config):
    method forward (line 70) | def forward(self, qkv, cu_seqlens, max_s, is_training=True):

FILE: KoSimCSE/apex/contrib/groupbn/batch_norm.py
  class bn_NHWC_impl (line 7) | class bn_NHWC_impl(torch.autograd.Function):
    method forward (line 9) | def forward(ctx, x, s, b, rm, riv, mini_m, mini_riv, ret_cta, mom, eps...
    method backward (line 32) | def backward(ctx, grad_y):
  class bn_addrelu_NHWC_impl (line 53) | class bn_addrelu_NHWC_impl(torch.autograd.Function):
    method forward (line 55) | def forward(ctx, x, z, s, b, rm, riv, mini_m, mini_riv, grid_dim_y, re...
    method backward (line 78) | def backward(ctx, grad_y):
  class BatchNorm2d_NHWC (line 101) | class BatchNorm2d_NHWC(_BatchNorm):
    method __init__ (line 103) | def __init__(self, num_features, fuse_relu=False, bn_group=1, max_cta_...
    method forward (line 196) | def forward(self, x, z=None):
    method __del__ (line 219) | def __del__(self):

FILE: KoSimCSE/apex/contrib/layer_norm/layer_norm.py
  class FastLayerNormFN (line 6) | class FastLayerNormFN(torch.autograd.Function):
    method forward (line 8) | def forward(ctx, x, gamma, beta, epsilon):
    method backward (line 19) | def backward(ctx, dy):
  class FastLayerNorm (line 31) | class FastLayerNorm(torch.nn.Module):
    method __init__ (line 32) | def __init__(self, hidden_size, eps=1e-5):
    method reset_parameters (line 39) | def reset_parameters(self):
    method forward (line 43) | def forward(self, x):

FILE: KoSimCSE/apex/contrib/multihead_attn/encdec_multihead_attn.py
  function jit_dropout_add (line 19) | def jit_dropout_add(x, residual, prob, is_training):
  class EncdecMultiheadAttn (line 26) | class EncdecMultiheadAttn(nn.Module):
    method __init__ (line 31) | def __init__(self, embed_dim, num_heads, dropout=0., bias=False, inclu...
    method reset_parameters (line 79) | def reset_parameters(self):
    method forward (line 98) | def forward(self, query, key, value, key_padding_mask=None, need_weigh...

FILE: KoSimCSE/apex/contrib/multihead_attn/encdec_multihead_attn_func.py
  class EncdecAttnFunc (line 5) | class EncdecAttnFunc(torch.autograd.Function):
    method forward (line 7) | def forward(ctx, use_time_mask, is_training, heads, scale, inputs_q, i...
    method backward (line 135) | def backward(ctx, output_grads):

FILE: KoSimCSE/apex/contrib/multihead_attn/fast_encdec_multihead_attn_func.py
  class FastEncdecAttnFunc (line 5) | class FastEncdecAttnFunc(torch.autograd.Function):
    method forward (line 7) | def forward(ctx, use_time_mask, is_training, heads, inputs_q, inputs_k...
    method backward (line 50) | def backward(ctx, output_grads):

FILE: KoSimCSE/apex/contrib/multihead_attn/fast_encdec_multihead_attn_norm_add_func.py
  class FastEncdecAttnNormAddFunc (line 12) | class FastEncdecAttnNormAddFunc(torch.autograd.Function):
    method forward (line 14) | def forward(ctx, use_time_mask, is_training, heads, inputs_q, inputs_k...
    method backward (line 69) | def backward(ctx, output_grads):

FILE: KoSimCSE/apex/contrib/multihead_attn/fast_self_multihead_attn_func.py
  class FastSelfAttnFunc (line 6) | class FastSelfAttnFunc(torch.autograd.Function) :
    method forward (line 8) | def forward(ctx, use_time_mask, is_training, heads, inputs, input_weig...
    method backward (line 120) | def backward(ctx, output_grads):

FILE: KoSimCSE/apex/contrib/multihead_attn/fast_self_multihead_attn_norm_add_func.py
  class FastSelfAttnNormAddFunc (line 5) | class FastSelfAttnNormAddFunc(torch.autograd.Function):
    method forward (line 7) | def forward(ctx, use_time_mask, is_training, heads, inputs, lyr_nrm_ga...
    method backward (line 56) | def backward(ctx, output_grads):

FILE: KoSimCSE/apex/contrib/multihead_attn/mask_softmax_dropout_func.py
  class MaskSoftmaxDropout (line 6) | class MaskSoftmaxDropout(torch.autograd.Function) :
    method forward (line 8) | def forward(ctx, is_training, heads, inputs, pad_mask, mask_additive, ...
    method backward (line 51) | def backward(ctx, output_grads):

FILE: KoSimCSE/apex/contrib/multihead_attn/self_multihead_attn.py
  function jit_dropout_add (line 19) | def jit_dropout_add(x, residual, prob, is_training):
  class SelfMultiheadAttn (line 26) | class SelfMultiheadAttn(nn.Module):
    method __init__ (line 31) | def __init__(self, embed_dim, num_heads, dropout=0., bias=False, inclu...
    method reset_parameters (line 97) | def reset_parameters(self):
    method forward (line 124) | def forward(self, query, key, value, key_padding_mask=None, need_weigh...

FILE: KoSimCSE/apex/contrib/multihead_attn/self_multihead_attn_func.py
  class SelfAttnFunc (line 4) | class SelfAttnFunc(torch.autograd.Function):
    method forward (line 6) | def forward(ctx, use_time_mask, is_training, heads, scale, inputs,
    method backward (line 121) | def backward(ctx, output_grads):

FILE: KoSimCSE/apex/contrib/optimizers/distributed_fused_adam.py
  class DistributedFusedAdam (line 9) | class DistributedFusedAdam(torch.optim.Optimizer):
    method __init__ (line 55) | def __init__(self, params,
    method _first_step_init (line 128) | def _first_step_init(self):
    method _init_everything (line 373) | def _init_everything(self):
    method set_last_step (line 378) | def set_last_step(self, last_step):
    method _get_flush_block (line 381) | def _get_flush_block(self):
    method _pipeline_block_reductions (line 397) | def _pipeline_block_reductions(self, block_id):
    method __launch_step_kernel (line 443) | def __launch_step_kernel(self):
    method _pipeline_step (line 469) | def _pipeline_step(self):
    method _flatten_grad_mt (line 479) | def _flatten_grad_mt(self, scale):
    method _do_overlapped_reduction (line 489) | def _do_overlapped_reduction(self, param_i, param_grads_size, param_of...
    method set_global_scale (line 504) | def set_global_scale(self, global_scale):
    method global_scale (line 510) | def global_scale(self):
    method has_overflow (line 514) | def has_overflow(self):
    method peek_overflow (line 523) | def peek_overflow(self):
    method strided_check_finite (line 529) | def strided_check_finite(self, output_params, stride=1, start=-1, end=...
    method L2_grad_norm (line 545) | def L2_grad_norm(self):
    method complete_reductions (line 552) | def complete_reductions(self):
    method step (line 577) | def step(self, closure=None):
    method state_dict (line 598) | def state_dict(self):
    method load_state_dict (line 615) | def load_state_dict(self, state_dict):

FILE: KoSimCSE/apex/contrib/optimizers/distributed_fused_adam_v2.py
  class DistributedFusedAdamV2 (line 7) | class DistributedFusedAdamV2(torch.optim.Optimizer):
    method __init__ (line 43) | def __init__(self, params,
    method set_last_step (line 351) | def set_last_step(self, last_step):
    method _get_flush_block (line 354) | def _get_flush_block(self):
    method _pipeline_block_reductions (line 370) | def _pipeline_block_reductions(self, block_id):
    method __launch_step_kernel (line 406) | def __launch_step_kernel(self, p, p_copy, m, v, g):
    method _pipeline_block_step (line 425) | def _pipeline_block_step(self, block_id):
    method _pipeline_step (line 445) | def _pipeline_step(self):
    method _flatten_grad_mt (line 460) | def _flatten_grad_mt(self, scale):
    method _do_overlapped_reduction (line 470) | def _do_overlapped_reduction(self, param_i, param_grads_size, param_of...
    method set_global_scale (line 487) | def set_global_scale(self, global_scale):
    method global_scale (line 493) | def global_scale(self):
    method has_overflow (line 497) | def has_overflow(self):
    method peek_overflow (line 506) | def peek_overflow(self):
    method strided_check_finite (line 512) | def strided_check_finite(self, output_params, stride=1, start=-1, end=...
    method L2_grad_norm (line 528) | def L2_grad_norm(self):
    method complete_reductions (line 535) | def complete_reductions(self):
    method revert_step (line 560) | def revert_step(self):
    method step (line 586) | def step(self, closure=None, skip_overflow_check=False):

FILE: KoSimCSE/apex/contrib/optimizers/distributed_fused_adam_v3.py
  class DistributedFusedAdamV3 (line 7) | class DistributedFusedAdamV3(torch.optim.Optimizer):
    method __init__ (line 43) | def __init__(self, params,
    method has_overflow (line 196) | def has_overflow(self):
    method set_last_step (line 199) | def set_last_step(self, last_step):
    method _get_flush_block (line 202) | def _get_flush_block(self):
    method __launch_step_kernel (line 218) | def __launch_step_kernel(self, p, p_copy, m, v, g):
    method _flatten_grad_mt (line 237) | def _flatten_grad_mt(self, scale):
    method _do_overlapped_reduction (line 247) | def _do_overlapped_reduction(self, param_i, param_grads_size, param_of...
    method set_global_scale (line 268) | def set_global_scale(self, global_scale):
    method global_scale (line 274) | def global_scale(self):
    method L2_grad_norm (line 278) | def L2_grad_norm(self):
    method complete_reductions (line 282) | def complete_reductions(self):
    method step (line 306) | def step(self, closure=None, skip_overflow_check=False):

FILE: KoSimCSE/apex/contrib/optimizers/distributed_fused_lamb.py
  class DistributedFusedLAMB (line 9) | class DistributedFusedLAMB(torch.optim.Optimizer):
    class AtomicCounter (line 70) | class AtomicCounter(object):
      method __init__ (line 71) | def __init__(self):
      method add (line 77) | def add(self, idx):
    method __init__ (line 82) | def __init__(self, params,
    method _lazy_init_stage1 (line 210) | def _lazy_init_stage1(self):
    method _lazy_init_stage2 (line 330) | def _lazy_init_stage2(self):
    method set_is_accumulation_step (line 451) | def set_is_accumulation_step(self, is_accumulation_step):
    method set_last_step (line 454) | def set_last_step(self, last_step):
    method _get_flush_block (line 457) | def _get_flush_block(self):
    method _pipeline_block_reductions (line 473) | def _pipeline_block_reductions(self, block_id):
    method __compute_contrib_param_norm (line 556) | def __compute_contrib_param_norm(self):
    method __compute_contrib_update_norm (line 569) | def __compute_contrib_update_norm(self):
    method _pipeline_step (line 577) | def _pipeline_step(self):
    method _flatten_grad_mt (line 633) | def _flatten_grad_mt(self, scale):
    method _do_overlapped_reduction (line 651) | def _do_overlapped_reduction(self, param_i, param):
    method set_global_scale (line 667) | def set_global_scale(self, global_scale):
    method global_scale (line 673) | def global_scale(self):
    method L2_grad_norm (line 677) | def L2_grad_norm(self):
    method complete_reductions (line 681) | def complete_reductions(self):
    method step (line 704) | def step(self, closure=None, grad_scaler=None):
    method state_dict (line 740) | def state_dict(self):
    method load_state_dict (line 757) | def load_state_dict(self, state_dict):

FILE: KoSimCSE/apex/contrib/optimizers/fp16_optimizer.py
  class FP16_Optimizer (line 4) | class FP16_Optimizer(object):
    method __init__ (line 25) | def __init__(self,
    method zero_grad (line 79) | def zero_grad(self, set_grads_to_None=True):
    method step (line 94) | def step(self, closure=None):
    method backward (line 132) | def backward(self, loss):
    method _update_scale (line 142) | def _update_scale(self, skip):
    method _get_state (line 161) | def _get_state(self):
    method _set_state (line 164) | def _set_state(self, value):
    method _get_param_groups (line 171) | def _get_param_groups(self):
    method _set_param_groups (line 174) | def _set_param_groups(self, value):
    method state_dict (line 179) | def state_dict(self):
    method load_state_dict (line 202) | def load_state_dict(self, state_dict):

FILE: KoSimCSE/apex/contrib/optimizers/fused_adam.py
  class FusedAdam (line 6) | class FusedAdam(torch.optim.Optimizer):
    method __init__ (line 38) | def __init__(self, params,
    method step (line 64) | def step(self, closure=None, grads=None, output_params=None, scale=1.,...

FILE: KoSimCSE/apex/contrib/optimizers/fused_lamb.py
  class FusedLAMB (line 6) | class FusedLAMB(torch.optim.Optimizer):
    method __init__ (line 63) | def __init__(self, params, lr=1e-3, bias_correction=True,
    method zero_grad (line 87) | def zero_grad(self):
    method step (line 95) | def step(self, closure=None):

FILE: KoSimCSE/apex/contrib/optimizers/fused_sgd.py
  class FusedSGD (line 7) | class FusedSGD(Optimizer):
    method __init__ (line 66) | def __init__(self, params, lr=required, momentum=0, dampening=0,
    method __setstate__ (line 93) | def __setstate__(self, state):
    method get_momentums (line 98) | def get_momentums(self, params):
    method step (line 115) | def step(self, closure=None, grads=None, output_params=None, scale=1.,...

FILE: KoSimCSE/apex/contrib/sparsity/asp.py
  function eligible_modules (line 12) | def eligible_modules(model, whitelist_layer_types, allowed_layer_names, ...
  class ASP (line 21) | class ASP:
    method init_model_for_pruning (line 29) | def init_model_for_pruning(cls, model, mask_calculator="m4n2_1d",
    method init_optimizer_for_pruning (line 127) | def init_optimizer_for_pruning(cls, optimizer):
    method compute_sparse_masks (line 155) | def compute_sparse_masks(cls):
    method restore_pruned_weights (line 176) | def restore_pruned_weights(cls):
    method is_sparsity_enabled (line 191) | def is_sparsity_enabled(cls):
    method prune_trained_model (line 212) | def prune_trained_model(cls, model, optimizer):

FILE: KoSimCSE/apex/contrib/sparsity/sparse_masklib.py
  function fill (line 9) | def fill(x):
  function reshape_1d (line 13) | def reshape_1d(matrix, m):
  function compute_valid_1d_patterns (line 25) | def compute_valid_1d_patterns(m,n):
  function mn_1d_best (line 37) | def mn_1d_best(matrix, m, n):
  function m4n2_1d (line 49) | def m4n2_1d(mat, density):
  function mn_2d_greedy (line 67) | def mn_2d_greedy(matrix, m, n):
  function m4n2_2d_greedy (line 98) | def m4n2_2d_greedy(mat, density):
  function compute_valid_2d_patterns (line 103) | def compute_valid_2d_patterns(m,n):
  function mn_2d_best (line 122) | def mn_2d_best(matrix, m, n):
  function m4n2_2d_best (line 140) | def m4n2_2d_best(mat, density):
  function create_mask (line 145) | def create_mask(tensor, pattern="m4n2_1d", density=0.5):

FILE: KoSimCSE/apex/contrib/sparsity/test/checkpointing_test_part1.py
  function build_model (line 7) | def build_model(args):
  function train_step (line 21) | def train_step(args, model, optimizer, input_batch, target_batch, step):
  function train_loop (line 31) | def train_loop(args, model, optimizer, step, num_steps):
  function main (line 38) | def main(args):
  class Args (line 76) | class Args:

FILE: KoSimCSE/apex/contrib/sparsity/test/checkpointing_test_part2.py
  function build_model (line 7) | def build_model(args):
  function train_step (line 21) | def train_step(args, model, optimizer, input_batch, target_batch, step):
  function train_loop (line 31) | def train_loop(args, model, optimizer, step, num_steps):
  function main (line 38) | def main(step, args, model_state_dict, optimizer_state_dict):
  class Args (line 61) | class Args:

FILE: KoSimCSE/apex/contrib/sparsity/test/checkpointing_test_reference.py
  function build_model (line 11) | def build_model(args):
  function train_step (line 25) | def train_step(args, model, optimizer, input_batch, target_batch, step):
  function train_loop (line 35) | def train_loop(args, model, optimizer, step, num_steps):
  function main (line 42) | def main(args):
  class Args (line 79) | class Args:

FILE: KoSimCSE/apex/contrib/sparsity/test/toy_problem.py
  function build_model (line 7) | def build_model(args):
  function train_step (line 21) | def train_step(args, model, optimizer, input_batch, target_batch, step):
  function train_loop (line 31) | def train_loop(args, model, optimizer, step, num_steps):
  function main (line 38) | def main(args):
  class Args (line 75) | class Args:

FILE: KoSimCSE/apex/contrib/test/fmha/test_fmha.py
  function py_mha (line 37) | def py_mha(qkv, amask, b, s, h, d):
  class TestFMHA (line 52) | class TestFMHA(unittest.TestCase):
    method run_test (line 54) | def run_test(self, s, b):
    method test_128 (line 106) | def test_128(self):
    method test_256 (line 109) | def test_256(self):
    method test_384 (line 112) | def test_384(self):
    method test_512 (line 115) | def test_512(self):

FILE: KoSimCSE/apex/contrib/test/layer_norm/test_fast_layer_norm.py
  class GPUTimer (line 12) | class GPUTimer:
    method __init__ (line 13) | def __init__(self, stream):
    method start (line 17) | def start(self):
    method stop (line 19) | def stop(self):
    method sync (line 21) | def sync(self):
    method millis (line 23) | def millis(self):
  function size_in_bytes (line 26) | def size_in_bytes(t):
  function abs_err (line 28) | def abs_err(x, y):
  class TestFastLayerNorm (line 35) | class TestFastLayerNorm(unittest.TestCase):
    method setUp (line 37) | def setUp(self, seed=1234):
    method test_ln_fp32 (line 42) | def test_ln_fp32(self):
    method test_ln_fp16 (line 44) | def test_ln_fp16(self):
    method run_test_layer_norm (line 47) | def run_test_layer_norm(self, dtype, atol, rtol=1e-5):
    method test_performance (line 94) | def test_performance(self):

FILE: KoSimCSE/apex/contrib/test/multihead_attn/test_encdec_multihead_attn.py
  class EncdecMultiheadAttnTest (line 7) | class EncdecMultiheadAttnTest(unittest.TestCase):
    method setUp (line 8) | def setUp(self, seed=1234):
    method test_encdec_multihead_attn (line 49) | def test_encdec_multihead_attn(self) :
    method test_encdec_multihead_attn_time_mask (line 76) | def test_encdec_multihead_attn_time_mask(self) :
    method test_encdec_multihead_attn_pad_mask (line 105) | def test_encdec_multihead_attn_pad_mask(self) :

FILE: KoSimCSE/apex/contrib/test/multihead_attn/test_encdec_multihead_attn_norm_add.py
  class EncdecMultiheadAttnNormAddTest (line 7) | class EncdecMultiheadAttnNormAddTest(unittest.TestCase):
    method setUp (line 8) | def setUp(self, seed=1234):
    method test_encdec_multihead_attn_norm_add (line 49) | def test_encdec_multihead_attn_norm_add(self) :

FILE: KoSimCSE/apex/contrib/test/multihead_attn/test_fast_self_multihead_attn_bias.py
  class SelfMultiheadAttnTest (line 7) | class SelfMultiheadAttnTest(unittest.TestCase):
    method setUp (line 8) | def setUp(self, seed=1234):
    method test_self_multihead_attn_additive_mask (line 48) | def test_self_multihead_attn_additive_mask(self) :

FILE: KoSimCSE/apex/contrib/test/multihead_attn/test_mha_fused_softmax.py
  class FusedSoftmaxTest (line 6) | class FusedSoftmaxTest(unittest.TestCase):
    method setUp (line 7) | def setUp(self, seed=1234):
    method test_fused_softmax (line 24) | def test_fused_softmax(self) :

FILE: KoSimCSE/apex/contrib/test/multihead_attn/test_self_multihead_attn.py
  class SelfMultiheadAttnTest (line 7) | class SelfMultiheadAttnTest(unittest.TestCase):
    method setUp (line 8) | def setUp(self, seed=1234):
    method test_self_multihead_attn (line 45) | def test_self_multihead_attn(self) :
    method test_self_multihead_attn_time_mask (line 71) | def test_self_multihead_attn_time_mask(self) :
    method test_self_multihead_attn_pad_mask (line 100) | def test_self_multihead_attn_pad_mask(self) :

FILE: KoSimCSE/apex/contrib/test/multihead_attn/test_self_multihead_attn_norm_add.py
  class SelfMultiheadAttnNormAddTest (line 7) | class SelfMultiheadAttnNormAddTest(unittest.TestCase):
    method setUp (line 8) | def setUp(self, seed=1234):
    method test_self_multihead_attn_norm_add (line 45) | def test_self_multihead_attn_norm_add(self) :

FILE: KoSimCSE/apex/contrib/test/test_label_smoothing.py
  function label_smoothing_raw (line 10) | def label_smoothing_raw(x, target, padding_idx, smoothing):
  function label_smoothing_opt_1 (line 20) | def label_smoothing_opt_1(x, target, padding_idx, smoothing):
  class LabelSmoothingTest (line 30) | class LabelSmoothingTest(unittest.TestCase):
    method setUp (line 31) | def setUp(self, seed=1234):
    method gen_test_inputs (line 40) | def gen_test_inputs(self, N, T, H, smoothing, padding_idx):
    method print_max_diff_elem (line 50) | def print_max_diff_elem(self, ref, tst):
    method test_label_smoothing_function (line 57) | def test_label_smoothing_function(self):
    method test_label_smoothing_perf (line 91) | def test_label_smoothing_perf(self):

FILE: KoSimCSE/apex/contrib/test/transducer/test_transducer_joint.py
  class TransducerJointTest (line 6) | class TransducerJointTest(unittest.TestCase):
    method setUp (line 7) | def setUp(self, seed=1234):
    method gen_input (line 11) | def gen_input(self, for_vector_kernel):
    method _pack (line 41) | def _pack(self, x, f_len, g_len):
    method _unpack (line 53) | def _unpack(self, x, f_len, g_len):
    method run_transducer_joint (line 67) | def run_transducer_joint(self, for_vector_kernel, pack_output, relu, d...
    method test_transducer_joint (line 118) | def test_transducer_joint(self):
    method test_transducer_joint_vec (line 121) | def test_transducer_joint_vec(self):
    method test_transducer_joint_pack (line 124) | def test_transducer_joint_pack(self):
    method test_transducer_joint_vec_pack (line 127) | def test_transducer_joint_vec_pack(self):
    method test_transducer_joint_relu (line 130) | def test_transducer_joint_relu(self):
    method test_transducer_joint_vec_relu (line 133) | def test_transducer_joint_vec_relu(self):
    method test_transducer_joint_pack_relu (line 136) | def test_transducer_joint_pack_relu(self):
    method test_transducer_joint_vec_pack_relu (line 139) | def test_transducer_joint_vec_pack_relu(self):
    method test_transducer_joint_relu_dropout (line 142) | def test_transducer_joint_relu_dropout(self):
    method test_transducer_joint_vec_relu_dropout (line 145) | def test_transducer_joint_vec_relu_dropout(self):
    method test_transducer_joint_pack_relu_dropout (line 148) | def test_transducer_joint_pack_relu_dropout(self):
    method test_transducer_joint_vec_pack_relu_dropout (line 151) | def test_transducer_joint_vec_pack_relu_dropout(self):

FILE: KoSimCSE/apex/contrib/test/transducer/test_transducer_loss.py
  class TransducerLossTest (line 6) | class TransducerLossTest(unittest.TestCase):
    method setUp (line 7) | def setUp(self, seed=1234):
    method gen_input (line 11) | def gen_input(self, scalar_t, for_vector_kernel):
    method _pack (line 41) | def _pack(self, x):
    method _unpack (line 52) | def _unpack(self, x):
    method run_transducer_loss (line 64) | def run_transducer_loss(self, scalar_t, fuse_softmax_backward, packed_...
    method test_transducer_loss_fp32 (line 90) | def test_transducer_loss_fp32(self):
    method test_transducer_loss_fp16 (line 98) | def test_transducer_loss_fp16(self):
    method test_transducer_loss_fp16_backward_fusion (line 106) | def test_transducer_loss_fp16_backward_fusion(self):
    method test_transducer_loss_fp16_backward_fusion_packed (line 114) | def test_transducer_loss_fp16_backward_fusion_packed(self):
    method test_transducer_loss_fp16_backward_fusion_packed_vec (line 122) | def test_transducer_loss_fp16_backward_fusion_packed_vec(self):

FILE: KoSimCSE/apex/contrib/test/transducer/transducer_ref.py
  function transducer_loss_reference (line 5) | def transducer_loss_reference(x, label, f_len, y_len, blank_idx, loss_gr...
  function transducer_joint_reference (line 79) | def transducer_joint_reference(f, g, h_grad, f_len, g_len, pack_output, ...

FILE: KoSimCSE/apex/contrib/transducer/transducer.py
  class TransducerJoint (line 5) | class TransducerJoint(torch.nn.Module):
    method __init__ (line 27) | def __init__(self, pack_output=False, relu=False, dropout=False, opt=1...
    method forward (line 43) | def forward(self, f, g, f_len, g_len, batch_offset=None, packed_batch=0):
  class TransducerLoss (line 68) | class TransducerLoss(torch.nn.Module):
    method __init__ (line 81) | def __init__(self, fuse_softmax_backward=True, opt=1, packed_input=Fal...
    method forward (line 89) | def forward(self, x, label, f_len, y_len, blank_idx, batch_offset=None...
  class TransducerLossFunc (line 127) | class TransducerLossFunc(torch.autograd.Function):
    method forward (line 129) | def forward(ctx, x, label, f_len, y_len, batch_offset, max_f_len, blan...
    method backward (line 149) | def backward(ctx, loss_grad):
  class TransducerJointFunc (line 158) | class TransducerJointFunc(torch.autograd.Function):
    method forward (line 160) | def forward(ctx, f, g, f_len, g_len, pack_output, relu, dropout, batch...
    method backward (line 180) | def backward(ctx, loss_grad):

FILE: KoSimCSE/apex/contrib/xentropy/softmax_xentropy.py
  class SoftmaxCrossEntropyLoss (line 4) | class SoftmaxCrossEntropyLoss(torch.autograd.Function):
    method forward (line 6) | def forward(ctx, logits, labels, smoothing=0.0, padding_idx=0, half_to...
    method backward (line 18) | def backward(ctx, grad_loss):

FILE: KoSimCSE/apex/fp16_utils/fp16_optimizer.py
  class FP16_Optimizer (line 13) | class FP16_Optimizer(object):
    method __init__ (line 14) | def __init__(self,
    method maybe_print (line 110) | def maybe_print(self, msg):
    method __getstate__ (line 114) | def __getstate__(self):
    method __setstate__ (line 117) | def __setstate__(self, state):
    method zero_grad (line 120) | def zero_grad(self, set_grads_to_None=False):
    method _master_params_to_model_params (line 160) | def _master_params_to_model_params(self):
    method clip_master_grads (line 185) | def clip_master_grads(self, max_norm, norm_type=2):
    method state_dict (line 209) | def state_dict(self):
    method load_state_dict (line 230) | def load_state_dict(self, state_dict):
    method step (line 272) | def step(self, closure=None): # could add clip option.
    method _step_with_closure (line 334) | def _step_with_closure(self, closure):
    method backward (line 373) | def backward(self, loss, update_master_grads=True, retain_graph=False):
    method update_master_grads (line 436) | def update_master_grads(self):
    method inspect_master_grad_data (line 493) | def inspect_master_grad_data(self):
    method _get_loss_scale (line 528) | def _get_loss_scale(self):
    method _set_loss_scale (line 531) | def _set_loss_scale(self, value):
    method _get_state (line 537) | def _get_state(self):
    method _set_state (line 540) | def _set_state(self, value):
    method _get_param_groups (line 547) | def _get_param_groups(self):
    method _set_param_groups (line 550) | def _set_param_groups(self, value):

FILE: KoSimCSE/apex/fp16_utils/fp16util.py
  class tofp16 (line 7) | class tofp16(nn.Module):
    method __init__ (line 15) | def __init__(self):
    method forward (line 18) | def forward(self, input):
  function BN_convert_float (line 22) | def BN_convert_float(module):
  function network_to_half (line 35) | def network_to_half(network):
  function convert_module (line 44) | def convert_module(module, dtype):
  function convert_network (line 60) | def convert_network(network, dtype):
  class FP16Model (line 73) | class FP16Model(nn.Module):
    method __init__ (line 78) | def __init__(self, network):
    method forward (line 82) | def forward(self, *inputs):
  function backwards_debug_hook (line 87) | def backwards_debug_hook(grad):
  function prep_param_lists (line 90) | def prep_param_lists(model, flat_master=False):
  function model_grads_to_master_grads (line 136) | def model_grads_to_master_grads(model_params, master_params, flat_master...
  function master_params_to_model_params (line 158) | def master_params_to_model_params(model_params, master_params, flat_mast...
  function to_python_float (line 176) | def to_python_float(t):

FILE: KoSimCSE/apex/fp16_utils/loss_scaler.py
  function to_python_float (line 4) | def to_python_float(t):
  class LossScaler (line 10) | class LossScaler:
    method __init__ (line 22) | def __init__(self, scale=1):
    method has_overflow (line 26) | def has_overflow(self, params):
    method _has_inf_or_nan (line 30) | def _has_inf_or_nan(x):
    method update_scale (line 33) | def update_scale(self, overflow):
    method loss_scale (line 37) | def loss_scale(self):
    method scale_gradient (line 40) | def scale_gradient(self, module, grad_in, grad_out):
    method backward (line 43) | def backward(self, loss, retain_graph=False):
  class DynamicLossScaler (line 47) | class DynamicLossScaler:
    method __init__ (line 73) | def __init__(self,
    method has_overflow (line 84) | def has_overflow(self, params):
    method _has_inf_or_nan (line 92) | def _has_inf_or_nan(x):
    method update_scale (line 113) | def update_scale(self, overflow):
    method loss_scale (line 124) | def loss_scale(self):
    method scale_gradient (line 127) | def scale_gradient(self, module, grad_in, grad_out):
    method backward (line 130) | def backward(self, loss, retain_graph=False):

FILE: KoSimCSE/apex/mlp/mlp.py
  class MlpFunction (line 8) | class MlpFunction(torch.autograd.Function):
    method forward (line 10) | def forward(ctx, bias, activation, *args):
    method backward (line 19) | def backward(ctx, grad_o):
  class MLP (line 26) | class MLP(torch.nn.Module):
    method __init__ (line 34) | def __init__(self, mlp_sizes, bias=True, activation='relu'):
    method reset_parameters (line 64) | def reset_parameters(self):
    method forward (line 74) | def forward(self, input):
    method extra_repr (line 77) | def extra_repr(self):

FILE: KoSimCSE/apex/multi_tensor_apply/multi_tensor_apply.py
  class MultiTensorApply (line 3) | class MultiTensorApply(object):
    method __init__ (line 7) | def __init__(self, chunk_size):
    method check_avail (line 16) | def check_avail(self):
    method __call__ (line 24) | def __call__(self, op, noop_flag_buffer, tensor_lists, *args):

FILE: KoSimCSE/apex/normalization/fused_layer_norm.py
  class FusedLayerNormAffineFunction (line 12) | class FusedLayerNormAffineFunction(torch.autograd.Function):
    method forward (line 15) | def forward(ctx, input, weight, bias, normalized_shape, eps):
    method backward (line 30) | def backward(ctx, grad_output):
  class FusedLayerNormFunction (line 39) | class FusedLayerNormFunction(torch.autograd.Function):
    method forward (line 42) | def forward(ctx, input, normalized_shape, eps):
    method backward (line 55) | def backward(ctx, grad_output):
  function fused_layer_norm_affine (line 64) | def fused_layer_norm_affine(input, normalized_shape, weight, bias, eps=1...
  function fused_layer_norm (line 67) | def fused_layer_norm(input, normalized_shape, eps=1e-6):
  class FusedLayerNorm (line 70) | class FusedLayerNorm(torch.nn.Module):
    method __init__ (line 129) | def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
    method reset_parameters (line 148) | def reset_parameters(self):
    method forward (line 153) | def forward(self, input):
    method extra_repr (line 163) | def extra_repr(self):

FILE: KoSimCSE/apex/optimizers/fused_adagrad.py
  class FusedAdagrad (line 5) | class FusedAdagrad(torch.optim.Optimizer):
    method __init__ (line 43) | def __init__(self, params, lr=1e-2, eps=1e-10,
    method zero_grad (line 59) | def zero_grad(self):
    method step (line 67) | def step(self, closure=None):

FILE: KoSimCSE/apex/optimizers/fused_adam.py
  class FusedAdam (line 4) | class FusedAdam(torch.optim.Optimizer):
    method __init__ (line 63) | def __init__(self, params, lr=1e-3, bias_correction=True,
    method zero_grad (line 82) | def zero_grad(self):
    method step (line 90) | def step(self, closure=None, grads=None, output_params=None, scale=Non...

FILE: KoSimCSE/apex/optimizers/fused_lamb.py
  class FusedLAMB (line 4) | class FusedLAMB(torch.optim.Optimizer):
    method __init__ (line 63) | def __init__(self, params, lr=1e-3, bias_correction=True,
    method zero_grad (line 88) | def zero_grad(self):
    method step (line 96) | def step(self, closure=None):

FILE: KoSimCSE/apex/optimizers/fused_novograd.py
  class FusedNovoGrad (line 4) | class FusedNovoGrad(torch.optim.Optimizer):
    method __init__ (line 67) | def __init__(self, params, lr=1e-3, bias_correction=True,
    method zero_grad (line 92) | def zero_grad(self):
    method load_state_dict (line 100) | def load_state_dict(self, state_dict):
    method step (line 108) | def step(self, closure=None):

FILE: KoSimCSE/apex/optimizers/fused_sgd.py
  class FusedSGD (line 6) | class FusedSGD(Optimizer):
    method __init__ (line 76) | def __init__(self, params, lr=required, momentum=0, dampening=0,
    method __setstate__ (line 108) | def __setstate__(self, state):
    method zero_grad (line 113) | def zero_grad(self):
    method get_momentums (line 121) | def get_momentums(self, params):
    method step (line 138) | def step(self, closure=None):

FILE: KoSimCSE/apex/parallel/LARC.py
  class LARC (line 5) | class LARC(object):
    method __init__ (line 39) | def __init__(self, optimizer, trust_coefficient=0.02, clip=True, eps=1...
    method __getstate__ (line 45) | def __getstate__(self):
    method __setstate__ (line 48) | def __setstate__(self, state):
    method state (line 52) | def state(self):
    method __repr__ (line 55) | def __repr__(self):
    method param_groups (line 59) | def param_groups(self):
    method param_groups (line 63) | def param_groups(self, value):
    method state_dict (line 66) | def state_dict(self):
    method load_state_dict (line 69) | def load_state_dict(self, state_dict):
    method zero_grad (line 72) | def zero_grad(self):
    method add_param_group (line 75) | def add_param_group(self, param_group):
    method step (line 78) | def step(self):

FILE: KoSimCSE/apex/parallel/__init__.py
  function convert_syncbn_model (line 21) | def convert_syncbn_model(module, process_group=None, channel_last=False):
  function create_syncbn_process_group (line 58) | def create_syncbn_process_group(group_size):

FILE: KoSimCSE/apex/parallel/distributed.py
  function import_flatten_impl (line 13) | def import_flatten_impl():
  function flatten (line 25) | def flatten(bucket):
  function unflatten (line 30) | def unflatten(coalesced, bucket):
  function apply_flat_dist_call (line 36) | def apply_flat_dist_call(bucket, call, extra_args=None):
  function split_half_float_double (line 51) | def split_half_float_double(tensors):
  function split_by_type (line 60) | def split_by_type(tensors):
  function flat_dist_call (line 70) | def flat_dist_call(tensors, call, extra_args=None):
  function extract_tensors (line 78) | def extract_tensors(maybe_tensor, tensor_list):
  class Reducer (line 89) | class Reducer(object):
    method __init__ (line 111) | def __init__(self, module_or_grads_list):
    method reduce (line 121) | def reduce(self):
  class DistributedDataParallel (line 129) | class DistributedDataParallel(Module):
    method __init__ (line 162) | def __init__(self,
    method __setstate__ (line 256) | def __setstate__(self, state):
    method __getstate__ (line 268) | def __getstate__(self):
    method enable_allreduce (line 275) | def enable_allreduce(self):
    method disable_allreduce (line 278) | def disable_allreduce(self):
    method sync_bucket_structure (line 283) | def sync_bucket_structure(self):
    method create_hooks (line 319) | def create_hooks(self):
    method _stream_this_bucket (line 411) | def _stream_this_bucket(self, bucket_idx):
    method _event_this_bucket (line 418) | def _event_this_bucket(self, bucket_idx):
    method allreduce_bucket (line 425) | def allreduce_bucket(self, bucket, bucket_idx, force_default_stream):
    method allreduce_maybe_retain (line 478) | def allreduce_maybe_retain(self, bucket, bucket_idx, force_default_str...
    method allreduce_fallback (line 491) | def allreduce_fallback(self):
    method comm_ready_buckets (line 513) | def comm_ready_buckets(self, param):
    method forward (line 559) | def forward(self, *inputs, **kwargs):

FILE: KoSimCSE/apex/parallel/multiproc.py
  function docstring_hack (line 5) | def docstring_hack():

FILE: KoSimCSE/apex/parallel/optimized_sync_batchnorm.py
  class SyncBatchNorm (line 9) | class SyncBatchNorm(_BatchNorm):
    method __init__ (line 58) | def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, ...
    method _specify_process_group (line 64) | def _specify_process_group(self, process_group):
    method _specify_channel_last (line 67) | def _specify_channel_last(self, channel_last):
    method forward (line 70) | def forward(self, input, z = None):

FILE: KoSimCSE/apex/parallel/optimized_sync_batchnorm_kernel.py
  class SyncBatchnormFunction (line 7) | class SyncBatchnormFunction(Function):
    method forward (line 10) | def forward(ctx, input, z, weight, bias, running_mean, running_varianc...
    method backward (line 75) | def backward(ctx, grad_output):

FILE: KoSimCSE/apex/parallel/sync_batchnorm.py
  class SyncBatchNorm (line 9) | class SyncBatchNorm(_BatchNorm):
    method __init__ (line 51) | def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, ...
    method _specify_process_group (line 65) | def _specify_process_group(self, process_group):
    method forward (line 68) | def forward(self, input):

FILE: KoSimCSE/apex/parallel/sync_batchnorm_kernel.py
  class SyncBatchnormFunction (line 7) | class SyncBatchnormFunction(Function):
    method forward (line 10) | def forward(ctx, input, weight, bias, running_mean, running_variance, ...
    method backward (line 33) | def backward(ctx, grad_output):

FILE: KoSimCSE/apex/pyprof/examples/custom_func_module/custom_function.py
  class Foo (line 9) | class Foo(torch.autograd.Function):
    method forward (line 11) | def forward(ctx, in1, in2):
    method backward (line 16) | def backward(ctx, grad):

FILE: KoSimCSE/apex/pyprof/examples/custom_func_module/custom_module.py
  class Foo (line 8) | class Foo(torch.nn.Module):
    method __init__ (line 9) | def __init__(self, size):
    method forward (line 14) | def forward(self, input):

FILE: KoSimCSE/apex/pyprof/examples/imagenet/imagenet.py
  function parseArgs (line 17) | def parseArgs():
  function main (line 89) | def main():

FILE: KoSimCSE/apex/pyprof/examples/jit/jit_script_function.py
  function foo (line 11) | def foo(x, y):

FILE: KoSimCSE/apex/pyprof/examples/jit/jit_script_method.py
  class Foo (line 7) | class Foo(torch.jit.ScriptModule):
    method __init__ (line 8) | def __init__(self, size):
    method forward (line 14) | def forward(self, input):

FILE: KoSimCSE/apex/pyprof/examples/jit/jit_trace_function.py
  function foo (line 7) | def foo(x, y):

FILE: KoSimCSE/apex/pyprof/examples/jit/jit_trace_method.py
  class Foo (line 7) | class Foo(torch.nn.Module):
    method __init__ (line 8) | def __init__(self, size):
    method forward (line 13) | def forward(self, input):

FILE: KoSimCSE/apex/pyprof/examples/lenet.py
  class LeNet5 (line 12) | class LeNet5(nn.Module):
    method __init__ (line 13) | def __init__(self):
    method forward (line 24) | def forward(self, x):
    method num_flat_features (line 35) | def num_flat_features(self, x):

FILE: KoSimCSE/apex/pyprof/examples/user_annotation/resnet.py
  function conv3x3 (line 15) | def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
  function conv1x1 (line 20) | def conv1x1(in_planes, out_planes, stride=1):
  class Bottleneck (line 24) | class Bottleneck(nn.Module):
    method __init__ (line 28) | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
    method forward (line 48) | def forward(self, x):
  class ResNet (line 102) | class ResNet(nn.Module):
    method __init__ (line 104) | def __init__(self, block, layers, num_classes=1000,
    method _make_layer (line 134) | def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
    method forward (line 158) | def forward(self, x):
  function resnet50 (line 193) | def resnet50():

FILE: KoSimCSE/apex/pyprof/nvtx/nvmarker.py
  function isfunc (line 27) | def isfunc(mod, f):
  function traceMarker (line 46) | def traceMarker(stack):
  function modMarker (line 56) | def modMarker(mod, fn_name, args):
  function add_wrapper (line 67) | def add_wrapper(mod, fn_name):
  function argMarker (line 110) | def argMarker(mod, op, args, kwargs):
  function patchClass (line 201) | def patchClass(cls):
  function init (line 206) | def init():

FILE: KoSimCSE/apex/pyprof/parse/db.py
  class DB (line 3) | class DB(object):
    method __init__ (line 9) | def __init__(self, dbFile):
    method select (line 21) | def select(self, cmd):
    method insert (line 36) | def insert(self, cmd, data):
    method execute (line 46) | def execute(self, cmd):
    method commit (line 56) | def commit(self):
    method close (line 59) | def close(self):

FILE: KoSimCSE/apex/pyprof/parse/kernel.py
  function demangle (line 5) | def demangle(name):
  function encode_object_id (line 11) | def encode_object_id(pid, tid):
  function getShortName (line 20) | def getShortName(name):
  class Kernel (line 33) | class Kernel(object):
    method __init__ (line 41) | def __init__(self):
    method setKernelInfo (line 77) | def setKernelInfo(self, info):
    method setKernelName (line 93) | def setKernelName(self, name):
    method setRunTimeInfo (line 98) | def setRunTimeInfo(self, info):
    method setMarkerInfo (line 107) | def setMarkerInfo(self, info):
    method setDirection (line 111) | def setDirection(self):
    method setOp (line 123) | def setOp(self):
    method print (line 180) | def print(self):

FILE: KoSimCSE/apex/pyprof/parse/nvvp.py
  class NVVP (line 3) | class NVVP(object):
    method __init__ (line 14) | def __init__(self, db):
    method getProfileStart (line 18) | def getProfileStart(self):
    method getString (line 36) | def getString(self, id_):
    method createMarkerTable (line 45) | def createMarkerTable(self):
    method getCPUInfo (line 65) | def getCPUInfo(self, corrId):
    method getKernelInfo (line 91) | def getKernelInfo(self):
    method getMarkerInfo (line 99) | def getMarkerInfo(self, objId, startTime, endTime):

FILE: KoSimCSE/apex/pyprof/parse/parse.py
  function parseArgs (line 15) | def parseArgs():
  function main (line 25) | def main():

FILE: KoSimCSE/apex/pyprof/prof/activation.py
  class Activation (line 5) | class Activation(OperatorLayerBase):
    method __init__ (line 12) | def __init__(self, d):
    method params (line 35) | def params(self):
    method flops (line 39) | def flops(self):
    method bytes (line 48) | def bytes(self):
    method tc (line 58) | def tc(self):
    method op (line 61) | def op(self):
    method mod (line 64) | def mod(self):

FILE: KoSimCSE/apex/pyprof/prof/base.py
  class OperatorLayerBase (line 3) | class OperatorLayerBase(ABC):
    method tc (line 10) | def tc(self):
    method params (line 18) | def params(self):
    method flops (line 25) | def flops(self):
    method bytes (line 32) | def bytes(self):
    method mod (line 36) | def mod(self):
    method op (line 43) | def op(self):

FILE: KoSimCSE/apex/pyprof/prof/blas.py
  class Addmm (line 8) | class Addmm(OperatorLayerBase):
    method __init__ (line 10) | def __init__(self, d):
    method tc (line 63) | def tc(self):
    method bytes (line 69) | def bytes(self):
    method flops (line 73) | def flops(self):
    method op (line 76) | def op(self):
    method mod (line 79) | def mod(self):
    method params (line 82) | def params(self):
  class Bmm (line 86) | class Bmm(OperatorLayerBase):
    method __init__ (line 88) | def __init__(self, d):
    method tc (line 123) | def tc(self):
    method params (line 129) | def params(self):
    method flops (line 134) | def flops(self):
    method bytes (line 137) | def bytes(self):
    method op (line 141) | def op(self):
    method mod (line 144) | def mod(self):
  class Matmul (line 147) | class Matmul(OperatorLayerBase):
    method __init__ (line 152) | def __init__(self, d):
    method params (line 252) | def params(self):
    method tc (line 255) | def tc(self):
    method bytes (line 264) | def bytes(self):
    method flops (line 272) | def flops(self):
    method op (line 279) | def op(self):
    method mod (line 282) | def mod(self):
  class Mm (line 285) | class Mm(OperatorLayerBase):
    method __init__ (line 287) | def __init__(self, d):
    method params (line 319) | def params(self):
    method tc (line 323) | def tc(self):
    method bytes (line 329) | def bytes(self):
    method flops (line 333) | def flops(self):
    method op (line 336) | def op(self):
    method mod (line 339) | def mod(self):

FILE: KoSimCSE/apex/pyprof/prof/conv.py
  class Conv (line 5) | class Conv(OperatorLayerBase):
    method __init__ (line 26) | def __init__(self, d):
    method params (line 180) | def params(self):
    method conv_bytes_flops (line 184) | def conv_bytes_flops(self, N, C, H, W, K, P, Q, R, S, g, t):
    method bytes_flops (line 190) | def bytes_flops(self):
    method bytes (line 218) | def bytes(self):
    method flops (line 222) | def flops(self):
    method tc (line 226) | def tc(self):
    method op (line 232) | def op(self):
    method mod (line 235) | def mod(self):

FILE: KoSimCSE/apex/pyprof/prof/convert.py
  class Convert (line 5) | class Convert(OperatorLayerBase):
    method __init__ (line 11) | def __init__(self, d):
    method params (line 41) | def params(self):
    method op (line 45) | def op(self):
    method mod (line 48) | def mod(self):
    method tc (line 51) | def tc(self):
    method elems (line 54) | def elems(self):
    method flops (line 57) | def flops(self):
    method bytes (line 60) | def bytes(self):

FILE: KoSimCSE/apex/pyprof/prof/data.py
  class Data (line 3) | class Data(object):
    method __init__ (line 7) | def __init__(self, kernel):
    method setParams (line 41) | def setParams(self, params):

FILE: KoSimCSE/apex/pyprof/prof/dropout.py
  class Dropout (line 5) | class Dropout(OperatorLayerBase):
    method __init__ (line 7) | def __init__(self, d):
    method params (line 28) | def params(self):
    method op (line 32) | def op(self):
    method mod (line 35) | def mod(self):
    method tc (line 38) | def tc(self):
    method elems (line 41) | def elems(self):
    method bytes (line 44) | def bytes(self):
    method flops (line 48) | def flops(self):

FILE: KoSimCSE/apex/pyprof/prof/embedding.py
  class Embedding (line 5) | class Embedding(OperatorLayerBase):
    method __init__ (line 7) | def __init__(self, d):
    method params (line 33) | def params(self):
    method op (line 37) | def op(self):
    method mod (line 40) | def mod(self):
    method tc (line 43) | def tc(self):
    method bytes (line 46) | def bytes(self):
    method flops (line 69) | def flops(self):

FILE: KoSimCSE/apex/pyprof/prof/index_slice_join_mutate.py
  class Cat (line 6) | class Cat(OperatorLayerBase):
    method __init__ (line 8) | def __init__(self, d):
    method params (line 34) | def params(self):
    method flops (line 38) | def flops(self):
    method tc (line 41) | def tc(self):
    method op (line 44) | def op(self):
    method mod (line 47) | def mod(self):
    method bytes (line 50) | def bytes(self):
  class Reshape (line 56) | class Reshape(OperatorLayerBase):
    method __init__ (line 58) | def __init__(self, d):
    method params (line 82) | def params(self):
    method flops (line 86) | def flops(self):
    method tc (line 89) | def tc(self):
    method op (line 92) | def op(self):
    method mod (line 95) | def mod(self):
    method bytes (line 98) | def bytes(self):
  class Gather (line 101) | class Gather(OperatorLayerBase):
    method __init__ (line 103) | def __init__(self, d):
    method params (line 132) | def params(self):
    method flops (line 136) | def flops(self):
    method tc (line 139) | def tc(self):
    method op (line 142) | def op(self):
    method mod (line 145) | def mod(self):
    method bytes (line 148) | def bytes(self):
  class MaskedScatter (line 151) | class MaskedScatter(OperatorLayerBase):
    method __init__ (line 153) | def __init__(self, d):
    method params (line 178) | def params(self):
    method flops (line 182) | def flops(self):
    method tc (line 185) | def tc(self):
    method op (line 188) | def op(self):
    method mod (line 191) | def mod(self):
    method bytes (line 194) | def bytes(self):
  class Nonzero (line 207) | class Nonzero(OperatorLayerBase):
    method __init__ (line 209) | def __init__(self, d):
    method params (line 229) | def params(self):
    method flops (line 233) | def flops(self):
    method tc (line 236) | def tc(self):
    method op (line 239) | def op(self):
    method mod (line 242) | def mod(self):
    method bytes (line 245) | def bytes(self):
  class IndexSelect (line 260) | class IndexSelect(OperatorLayerBase):
    method __init__ (line 262) | def __init__(self, d):
    method params (line 311) | def params(self):
    method tc (line 315) | def tc(self):
    method op (line 318) | def op(self):
    method mod (line 321) | def mod(self):
    method flops (line 324) | def flops(self):
    method bytes (line 327) | def bytes(self):
  class MaskedSelect (line 343) | class MaskedSelect(OperatorLayerBase):
    method __init__ (line 345) | def __init__(self, d):
    method params (line 393) | def params(self):
    method tc (line 397) | def tc(self):
    method op (line 400) | def op(self):
    method mod (line 403) | def mod(self):
    method bytes (line 406) | def bytes(self):
    method flops (line 418) | def flops(self):

FILE: KoSimCSE/apex/pyprof/prof/linear.py
  class Linear (line 5) | class Linear(OperatorLayerBase):
    method setXWBMNK (line 17) | def setXWBMNK(self, args):
    method tc (line 63) | def tc(self):
    method __init__ (line 69) | def __init__(self, d):
    method params (line 118) | def params(self):
    method op (line 145) | def op(self):
    method bytesFlops (line 148) | def bytesFlops(self):
    method bytes (line 179) | def bytes(self):
    method flops (line 183) | def flops(self):
    method mod (line 187) | def mod(self):

FILE: KoSimCSE/apex/pyprof/prof/loss.py
  class MSELoss (line 7) | class MSELoss(OperatorLayerBase):
    method __init__ (line 9) | def __init__(self, d):
    method params (line 51) | def params(self):
    method elems (line 55) | def elems(self):
    method bytes (line 71) | def bytes(self):
    method flops (line 74) | def flops(self):
    method tc (line 77) | def tc(self):
    method op (line 80) | def op(self):
    method mod (line 83) | def mod(self):

FILE: KoSimCSE/apex/pyprof/prof/misc.py
  class Foo (line 5) | class Foo(OperatorLayerBase):
    method __init__ (line 9) | def __init__(self, d):
    method params (line 31) | def params(self):
    method tc (line 35) | def tc(self):
    method op (line 38) | def op(self):
    method mod (line 41) | def mod(self):
    method flops (line 44) | def flops(self):
    method bytes (line 47) | def bytes(self):
  class Copy (line 50) | class Copy(OperatorLayerBase):
    method __init__ (line 52) | def __init__(self, d):
    method params (line 75) | def params(self):
    method tc (line 80) | def tc(self):
    method op (line 83) | def op(self):
    method mod (line 86) | def mod(self):
    method flops (line 89) | def flops(self):
    method elems (line 92) | def elems(self):
    method bytes (line 95) | def bytes(self):
  class Clone (line 98) | class Clone(OperatorLayerBase):
    method __init__ (line 100) | def __init__(self, d):
    method params (line 118) | def params(self):
    method flops (line 122) | def flops(self):
    method tc (line 125) | def tc(self):
    method op (line 128) | def op(self):
    method mod (line 131) | def mod(self):
    method elems (line 134) | def elems(self):
    method bytes (line 137) | def bytes(self):
  class Contiguous (line 140) | class Contiguous(OperatorLayerBase):
    method __init__ (line 142) | def __init__(self, d):
    method params (line 160) | def params(self):
    method flops (line 164) | def flops(self):
    method bytes (line 167) | def bytes(self):
    method tc (line 170) | def tc(self):
    method op (line 173) | def op(self):
    method mod (line 176) | def mod(self):
  class Any (line 179) | class Any(OperatorLayerBase):
    method __init__ (line 181) | def __init__(self, d):
    method params (line 202) | def params(self):
    method op (line 206) | def op(self):
    method mod (line 209) | def mod(self):
    method tc (line 212) | def tc(self):
    method flops (line 215) | def flops(self):
    method bytes (line 218) | def bytes(self):

FILE: KoSimCSE/apex/pyprof/prof/normalization.py
  class BatchNorm (line 5) | class BatchNorm(OperatorLayerBase):
    method __init__ (line 7) | def __init__(self, d):
    method params (line 27) | def params(self):
    method tc (line 31) | def tc(self):
    method op (line 34) | def op(self):
    method mod (line 37) | def mod(self):
    method elems (line 40) | def elems(self):

Download .json

Condensed preview — 522 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (4,346K chars).

[
  {
    "path": "KoSBERT/Clustering.py",
    "chars": 1197,
    "preview": "from sentence_transformers import SentenceTransformer, util\nimport numpy as np\n\nmodel_path = '../Checkpoint/KoSBERT/kosb"
  },
  {
    "path": "KoSBERT/README.md",
    "chars": 5157,
    "preview": "# KoSentenceBERT\n[[Github]](https://github.com/UKPLab/sentence-transformers) Official implementation of SBERT. <br>\nKore"
  },
  {
    "path": "KoSBERT/SemanticSearch.py",
    "chars": 1395,
    "preview": "from sentence_transformers import SentenceTransformer, util\nimport numpy as np\n\nmodel_path = '../Checkpoint/KoSBERT/kosb"
  },
  {
    "path": "KoSBERT/con_training_sts.py",
    "chars": 3361,
    "preview": "from torch.utils.data import DataLoader\nimport math\nfrom sentence_transformers import SentenceTransformer,  SentencesDat"
  },
  {
    "path": "KoSBERT/output/empty.txt",
    "chars": 1,
    "preview": "."
  },
  {
    "path": "KoSBERT/run_example.sh",
    "chars": 1137,
    "preview": "#!/bin/bash\n\n# bert-base\necho \"First Step Training NLI Dataset (BERT-BASE)\"\nCUDA_VISIBLE_DEVICES=0 python training_nli.p"
  },
  {
    "path": "KoSBERT/training_nli.py",
    "chars": 3975,
    "preview": "from torch.utils.data import DataLoader\nimport math\nfrom sentence_transformers import models, losses\nfrom sentence_trans"
  },
  {
    "path": "KoSentenceT5/README.md",
    "chars": 1009,
    "preview": "# KoSentenceT5\nKoSentenceT5 : Korean Sentence Embeddings using T5. <br>\n> **Warning** <br>\n> This repository uses ETRI-T"
  },
  {
    "path": "KoSentenceT5/apex/RNN/README.md",
    "chars": 22,
    "preview": "Under construction...\n"
  },
  {
    "path": "KoSentenceT5/apex/RNN/RNNBackend.py",
    "chars": 11578,
    "preview": "import torch\nimport torch.nn as nn\nfrom torch.autograd import Variable\n\nimport torch.nn.functional as F\n\nimport math\n\n\nd"
  },
  {
    "path": "KoSentenceT5/apex/RNN/__init__.py",
    "chars": 71,
    "preview": "from .models import LSTM, GRU, ReLU, Tanh, mLSTM\n\n__all__ = ['models']\n"
  },
  {
    "path": "KoSentenceT5/apex/RNN/cells.py",
    "chars": 2550,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom .RNNBackend import RNNCell\n\nfrom torch.nn._func"
  },
  {
    "path": "KoSentenceT5/apex/RNN/models.py",
    "chars": 2137,
    "preview": "import torch\n\nfrom torch.nn._functions.rnn import LSTMCell, RNNReLUCell, RNNTanhCell, GRUCell\n\nfrom .RNNBackend import b"
  },
  {
    "path": "KoSentenceT5/apex/__init__.py",
    "chars": 851,
    "preview": "# May help avoid undefined symbol errors https://pytorch.org/cppdocs/notes/faq.html#undefined-symbol-errors-from-pytorch"
  },
  {
    "path": "KoSentenceT5/apex/amp/README.md",
    "chars": 2107,
    "preview": "# amp: Automatic Mixed Precision\n\n## Annotating User Functions\n\nNearly all PyTorch user code needs nothing more than the"
  },
  {
    "path": "KoSentenceT5/apex/amp/__init__.py",
    "chars": 310,
    "preview": "from .amp import init, half_function, float_function, promote_function,\\\n    register_half_function, register_float_func"
  },
  {
    "path": "KoSentenceT5/apex/amp/__version__.py",
    "chars": 62,
    "preview": "VERSION = (0, 1, 0)\n__version__ = '.'.join(map(str, VERSION))\n"
  },
  {
    "path": "KoSentenceT5/apex/amp/_amp_state.py",
    "chars": 2008,
    "preview": "# This is a \"header object\" that allows different amp modules to communicate.\n# I'm a C++ guy, not a python guy.  I deci"
  },
  {
    "path": "KoSentenceT5/apex/amp/_initialize.py",
    "chars": 11606,
    "preview": "import torch\nfrom torch._six import string_classes\nimport functools\nimport numpy as np\nimport sys\nfrom types import Meth"
  },
  {
    "path": "KoSentenceT5/apex/amp/_process_optimizer.py",
    "chars": 20747,
    "preview": "import types\nfrom ..fp16_utils import master_params_to_model_params\nfrom ..multi_tensor_apply import multi_tensor_applie"
  },
  {
    "path": "KoSentenceT5/apex/amp/amp.py",
    "chars": 7266,
    "preview": "from . import compat, rnn_compat, utils, wrap\nfrom .handle import AmpHandle, NoOpHandle\nfrom .lists import functional_ov"
  },
  {
    "path": "KoSentenceT5/apex/amp/compat.py",
    "chars": 1393,
    "preview": "import torch\n\n# True for post-0.4, when Variables/Tensors merged.\ndef variable_is_tensor():\n    v = torch.autograd.Varia"
  },
  {
    "path": "KoSentenceT5/apex/amp/frontend.py",
    "chars": 21267,
    "preview": "import torch\nfrom ._initialize import _initialize\nfrom ._amp_state import _amp_state, warn_or_err, maybe_print\nfrom coll"
  },
  {
    "path": "KoSentenceT5/apex/amp/handle.py",
    "chars": 12066,
    "preview": "import contextlib\nimport warnings\nimport sys\nimport torch\n\nfrom . import utils\nfrom .opt import OptimWrapper\nfrom .scale"
  },
  {
    "path": "KoSentenceT5/apex/amp/lists/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "KoSentenceT5/apex/amp/lists/functional_overrides.py",
    "chars": 2248,
    "preview": "\n# TODO: think about the following two. They do weird things.\n# - torch.nn.utils.clip_grad (but it should always be fp32"
  },
  {
    "path": "KoSentenceT5/apex/amp/lists/tensor_overrides.py",
    "chars": 1402,
    "preview": "from .. import compat\nfrom . import torch_overrides\n\nimport importlib\n\nimport torch\n\n# if compat.variable_is_tensor() an"
  },
  {
    "path": "KoSentenceT5/apex/amp/lists/torch_overrides.py",
    "chars": 2082,
    "preview": "import torch\n\nfrom .. import utils\n\nMODULE = torch\n\nFP16_FUNCS = [\n    # Low level functions wrapped by torch.nn layers."
  },
  {
    "path": "KoSentenceT5/apex/amp/opt.py",
    "chars": 3446,
    "preview": "import contextlib\nimport warnings\n\nfrom .scaler import LossScaler, master_params\nfrom ._amp_state import maybe_print\n\nim"
  },
  {
    "path": "KoSentenceT5/apex/amp/rnn_compat.py",
    "chars": 1995,
    "preview": "from . import utils, wrap\n\nimport torch\n_VF = torch._C._VariableFunctions\nRNN_NAMES = ['rnn_relu', 'rnn_tanh', 'gru', 'l"
  },
  {
    "path": "KoSentenceT5/apex/amp/scaler.py",
    "chars": 10494,
    "preview": "import torch\nfrom ..multi_tensor_apply import multi_tensor_applier\nfrom ._amp_state import _amp_state, master_params, ma"
  },
  {
    "path": "KoSentenceT5/apex/amp/utils.py",
    "chars": 7222,
    "preview": "from . import compat\n\nimport functools\nimport itertools\n\nimport torch\n\ndef is_cuda_enabled():\n    return torch.version.c"
  },
  {
    "path": "KoSentenceT5/apex/amp/wrap.py",
    "chars": 11242,
    "preview": "from . import compat\nfrom . import utils\nfrom ._amp_state import _amp_state\nfrom . import rnn_compat\n\nimport functools\n\n"
  },
  {
    "path": "KoSentenceT5/apex/contrib/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "KoSentenceT5/apex/contrib/bottleneck/__init__.py",
    "chars": 35,
    "preview": "from .bottleneck import Bottleneck\n"
  },
  {
    "path": "KoSentenceT5/apex/contrib/bottleneck/bottleneck.py",
    "chars": 7851,
    "preview": "import torch\nfrom torch import nn\nimport fast_bottleneck\n\ndef kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity="
  },
  {
    "path": "KoSentenceT5/apex/contrib/bottleneck/test.py",
    "chars": 3070,
    "preview": "import torch\nfrom bottleneck import Bottleneck\ntorch.manual_seed(23337)\n\n# use True to print layerwise sum for all outpu"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/bottleneck/bottleneck.cpp",
    "chars": 67827,
    "preview": "#include <ATen/ATen.h>\n#include <ATen/cudnn/Handle.h>  // for getcudnnhandle\n#include <torch/extension.h>\n#include <torc"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/fmha/fmha_api.cpp",
    "chars": 15179,
    "preview": "/******************************************************************************\n * Copyright (c) 2011-2021, NVIDIA CORPO"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/fmha/src/fmha/gemm.h",
    "chars": 11812,
    "preview": "/******************************************************************************\n * Copyright (c) 2011-2021, NVIDIA CORPO"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/fmha/src/fmha/gmem_tile.h",
    "chars": 16854,
    "preview": "/******************************************************************************\n * Copyright (c) 2011-2021, NVIDIA CORPO"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/fmha/src/fmha/kernel_traits.h",
    "chars": 4890,
    "preview": "/******************************************************************************\n * Copyright (c) 2011-2021, NVIDIA CORPO"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/fmha/src/fmha/mask.h",
    "chars": 3172,
    "preview": "/******************************************************************************\n * Copyright (c) 2011-2021, NVIDIA CORPO"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/fmha/src/fmha/smem_tile.h",
    "chars": 52449,
    "preview": "/******************************************************************************\n * Copyright (c) 2011-2021, NVIDIA CORPO"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/fmha/src/fmha/softmax.h",
    "chars": 18865,
    "preview": "/******************************************************************************\n * Copyright (c) 2011-2021, NVIDIA CORPO"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/fmha/src/fmha/utils.h",
    "chars": 32232,
    "preview": "/******************************************************************************\n * Copyright (c) 2011-2021, NVIDIA CORPO"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/fmha/src/fmha.h",
    "chars": 5134,
    "preview": "/******************************************************************************\n * Copyright (c) 2011-2021, NVIDIA CORPO"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_128_64_kernel.sm80.cu",
    "chars": 3431,
    "preview": "/******************************************************************************\n * Copyright (c) 2011-2021, NVIDIA CORPO"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_256_64_kernel.sm80.cu",
    "chars": 3431,
    "preview": "/******************************************************************************\n * Copyright (c) 2011-2021, NVIDIA CORPO"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_384_64_kernel.sm80.cu",
    "chars": 3431,
    "preview": "/******************************************************************************\n * Copyright (c) 2011-2021, NVIDIA CORPO"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_512_64_kernel.sm80.cu",
    "chars": 5400,
    "preview": "/******************************************************************************\n * Copyright (c) 2011-2021, NVIDIA CORPO"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/fmha/src/fmha_dgrad_kernel_1xN_reload.h",
    "chars": 22254,
    "preview": "/******************************************************************************\n * Copyright (c) 2011-2021, NVIDIA CORPO"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/fmha/src/fmha_dgrad_kernel_1xN_reload_nl.h",
    "chars": 23103,
    "preview": "/******************************************************************************\n * Copyright (c) 2011-2021, NVIDIA CORPO"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/fmha/src/fmha_fprop_fp16_128_64_kernel.sm80.cu",
    "chars": 3182,
    "preview": "/******************************************************************************\n * Copyright (c) 2011-2021, NVIDIA CORPO"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/fmha/src/fmha_fprop_fp16_256_64_kernel.sm80.cu",
    "chars": 3182,
    "preview": "/******************************************************************************\n * Copyright (c) 2011-2021, NVIDIA CORPO"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/fmha/src/fmha_fprop_fp16_384_64_kernel.sm80.cu",
    "chars": 3092,
    "preview": "/******************************************************************************\n * Copyright (c) 2011-2021, NVIDIA CORPO"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/fmha/src/fmha_fprop_fp16_512_64_kernel.sm80.cu",
    "chars": 5280,
    "preview": "/******************************************************************************\n * Copyright (c) 2011-2021, NVIDIA CORPO"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/fmha/src/fmha_fprop_kernel_1xN.h",
    "chars": 13090,
    "preview": "/******************************************************************************\n * Copyright (c) 2011-2021, NVIDIA CORPO"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/fmha/src/fmha_fprop_kernel_1xN_nl.h",
    "chars": 13127,
    "preview": "/******************************************************************************\n * Copyright (c) 2011-2021, NVIDIA CORPO"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/fmha/src/fmha_fprop_kernel_1xN_reload_v.h",
    "chars": 12841,
    "preview": "/******************************************************************************\n * Copyright (c) 2011-2021, NVIDIA CORPO"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/fmha/src/fmha_kernel.h",
    "chars": 5651,
    "preview": "/******************************************************************************\n * Copyright (c) 2011-2021, NVIDIA CORPO"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/fmha/src/fmha_noloop_reduce.cu",
    "chars": 6462,
    "preview": "/******************************************************************************\n * Copyright (c) 2011-2021, NVIDIA CORPO"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/fmha/src/fmha_utils.h",
    "chars": 4560,
    "preview": "/******************************************************************************\n * Copyright (c) 2011-2021, NVIDIA CORPO"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/groupbn/batch_norm.cu",
    "chars": 11458,
    "preview": "#include <ATen/ATen.h>\n#include <ATen/cuda/CUDAContext.h>\n#include <THC/THCNumerics.cuh>\n\n#include \"THC/THC.h\"\n\n#include"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/groupbn/batch_norm.h",
    "chars": 28975,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/groupbn/batch_norm_add_relu.cu",
    "chars": 12090,
    "preview": "#include <ATen/ATen.h>\n#include <ATen/cuda/CUDAContext.h>\n#include <THC/THCNumerics.cuh>\n\n#include \"THC/THC.h\"\n\n#include"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/groupbn/batch_norm_add_relu.h",
    "chars": 26879,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/groupbn/cuda_utils.h",
    "chars": 288,
    "preview": "#include <ATen/cuda/CUDAContext.h>\n#ifndef CUDA_UTILS_H\n#define CUDA_UTILS_H\n\nnamespace at {\nnamespace cuda {\n\nnamespace"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/groupbn/interface.cpp",
    "chars": 7076,
    "preview": "#include <pybind11/pybind11.h>\n#include <pybind11/numpy.h>\n#include <pybind11/stl.h>\n\n#include <torch/extension.h>\n#incl"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/groupbn/ipc.cu",
    "chars": 3682,
    "preview": "#include <ATen/ATen.h>\n#include <ATen/cuda/CUDAContext.h>\n#include <THC/THCNumerics.cuh>\n\n#include \"THC/THC.h\"\n\n#include"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/groupbn/nhwc_batch_norm_kernel.h",
    "chars": 111190,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/layer_norm/ln_api.cpp",
    "chars": 3358,
    "preview": "#include <torch/extension.h>\n#include \"ATen/cuda/CUDAContext.h\"\n\nvoid ln_fwd_cuda(at::Tensor &y, at::Tensor &mu, at::Ten"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/layer_norm/ln_bwd_semi_cuda_kernel.cu",
    "chars": 16079,
    "preview": "#include \"utils.cuh\"\n#include \"ln_kernel_traits.h\"\n#include \"ATen/cuda/CUDAContext.h\"\n\ntemplate<typename Ktraits>\n__glob"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/layer_norm/ln_fwd_cuda_kernel.cu",
    "chars": 5543,
    "preview": "#include \"utils.cuh\"\n#include \"ln_kernel_traits.h\"\n#include \"ATen/cuda/CUDAContext.h\"\n\ntemplate <typename Ktraits>\n__glo"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/layer_norm/ln_kernel_traits.h",
    "chars": 910,
    "preview": "#pragma once\n\nconstexpr uint32_t THREADS_PER_WARP = 32;\n\ntemplate <typename dtype, int COLS_, int WARPS_M_, int WARPS_N_"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/layer_norm/utils.cuh",
    "chars": 3190,
    "preview": "#pragma once\n\n#include \"torch/extension.h\"\n#include <ATen/cuda/Exceptions.h> // for CUDNN_CHECK\n\n#define DIVUP(x, y) ((("
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/multihead_attn/additive_masked_softmax_dropout.cpp",
    "chars": 3967,
    "preview": "#include <torch/extension.h>\n#include <cuda_fp16.h>\n#include <vector>\n\nnamespace multihead_attn {\nnamespace fused_softma"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/multihead_attn/additive_masked_softmax_dropout_cuda.cu",
    "chars": 5089,
    "preview": "#include <vector>\n#include <iostream>\n\n#include <ATen/ATen.h>\n#include <cuda.h>\n#include <cuda_runtime.h>\n#include <cuda"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/multihead_attn/dropout.h",
    "chars": 11937,
    "preview": "#include <ATen/ATen.h>\n\n#ifdef OLD_GENERATOR\n#include <ATen/CUDAGenerator.h>\n#else\n#include <ATen/CUDAGeneratorImpl.h>\n#"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/multihead_attn/encdec_multihead_attn.cpp",
    "chars": 8817,
    "preview": "#include <torch/extension.h>\n#include <vector>\n\nnamespace multihead_attn {\nnamespace encdec {\nnamespace cublas_gemmex {\n"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/multihead_attn/encdec_multihead_attn_cuda.cu",
    "chars": 25632,
    "preview": "#include <vector>\n#include <iostream>\n\n#include <ATen/ATen.h>\n#include <ATen/cuda/CUDAContext.h>\n#include <cuda.h>\n#incl"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add.cpp",
    "chars": 11861,
    "preview": "#include <torch/extension.h>\n#include <vector>\n\nnamespace multihead_attn {\nnamespace encdec_norm_add {\nnamespace cublas_"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add_cuda.cu",
    "chars": 30391,
    "preview": "#include <vector>\n#include <iostream>\n\n#include <ATen/ATen.h>\n#include <cuda.h>\n#include <cuda_runtime.h>\n#include <cuda"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/multihead_attn/layer_norm.h",
    "chars": 23374,
    "preview": "#include \"ATen/ATen.h\"\n#include <THC/THCDeviceUtils.cuh>\n\n#include <cuda.h>\n#include <cuda_runtime.h>\n\ntemplate<typename"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/multihead_attn/masked_softmax_dropout.cpp",
    "chars": 4158,
    "preview": "#include <torch/extension.h>\n#include <vector>\n\nnamespace multihead_attn {\nnamespace fused_softmax {\nnamespace mask_soft"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/multihead_attn/masked_softmax_dropout_cuda.cu",
    "chars": 5819,
    "preview": "#include <vector>\n#include <iostream>\n\n#include <ATen/ATen.h>\n#include <cuda.h>\n#include <cuda_runtime.h>\n#include <cuda"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/multihead_attn/philox.h",
    "chars": 2610,
    "preview": "#pragma once\n//Philox CUDA. \n\nclass Philox {\npublic:\n  __device__ inline Philox(unsigned long long seed,\n               "
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/multihead_attn/self_multihead_attn.cpp",
    "chars": 6852,
    "preview": "#include <torch/extension.h>\n#include <vector>\n\nnamespace multihead_attn {\nnamespace self {\nnamespace cublas_gemmex {\n\ns"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias.cpp",
    "chars": 7356,
    "preview": "#include <torch/extension.h>\n#include <vector>\n\nnamespace multihead_attn {\nnamespace self_bias {\nnamespace cublas_gemmex"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask.cpp",
    "chars": 7521,
    "preview": "#include <torch/extension.h>\n#include <vector>\n#include <cuda_fp16.h>\n\nnamespace multihead_attn {\nnamespace self_bias_ad"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask_cuda.cu",
    "chars": 20980,
    "preview": "#include <vector>\n#include <iostream>\n\n#include <ATen/ATen.h>\n#include <cuda.h>\n#include <cuda_runtime.h>\n#include <cuda"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_cuda.cu",
    "chars": 21146,
    "preview": "#include <vector>\n#include <iostream>\n\n#include <ATen/ATen.h>\n#include <cuda.h>\n#include <cuda_runtime.h>\n#include <cuda"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/multihead_attn/self_multihead_attn_cuda.cu",
    "chars": 20983,
    "preview": "#include <vector>\n#include <iostream>\n\n#include <ATen/ATen.h>\n#include <cuda.h>\n#include <cuda_runtime.h>\n#include <cuda"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add.cpp",
    "chars": 9946,
    "preview": "#include <torch/extension.h>\n#include <vector>\n\nnamespace multihead_attn {\nnamespace self_norm_add {\nnamespace cublas_ge"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add_cuda.cu",
    "chars": 25927,
    "preview": "#include <vector>\n#include <iostream>\n\n#include <ATen/ATen.h>\n#include <cuda.h>\n#include <cuda_runtime.h>\n#include <cuda"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/multihead_attn/softmax.h",
    "chars": 118979,
    "preview": "#pragma once\n#include <ATen/CUDAGeneratorImpl.h>\n#include <ATen/cuda/CUDAGraphsUtils.cuh>\n#include <curand_kernel.h>\n#in"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/multihead_attn/strided_batched_gemm.h",
    "chars": 34453,
    "preview": "#include <vector>\n#include <iostream>\n\n//#include <ATen/ATen.h>\n#include <cuda.h>\n#include <cuda_runtime.h>\n#include <cu"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/optimizers/fused_adam_cuda.cpp",
    "chars": 5601,
    "preview": "#include <torch/extension.h>\n\n// CUDA forward declaration\nvoid fused_strided_check_finite(at::Tensor & overflow_flag, at"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu",
    "chars": 35130,
    "preview": "#include \"ATen/ATen.h\"\n#include \"ATen/cuda/CUDAContext.h\"\n#include \"ATen/cuda/detail/IndexUtils.cuh\"\n#include <cuda.h>\n#"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/optimizers/fused_lamb_cuda.cpp",
    "chars": 562,
    "preview": "#include <torch/extension.h>\n\nvoid multi_tensor_lamb_cuda(\n  int chunk_size,\n  at::Tensor noop_flag,\n  std::vector<std::"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/optimizers/fused_lamb_cuda_kernel.cu",
    "chars": 8664,
    "preview": "#include <ATen/ATen.h>\n#include <ATen/AccumulateType.h>\n#include <ATen/cuda/CUDAContext.h>\n#include <ATen/cuda/Exception"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/optimizers/multi_tensor_distopt_adam.cpp",
    "chars": 560,
    "preview": "#include <torch/extension.h>\n\nvoid multi_tensor_fused_adam_cuda(\n  int chunk_size,\n  at::Tensor noop_flag,\n  std::vector"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/optimizers/multi_tensor_distopt_adam_kernel.cu",
    "chars": 7451,
    "preview": "#include <ATen/ATen.h>\n#include <ATen/AccumulateType.h>\n#include <ATen/cuda/CUDAContext.h>\n#include <ATen/cuda/Exception"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb.cpp",
    "chars": 1180,
    "preview": "#include <torch/extension.h>\n\nvoid multi_tensor_lamb_compute_update_term_cuda(\n  int chunk_size,\n  at::Tensor noop_flag,"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb_kernel.cu",
    "chars": 14973,
    "preview": "#include <ATen/ATen.h>\n#include <ATen/AccumulateType.h>\n#include <ATen/cuda/CUDAContext.h>\n#include <ATen/cuda/Exception"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/transducer/transducer_joint.cpp",
    "chars": 2319,
    "preview": "#include <torch/extension.h>\n#include <ATen/Functions.h>\n\n#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x \" must be a "
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/transducer/transducer_joint_kernel.cu",
    "chars": 36331,
    "preview": "#include <torch/extension.h>\n#include <cuda.h>\n#include <cuda_runtime.h>\n#include <c10/macros/Macros.h>\n#include <THC/TH"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/transducer/transducer_loss.cpp",
    "chars": 2512,
    "preview": "#include <torch/extension.h>\n#include <vector>\n\n#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x \" must be a CUDA tenso"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/transducer/transducer_loss_kernel.cu",
    "chars": 31796,
    "preview": "#include <torch/extension.h>\n#include <cuda.h>\n#include <cuda_runtime.h>\n#include <vector>\n#include <ATen/ATen.h>\n#inclu"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/xentropy/interface.cpp",
    "chars": 1657,
    "preview": "#include <torch/extension.h>\n\n// CUDA forward declarations\n\nstd::vector<at::Tensor> softmax_xentropy_cuda(\n    const at:"
  },
  {
    "path": "KoSentenceT5/apex/contrib/csrc/xentropy/xentropy_kernel.cu",
    "chars": 24642,
    "preview": "/**\n * From PyTorch:\n *\n * Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)\n * Copyright (c) 2014-     Fac"
  },
  {
    "path": "KoSentenceT5/apex/contrib/examples/multihead_attn/func_test_multihead_attn.py",
    "chars": 5740,
    "preview": "import torch\nimport torch.nn.functional as F\nimport argparse\n\nfrom apex.contrib.multihead_attn import SelfMultiheadAttn\n"
  },
  {
    "path": "KoSentenceT5/apex/contrib/examples/multihead_attn/perf_test_multihead_attn.py",
    "chars": 6163,
    "preview": "import torch\nimport torch.nn.functional as F\nimport argparse\n\nfrom apex.contrib.multihead_attn import SelfMultiheadAttn\n"
  },
  {
    "path": "KoSentenceT5/apex/contrib/fmha/__init__.py",
    "chars": 26,
    "preview": "from .fmha import FMHAFun\n"
  },
  {
    "path": "KoSentenceT5/apex/contrib/fmha/fmha.py",
    "chars": 3394,
    "preview": "###############################################################################\n# Copyright (c) 2011-2021, NVIDIA CORPOR"
  },
  {
    "path": "KoSentenceT5/apex/contrib/groupbn/__init__.py",
    "chars": 239,
    "preview": "try:\n    import torch\n    import bnp\n    from .batch_norm import BatchNorm2d_NHWC\n    del torch\n    del bnp\n    del batc"
  },
  {
    "path": "KoSentenceT5/apex/contrib/groupbn/batch_norm.py",
    "chars": 11208,
    "preview": "import torch\nimport numpy as np\nfrom torch.nn.modules.batchnorm import _BatchNorm\n\nimport bnp\n\nclass bn_NHWC_impl(torch."
  },
  {
    "path": "KoSentenceT5/apex/contrib/layer_norm/__init__.py",
    "chars": 38,
    "preview": "from .layer_norm import FastLayerNorm\n"
  },
  {
    "path": "KoSentenceT5/apex/contrib/layer_norm/layer_norm.py",
    "chars": 1490,
    "preview": "import torch\nfrom torch.nn import init\n\nimport fast_layer_norm\n\nclass FastLayerNormFN(torch.autograd.Function):\n    @sta"
  },
  {
    "path": "KoSentenceT5/apex/contrib/multihead_attn/README.md",
    "chars": 2267,
    "preview": "# Fast Multihead Attention \n\nThis implementation has two main features :\n* A C++ implementation to avoid the CPU overhea"
  },
  {
    "path": "KoSentenceT5/apex/contrib/multihead_attn/__init__.py",
    "chars": 176,
    "preview": "from .self_multihead_attn import SelfMultiheadAttn\nfrom .encdec_multihead_attn import EncdecMultiheadAttn\nfrom .mask_sof"
  },
  {
    "path": "KoSentenceT5/apex/contrib/multihead_attn/encdec_multihead_attn.py",
    "chars": 7043,
    "preview": "import math\n\nimport torch\nfrom torch import nn\nfrom torch.nn import Parameter\nimport torch.nn.functional as F\n\nfrom .enc"
  },
  {
    "path": "KoSentenceT5/apex/contrib/multihead_attn/encdec_multihead_attn_func.py",
    "chars": 17587,
    "preview": "import torch\nimport torch.nn.functional as F\n\n\nclass EncdecAttnFunc(torch.autograd.Function):\n    @staticmethod\n    def "
  },
  {
    "path": "KoSentenceT5/apex/contrib/multihead_attn/fast_encdec_multihead_attn_func.py",
    "chars": 5447,
    "preview": "import torch\nimport fast_encdec_multihead_attn\n\n\nclass FastEncdecAttnFunc(torch.autograd.Function):\n    @staticmethod\n  "
  },
  {
    "path": "KoSentenceT5/apex/contrib/multihead_attn/fast_encdec_multihead_attn_norm_add_func.py",
    "chars": 8251,
    "preview": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the license f"
  },
  {
    "path": "KoSentenceT5/apex/contrib/multihead_attn/fast_self_multihead_attn_func.py",
    "chars": 13483,
    "preview": "import torch\nimport fast_self_multihead_attn\nimport fast_self_multihead_attn_bias\nimport fast_self_multihead_attn_bias_a"
  },
  {
    "path": "KoSentenceT5/apex/contrib/multihead_attn/fast_self_multihead_attn_norm_add_func.py",
    "chars": 6704,
    "preview": "import torch\nimport fast_self_multihead_attn_norm_add\n\n\nclass FastSelfAttnNormAddFunc(torch.autograd.Function):\n    @sta"
  },
  {
    "path": "KoSentenceT5/apex/contrib/multihead_attn/mask_softmax_dropout_func.py",
    "chars": 4603,
    "preview": "import torch\nimport fast_mask_softmax_dropout\nimport fast_additive_mask_softmax_dropout\n\n\nclass MaskSoftmaxDropout(torch"
  },
  {
    "path": "KoSentenceT5/apex/contrib/multihead_attn/self_multihead_attn.py",
    "chars": 9054,
    "preview": "import math\n\nimport torch\nfrom torch import nn\nfrom torch.nn import Parameter\nimport torch.nn.functional as F\n\nfrom .sel"
  },
  {
    "path": "KoSentenceT5/apex/contrib/multihead_attn/self_multihead_attn_func.py",
    "chars": 14741,
    "preview": "import torch\nimport torch.nn.functional as F\n\nclass SelfAttnFunc(torch.autograd.Function):\n    @staticmethod\n    def for"
  },
  {
    "path": "KoSentenceT5/apex/contrib/optimizers/__init__.py",
    "chars": 111,
    "preview": "from .fp16_optimizer import FP16_Optimizer\nfrom .fused_adam import FusedAdam\nfrom .fused_lamb import FusedLAMB\n"
  },
  {
    "path": "KoSentenceT5/apex/contrib/optimizers/distributed_fused_adam.py",
    "chars": 34787,
    "preview": "import math\nimport torch\nimport importlib\nimport amp_C\nfrom apex.multi_tensor_apply import multi_tensor_applier\n\nimport "
  },
  {
    "path": "KoSentenceT5/apex/contrib/optimizers/distributed_fused_adam_v2.py",
    "chars": 31780,
    "preview": "import math\nimport torch\nimport importlib\nimport amp_C\nfrom apex.multi_tensor_apply import multi_tensor_applier\n\nclass D"
  },
  {
    "path": "KoSentenceT5/apex/contrib/optimizers/distributed_fused_adam_v3.py",
    "chars": 15709,
    "preview": "import math\nimport torch\nimport importlib\nimport amp_C\nfrom apex.multi_tensor_apply import multi_tensor_applier\n\nclass D"
  },
  {
    "path": "KoSentenceT5/apex/contrib/optimizers/distributed_fused_lamb.py",
    "chars": 43023,
    "preview": "import math\nimport torch\nimport importlib\nimport amp_C\nfrom apex.multi_tensor_apply import multi_tensor_applier\n\nimport "
  },
  {
    "path": "KoSentenceT5/apex/contrib/optimizers/fp16_optimizer.py",
    "chars": 10448,
    "preview": "import torch\nfrom apex.multi_tensor_apply import multi_tensor_applier\n\nclass FP16_Optimizer(object):\n    \"\"\"\n    :class:"
  },
  {
    "path": "KoSentenceT5/apex/contrib/optimizers/fused_adam.py",
    "chars": 9284,
    "preview": "import types\nimport torch\nimport importlib\nfrom apex.multi_tensor_apply import multi_tensor_applier\n\nclass FusedAdam(tor"
  },
  {
    "path": "KoSentenceT5/apex/contrib/optimizers/fused_lamb.py",
    "chars": 9408,
    "preview": "import torch\nimport importlib\nimport math\nfrom apex.multi_tensor_apply import multi_tensor_applier\n\nclass FusedLAMB(torc"
  },
  {
    "path": "KoSentenceT5/apex/contrib/optimizers/fused_sgd.py",
    "chars": 9468,
    "preview": "import types\nimport torch\nfrom torch.optim.optimizer import Optimizer, required\n\nfrom apex.multi_tensor_apply import mul"
  },
  {
    "path": "KoSentenceT5/apex/contrib/sparsity/README.md",
    "chars": 3073,
    "preview": "# Introduction to ASP\n\nThis serves as a quick-start for ASP (Automatic SParsity), a tool that enables sparse training an"
  },
  {
    "path": "KoSentenceT5/apex/contrib/sparsity/__init__.py",
    "chars": 61,
    "preview": "from .sparse_masklib import create_mask\nfrom .asp import ASP\n"
  },
  {
    "path": "KoSentenceT5/apex/contrib/sparsity/asp.py",
    "chars": 11740,
    "preview": "import types\nimport torch\nfrom .sparse_masklib import create_mask\n\ntorchvision_imported=True\ntry:\n    import torchvision"
  },
  {
    "path": "KoSentenceT5/apex/contrib/sparsity/sparse_masklib.py",
    "chars": 7291,
    "preview": "import sys\nimport torch\nimport numpy as np\nimport collections\nfrom itertools import permutations\n\n\n\"\"\" compute density ("
  },
  {
    "path": "KoSentenceT5/apex/contrib/sparsity/test/checkpointing_test_part1.py",
    "chars": 3353,
    "preview": "from collections import OrderedDict\n\nimport torch\nfrom apex.optimizers import FusedAdam\nfrom apex.contrib.sparsity impor"
  },
  {
    "path": "KoSentenceT5/apex/contrib/sparsity/test/checkpointing_test_part2.py",
    "chars": 3131,
    "preview": "from collections import OrderedDict\n\nimport torch\nfrom apex.optimizers import FusedAdam\nfrom apex.contrib.sparsity impor"
  },
  {
    "path": "KoSentenceT5/apex/contrib/sparsity/test/checkpointing_test_reference.py",
    "chars": 3177,
    "preview": "from collections import OrderedDict\n\nimport torch\nfrom apex.optimizers import FusedAdam\nfrom apex.contrib.sparsity impor"
  },
  {
    "path": "KoSentenceT5/apex/contrib/sparsity/test/toy_problem.py",
    "chars": 3217,
    "preview": "from collections import OrderedDict\n\nimport torch\nfrom apex.optimizers import FusedAdam\nfrom apex.contrib.sparsity impor"
  },
  {
    "path": "KoSentenceT5/apex/contrib/test/fmha/test_fmha.py",
    "chars": 4383,
    "preview": "###############################################################################\n# Copyright (c) 2011-2021, NVIDIA CORPOR"
  },
  {
    "path": "KoSentenceT5/apex/contrib/test/layer_norm/test_fast_layer_norm.py",
    "chars": 4960,
    "preview": "import torch\nimport unittest\nimport numpy as np\n\nimport torch.nn.functional as F\n\nfrom apex.contrib.layer_norm import Fa"
  },
  {
    "path": "KoSentenceT5/apex/contrib/test/multihead_attn/test_encdec_multihead_attn.py",
    "chars": 7429,
    "preview": "import torch\n\nimport unittest\n\nfrom apex.contrib.multihead_attn import EncdecMultiheadAttn\n\nclass EncdecMultiheadAttnTes"
  },
  {
    "path": "KoSentenceT5/apex/contrib/test/multihead_attn/test_encdec_multihead_attn_norm_add.py",
    "chars": 3875,
    "preview": "import torch\n\nimport unittest\n\nfrom apex.contrib.multihead_attn import EncdecMultiheadAttn\n\nclass EncdecMultiheadAttnNor"
  },
  {
    "path": "KoSentenceT5/apex/contrib/test/multihead_attn/test_fast_self_multihead_attn_bias.py",
    "chars": 3668,
    "preview": "import torch\n\nimport unittest\n\nfrom apex.contrib.multihead_attn import SelfMultiheadAttn\n\nclass SelfMultiheadAttnTest(un"
  },
  {
    "path": "KoSentenceT5/apex/contrib/test/multihead_attn/test_mha_fused_softmax.py",
    "chars": 1800,
    "preview": "import torch\nimport unittest\nimport torch.nn.functional as F\nfrom apex.contrib.multihead_attn import fast_mask_softmax_d"
  },
  {
    "path": "KoSentenceT5/apex/contrib/test/multihead_attn/test_self_multihead_attn.py",
    "chars": 6569,
    "preview": "import torch\n\nimport unittest\n\nfrom apex.contrib.multihead_attn import SelfMultiheadAttn\n\nclass SelfMultiheadAttnTest(un"
  },
  {
    "path": "KoSentenceT5/apex/contrib/test/multihead_attn/test_self_multihead_attn_norm_add.py",
    "chars": 3305,
    "preview": "import torch\n\nimport unittest\n\nfrom apex.contrib.multihead_attn import SelfMultiheadAttn\n\nclass SelfMultiheadAttnNormAdd"
  },
  {
    "path": "KoSentenceT5/apex/contrib/test/test_label_smoothing.py",
    "chars": 4800,
    "preview": "import torch\nfrom apex.contrib import xentropy as label_smoothing\nimport unittest\n\nimport warnings\nimport random\nimport "
  },
  {
    "path": "KoSentenceT5/apex/contrib/test/transducer/test_transducer_joint.py",
    "chars": 7042,
    "preview": "import torch\nimport unittest\nfrom apex.contrib.transducer import TransducerJoint\nimport transducer_ref\n\nclass Transducer"
  },
  {
    "path": "KoSentenceT5/apex/contrib/test/transducer/test_transducer_loss.py",
    "chars": 6871,
    "preview": "import torch\nimport unittest\nfrom apex.contrib.transducer import TransducerLoss\nimport transducer_ref\n\nclass TransducerL"
  },
  {
    "path": "KoSentenceT5/apex/contrib/test/transducer/transducer_ref.py",
    "chars": 4668,
    "preview": "import torch\nimport numpy as np\nimport pdb\n\ndef transducer_loss_reference(x, label, f_len, y_len, blank_idx, loss_grad):"
  },
  {
    "path": "KoSentenceT5/apex/contrib/transducer/__init__.py",
    "chars": 78,
    "preview": "from .transducer import TransducerJoint\nfrom .transducer import TransducerLoss"
  },
  {
    "path": "KoSentenceT5/apex/contrib/transducer/transducer.py",
    "chars": 9934,
    "preview": "import torch\nimport transducer_loss_cuda\nimport transducer_joint_cuda\n\nclass TransducerJoint(torch.nn.Module):\n    \"\"\"Tr"
  },
  {
    "path": "KoSentenceT5/apex/contrib/xentropy/__init__.py",
    "chars": 284,
    "preview": "try:\n    import torch\n    import xentropy_cuda\n    from .softmax_xentropy import SoftmaxCrossEntropyLoss\n    del torch\n "
  },
  {
    "path": "KoSentenceT5/apex/contrib/xentropy/softmax_xentropy.py",
    "chars": 1023,
    "preview": "import torch\nimport xentropy_cuda\n\nclass SoftmaxCrossEntropyLoss(torch.autograd.Function):\n    @staticmethod\n    def for"
  },
  {
    "path": "KoSentenceT5/apex/fp16_utils/README.md",
    "chars": 1443,
    "preview": "fp16_optimizer.py contains `FP16_Optimizer`, a Python class designed to wrap an existing Pytorch optimizer and automatic"
  },
  {
    "path": "KoSentenceT5/apex/fp16_utils/__init__.py",
    "chars": 367,
    "preview": "from .fp16util import (\n    BN_convert_float,\n    network_to_half,\n    prep_param_lists,\n    model_grads_to_master_grads"
  },
  {
    "path": "KoSentenceT5/apex/fp16_utils/fp16_optimizer.py",
    "chars": 27769,
    "preview": "import torch\nfrom torch import nn\nfrom torch.autograd import Variable\nfrom torch.nn.parameter import Parameter\nfrom torc"
  },
  {
    "path": "KoSentenceT5/apex/fp16_utils/fp16util.py",
    "chars": 7141,
    "preview": "import torch\nimport torch.nn as nn\nfrom torch.autograd import Variable\nfrom torch._utils import _flatten_dense_tensors, "
  },
  {
    "path": "KoSentenceT5/apex/fp16_utils/loss_scaler.py",
    "chars": 7568,
    "preview": "import torch\n\n# item() is a recent addition, so this helps with backward compatibility.\ndef to_python_float(t):\n    if h"
  },
  {
    "path": "KoSentenceT5/apex/mlp/__init__.py",
    "chars": 19,
    "preview": "from .mlp import *\n"
  },
  {
    "path": "KoSentenceT5/apex/mlp/mlp.py",
    "chars": 2614,
    "preview": "from copy import copy\nimport math\nimport torch\nfrom torch import nn\nimport mlp_cuda\nfrom .. import amp\n\nclass MlpFunctio"
  },
  {
    "path": "KoSentenceT5/apex/multi_tensor_apply/__init__.py",
    "chars": 100,
    "preview": "from .multi_tensor_apply import MultiTensorApply\n\nmulti_tensor_applier = MultiTensorApply(2048*32)\n\n"
  },
  {
    "path": "KoSentenceT5/apex/multi_tensor_apply/multi_tensor_apply.py",
    "chars": 991,
    "preview": "import torch\n\nclass MultiTensorApply(object):\n    available = False\n    warned = False\n\n    def __init__(self, chunk_siz"
  },
  {
    "path": "KoSentenceT5/apex/normalization/__init__.py",
    "chars": 45,
    "preview": "from .fused_layer_norm import FusedLayerNorm\n"
  },
  {
    "path": "KoSentenceT5/apex/normalization/fused_layer_norm.py",
    "chars": 6612,
    "preview": "import math\nimport torch\nimport numbers\nfrom torch.nn.parameter import Parameter\nfrom torch.nn import init\nfrom torch.nn"
  },
  {
    "path": "KoSentenceT5/apex/optimizers/__init__.py",
    "chars": 181,
    "preview": "from .fused_sgd import FusedSGD\nfrom .fused_adam import FusedAdam\nfrom .fused_novograd import FusedNovoGrad\nfrom .fused_"
  },
  {
    "path": "KoSentenceT5/apex/optimizers/fused_adagrad.py",
    "chars": 5231,
    "preview": "import torch\nfrom apex.multi_tensor_apply import multi_tensor_applier\n\n\nclass FusedAdagrad(torch.optim.Optimizer):\n    \""
  },
  {
    "path": "KoSentenceT5/apex/optimizers/fused_adam.py",
    "chars": 7718,
    "preview": "import torch\nfrom apex.multi_tensor_apply import multi_tensor_applier\n\nclass FusedAdam(torch.optim.Optimizer):\n\n    \"\"\"I"
  },
  {
    "path": "KoSentenceT5/apex/optimizers/fused_lamb.py",
    "chars": 9910,
    "preview": "import torch\nfrom apex.multi_tensor_apply import multi_tensor_applier\n\nclass FusedLAMB(torch.optim.Optimizer):\n\n    \"\"\"I"
  },
  {
    "path": "KoSentenceT5/apex/optimizers/fused_novograd.py",
    "chars": 10652,
    "preview": "import torch\nfrom apex.multi_tensor_apply import multi_tensor_applier\n\nclass FusedNovoGrad(torch.optim.Optimizer):\n\n    "
  },
  {
    "path": "KoSentenceT5/apex/optimizers/fused_sgd.py",
    "chars": 10041,
    "preview": "import torch\nfrom torch.optim.optimizer import Optimizer, required\n\nfrom apex.multi_tensor_apply import multi_tensor_app"
  },
  {
    "path": "KoSentenceT5/apex/parallel/LARC.py",
    "chars": 4018,
    "preview": "import torch\nfrom torch import nn\nfrom torch.nn.parameter import Parameter\n\nclass LARC(object):\n    \"\"\"\n    :class:`LARC"
  },
  {
    "path": "KoSentenceT5/apex/parallel/README.md",
    "chars": 2699,
    "preview": "## Distributed Data Parallel\n\ndistributed.py contains the source code for `apex.parallel.DistributedDataParallel`, a mod"
  },
  {
    "path": "KoSentenceT5/apex/parallel/__init__.py",
    "chars": 3667,
    "preview": "import torch\n\nif hasattr(torch.distributed, 'ReduceOp'):\n    ReduceOp = torch.distributed.ReduceOp\nelif hasattr(torch.di"
  },
  {
    "path": "KoSentenceT5/apex/parallel/distributed.py",
    "chars": 30651,
    "preview": "import torch\nimport torch.distributed as dist\nfrom torch.nn.modules import Module\nfrom torch.autograd import Variable\nfr"
  },
  {
    "path": "KoSentenceT5/apex/parallel/multiproc.py",
    "chars": 884,
    "preview": "import torch\nimport sys\nimport subprocess\n\ndef docstring_hack():\n    \"\"\"\n    Multiproc file which will launch a set of p"
  },
  {
    "path": "KoSentenceT5/apex/parallel/optimized_sync_batchnorm.py",
    "chars": 4364,
    "preview": "import torch\nfrom torch.nn.modules.batchnorm import _BatchNorm\nfrom torch.nn import functional as F\n\nimport syncbn\nfrom "
  },
  {
    "path": "KoSentenceT5/apex/parallel/optimized_sync_batchnorm_kernel.py",
    "chars": 5467,
    "preview": "import torch\nfrom torch.autograd.function import Function\n\nimport syncbn\nfrom apex.parallel import ReduceOp\n\nclass SyncB"
  },
  {
    "path": "KoSentenceT5/apex/parallel/sync_batchnorm.py",
    "chars": 6532,
    "preview": "import torch\nfrom torch.nn.modules.batchnorm import _BatchNorm\nfrom torch.nn import functional as F\n\nfrom .sync_batchnor"
  },
  {
    "path": "KoSentenceT5/apex/parallel/sync_batchnorm_kernel.py",
    "chars": 3761,
    "preview": "import torch\nfrom torch.autograd.function import Function\n\nfrom apex.parallel import ReduceOp\n\n\nclass SyncBatchnormFunct"
  },
  {
    "path": "KoSentenceT5/apex/pyprof/FAQs.md",
    "chars": 529,
    "preview": "1. How do I intercept the Adam optimizer in APEX ?\n\n\t```python\n\tfrom apex import pyprof\n\timport fused_adam_cuda\n\tpyprof."
  },
  {
    "path": "KoSentenceT5/apex/pyprof/README.md",
    "chars": 17162,
    "preview": "## PyProf - PyTorch Profiling tool\n\n### What does this tool do?                                                         "
  },
  {
    "path": "KoSentenceT5/apex/pyprof/__init__.py",
    "chars": 42,
    "preview": "import warnings\n\nfrom . import nvtx, prof\n"
  },
  {
    "path": "KoSentenceT5/apex/pyprof/examples/.gitignore",
    "chars": 31,
    "preview": "__pycache__\n*.sql\n*.dict\n*.csv\n"
  },
  {
    "path": "KoSentenceT5/apex/pyprof/examples/apex/README.md",
    "chars": 124,
    "preview": "This directory has examples of how to use `pyprof` with APEX extensions e.g. `fused_adam_cuda` and `fused_layer_norm_cud"
  },
  {
    "path": "KoSentenceT5/apex/pyprof/examples/apex/fused_adam.py",
    "chars": 546,
    "preview": "import torch\nimport fused_adam_cuda\nfrom apex.optimizers import FusedAdam, FP16_Optimizer\nfrom apex import pyprof\n\npypro"
  },
  {
    "path": "KoSentenceT5/apex/pyprof/examples/apex/fused_layer_norm.py",
    "chars": 790,
    "preview": "import torch\nimport fused_layer_norm_cuda\nfrom apex.normalization import FusedLayerNorm\nfrom apex import pyprof\n\npyprof."
  },
  {
    "path": "KoSentenceT5/apex/pyprof/examples/apex/test.sh",
    "chars": 432,
    "preview": "#!/bin/bash\n\nset -e\n\nSCRIPT=`realpath $0`\nSCRIPTPATH=`dirname $SCRIPT`\nPYPROF=\"$SCRIPTPATH/../..\"\n\nparse=\"python $PYPROF"
  },
  {
    "path": "KoSentenceT5/apex/pyprof/examples/custom_func_module/README.md",
    "chars": 315,
    "preview": "This directory has examples which show how to intercept (monkey patch) custom functions and modules with `pyprof`. No ch"
  },
  {
    "path": "KoSentenceT5/apex/pyprof/examples/custom_func_module/custom_function.py",
    "chars": 762,
    "preview": "#!/usr/bin/env python3\n\nimport torch\nimport torch.cuda.profiler as profiler\nfrom apex import pyprof\n#Initialize pyprof\np"
  },
  {
    "path": "KoSentenceT5/apex/pyprof/examples/custom_func_module/custom_module.py",
    "chars": 601,
    "preview": "#!/usr/bin/env python3\n\nimport torch\nimport torch.cuda.profiler as profiler\nfrom apex import pyprof\npyprof.nvtx.init()\n\n"
  }
]

// ... and 322 more files (download for full content)

About this extraction

This page contains the full source code of the BM-K/Sentence-Embedding-Is-All-You-Need GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 522 files (4.0 MB), approximately 1.1M tokens, and a symbol index with 2719 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo