gitextract_gcbwaf39/

├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── assets/
│   └── images/
│       ├── dev-discuss-asynctp/
│       │   └── readme.md
│       └── readme.md
├── dev/
│   ├── sr/
│   │   ├── .gitignore
│   │   ├── readme.md
│   │   ├── setup.py
│   │   ├── src/
│   │   │   ├── stochastic_rounding.cu
│   │   │   ├── stochastic_rounding.hpp
│   │   │   └── stochastic_rounding_cuda.cu
│   │   ├── test.md
│   │   ├── tests/
│   │   │   ├── benchmark.py
│   │   │   └── core_unit_tests.py
│   │   ├── usage.py
│   │   └── usage2.py
│   └── triton_groupGEMM/
│       ├── groupgemm.py
│       ├── testing/
│       │   ├── base_testing.py
│       │   └── unit_tests.py
│       ├── tma_utils.py
│       └── triton_tutorial_groupgemm.py
├── kernels/
│   ├── MoE/
│   │   └── group_GEMM/
│   │       └── triton/
│   │           ├── readme.md
│   │           ├── testing/
│   │           │   ├── fast_verification.py
│   │           │   └── pytorch_reference_backwards.py
│   │           ├── tgroup_gemm_backwards.py
│   │           ├── tgroup_gemm_forward.py
│   │           └── utils/
│   │               └── tma_utils.py
│   ├── blackwell/
│   │   ├── cute_gemm_01/
│   │   │   ├── Makefile
│   │   │   ├── build/
│   │   │   │   └── temp.linux-x86_64-cpython-312/
│   │   │   │       ├── .ninja_deps
│   │   │   │       ├── .ninja_log
│   │   │   │       ├── build.ninja
│   │   │   │       ├── sm100_gemm.o
│   │   │   │       └── sm100_gemm_pytorch.o
│   │   │   ├── dist/
│   │   │   │   └── sm100_gemm-0.0.0-py3.12-linux-x86_64.egg
│   │   │   ├── driver.py
│   │   │   ├── setup.py
│   │   │   ├── sm100_gemm.cu
│   │   │   ├── sm100_gemm.egg-info/
│   │   │   │   ├── PKG-INFO
│   │   │   │   ├── SOURCES.txt
│   │   │   │   ├── dependency_links.txt
│   │   │   │   ├── not-zip-safe
│   │   │   │   ├── requires.txt
│   │   │   │   └── top_level.txt
│   │   │   ├── sm100_gemm.h
│   │   │   └── sm100_gemm_pytorch.cpp
│   │   └── cute_gemm_02_tma/
│   │       ├── build/
│   │       │   └── temp.linux-x86_64-cpython-312/
│   │       │       ├── .ninja_deps
│   │       │       ├── .ninja_log
│   │       │       ├── build.ninja
│   │       │       ├── sm100_gemm.o
│   │       │       └── sm100_gemm_pytorch.o
│   │       ├── dist/
│   │       │   └── sm100_gemm-0.0.0-py3.12-linux-x86_64.egg
│   │       ├── driver.py
│   │       ├── setup.py
│   │       ├── sm100_gemm.cu
│   │       ├── sm100_gemm.egg-info/
│   │       │   ├── PKG-INFO
│   │       │   ├── SOURCES.txt
│   │       │   ├── dependency_links.txt
│   │       │   ├── not-zip-safe
│   │       │   ├── requires.txt
│   │       │   └── top_level.txt
│   │       ├── sm100_gemm.h
│   │       └── sm100_gemm_pytorch.cpp
│   ├── cuda/
│   │   ├── cutlass_gemm/
│   │   │   ├── broadcast_load_epilogue_c3x.hpp
│   │   │   ├── common.hpp
│   │   │   ├── cutlass.cpp
│   │   │   ├── cutlass_kernel.cu
│   │   │   ├── readme.md
│   │   │   ├── setup.py
│   │   │   └── test_cutlass_gemm.py
│   │   ├── inference/
│   │   │   ├── README.md
│   │   │   └── hadamard_transform/
│   │   │       ├── hadamard_transform.cpp
│   │   │       ├── hadamard_transform_cuda.cu
│   │   │       ├── setup.py
│   │   │       └── test.py
│   │   ├── training/
│   │   │   └── README.md
│   │   └── tutorials/
│   │       ├── README.md
│   │       └── flash2.cu
│   ├── needs_perf_help/
│   │   ├── fp8_gemm_bench.py
│   │   └── fp8_rowwise_tma_persistent.py
│   └── triton/
│       ├── inference/
│       │   ├── README.md
│       │   ├── col_major_moe_gemm/
│       │   │   ├── README.md
│       │   │   ├── perf_test_moe.py
│       │   │   ├── profile_moe.py
│       │   │   ├── results.html
│       │   │   ├── test.csv
│       │   │   ├── test_moe_gemm.py
│       │   │   ├── v0_moe_fused.py
│       │   │   ├── v1_moe_fused.py
│       │   │   └── v2_moe_fused.py
│       │   ├── flash_attention/
│       │   │   └── stay_attention.py
│       │   ├── fp8/
│       │   │   ├── float8_groupwise_quant.py
│       │   │   ├── scaled_fp8_gemm.py
│       │   │   ├── splitk_gemm_fp8.py
│       │   │   └── tma_gemm.py
│       │   ├── gptq/
│       │   │   ├── a100_qlinear.py
│       │   │   ├── benchmark.py
│       │   │   ├── h100_qlinear.py
│       │   │   ├── mixtral/
│       │   │   │   ├── test_dequant_moe_gemm.py
│       │   │   │   └── w4a16_fused_dequant_gemm.py
│       │   │   ├── small_benchmark_cuda_graphs.py
│       │   │   └── splitk_dequant_gemm.py
│       │   ├── mamba/
│       │   │   └── causal_1d_conv/
│       │   │       ├── causal_1d_conv/
│       │   │       │   └── causal_1d_conv.py
│       │   │       └── tests/
│       │   │           └── test_causal_1d_conv.py
│       │   ├── paged_attention/
│       │   │   └── attention_triton.py
│       │   └── torch_compile/
│       │       └── flash_backward.py
│       ├── training/
│       │   ├── README.md
│       │   ├── fused_softmax/
│       │   │   ├── README.md
│       │   │   └── softmax.py
│       │   └── rms_norm/
│       │       └── fused_rms_norm.py
│       └── tutorials/
│           └── README.md
├── readme.md
└── tutorials/
    └── triton/
        ├── kernels/
        │   ├── __init__.py
        │   ├── flash_attention_fwd.py
        │   ├── fused_softmax.py
        │   ├── readme.md
        │   └── vector_add.py
        └── tests/
            ├── test_softmax.py
            └── test_utils.py