SYMBOL INDEX (484 symbols across 72 files)

FILE: benchmark/bench_flash_mla.py
  function scaled_dot_product_attention (line 15) | def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal...
  function run_torch_mla (line 36) | def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size,...
  function run_flash_mla (line 63) | def run_flash_mla(q, block_table, blocked_k, max_seqlen_pad, block_size,...
  function run_flash_infer (line 82) | def run_flash_infer(q, block_table, blocked_k, max_seqlen_pad, block_siz...
  function _mla_attn_kernel (line 136) | def _mla_attn_kernel(
  function _mla_attn (line 222) | def _mla_attn(
  function _mla_softmax_reducev_kernel (line 274) | def _mla_softmax_reducev_kernel(
  function _mla_softmax_reducev (line 323) | def _mla_softmax_reducev(
  function mla_decode_triton (line 346) | def mla_decode_triton(
  function run_flash_mla_triton (line 381) | def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, bloc...
  function compare_ab (line 410) | def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv...
  function compare_a (line 450) | def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, d...
  function get_args (line 493) | def get_args():

FILE: benchmark/visualize.py
  function parse_args (line 7) | def parse_args():

FILE: csrc/api/api.cpp
  function PYBIND11_MODULE (line 8) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: csrc/api/common.h
  function is_sm90a (line 21) | struct Arch {
  function int64_stride_to_int (line 44) | inline int int64_stride_to_int(int64_t orig_stride) {
  function get_enum_max (line 124) | constexpr std::size_t get_enum_max(){
  function std (line 133) | constexpr std::string get_dynamic_enum_name(T value){
  function virtual (line 171) | constexpr virtual inline std::span<const FeatureT> get_supported_feature...

FILE: csrc/api/sparse_decode.h
  type class (line 14) | enum class
  type DecodeImplMeta (line 30) | struct DecodeImplMeta {

FILE: csrc/api/sparse_fwd.h
  type class (line 12) | enum class
  function class (line 29) | class Fwd_Sm90_Impl : public FwdImplBase {
  function class (line 50) | class Fwd_Sm100_Head64_Impl : public FwdImplBase {
  function class (line 68) | class Fwd_Sm100_Head128_Impl : public FwdImplBase {
  function class (line 86) | class Fwd_Sm100_Head128_Small_TopK_Impl : public FwdImplBase {

FILE: csrc/defines.h
  type int32x8_t (line 13) | struct int32x8_t {
  type float8 (line 17) | struct float8 {
  type bf16x8 (line 21) | struct bf16x8 {

FILE: csrc/kerutils/include/kerutils/common/common.h
  function namespace (line 3) | namespace kerutils {}

FILE: csrc/kerutils/include/kerutils/device/common.h
  type class (line 16) | enum class
  function PrefetchSize (line 25) | enum class PrefetchSize {

FILE: csrc/kerutils/include/kerutils/host/host.h
  function namespace (line 15) | namespace kerutils {

FILE: csrc/kerutils/include/kerutils/supplemental/torch_tensors.h
  function namespace (line 9) | namespace kerutils {

FILE: csrc/params.h
  function ModelType (line 5) | enum class ModelType {
  type DenseAttnDecodeParams (line 19) | struct DenseAttnDecodeParams { // TODO Change name to DenseAttnDecodeParams
  type SparseAttnDecodeParams (line 63) | struct SparseAttnDecodeParams {
  type CombineParams (line 105) | struct CombineParams {
  type GetDecodeSchedMetaParams (line 127) | struct GetDecodeSchedMetaParams {
  type SparseAttnFwdParams (line 145) | struct SparseAttnFwdParams {
  function SparseAttnFwdMode (line 171) | enum class SparseAttnFwdMode {

FILE: csrc/sm100/decode/head64/config.h
  function namespace (line 14) | namespace sm100::decode::head64 {

FILE: csrc/sm100/decode/head64/kernel.h
  function namespace (line 5) | namespace sm100::decode::head64 {

FILE: csrc/sm100/helpers.h
  function namespace (line 9) | namespace sm100 {

FILE: csrc/sm100/prefill/dense/collective/fmha_common.hpp
  type cutlass::fmha::collective (line 37) | namespace cutlass::fmha::collective {
    function CUTE_DEVICE (line 42) | CUTE_DEVICE void gemm_reset_zero_acc(Atom& atom, TA const& tA, TB cons...
    function CUTE_DEVICE (line 56) | CUTE_DEVICE void gemm_zero_acc(Atom& atom, TA const& tA, TB const& tB,...
    function CUTE_DEVICE (line 62) | CUTE_DEVICE constexpr auto unstageSmemLayout(Layout const& layout, Sta...
    function CUTE_DEVICE (line 67) | CUTE_DEVICE T warp_uniform(T a) {

FILE: csrc/sm100/prefill/dense/collective/fmha_fusion.hpp
  type cutlass::fmha::collective (line 37) | namespace cutlass::fmha::collective {
    type NoMask (line 41) | struct NoMask {
      method CUTLASS_DEVICE (line 43) | CUTLASS_DEVICE
      method CUTLASS_DEVICE (line 53) | CUTLASS_DEVICE
      method CUTLASS_DEVICE (line 63) | CUTLASS_DEVICE
      method CUTLASS_DEVICE (line 73) | CUTLASS_DEVICE
    type ResidualMask (line 83) | struct ResidualMask : NoMask {
      method CUTLASS_DEVICE (line 88) | CUTLASS_DEVICE int get_masked_trip_count(
      method CUTLASS_DEVICE (line 100) | CUTLASS_DEVICE
      method CUTLASS_DEVICE (line 114) | CUTLASS_DEVICE
    type ResidualMaskForBackward (line 135) | struct ResidualMaskForBackward : NoMask {
      method CUTLASS_DEVICE (line 140) | CUTLASS_DEVICE int get_masked_trip_count(
      method CUTLASS_DEVICE (line 152) | CUTLASS_DEVICE
      method CUTLASS_DEVICE (line 166) | CUTLASS_DEVICE
    type CausalMask (line 191) | struct CausalMask : NoMask {
      method CUTLASS_DEVICE (line 198) | CUTLASS_DEVICE
      method CUTLASS_DEVICE (line 218) | CUTLASS_DEVICE
      method CUTLASS_DEVICE (line 234) | CUTLASS_DEVICE
      method CUTLASS_DEVICE (line 244) | CUTLASS_DEVICE
    type CausalForBackwardMask (line 280) | struct CausalForBackwardMask : CausalMask<kIsQBegin>, ResidualMaskForB...
      method CUTLASS_DEVICE (line 285) | CUTLASS_DEVICE
    type VariableLength (line 316) | struct VariableLength {
      method CUTE_HOST_DEVICE (line 321) | CUTE_HOST_DEVICE operator int() const {
    type is_variable_length_impl (line 326) | struct is_variable_length_impl : std::false_type {}
    type is_variable_length_impl<VariableLength> (line 327) | struct is_variable_length_impl<VariableLength> : std::true_type {}
    function CUTE_HOST_DEVICE (line 331) | CUTE_HOST_DEVICE
    function CUTE_HOST_DEVICE (line 345) | CUTE_HOST_DEVICE
    function CUTE_HOST_DEVICE (line 361) | CUTE_HOST_DEVICE
  type cute (line 386) | namespace cute {
    type is_integral<cutlass::fmha::collective::VariableLength> (line 389) | struct is_integral<cutlass::fmha::collective::VariableLength> : true_t...
    function CUTE_HOST_DEVICE (line 391) | CUTE_HOST_DEVICE

FILE: csrc/sm100/prefill/dense/collective/sm100_fmha_fwd_epilogue_tma_warpspecialized.hpp
  type cutlass::fmha::collective (line 38) | namespace cutlass::fmha::collective {
    type Sm100FmhaFwdEpilogueTmaWarpspecialized (line 48) | struct Sm100FmhaFwdEpilogueTmaWarpspecialized {
      type TensorStorage (line 64) | struct TensorStorage {
      type Arguments (line 71) | struct Arguments {
      type Params (line 86) | struct Params {
      method CUTLASS_DEVICE (line 96) | CUTLASS_DEVICE static constexpr
      method Params (line 107) | static Params to_underlying_arguments(
      method CUTLASS_DEVICE (line 145) | CUTLASS_DEVICE
      method CUTLASS_DEVICE (line 152) | CUTLASS_DEVICE Sm100FmhaFwdEpilogueTmaWarpspecialized(const Params& ...
      method store (line 155) | CUTLASS_DEVICE auto

FILE: csrc/sm100/prefill/dense/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp
  type cutlass::fmha::collective (line 44) | namespace cutlass::fmha::collective {
    type Sm100FmhaFwdMainloopTmaWarpspecialized (line 65) | struct Sm100FmhaFwdMainloopTmaWarpspecialized {
      type TensorStorage (line 113) | struct TensorStorage {
      type TmemAllocation (line 121) | enum class TmemAllocation : uint32_t {
      type Arguments (line 187) | struct Arguments {
      type Params (line 202) | struct Params {
      method can_implement (line 212) | static bool can_implement(ProblemShape const& problem_shape, Argumen...
      method Params (line 217) | static Params to_underlying_arguments(
      method CUTLASS_DEVICE (line 236) | CUTLASS_DEVICE
      method CUTLASS_DEVICE (line 242) | CUTLASS_DEVICE void
      method mma (line 258) | CUTLASS_DEVICE auto
      method softmax_step (line 514) | CUTLASS_DEVICE auto
      method softmax (line 714) | CUTLASS_DEVICE auto
      method correction_epilogue (line 778) | CUTLASS_DEVICE auto
      method correction_rescale (line 868) | CUTLASS_DEVICE auto
      method correction (line 954) | CUTLASS_DEVICE auto
      method correction_empty (line 1142) | CUTLASS_DEVICE auto

FILE: csrc/sm100/prefill/dense/collective/sm100_fmha_load_tma_warpspecialized.hpp
  type cutlass::fmha::collective (line 42) | namespace cutlass::fmha::collective {
    type Sm100FmhaLoadTmaWarpspecialized (line 62) | struct Sm100FmhaLoadTmaWarpspecialized {
      type Arguments (line 67) | struct Arguments {
      type Params (line 80) | struct Params {
      method Params (line 87) | static Params to_underlying_arguments(
      method CUTLASS_DEVICE (line 141) | CUTLASS_DEVICE
      method CUTLASS_DEVICE (line 149) | CUTLASS_DEVICE void

FILE: csrc/sm100/prefill/dense/collective/sm100_fmha_mla_fwd_mainloop_tma_warpspecialized.hpp
  type cutlass::fmha::collective (line 45) | namespace cutlass::fmha::collective {
    type Sm100MlaFwdMainloopTmaWarpspecialized (line 65) | struct Sm100MlaFwdMainloopTmaWarpspecialized {
      type TensorStorageQKVO (line 127) | struct TensorStorageQKVO {
      type TensorStorageQKV (line 134) | struct TensorStorageQKV {
      type TmemAllocation (line 142) | enum class TmemAllocation : uint32_t {
      type Arguments (line 205) | struct Arguments {
      type Params (line 220) | struct Params {
      method can_implement (line 230) | static bool can_implement(ProblemShape const& problem_shape, Argumen...
      method Params (line 235) | static Params to_underlying_arguments(
      method CUTLASS_DEVICE (line 254) | CUTLASS_DEVICE
      method CUTLASS_DEVICE (line 260) | CUTLASS_DEVICE void
      method mma (line 276) | CUTLASS_DEVICE auto
      method softmax_step (line 532) | CUTLASS_DEVICE auto
      method softmax (line 735) | CUTLASS_DEVICE auto
      method correction_epilogue (line 786) | CUTLASS_DEVICE auto
      method correction_rescale (line 876) | CUTLASS_DEVICE auto
      method correction (line 962) | CUTLASS_DEVICE auto
      method correction_empty (line 1149) | CUTLASS_DEVICE auto

FILE: csrc/sm100/prefill/dense/collective/sm100_fmha_mla_load_tma_warpspecialized.hpp
  type cutlass::fmha::collective (line 42) | namespace cutlass::fmha::collective {
    type Sm100MlaFwdLoadTmaWarpspecialized (line 63) | struct Sm100MlaFwdLoadTmaWarpspecialized {
      type Arguments (line 74) | struct Arguments {
      type Params (line 87) | struct Params {
      method Params (line 94) | static Params to_underlying_arguments(
      method CUTLASS_DEVICE (line 149) | CUTLASS_DEVICE
      method CUTLASS_DEVICE (line 157) | CUTLASS_DEVICE void

FILE: csrc/sm100/prefill/dense/common/gather_tensor.hpp
  type example (line 37) | namespace example {
    type NoGather (line 42) | struct NoGather
      method NoGather (line 45) | NoGather(Ts...) {}
    type IndexedGather (line 50) | struct IndexedGather
      method CUTE_HOST_DEVICE (line 52) | CUTE_HOST_DEVICE constexpr
      method CUTE_HOST_DEVICE (line 56) | CUTE_HOST_DEVICE constexpr
      method print (line 61) | void
    type StridedGather (line 72) | struct StridedGather
      method CUTE_HOST_DEVICE (line 74) | CUTE_HOST_DEVICE constexpr
      method CUTE_HOST_DEVICE (line 78) | CUTE_HOST_DEVICE constexpr
      method print (line 83) | void
    type CustomStride (line 95) | struct CustomStride
      method CUTE_HOST_DEVICE (line 101) | CUTE_HOST_DEVICE constexpr friend
      method CUTE_HOST_DEVICE (line 106) | CUTE_HOST_DEVICE constexpr friend
      method print (line 111) | void
      method CUTE_HOST_DEVICE (line 121) | CUTE_HOST_DEVICE constexpr friend
      method CUTE_HOST_DEVICE (line 130) | CUTE_HOST_DEVICE constexpr friend
    function make_custom_stride_layout (line 142) | CUTLASS_HOST_DEVICE
    function make_gather_tensor (line 155) | CUTLASS_HOST_DEVICE
  type cute (line 171) | namespace cute
    function CUTE_HOST_DEVICE (line 175) | CUTE_HOST_DEVICE constexpr
    function CUTE_HOST_DEVICE (line 195) | CUTE_HOST_DEVICE constexpr

FILE: csrc/sm100/prefill/dense/common/pipeline_mla.hpp
  type cutlass (line 40) | namespace cutlass {
    class PipelineTmaAsyncMla (line 49) | class PipelineTmaAsyncMla {
      method CUTLASS_DEVICE (line 72) | static
      method CUTLASS_DEVICE (line 90) | static
      method CUTLASS_DEVICE (line 110) | CUTLASS_DEVICE
      method CUTLASS_DEVICE (line 119) | CUTLASS_DEVICE
      method if (line 142) | if constexpr (cute::is_same_v<InitBarriers, cute::true_type>) {
      method if (line 147) | if constexpr (cute::is_same_v<InitMasks, cute::true_type>) {
    function CUTLASS_DEVICE (line 171) | CUTLASS_DEVICE
    function CUTLASS_DEVICE (line 176) | CUTLASS_DEVICE
    function CUTLASS_DEVICE (line 198) | CUTLASS_DEVICE
    function CUTLASS_DEVICE (line 203) | CUTLASS_DEVICE
    function CUTLASS_DEVICE (line 208) | CUTLASS_DEVICE
    function CUTLASS_DEVICE (line 213) | CUTLASS_DEVICE
    function CUTLASS_DEVICE (line 228) | CUTLASS_DEVICE

FILE: csrc/sm100/prefill/dense/common/pow_2.hpp
  type cutlass::fmha (line 39) | namespace cutlass::fmha {
    type Pow2 (line 41) | struct Pow2 {
      method CUTE_HOST_DEVICE (line 52) | CUTE_HOST_DEVICE T operator *(T const& b) const {
    function CUTE_HOST_DEVICE (line 77) | CUTE_HOST_DEVICE bool operator<(T const& a, Pow2 const& b) {
    function CUTE_HOST_DEVICE (line 81) | CUTE_HOST_DEVICE void print(Pow2 const& a) {
  type cute (line 87) | namespace cute {
    type is_integral<cutlass::fmha::Pow2> (line 90) | struct is_integral<cutlass::fmha::Pow2> : true_type {}

FILE: csrc/sm100/prefill/dense/common/utils.hpp
  type cutlass_dtype (line 8) | struct cutlass_dtype {
  type cutlass_dtype<half> (line 13) | struct cutlass_dtype<half> {
  type cutlass_dtype<nv_bfloat16> (line 18) | struct cutlass_dtype<nv_bfloat16> {
  type cutlass_dtype<__nv_fp8_e4m3> (line 23) | struct cutlass_dtype<__nv_fp8_e4m3> {
  type cutlass_dtype<__nv_fp8_e5m2> (line 28) | struct cutlass_dtype<__nv_fp8_e5m2> {

FILE: csrc/sm100/prefill/dense/device/fmha.hpp
  type cutlass::fmha::device (line 49) | namespace cutlass::fmha::device {
    class FMHA (line 56) | class FMHA {
      method is_initialized (line 72) | bool is_initialized(bool set = false) {
      method Params (line 81) | Params const& params() const {
      method Status (line 86) | static Status
      method get_workspace_size (line 97) | static size_t
      method dim3 (line 105) | static dim3
      method maximum_active_blocks (line 111) | static int maximum_active_blocks(int /* smem_capacity */ = -1) {
      method Status (line 153) | Status
      method Status (line 190) | Status
      method Status (line 205) | static Status
      method Status (line 249) | Status
      method Status (line 259) | Status
      method Status (line 265) | Status
      method Status (line 271) | Status

FILE: csrc/sm100/prefill/dense/device/fmha_device_bwd.hpp
  type cutlass::fmha::device (line 48) | namespace cutlass::fmha::device {
    class Sm100FmhaBwd (line 62) | class Sm100FmhaBwd {
      type Arguments (line 65) | struct Arguments {
      type Params (line 119) | struct Params {
      method to_sum_OdO_arguments (line 130) | static typename OperationSumOdO::Arguments to_sum_OdO_arguments(
      method to_convert_arguments (line 153) | static typename OperationConvert::Arguments to_convert_arguments(Arg...
      method to_bwd_arguments (line 172) | static typename Operation::Arguments to_bwd_arguments(
      method Status (line 197) | static Status
      method get_workspace_size (line 220) | static size_t
      method Status (line 237) | Status
      method Status (line 266) | Status
      method Status (line 286) | static Status
      method Status (line 319) | Status
      method Status (line 329) | Status

FILE: csrc/sm100/prefill/dense/kernel/fmha_causal_tile_scheduler.hpp
  type cutlass::fmha::kernel (line 38) | namespace cutlass::fmha::kernel {
    type CausalIndividualTileScheduler (line 45) | struct CausalIndividualTileScheduler {
      type Params (line 51) | struct Params {
      method CUTLASS_DEVICE (line 62) | CUTLASS_DEVICE
      method Params (line 66) | static Params to_underlying_arguments(
      method dim3 (line 78) | static dim3 get_grid_shape(Params const& params) {
      method CUTLASS_DEVICE (line 82) | CUTLASS_DEVICE
      method get_block_coord (line 87) | CUTLASS_DEVICE
      method CUTLASS_DEVICE (line 112) | CUTLASS_DEVICE
    type CausalPersistentTileScheduler (line 125) | struct CausalPersistentTileScheduler {
      type Params (line 127) | struct Params {
      method Params (line 143) | static Params to_underlying_arguments(
      method dim3 (line 168) | static dim3 get_grid_shape(Params const& params) {
      method CUTLASS_DEVICE (line 173) | CUTLASS_DEVICE
      method get_block_coord (line 178) | CUTLASS_DEVICE
      method CUTLASS_DEVICE (line 189) | CUTLASS_DEVICE

FILE: csrc/sm100/prefill/dense/kernel/fmha_kernel_bwd_convert.hpp
  type cutlass::fmha::kernel (line 39) | namespace cutlass::fmha::kernel {
    type FmhaKernelBwdConvert (line 44) | struct FmhaKernelBwdConvert {
      type Arguments (line 46) | struct Arguments {
      method get_workspace_size (line 77) | static size_t get_workspace_size(Arguments const& args) { return 0; }
      method initialize_workspace (line 78) | static cutlass::Status initialize_workspace(Arguments const&, void*,...
      method can_implement (line 88) | static bool can_implement(Arguments const& args) {
      method dim3 (line 92) | static dim3 get_grid_shape(Params const& params) {
      method dim3 (line 97) | static dim3 get_block_shape() {
      method Params (line 102) | static Params to_underlying_arguments(Arguments const& args, void* w...
      method CUTLASS_DEVICE (line 107) | CUTLASS_DEVICE void copy(Params const& params, const ElementAcc* ptr...
      method CUTLASS_DEVICE (line 141) | CUTLASS_DEVICE void operator()(const Params &params, char* smem) {

FILE: csrc/sm100/prefill/dense/kernel/fmha_kernel_bwd_sum_OdO.hpp
  type cutlass::fmha::kernel (line 39) | namespace cutlass::fmha::kernel {
    type FmhaKernelBwdSumOdO (line 44) | struct FmhaKernelBwdSumOdO {
      type Arguments (line 46) | struct Arguments {
      method get_workspace_size (line 76) | static size_t get_workspace_size(Arguments const& args) { return 0; }
      method initialize_workspace (line 77) | static cutlass::Status initialize_workspace(Arguments const&, void*,...
      method can_implement (line 89) | static bool can_implement(Arguments const& args) {
      method dim3 (line 93) | static dim3 get_grid_shape(Params const& params) {
      method dim3 (line 98) | static dim3 get_block_shape() {
      method Params (line 103) | static Params to_underlying_arguments(Arguments const& args, void* w...
      method CUTLASS_DEVICE (line 107) | CUTLASS_DEVICE void operator()(const Params &params, char* smem) {

FILE: csrc/sm100/prefill/dense/kernel/fmha_options.hpp
  type cutlass::fmha::kernel (line 38) | namespace cutlass::fmha::kernel {
    type find_option (line 41) | struct find_option
    type find_option<kTag, Default> (line 44) | struct find_option<kTag, Default> {
    type Tag (line 60) | enum class Tag {
    type Option (line 80) | struct Option {
  type find_option<kTag, Default, Option, Options...> (line 49) | struct find_option<kTag, Default, Option, Options...> :

FILE: csrc/sm100/prefill/dense/kernel/fmha_tile_scheduler.hpp
  type cutlass::fmha::kernel (line 40) | namespace cutlass::fmha::kernel {
    type IndividualTileScheduler (line 44) | struct IndividualTileScheduler {
      type Params (line 46) | struct Params {
      method CUTLASS_DEVICE (line 52) | CUTLASS_DEVICE
      method Params (line 56) | static Params to_underlying_arguments(
      method dim3 (line 64) | static dim3 get_grid_shape(Params const& params) {
      method CUTLASS_DEVICE (line 68) | CUTLASS_DEVICE
      method get_block_coord (line 73) | CUTLASS_DEVICE
      method CUTLASS_DEVICE (line 79) | CUTLASS_DEVICE
    type PersistentTileScheduler (line 88) | struct PersistentTileScheduler {
      type Params (line 90) | struct Params {
      method Params (line 106) | static Params to_underlying_arguments(
      method dim3 (line 131) | static dim3 get_grid_shape(Params const& params) {
      method CUTLASS_DEVICE (line 136) | CUTLASS_DEVICE
      method get_block_coord (line 141) | CUTLASS_DEVICE
      method CUTLASS_DEVICE (line 152) | CUTLASS_DEVICE

FILE: csrc/sm100/prefill/dense/kernel/sm100_fmha_bwd_kernel_tma_warpspecialized.hpp
  type cutlass::fmha::kernel (line 49) | namespace cutlass::fmha::kernel {
    type Sm100FmhaBwdKernelTmaWarpSpecialized (line 62) | struct Sm100FmhaBwdKernelTmaWarpSpecialized {
      type TmemAllocation (line 72) | struct TmemAllocation {
      type WarpRole (line 87) | enum class WarpRole {
      method CUTLASS_DEVICE (line 94) | CUTLASS_DEVICE WarpRole warp_idx_to_role(int warp_idx) {
      type RegisterAllocation (line 98) | struct RegisterAllocation {
      type PipelineStorage (line 204) | struct PipelineStorage {
      method CUTE_DEVICE (line 218) | static CUTE_DEVICE constexpr auto restage(Layout const& layout, Stag...
      type TensorStorage (line 242) | struct TensorStorage {
      type SharedStorage (line 271) | struct SharedStorage {
      type MainloopArguments (line 284) | struct MainloopArguments {
      type MainloopParams (line 316) | struct MainloopParams {
      type EpilogueArguments (line 324) | struct EpilogueArguments {
      type Arguments (line 331) | struct Arguments {
      type Params (line 338) | struct Params {
      method can_implement (line 347) | static bool can_implement(Arguments const& args) {
      method Status (line 360) | static Status initialize_workspace(Arguments const&, void*, cudaStre...
      method Params (line 365) | static Params to_underlying_arguments(Arguments const& args, void*) {
      method quantize (line 414) | static CUTLASS_DEVICE auto quantize(T const& input) {
      method CUTLASS_DEVICE (line 432) | CUTLASS_DEVICE void load(
      method CUTLASS_DEVICE (line 661) | CUTLASS_DEVICE void mma(
      method CUTLASS_DEVICE (line 946) | CUTLASS_DEVICE void store(
      method CUTLASS_DEVICE (line 971) | CUTLASS_DEVICE void epilogue_clear(
      method CUTLASS_DEVICE (line 1015) | CUTLASS_DEVICE void epilogue(
      method CUTLASS_DEVICE (line 1119) | CUTLASS_DEVICE void compute(
      method CUTLASS_DEVICE (line 1392) | CUTLASS_DEVICE void reduce(
      method CUTLASS_DEVICE (line 1489) | CUTLASS_DEVICE void operator()(Params const& params, char* smem) {
      method dim3 (line 1822) | static dim3 get_block_shape() {
      method dim3 (line 1827) | static dim3 get_grid_shape(Params const& params) {

FILE: csrc/sm100/prefill/dense/kernel/sm100_fmha_bwd_mla_kernel_tma_warpspecialized.hpp
  type cutlass::fmha::kernel (line 49) | namespace cutlass::fmha::kernel {
    type Sm100FmhaBwdMlaKernelTmaWarpSpecialized (line 62) | struct Sm100FmhaBwdMlaKernelTmaWarpSpecialized {
      type TmemAllocation (line 70) | struct TmemAllocation {
      type WarpRole (line 85) | enum class WarpRole {
      method CUTLASS_DEVICE (line 95) | CUTLASS_DEVICE WarpRole warp_idx_to_role(int warp_idx) {
      type RegisterAllocation (line 99) | struct RegisterAllocation {
      type PipelineStorage (line 205) | struct PipelineStorage {
      method CUTE_DEVICE (line 219) | static CUTE_DEVICE constexpr auto restage(Layout const& layout, Stag...
      type TensorStorage (line 245) | struct TensorStorage {
      type SharedStorage (line 278) | struct SharedStorage {
      type MainloopArguments (line 291) | struct MainloopArguments {
      type MainloopParams (line 323) | struct MainloopParams {
      type EpilogueArguments (line 331) | struct EpilogueArguments {
      type Arguments (line 338) | struct Arguments {
      type Params (line 345) | struct Params {
      method can_implement (line 354) | static bool can_implement(Arguments const& args) {
      method Status (line 367) | static Status initialize_workspace(Arguments const&, void*, cudaStre...
      method Params (line 372) | static Params to_underlying_arguments(Arguments const& args, void*) {
      method quantize (line 421) | static CUTLASS_DEVICE auto quantize(T const& input) {
      method CUTLASS_DEVICE (line 439) | CUTLASS_DEVICE void load(
      method CUTLASS_DEVICE (line 667) | CUTLASS_DEVICE void mma(
      method CUTLASS_DEVICE (line 951) | CUTLASS_DEVICE void store(
      method CUTLASS_DEVICE (line 976) | CUTLASS_DEVICE void epilogue_clear(
      method CUTLASS_DEVICE (line 1021) | CUTLASS_DEVICE void epilogue(
      method CUTLASS_DEVICE (line 1125) | CUTLASS_DEVICE void compute(
      method CUTLASS_DEVICE (line 1386) | CUTLASS_DEVICE void reduce(
      method CUTLASS_DEVICE (line 1483) | CUTLASS_DEVICE void operator()(Params const& params, char* smem) {
      method dim3 (line 1816) | static dim3 get_block_shape() {
      method dim3 (line 1821) | static dim3 get_grid_shape(Params const& params) {

FILE: csrc/sm100/prefill/dense/kernel/sm100_fmha_fwd_kernel_tma_warpspecialized.hpp
  type cutlass::fmha::kernel (line 47) | namespace cutlass::fmha::kernel {
    type Sm100FmhaCtxKernelWarpspecializedSchedule (line 52) | struct Sm100FmhaCtxKernelWarpspecializedSchedule {
      type WarpRole (line 54) | enum class WarpRole {
      method WarpRole (line 64) | static constexpr WarpRole warp_idx_to_WarpRole(int warp_idx) {
    type Sm100MlaFwdCtxKernelWarpspecializedSchedule (line 91) | struct Sm100MlaFwdCtxKernelWarpspecializedSchedule {
      type WarpRole (line 93) | enum class WarpRole {
      method WarpRole (line 103) | static constexpr WarpRole warp_idx_to_WarpRole(int warp_idx) {
    type Sm100FmhaFwdKernelTmaWarpspecialized (line 136) | struct Sm100FmhaFwdKernelTmaWarpspecialized {
      method WarpRole (line 143) | constexpr WarpRole warp_idx_to_WarpRole(int warp_idx) {
      type SharedStorage (line 168) | struct SharedStorage {
        type PipelineStorage (line 188) | struct PipelineStorage {
      type Arguments (line 205) | struct Arguments {
      type Params (line 212) | struct Params {
      method get_workspace_size (line 223) | static size_t get_workspace_size(Arguments const& args) { return 0; }
      method initialize_workspace (line 224) | static cutlass::Status initialize_workspace(Arguments const&, void*,...
      method can_implement (line 228) | static bool can_implement(Arguments const& args) {
      method dim3 (line 232) | static dim3 get_grid_shape(Params const& params) {
      method dim3 (line 236) | static dim3 get_block_shape() {
      method Params (line 241) | static Params to_underlying_arguments(Arguments const& args, void* w...
      method apply_batch (line 250) | CUTLASS_DEVICE auto apply_batch(const Params &params, ProblemShape c...
      method CUTLASS_DEVICE (line 254) | CUTLASS_DEVICE void operator()(const Params &params, char* smem) {

FILE: csrc/sm100/prefill/sparse/common_subroutine.h
  function namespace (line 6) | namespace sm100 {

FILE: csrc/sm100/prefill/sparse/fwd/head128/config.h
  function namespace (line 10) | namespace sm100::fwd::head128 {

FILE: csrc/sm100/prefill/sparse/fwd/head128/phase1.h
  function namespace (line 5) | namespace sm100::fwd::head128 {

FILE: csrc/sm100/prefill/sparse/fwd/head64/config.h
  function namespace (line 8) | namespace sm100::fwd::head64 {

FILE: csrc/sm100/prefill/sparse/fwd/head64/phase1.h
  function namespace (line 5) | namespace sm100::fwd::head64 {

FILE: csrc/sm100/prefill/sparse/fwd_for_small_topk/head128/config.h
  function namespace (line 12) | namespace sm100::fwd_for_small_topk::head128 {

FILE: csrc/sm100/prefill/sparse/fwd_for_small_topk/head128/phase1.h
  function namespace (line 5) | namespace sm100::fwd_for_small_topk::head128 {

FILE: csrc/sm90/decode/dense/config.h
  function namespace (line 3) | namespace Config {

FILE: csrc/sm90/decode/dense/splitkv_mla.h
  function namespace (line 5) | namespace sm90 {

FILE: csrc/sm90/decode/dense/traits.h
  type SharedMemoryPlan (line 71) | struct SharedMemoryPlan {
  type NamedBarriers (line 101) | enum NamedBarriers : int {

FILE: csrc/sm90/decode/sparse_fp8/components/config.h
  function namespace (line 10) | namespace sm90::decode::sparse_fp8 {

FILE: csrc/sm90/decode/sparse_fp8/components/dequant.h
  type fp8x8 (line 10) | struct fp8x8 {
  type fp8x16 (line 15) | struct fp8x16 {
  function bf16x8 (line 21) | bf16x8 cvt_fp8x8_bf16x8(const fp8x8 &inputs, const __nv_bfloat162 &scale...
  type class (line 36) | enum class
  function L2PrefetchHint (line 43) | enum class L2PrefetchHint {

FILE: csrc/sm90/decode/sparse_fp8/components/helpers.h
  function namespace (line 10) | namespace sm90::decode::sparse_fp8 {
  function st_async_128b (line 79) | void st_async_128b(void* dst_ptr, const T& data, const transac_bar_t* mb...
  function CUTE_DEVICE (line 90) | CUTE_DEVICE

FILE: csrc/sm90/decode/sparse_fp8/config.h
  function namespace (line 13) | namespace sm90::decode::sparse_fp8 {

FILE: csrc/sm90/decode/sparse_fp8/splitkv_mla.h
  function namespace (line 5) | namespace sm90::decode::sparse_fp8 {

FILE: csrc/sm90/helpers.h
  function namespace (line 6) | namespace sm90 {

FILE: csrc/sm90/prefill/sparse/config.h
  function namespace (line 14) | namespace sm90::fwd {

FILE: csrc/sm90/prefill/sparse/fwd.h
  function namespace (line 5) | namespace sm90 {

FILE: csrc/sm90/prefill/sparse/phase1.h
  function namespace (line 5) | namespace sm90::fwd {

FILE: csrc/smxx/decode/combine/combine.h
  function namespace (line 5) | namespace smxx::decode {

FILE: csrc/smxx/decode/get_decoding_sched_meta/get_decoding_sched_meta.h
  function namespace (line 5) | namespace smxx::decode {

FILE: csrc/utils.h
  type RingBufferState (line 58) | struct RingBufferState {
  function RingBufferState (line 75) | RingBufferState offset_by(const int offset) const {

FILE: flash_mla/flash_mla_interface.py
  class FlashMLASchedMeta (line 9) | class FlashMLASchedMeta:
    class Config (line 15) | class Config:
  function get_mla_metadata (line 37) | def get_mla_metadata(
  function flash_mla_with_kvcache (line 53) | def flash_mla_with_kvcache(
  function flash_mla_sparse_fwd (line 176) | def flash_mla_sparse_fwd(
  function _flash_attn_varlen_forward (line 214) | def _flash_attn_varlen_forward(
  function _flash_attn_varlen_backward (line 261) | def _flash_attn_varlen_backward(
  class FlashAttnVarlenFunc (line 328) | class FlashAttnVarlenFunc(torch.autograd.Function):
    method forward (line 329) | def forward(
    method backward (line 356) | def backward(
  function flash_attn_varlen_func (line 372) | def flash_attn_varlen_func(
  function flash_attn_varlen_qkvpacked_func (line 395) | def flash_attn_varlen_qkvpacked_func(
  function flash_attn_varlen_kvpacked_func (line 415) | def flash_attn_varlen_kvpacked_func(

FILE: setup.py
  function is_flag_set (line 16) | def is_flag_set(flag: str) -> bool:
  function get_features_args (line 19) | def get_features_args():
  function get_arch_flags (line 25) | def get_arch_flags():
  function get_nvcc_thread_args (line 48) | def get_nvcc_thread_args():

FILE: tests/kernelkit/bench.py
  class empty_suppress (line 9) | class empty_suppress:
    method __enter__ (line 10) | def __enter__(self):
    method __exit__ (line 13) | def __exit__(self, *_):
  function profiler_range_start_marker_kernel (line 17) | def profiler_range_start_marker_kernel():
  function _run_profiler_range_start_marker_kernel (line 20) | def _run_profiler_range_start_marker_kernel():
  class BenchKinetoRawResult (line 24) | class BenchKinetoRawResult:
    method _get_matched_kernel_name (line 33) | def _get_matched_kernel_name(self, name_substr: str, allow_no_match: b...
    method get_kernel_names (line 42) | def get_kernel_names(self) -> List[str]:
    method get_kernel_times (line 45) | def get_kernel_times(self, kernel_names_substr: List[str], allow_indiv...
    method get_kernel_time (line 74) | def get_kernel_time(self, kernel_name_substr: str) -> float:
    method get_e2e_time (line 77) | def get_e2e_time(self, start_kernel_name_substr: str, end_kenrel_name_...
  function bench_kineto (line 103) | def bench_kineto(fn: Callable, num_tests: int = 30,
  function bench_by_cuda_events (line 161) | def bench_by_cuda_events(kernels: List[Callable], num_warmups_each: int,...
  function bench_by_cuda_events (line 164) | def bench_by_cuda_events(kernels: Callable, num_warmups_each: int, num_r...
  function bench_by_cuda_events (line 166) | def bench_by_cuda_events(kernels: Union[List[Callable], Callable], num_w...

FILE: tests/kernelkit/compare.py
  function check_is_bitwise_equal_comparator (line 5) | def check_is_bitwise_equal_comparator(ans: torch.Tensor, ref: torch.Tens...
  function check_is_bitwise_equal (line 13) | def check_is_bitwise_equal(name: str, ans: torch.Tensor, ref: torch.Tens...
  function get_cos_diff (line 19) | def get_cos_diff(ans: torch.Tensor, ref: torch.Tensor) -> float:
  function check_is_allclose (line 31) | def check_is_allclose(name: str, ans: torch.Tensor, ref: torch.Tensor, a...
  function check_is_allclose_comparator (line 94) | def check_is_allclose_comparator(name: str, ans: torch.Tensor, ref: torc...

FILE: tests/kernelkit/generate.py
  function _get_new_non_contiguous_tensor_shape (line 3) | def _get_new_non_contiguous_tensor_shape(shape):
  function gen_non_contiguous_randn_tensor (line 10) | def gen_non_contiguous_randn_tensor(shape, *args, **kwargs):
  function gen_non_contiguous_tensor (line 16) | def gen_non_contiguous_tensor(shape, *args, **kwargs):
  function non_contiguousify (line 22) | def non_contiguousify(tensor: torch.Tensor) -> torch.Tensor:

FILE: tests/kernelkit/precision.py
  class LowPrecisionMode (line 5) | class LowPrecisionMode:
    method __init__ (line 6) | def __init__(self, enabled: bool = True):
    method __enter__ (line 9) | def __enter__(self):
    method __exit__ (line 13) | def __exit__(self, exc_type, exc_value, traceback):
  function is_low_precision_mode (line 17) | def is_low_precision_mode() -> bool:
  function optional_cast_to_bf16_and_cast_back (line 23) | def optional_cast_to_bf16_and_cast_back(tensor: torch.Tensor) -> torch.T...

FILE: tests/kernelkit/utils.py
  function cdiv (line 18) | def cdiv(a: int, b: int) -> int:
  function is_using_profiling_tools (line 22) | def is_using_profiling_tools() -> bool:
  function set_random_seed (line 33) | def set_random_seed(seed: int):
  class Counter (line 44) | class Counter:
    method __init__ (line 45) | def __init__(self):
    method next (line 48) | def next(self) -> int:

FILE: tests/lib.py
  class TestTarget (line 13) | class TestTarget(enum.Enum):
  class ExtraTestParamForDecode (line 18) | class ExtraTestParamForDecode:
  class TestParam (line 29) | class TestParam:
  class RawTestParamForDecode (line 46) | class RawTestParamForDecode:
    method to_test_param (line 74) | def to_test_param(self) -> TestParam:
  class Testcase (line 90) | class Testcase:
  function _randperm_batch (line 100) | def _randperm_batch(batch_size: int, perm_range: torch.Tensor, perm_size...
  function generate_testcase (line 121) | def generate_testcase(t: TestParam) -> Testcase:
  class KVScope (line 168) | class KVScope:
    method quant_and_dequant_ (line 178) | def quant_and_dequant_(self):
    method get_kvcache_for_flash_mla (line 195) | def get_kvcache_for_flash_mla(self) -> torch.Tensor:
    method apply_perm (line 202) | def apply_perm(self, perm: torch.Tensor) -> "KVScope":
  class TestcaseForDecode (line 219) | class TestcaseForDecode:
  function generate_testcase_for_decode (line 227) | def generate_testcase_for_decode(t: TestParam) -> TestcaseForDecode:
  function run_flash_mla_sparse_fwd (line 310) | def run_flash_mla_sparse_fwd(p: TestParam, t: Testcase, return_p_sum: bo...
  function run_flash_mla_decode (line 319) | def run_flash_mla_decode(p: TestParam, t: TestcaseForDecode, tile_schedu...
  class FlopsAndMemVolStatistics (line 338) | class FlopsAndMemVolStatistics:
  function count_flop_and_mem_vol (line 345) | def count_flop_and_mem_vol(p: TestParam, t: Testcase) -> FlopsAndMemVolS...
  class FlopsAndMemVolStatisticsForDecode (line 360) | class FlopsAndMemVolStatisticsForDecode:
  function count_flop_and_mem_vol_for_decode (line 367) | def count_flop_and_mem_vol_for_decode(p: TestParam, t: TestcaseForDecode...
  function is_no_cooldown (line 404) | def is_no_cooldown() -> bool:

FILE: tests/quant.py
  class FP8KVCacheLayout (line 6) | class FP8KVCacheLayout(enum.Enum):
    method get_meta (line 10) | def get_meta(self) -> Tuple[int, int, int, int, int]:
  function _cast_scale_inv_to_ue8m0 (line 17) | def _cast_scale_inv_to_ue8m0(scales_inv: torch.Tensor, out_dtype = torch...
  function quantize_k_cache (line 20) | def quantize_k_cache(
  function dequantize_k_cache (line 81) | def dequantize_k_cache(
  function abs_indices2indices_in_kvcache (line 126) | def abs_indices2indices_in_kvcache(

FILE: tests/ref.py
  function _merge_two_lse (line 7) | def _merge_two_lse(lse0: torch.Tensor, lse1: Optional[torch.Tensor], s_q...
  function ref_sparse_attn_fwd (line 19) | def ref_sparse_attn_fwd(p: TestParam, t: Testcase) -> Tuple[torch.Tensor...
  function ref_sparse_attn_decode (line 55) | def ref_sparse_attn_decode(

FILE: tests/test_flash_mla_dense_decoding.py
  class TestParam (line 13) | class TestParam:
  function generate_test_data (line 29) | def generate_test_data(t: TestParam) -> Tuple[torch.Tensor, torch.Tensor...
  function reference_torch (line 73) | def reference_torch(
  function test_flash_mla (line 145) | def test_flash_mla(t: TestParam):
  function main (line 195) | def main(torch_dtype):

FILE: tests/test_flash_mla_sparse_decoding.py
  function gen_testcase (line 23) | def gen_testcase() -> List[RawTestParam]:
  class Result (line 130) | class Result:
  function test_flash_mla (line 142) | def test_flash_mla(p: TestParam) -> Result:
  function main (line 236) | def main():

FILE: tests/test_flash_mla_sparse_prefill.py
  function run_test (line 14) | def run_test(p: TestParam) -> bool:

FILE: tests/test_fmha_sm100.py
  function get_window_size (line 10) | def get_window_size(causal, window):
  function get_attn_bias (line 18) | def get_attn_bias(s_q, s_k, causal, window):
  function sdpa (line 31) | def sdpa(query, key, value, attn_bias, softmax_scale=None):
  function sdpa_checkpoint (line 46) | def sdpa_checkpoint(*args, **kwargs):
  function test_flash_attention (line 50) | def test_flash_attention(b, mean_sq, mean_sk, varlen, h, h_k, d, dv, cau...