SYMBOL INDEX (484 symbols across 72 files) FILE: benchmark/bench_flash_mla.py function scaled_dot_product_attention (line 15) | def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal... function run_torch_mla (line 36) | def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size,... function run_flash_mla (line 63) | def run_flash_mla(q, block_table, blocked_k, max_seqlen_pad, block_size,... function run_flash_infer (line 82) | def run_flash_infer(q, block_table, blocked_k, max_seqlen_pad, block_siz... function _mla_attn_kernel (line 136) | def _mla_attn_kernel( function _mla_attn (line 222) | def _mla_attn( function _mla_softmax_reducev_kernel (line 274) | def _mla_softmax_reducev_kernel( function _mla_softmax_reducev (line 323) | def _mla_softmax_reducev( function mla_decode_triton (line 346) | def mla_decode_triton( function run_flash_mla_triton (line 381) | def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, bloc... function compare_ab (line 410) | def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv... function compare_a (line 450) | def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, d... function get_args (line 493) | def get_args(): FILE: benchmark/visualize.py function parse_args (line 7) | def parse_args(): FILE: csrc/api/api.cpp function PYBIND11_MODULE (line 8) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { FILE: csrc/api/common.h function is_sm90a (line 21) | struct Arch { function int64_stride_to_int (line 44) | inline int int64_stride_to_int(int64_t orig_stride) { function get_enum_max (line 124) | constexpr std::size_t get_enum_max(){ function std (line 133) | constexpr std::string get_dynamic_enum_name(T value){ function virtual (line 171) | constexpr virtual inline std::span get_supported_feature... FILE: csrc/api/sparse_decode.h type class (line 14) | enum class type DecodeImplMeta (line 30) | struct DecodeImplMeta { FILE: csrc/api/sparse_fwd.h type class (line 12) | enum class function class (line 29) | class Fwd_Sm90_Impl : public FwdImplBase { function class (line 50) | class Fwd_Sm100_Head64_Impl : public FwdImplBase { function class (line 68) | class Fwd_Sm100_Head128_Impl : public FwdImplBase { function class (line 86) | class Fwd_Sm100_Head128_Small_TopK_Impl : public FwdImplBase { FILE: csrc/defines.h type int32x8_t (line 13) | struct int32x8_t { type float8 (line 17) | struct float8 { type bf16x8 (line 21) | struct bf16x8 { FILE: csrc/kerutils/include/kerutils/common/common.h function namespace (line 3) | namespace kerutils {} FILE: csrc/kerutils/include/kerutils/device/common.h type class (line 16) | enum class function PrefetchSize (line 25) | enum class PrefetchSize { FILE: csrc/kerutils/include/kerutils/host/host.h function namespace (line 15) | namespace kerutils { FILE: csrc/kerutils/include/kerutils/supplemental/torch_tensors.h function namespace (line 9) | namespace kerutils { FILE: csrc/params.h function ModelType (line 5) | enum class ModelType { type DenseAttnDecodeParams (line 19) | struct DenseAttnDecodeParams { // TODO Change name to DenseAttnDecodeParams type SparseAttnDecodeParams (line 63) | struct SparseAttnDecodeParams { type CombineParams (line 105) | struct CombineParams { type GetDecodeSchedMetaParams (line 127) | struct GetDecodeSchedMetaParams { type SparseAttnFwdParams (line 145) | struct SparseAttnFwdParams { function SparseAttnFwdMode (line 171) | enum class SparseAttnFwdMode { FILE: csrc/sm100/decode/head64/config.h function namespace (line 14) | namespace sm100::decode::head64 { FILE: csrc/sm100/decode/head64/kernel.h function namespace (line 5) | namespace sm100::decode::head64 { FILE: csrc/sm100/helpers.h function namespace (line 9) | namespace sm100 { FILE: csrc/sm100/prefill/dense/collective/fmha_common.hpp type cutlass::fmha::collective (line 37) | namespace cutlass::fmha::collective { function CUTE_DEVICE (line 42) | CUTE_DEVICE void gemm_reset_zero_acc(Atom& atom, TA const& tA, TB cons... function CUTE_DEVICE (line 56) | CUTE_DEVICE void gemm_zero_acc(Atom& atom, TA const& tA, TB const& tB,... function CUTE_DEVICE (line 62) | CUTE_DEVICE constexpr auto unstageSmemLayout(Layout const& layout, Sta... function CUTE_DEVICE (line 67) | CUTE_DEVICE T warp_uniform(T a) { FILE: csrc/sm100/prefill/dense/collective/fmha_fusion.hpp type cutlass::fmha::collective (line 37) | namespace cutlass::fmha::collective { type NoMask (line 41) | struct NoMask { method CUTLASS_DEVICE (line 43) | CUTLASS_DEVICE method CUTLASS_DEVICE (line 53) | CUTLASS_DEVICE method CUTLASS_DEVICE (line 63) | CUTLASS_DEVICE method CUTLASS_DEVICE (line 73) | CUTLASS_DEVICE type ResidualMask (line 83) | struct ResidualMask : NoMask { method CUTLASS_DEVICE (line 88) | CUTLASS_DEVICE int get_masked_trip_count( method CUTLASS_DEVICE (line 100) | CUTLASS_DEVICE method CUTLASS_DEVICE (line 114) | CUTLASS_DEVICE type ResidualMaskForBackward (line 135) | struct ResidualMaskForBackward : NoMask { method CUTLASS_DEVICE (line 140) | CUTLASS_DEVICE int get_masked_trip_count( method CUTLASS_DEVICE (line 152) | CUTLASS_DEVICE method CUTLASS_DEVICE (line 166) | CUTLASS_DEVICE type CausalMask (line 191) | struct CausalMask : NoMask { method CUTLASS_DEVICE (line 198) | CUTLASS_DEVICE method CUTLASS_DEVICE (line 218) | CUTLASS_DEVICE method CUTLASS_DEVICE (line 234) | CUTLASS_DEVICE method CUTLASS_DEVICE (line 244) | CUTLASS_DEVICE type CausalForBackwardMask (line 280) | struct CausalForBackwardMask : CausalMask, ResidualMaskForB... method CUTLASS_DEVICE (line 285) | CUTLASS_DEVICE type VariableLength (line 316) | struct VariableLength { method CUTE_HOST_DEVICE (line 321) | CUTE_HOST_DEVICE operator int() const { type is_variable_length_impl (line 326) | struct is_variable_length_impl : std::false_type {} type is_variable_length_impl (line 327) | struct is_variable_length_impl : std::true_type {} function CUTE_HOST_DEVICE (line 331) | CUTE_HOST_DEVICE function CUTE_HOST_DEVICE (line 345) | CUTE_HOST_DEVICE function CUTE_HOST_DEVICE (line 361) | CUTE_HOST_DEVICE type cute (line 386) | namespace cute { type is_integral (line 389) | struct is_integral : true_t... function CUTE_HOST_DEVICE (line 391) | CUTE_HOST_DEVICE FILE: csrc/sm100/prefill/dense/collective/sm100_fmha_fwd_epilogue_tma_warpspecialized.hpp type cutlass::fmha::collective (line 38) | namespace cutlass::fmha::collective { type Sm100FmhaFwdEpilogueTmaWarpspecialized (line 48) | struct Sm100FmhaFwdEpilogueTmaWarpspecialized { type TensorStorage (line 64) | struct TensorStorage { type Arguments (line 71) | struct Arguments { type Params (line 86) | struct Params { method CUTLASS_DEVICE (line 96) | CUTLASS_DEVICE static constexpr method Params (line 107) | static Params to_underlying_arguments( method CUTLASS_DEVICE (line 145) | CUTLASS_DEVICE method CUTLASS_DEVICE (line 152) | CUTLASS_DEVICE Sm100FmhaFwdEpilogueTmaWarpspecialized(const Params& ... method store (line 155) | CUTLASS_DEVICE auto FILE: csrc/sm100/prefill/dense/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp type cutlass::fmha::collective (line 44) | namespace cutlass::fmha::collective { type Sm100FmhaFwdMainloopTmaWarpspecialized (line 65) | struct Sm100FmhaFwdMainloopTmaWarpspecialized { type TensorStorage (line 113) | struct TensorStorage { type TmemAllocation (line 121) | enum class TmemAllocation : uint32_t { type Arguments (line 187) | struct Arguments { type Params (line 202) | struct Params { method can_implement (line 212) | static bool can_implement(ProblemShape const& problem_shape, Argumen... method Params (line 217) | static Params to_underlying_arguments( method CUTLASS_DEVICE (line 236) | CUTLASS_DEVICE method CUTLASS_DEVICE (line 242) | CUTLASS_DEVICE void method mma (line 258) | CUTLASS_DEVICE auto method softmax_step (line 514) | CUTLASS_DEVICE auto method softmax (line 714) | CUTLASS_DEVICE auto method correction_epilogue (line 778) | CUTLASS_DEVICE auto method correction_rescale (line 868) | CUTLASS_DEVICE auto method correction (line 954) | CUTLASS_DEVICE auto method correction_empty (line 1142) | CUTLASS_DEVICE auto FILE: csrc/sm100/prefill/dense/collective/sm100_fmha_load_tma_warpspecialized.hpp type cutlass::fmha::collective (line 42) | namespace cutlass::fmha::collective { type Sm100FmhaLoadTmaWarpspecialized (line 62) | struct Sm100FmhaLoadTmaWarpspecialized { type Arguments (line 67) | struct Arguments { type Params (line 80) | struct Params { method Params (line 87) | static Params to_underlying_arguments( method CUTLASS_DEVICE (line 141) | CUTLASS_DEVICE method CUTLASS_DEVICE (line 149) | CUTLASS_DEVICE void FILE: csrc/sm100/prefill/dense/collective/sm100_fmha_mla_fwd_mainloop_tma_warpspecialized.hpp type cutlass::fmha::collective (line 45) | namespace cutlass::fmha::collective { type Sm100MlaFwdMainloopTmaWarpspecialized (line 65) | struct Sm100MlaFwdMainloopTmaWarpspecialized { type TensorStorageQKVO (line 127) | struct TensorStorageQKVO { type TensorStorageQKV (line 134) | struct TensorStorageQKV { type TmemAllocation (line 142) | enum class TmemAllocation : uint32_t { type Arguments (line 205) | struct Arguments { type Params (line 220) | struct Params { method can_implement (line 230) | static bool can_implement(ProblemShape const& problem_shape, Argumen... method Params (line 235) | static Params to_underlying_arguments( method CUTLASS_DEVICE (line 254) | CUTLASS_DEVICE method CUTLASS_DEVICE (line 260) | CUTLASS_DEVICE void method mma (line 276) | CUTLASS_DEVICE auto method softmax_step (line 532) | CUTLASS_DEVICE auto method softmax (line 735) | CUTLASS_DEVICE auto method correction_epilogue (line 786) | CUTLASS_DEVICE auto method correction_rescale (line 876) | CUTLASS_DEVICE auto method correction (line 962) | CUTLASS_DEVICE auto method correction_empty (line 1149) | CUTLASS_DEVICE auto FILE: csrc/sm100/prefill/dense/collective/sm100_fmha_mla_load_tma_warpspecialized.hpp type cutlass::fmha::collective (line 42) | namespace cutlass::fmha::collective { type Sm100MlaFwdLoadTmaWarpspecialized (line 63) | struct Sm100MlaFwdLoadTmaWarpspecialized { type Arguments (line 74) | struct Arguments { type Params (line 87) | struct Params { method Params (line 94) | static Params to_underlying_arguments( method CUTLASS_DEVICE (line 149) | CUTLASS_DEVICE method CUTLASS_DEVICE (line 157) | CUTLASS_DEVICE void FILE: csrc/sm100/prefill/dense/common/gather_tensor.hpp type example (line 37) | namespace example { type NoGather (line 42) | struct NoGather method NoGather (line 45) | NoGather(Ts...) {} type IndexedGather (line 50) | struct IndexedGather method CUTE_HOST_DEVICE (line 52) | CUTE_HOST_DEVICE constexpr method CUTE_HOST_DEVICE (line 56) | CUTE_HOST_DEVICE constexpr method print (line 61) | void type StridedGather (line 72) | struct StridedGather method CUTE_HOST_DEVICE (line 74) | CUTE_HOST_DEVICE constexpr method CUTE_HOST_DEVICE (line 78) | CUTE_HOST_DEVICE constexpr method print (line 83) | void type CustomStride (line 95) | struct CustomStride method CUTE_HOST_DEVICE (line 101) | CUTE_HOST_DEVICE constexpr friend method CUTE_HOST_DEVICE (line 106) | CUTE_HOST_DEVICE constexpr friend method print (line 111) | void method CUTE_HOST_DEVICE (line 121) | CUTE_HOST_DEVICE constexpr friend method CUTE_HOST_DEVICE (line 130) | CUTE_HOST_DEVICE constexpr friend function make_custom_stride_layout (line 142) | CUTLASS_HOST_DEVICE function make_gather_tensor (line 155) | CUTLASS_HOST_DEVICE type cute (line 171) | namespace cute function CUTE_HOST_DEVICE (line 175) | CUTE_HOST_DEVICE constexpr function CUTE_HOST_DEVICE (line 195) | CUTE_HOST_DEVICE constexpr FILE: csrc/sm100/prefill/dense/common/pipeline_mla.hpp type cutlass (line 40) | namespace cutlass { class PipelineTmaAsyncMla (line 49) | class PipelineTmaAsyncMla { method CUTLASS_DEVICE (line 72) | static method CUTLASS_DEVICE (line 90) | static method CUTLASS_DEVICE (line 110) | CUTLASS_DEVICE method CUTLASS_DEVICE (line 119) | CUTLASS_DEVICE method if (line 142) | if constexpr (cute::is_same_v) { method if (line 147) | if constexpr (cute::is_same_v) { function CUTLASS_DEVICE (line 171) | CUTLASS_DEVICE function CUTLASS_DEVICE (line 176) | CUTLASS_DEVICE function CUTLASS_DEVICE (line 198) | CUTLASS_DEVICE function CUTLASS_DEVICE (line 203) | CUTLASS_DEVICE function CUTLASS_DEVICE (line 208) | CUTLASS_DEVICE function CUTLASS_DEVICE (line 213) | CUTLASS_DEVICE function CUTLASS_DEVICE (line 228) | CUTLASS_DEVICE FILE: csrc/sm100/prefill/dense/common/pow_2.hpp type cutlass::fmha (line 39) | namespace cutlass::fmha { type Pow2 (line 41) | struct Pow2 { method CUTE_HOST_DEVICE (line 52) | CUTE_HOST_DEVICE T operator *(T const& b) const { function CUTE_HOST_DEVICE (line 77) | CUTE_HOST_DEVICE bool operator<(T const& a, Pow2 const& b) { function CUTE_HOST_DEVICE (line 81) | CUTE_HOST_DEVICE void print(Pow2 const& a) { type cute (line 87) | namespace cute { type is_integral (line 90) | struct is_integral : true_type {} FILE: csrc/sm100/prefill/dense/common/utils.hpp type cutlass_dtype (line 8) | struct cutlass_dtype { type cutlass_dtype (line 13) | struct cutlass_dtype { type cutlass_dtype (line 18) | struct cutlass_dtype { type cutlass_dtype<__nv_fp8_e4m3> (line 23) | struct cutlass_dtype<__nv_fp8_e4m3> { type cutlass_dtype<__nv_fp8_e5m2> (line 28) | struct cutlass_dtype<__nv_fp8_e5m2> { FILE: csrc/sm100/prefill/dense/device/fmha.hpp type cutlass::fmha::device (line 49) | namespace cutlass::fmha::device { class FMHA (line 56) | class FMHA { method is_initialized (line 72) | bool is_initialized(bool set = false) { method Params (line 81) | Params const& params() const { method Status (line 86) | static Status method get_workspace_size (line 97) | static size_t method dim3 (line 105) | static dim3 method maximum_active_blocks (line 111) | static int maximum_active_blocks(int /* smem_capacity */ = -1) { method Status (line 153) | Status method Status (line 190) | Status method Status (line 205) | static Status method Status (line 249) | Status method Status (line 259) | Status method Status (line 265) | Status method Status (line 271) | Status FILE: csrc/sm100/prefill/dense/device/fmha_device_bwd.hpp type cutlass::fmha::device (line 48) | namespace cutlass::fmha::device { class Sm100FmhaBwd (line 62) | class Sm100FmhaBwd { type Arguments (line 65) | struct Arguments { type Params (line 119) | struct Params { method to_sum_OdO_arguments (line 130) | static typename OperationSumOdO::Arguments to_sum_OdO_arguments( method to_convert_arguments (line 153) | static typename OperationConvert::Arguments to_convert_arguments(Arg... method to_bwd_arguments (line 172) | static typename Operation::Arguments to_bwd_arguments( method Status (line 197) | static Status method get_workspace_size (line 220) | static size_t method Status (line 237) | Status method Status (line 266) | Status method Status (line 286) | static Status method Status (line 319) | Status method Status (line 329) | Status FILE: csrc/sm100/prefill/dense/kernel/fmha_causal_tile_scheduler.hpp type cutlass::fmha::kernel (line 38) | namespace cutlass::fmha::kernel { type CausalIndividualTileScheduler (line 45) | struct CausalIndividualTileScheduler { type Params (line 51) | struct Params { method CUTLASS_DEVICE (line 62) | CUTLASS_DEVICE method Params (line 66) | static Params to_underlying_arguments( method dim3 (line 78) | static dim3 get_grid_shape(Params const& params) { method CUTLASS_DEVICE (line 82) | CUTLASS_DEVICE method get_block_coord (line 87) | CUTLASS_DEVICE method CUTLASS_DEVICE (line 112) | CUTLASS_DEVICE type CausalPersistentTileScheduler (line 125) | struct CausalPersistentTileScheduler { type Params (line 127) | struct Params { method Params (line 143) | static Params to_underlying_arguments( method dim3 (line 168) | static dim3 get_grid_shape(Params const& params) { method CUTLASS_DEVICE (line 173) | CUTLASS_DEVICE method get_block_coord (line 178) | CUTLASS_DEVICE method CUTLASS_DEVICE (line 189) | CUTLASS_DEVICE FILE: csrc/sm100/prefill/dense/kernel/fmha_kernel_bwd_convert.hpp type cutlass::fmha::kernel (line 39) | namespace cutlass::fmha::kernel { type FmhaKernelBwdConvert (line 44) | struct FmhaKernelBwdConvert { type Arguments (line 46) | struct Arguments { method get_workspace_size (line 77) | static size_t get_workspace_size(Arguments const& args) { return 0; } method initialize_workspace (line 78) | static cutlass::Status initialize_workspace(Arguments const&, void*,... method can_implement (line 88) | static bool can_implement(Arguments const& args) { method dim3 (line 92) | static dim3 get_grid_shape(Params const& params) { method dim3 (line 97) | static dim3 get_block_shape() { method Params (line 102) | static Params to_underlying_arguments(Arguments const& args, void* w... method CUTLASS_DEVICE (line 107) | CUTLASS_DEVICE void copy(Params const& params, const ElementAcc* ptr... method CUTLASS_DEVICE (line 141) | CUTLASS_DEVICE void operator()(const Params ¶ms, char* smem) { FILE: csrc/sm100/prefill/dense/kernel/fmha_kernel_bwd_sum_OdO.hpp type cutlass::fmha::kernel (line 39) | namespace cutlass::fmha::kernel { type FmhaKernelBwdSumOdO (line 44) | struct FmhaKernelBwdSumOdO { type Arguments (line 46) | struct Arguments { method get_workspace_size (line 76) | static size_t get_workspace_size(Arguments const& args) { return 0; } method initialize_workspace (line 77) | static cutlass::Status initialize_workspace(Arguments const&, void*,... method can_implement (line 89) | static bool can_implement(Arguments const& args) { method dim3 (line 93) | static dim3 get_grid_shape(Params const& params) { method dim3 (line 98) | static dim3 get_block_shape() { method Params (line 103) | static Params to_underlying_arguments(Arguments const& args, void* w... method CUTLASS_DEVICE (line 107) | CUTLASS_DEVICE void operator()(const Params ¶ms, char* smem) { FILE: csrc/sm100/prefill/dense/kernel/fmha_options.hpp type cutlass::fmha::kernel (line 38) | namespace cutlass::fmha::kernel { type find_option (line 41) | struct find_option type find_option (line 44) | struct find_option { type Tag (line 60) | enum class Tag { type Option (line 80) | struct Option { type find_option (line 49) | struct find_option : FILE: csrc/sm100/prefill/dense/kernel/fmha_tile_scheduler.hpp type cutlass::fmha::kernel (line 40) | namespace cutlass::fmha::kernel { type IndividualTileScheduler (line 44) | struct IndividualTileScheduler { type Params (line 46) | struct Params { method CUTLASS_DEVICE (line 52) | CUTLASS_DEVICE method Params (line 56) | static Params to_underlying_arguments( method dim3 (line 64) | static dim3 get_grid_shape(Params const& params) { method CUTLASS_DEVICE (line 68) | CUTLASS_DEVICE method get_block_coord (line 73) | CUTLASS_DEVICE method CUTLASS_DEVICE (line 79) | CUTLASS_DEVICE type PersistentTileScheduler (line 88) | struct PersistentTileScheduler { type Params (line 90) | struct Params { method Params (line 106) | static Params to_underlying_arguments( method dim3 (line 131) | static dim3 get_grid_shape(Params const& params) { method CUTLASS_DEVICE (line 136) | CUTLASS_DEVICE method get_block_coord (line 141) | CUTLASS_DEVICE method CUTLASS_DEVICE (line 152) | CUTLASS_DEVICE FILE: csrc/sm100/prefill/dense/kernel/sm100_fmha_bwd_kernel_tma_warpspecialized.hpp type cutlass::fmha::kernel (line 49) | namespace cutlass::fmha::kernel { type Sm100FmhaBwdKernelTmaWarpSpecialized (line 62) | struct Sm100FmhaBwdKernelTmaWarpSpecialized { type TmemAllocation (line 72) | struct TmemAllocation { type WarpRole (line 87) | enum class WarpRole { method CUTLASS_DEVICE (line 94) | CUTLASS_DEVICE WarpRole warp_idx_to_role(int warp_idx) { type RegisterAllocation (line 98) | struct RegisterAllocation { type PipelineStorage (line 204) | struct PipelineStorage { method CUTE_DEVICE (line 218) | static CUTE_DEVICE constexpr auto restage(Layout const& layout, Stag... type TensorStorage (line 242) | struct TensorStorage { type SharedStorage (line 271) | struct SharedStorage { type MainloopArguments (line 284) | struct MainloopArguments { type MainloopParams (line 316) | struct MainloopParams { type EpilogueArguments (line 324) | struct EpilogueArguments { type Arguments (line 331) | struct Arguments { type Params (line 338) | struct Params { method can_implement (line 347) | static bool can_implement(Arguments const& args) { method Status (line 360) | static Status initialize_workspace(Arguments const&, void*, cudaStre... method Params (line 365) | static Params to_underlying_arguments(Arguments const& args, void*) { method quantize (line 414) | static CUTLASS_DEVICE auto quantize(T const& input) { method CUTLASS_DEVICE (line 432) | CUTLASS_DEVICE void load( method CUTLASS_DEVICE (line 661) | CUTLASS_DEVICE void mma( method CUTLASS_DEVICE (line 946) | CUTLASS_DEVICE void store( method CUTLASS_DEVICE (line 971) | CUTLASS_DEVICE void epilogue_clear( method CUTLASS_DEVICE (line 1015) | CUTLASS_DEVICE void epilogue( method CUTLASS_DEVICE (line 1119) | CUTLASS_DEVICE void compute( method CUTLASS_DEVICE (line 1392) | CUTLASS_DEVICE void reduce( method CUTLASS_DEVICE (line 1489) | CUTLASS_DEVICE void operator()(Params const& params, char* smem) { method dim3 (line 1822) | static dim3 get_block_shape() { method dim3 (line 1827) | static dim3 get_grid_shape(Params const& params) { FILE: csrc/sm100/prefill/dense/kernel/sm100_fmha_bwd_mla_kernel_tma_warpspecialized.hpp type cutlass::fmha::kernel (line 49) | namespace cutlass::fmha::kernel { type Sm100FmhaBwdMlaKernelTmaWarpSpecialized (line 62) | struct Sm100FmhaBwdMlaKernelTmaWarpSpecialized { type TmemAllocation (line 70) | struct TmemAllocation { type WarpRole (line 85) | enum class WarpRole { method CUTLASS_DEVICE (line 95) | CUTLASS_DEVICE WarpRole warp_idx_to_role(int warp_idx) { type RegisterAllocation (line 99) | struct RegisterAllocation { type PipelineStorage (line 205) | struct PipelineStorage { method CUTE_DEVICE (line 219) | static CUTE_DEVICE constexpr auto restage(Layout const& layout, Stag... type TensorStorage (line 245) | struct TensorStorage { type SharedStorage (line 278) | struct SharedStorage { type MainloopArguments (line 291) | struct MainloopArguments { type MainloopParams (line 323) | struct MainloopParams { type EpilogueArguments (line 331) | struct EpilogueArguments { type Arguments (line 338) | struct Arguments { type Params (line 345) | struct Params { method can_implement (line 354) | static bool can_implement(Arguments const& args) { method Status (line 367) | static Status initialize_workspace(Arguments const&, void*, cudaStre... method Params (line 372) | static Params to_underlying_arguments(Arguments const& args, void*) { method quantize (line 421) | static CUTLASS_DEVICE auto quantize(T const& input) { method CUTLASS_DEVICE (line 439) | CUTLASS_DEVICE void load( method CUTLASS_DEVICE (line 667) | CUTLASS_DEVICE void mma( method CUTLASS_DEVICE (line 951) | CUTLASS_DEVICE void store( method CUTLASS_DEVICE (line 976) | CUTLASS_DEVICE void epilogue_clear( method CUTLASS_DEVICE (line 1021) | CUTLASS_DEVICE void epilogue( method CUTLASS_DEVICE (line 1125) | CUTLASS_DEVICE void compute( method CUTLASS_DEVICE (line 1386) | CUTLASS_DEVICE void reduce( method CUTLASS_DEVICE (line 1483) | CUTLASS_DEVICE void operator()(Params const& params, char* smem) { method dim3 (line 1816) | static dim3 get_block_shape() { method dim3 (line 1821) | static dim3 get_grid_shape(Params const& params) { FILE: csrc/sm100/prefill/dense/kernel/sm100_fmha_fwd_kernel_tma_warpspecialized.hpp type cutlass::fmha::kernel (line 47) | namespace cutlass::fmha::kernel { type Sm100FmhaCtxKernelWarpspecializedSchedule (line 52) | struct Sm100FmhaCtxKernelWarpspecializedSchedule { type WarpRole (line 54) | enum class WarpRole { method WarpRole (line 64) | static constexpr WarpRole warp_idx_to_WarpRole(int warp_idx) { type Sm100MlaFwdCtxKernelWarpspecializedSchedule (line 91) | struct Sm100MlaFwdCtxKernelWarpspecializedSchedule { type WarpRole (line 93) | enum class WarpRole { method WarpRole (line 103) | static constexpr WarpRole warp_idx_to_WarpRole(int warp_idx) { type Sm100FmhaFwdKernelTmaWarpspecialized (line 136) | struct Sm100FmhaFwdKernelTmaWarpspecialized { method WarpRole (line 143) | constexpr WarpRole warp_idx_to_WarpRole(int warp_idx) { type SharedStorage (line 168) | struct SharedStorage { type PipelineStorage (line 188) | struct PipelineStorage { type Arguments (line 205) | struct Arguments { type Params (line 212) | struct Params { method get_workspace_size (line 223) | static size_t get_workspace_size(Arguments const& args) { return 0; } method initialize_workspace (line 224) | static cutlass::Status initialize_workspace(Arguments const&, void*,... method can_implement (line 228) | static bool can_implement(Arguments const& args) { method dim3 (line 232) | static dim3 get_grid_shape(Params const& params) { method dim3 (line 236) | static dim3 get_block_shape() { method Params (line 241) | static Params to_underlying_arguments(Arguments const& args, void* w... method apply_batch (line 250) | CUTLASS_DEVICE auto apply_batch(const Params ¶ms, ProblemShape c... method CUTLASS_DEVICE (line 254) | CUTLASS_DEVICE void operator()(const Params ¶ms, char* smem) { FILE: csrc/sm100/prefill/sparse/common_subroutine.h function namespace (line 6) | namespace sm100 { FILE: csrc/sm100/prefill/sparse/fwd/head128/config.h function namespace (line 10) | namespace sm100::fwd::head128 { FILE: csrc/sm100/prefill/sparse/fwd/head128/phase1.h function namespace (line 5) | namespace sm100::fwd::head128 { FILE: csrc/sm100/prefill/sparse/fwd/head64/config.h function namespace (line 8) | namespace sm100::fwd::head64 { FILE: csrc/sm100/prefill/sparse/fwd/head64/phase1.h function namespace (line 5) | namespace sm100::fwd::head64 { FILE: csrc/sm100/prefill/sparse/fwd_for_small_topk/head128/config.h function namespace (line 12) | namespace sm100::fwd_for_small_topk::head128 { FILE: csrc/sm100/prefill/sparse/fwd_for_small_topk/head128/phase1.h function namespace (line 5) | namespace sm100::fwd_for_small_topk::head128 { FILE: csrc/sm90/decode/dense/config.h function namespace (line 3) | namespace Config { FILE: csrc/sm90/decode/dense/splitkv_mla.h function namespace (line 5) | namespace sm90 { FILE: csrc/sm90/decode/dense/traits.h type SharedMemoryPlan (line 71) | struct SharedMemoryPlan { type NamedBarriers (line 101) | enum NamedBarriers : int { FILE: csrc/sm90/decode/sparse_fp8/components/config.h function namespace (line 10) | namespace sm90::decode::sparse_fp8 { FILE: csrc/sm90/decode/sparse_fp8/components/dequant.h type fp8x8 (line 10) | struct fp8x8 { type fp8x16 (line 15) | struct fp8x16 { function bf16x8 (line 21) | bf16x8 cvt_fp8x8_bf16x8(const fp8x8 &inputs, const __nv_bfloat162 &scale... type class (line 36) | enum class function L2PrefetchHint (line 43) | enum class L2PrefetchHint { FILE: csrc/sm90/decode/sparse_fp8/components/helpers.h function namespace (line 10) | namespace sm90::decode::sparse_fp8 { function st_async_128b (line 79) | void st_async_128b(void* dst_ptr, const T& data, const transac_bar_t* mb... function CUTE_DEVICE (line 90) | CUTE_DEVICE FILE: csrc/sm90/decode/sparse_fp8/config.h function namespace (line 13) | namespace sm90::decode::sparse_fp8 { FILE: csrc/sm90/decode/sparse_fp8/splitkv_mla.h function namespace (line 5) | namespace sm90::decode::sparse_fp8 { FILE: csrc/sm90/helpers.h function namespace (line 6) | namespace sm90 { FILE: csrc/sm90/prefill/sparse/config.h function namespace (line 14) | namespace sm90::fwd { FILE: csrc/sm90/prefill/sparse/fwd.h function namespace (line 5) | namespace sm90 { FILE: csrc/sm90/prefill/sparse/phase1.h function namespace (line 5) | namespace sm90::fwd { FILE: csrc/smxx/decode/combine/combine.h function namespace (line 5) | namespace smxx::decode { FILE: csrc/smxx/decode/get_decoding_sched_meta/get_decoding_sched_meta.h function namespace (line 5) | namespace smxx::decode { FILE: csrc/utils.h type RingBufferState (line 58) | struct RingBufferState { function RingBufferState (line 75) | RingBufferState offset_by(const int offset) const { FILE: flash_mla/flash_mla_interface.py class FlashMLASchedMeta (line 9) | class FlashMLASchedMeta: class Config (line 15) | class Config: function get_mla_metadata (line 37) | def get_mla_metadata( function flash_mla_with_kvcache (line 53) | def flash_mla_with_kvcache( function flash_mla_sparse_fwd (line 176) | def flash_mla_sparse_fwd( function _flash_attn_varlen_forward (line 214) | def _flash_attn_varlen_forward( function _flash_attn_varlen_backward (line 261) | def _flash_attn_varlen_backward( class FlashAttnVarlenFunc (line 328) | class FlashAttnVarlenFunc(torch.autograd.Function): method forward (line 329) | def forward( method backward (line 356) | def backward( function flash_attn_varlen_func (line 372) | def flash_attn_varlen_func( function flash_attn_varlen_qkvpacked_func (line 395) | def flash_attn_varlen_qkvpacked_func( function flash_attn_varlen_kvpacked_func (line 415) | def flash_attn_varlen_kvpacked_func( FILE: setup.py function is_flag_set (line 16) | def is_flag_set(flag: str) -> bool: function get_features_args (line 19) | def get_features_args(): function get_arch_flags (line 25) | def get_arch_flags(): function get_nvcc_thread_args (line 48) | def get_nvcc_thread_args(): FILE: tests/kernelkit/bench.py class empty_suppress (line 9) | class empty_suppress: method __enter__ (line 10) | def __enter__(self): method __exit__ (line 13) | def __exit__(self, *_): function profiler_range_start_marker_kernel (line 17) | def profiler_range_start_marker_kernel(): function _run_profiler_range_start_marker_kernel (line 20) | def _run_profiler_range_start_marker_kernel(): class BenchKinetoRawResult (line 24) | class BenchKinetoRawResult: method _get_matched_kernel_name (line 33) | def _get_matched_kernel_name(self, name_substr: str, allow_no_match: b... method get_kernel_names (line 42) | def get_kernel_names(self) -> List[str]: method get_kernel_times (line 45) | def get_kernel_times(self, kernel_names_substr: List[str], allow_indiv... method get_kernel_time (line 74) | def get_kernel_time(self, kernel_name_substr: str) -> float: method get_e2e_time (line 77) | def get_e2e_time(self, start_kernel_name_substr: str, end_kenrel_name_... function bench_kineto (line 103) | def bench_kineto(fn: Callable, num_tests: int = 30, function bench_by_cuda_events (line 161) | def bench_by_cuda_events(kernels: List[Callable], num_warmups_each: int,... function bench_by_cuda_events (line 164) | def bench_by_cuda_events(kernels: Callable, num_warmups_each: int, num_r... function bench_by_cuda_events (line 166) | def bench_by_cuda_events(kernels: Union[List[Callable], Callable], num_w... FILE: tests/kernelkit/compare.py function check_is_bitwise_equal_comparator (line 5) | def check_is_bitwise_equal_comparator(ans: torch.Tensor, ref: torch.Tens... function check_is_bitwise_equal (line 13) | def check_is_bitwise_equal(name: str, ans: torch.Tensor, ref: torch.Tens... function get_cos_diff (line 19) | def get_cos_diff(ans: torch.Tensor, ref: torch.Tensor) -> float: function check_is_allclose (line 31) | def check_is_allclose(name: str, ans: torch.Tensor, ref: torch.Tensor, a... function check_is_allclose_comparator (line 94) | def check_is_allclose_comparator(name: str, ans: torch.Tensor, ref: torc... FILE: tests/kernelkit/generate.py function _get_new_non_contiguous_tensor_shape (line 3) | def _get_new_non_contiguous_tensor_shape(shape): function gen_non_contiguous_randn_tensor (line 10) | def gen_non_contiguous_randn_tensor(shape, *args, **kwargs): function gen_non_contiguous_tensor (line 16) | def gen_non_contiguous_tensor(shape, *args, **kwargs): function non_contiguousify (line 22) | def non_contiguousify(tensor: torch.Tensor) -> torch.Tensor: FILE: tests/kernelkit/precision.py class LowPrecisionMode (line 5) | class LowPrecisionMode: method __init__ (line 6) | def __init__(self, enabled: bool = True): method __enter__ (line 9) | def __enter__(self): method __exit__ (line 13) | def __exit__(self, exc_type, exc_value, traceback): function is_low_precision_mode (line 17) | def is_low_precision_mode() -> bool: function optional_cast_to_bf16_and_cast_back (line 23) | def optional_cast_to_bf16_and_cast_back(tensor: torch.Tensor) -> torch.T... FILE: tests/kernelkit/utils.py function cdiv (line 18) | def cdiv(a: int, b: int) -> int: function is_using_profiling_tools (line 22) | def is_using_profiling_tools() -> bool: function set_random_seed (line 33) | def set_random_seed(seed: int): class Counter (line 44) | class Counter: method __init__ (line 45) | def __init__(self): method next (line 48) | def next(self) -> int: FILE: tests/lib.py class TestTarget (line 13) | class TestTarget(enum.Enum): class ExtraTestParamForDecode (line 18) | class ExtraTestParamForDecode: class TestParam (line 29) | class TestParam: class RawTestParamForDecode (line 46) | class RawTestParamForDecode: method to_test_param (line 74) | def to_test_param(self) -> TestParam: class Testcase (line 90) | class Testcase: function _randperm_batch (line 100) | def _randperm_batch(batch_size: int, perm_range: torch.Tensor, perm_size... function generate_testcase (line 121) | def generate_testcase(t: TestParam) -> Testcase: class KVScope (line 168) | class KVScope: method quant_and_dequant_ (line 178) | def quant_and_dequant_(self): method get_kvcache_for_flash_mla (line 195) | def get_kvcache_for_flash_mla(self) -> torch.Tensor: method apply_perm (line 202) | def apply_perm(self, perm: torch.Tensor) -> "KVScope": class TestcaseForDecode (line 219) | class TestcaseForDecode: function generate_testcase_for_decode (line 227) | def generate_testcase_for_decode(t: TestParam) -> TestcaseForDecode: function run_flash_mla_sparse_fwd (line 310) | def run_flash_mla_sparse_fwd(p: TestParam, t: Testcase, return_p_sum: bo... function run_flash_mla_decode (line 319) | def run_flash_mla_decode(p: TestParam, t: TestcaseForDecode, tile_schedu... class FlopsAndMemVolStatistics (line 338) | class FlopsAndMemVolStatistics: function count_flop_and_mem_vol (line 345) | def count_flop_and_mem_vol(p: TestParam, t: Testcase) -> FlopsAndMemVolS... class FlopsAndMemVolStatisticsForDecode (line 360) | class FlopsAndMemVolStatisticsForDecode: function count_flop_and_mem_vol_for_decode (line 367) | def count_flop_and_mem_vol_for_decode(p: TestParam, t: TestcaseForDecode... function is_no_cooldown (line 404) | def is_no_cooldown() -> bool: FILE: tests/quant.py class FP8KVCacheLayout (line 6) | class FP8KVCacheLayout(enum.Enum): method get_meta (line 10) | def get_meta(self) -> Tuple[int, int, int, int, int]: function _cast_scale_inv_to_ue8m0 (line 17) | def _cast_scale_inv_to_ue8m0(scales_inv: torch.Tensor, out_dtype = torch... function quantize_k_cache (line 20) | def quantize_k_cache( function dequantize_k_cache (line 81) | def dequantize_k_cache( function abs_indices2indices_in_kvcache (line 126) | def abs_indices2indices_in_kvcache( FILE: tests/ref.py function _merge_two_lse (line 7) | def _merge_two_lse(lse0: torch.Tensor, lse1: Optional[torch.Tensor], s_q... function ref_sparse_attn_fwd (line 19) | def ref_sparse_attn_fwd(p: TestParam, t: Testcase) -> Tuple[torch.Tensor... function ref_sparse_attn_decode (line 55) | def ref_sparse_attn_decode( FILE: tests/test_flash_mla_dense_decoding.py class TestParam (line 13) | class TestParam: function generate_test_data (line 29) | def generate_test_data(t: TestParam) -> Tuple[torch.Tensor, torch.Tensor... function reference_torch (line 73) | def reference_torch( function test_flash_mla (line 145) | def test_flash_mla(t: TestParam): function main (line 195) | def main(torch_dtype): FILE: tests/test_flash_mla_sparse_decoding.py function gen_testcase (line 23) | def gen_testcase() -> List[RawTestParam]: class Result (line 130) | class Result: function test_flash_mla (line 142) | def test_flash_mla(p: TestParam) -> Result: function main (line 236) | def main(): FILE: tests/test_flash_mla_sparse_prefill.py function run_test (line 14) | def run_test(p: TestParam) -> bool: FILE: tests/test_fmha_sm100.py function get_window_size (line 10) | def get_window_size(causal, window): function get_attn_bias (line 18) | def get_attn_bias(s_q, s_k, causal, window): function sdpa (line 31) | def sdpa(query, key, value, attn_bias, softmax_scale=None): function sdpa_checkpoint (line 46) | def sdpa_checkpoint(*args, **kwargs): function test_flash_attention (line 50) | def test_flash_attention(b, mean_sq, mean_sk, varlen, h, h_k, d, dv, cau...