SYMBOL INDEX (317 symbols across 52 files) FILE: dev/sr/src/stochastic_rounding.hpp type philox (line 9) | namespace philox { class PhiloxGenerator (line 18) | class PhiloxGenerator { FILE: dev/sr/tests/benchmark.py function measure_performance (line 8) | def measure_performance(func, input_tensor, warmup=0, repeats=1): function benchmark_sizes (line 27) | def benchmark_sizes(sizes= [1000, 10000, 100000, 1000000, 10000000, (100... function benchmark_shapes (line 59) | def benchmark_shapes(total_size=1000000): function stress_test (line 84) | def stress_test(duration=10): function memory_test (line 100) | def memory_test(max_size=1e9): function main (line 128) | def main(): FILE: dev/sr/tests/core_unit_tests.py class TestStochasticRounding (line 8) | class TestStochasticRounding(unittest.TestCase): method setup (line 9) | def setup(self): method _test_rounding_statistics_helper (line 14) | def _test_rounding_statistics_helper(self, value, lower_value, upper_v... method test_special_values (line 40) | def test_special_values(self): method test_small_values (line 59) | def test_small_values(self): method test_vectorized_loading (line 67) | def test_vectorized_loading(self): method test_large_values (line 81) | def test_large_values(self): method test_rounding_statistics (line 89) | def test_rounding_statistics(self): method test_rounding_statistics_2 (line 93) | def test_rounding_statistics_2(self): method test_rounding_statistics_small (line 97) | def test_rounding_statistics_small(self): method test_rounding_statistics_large (line 101) | def test_rounding_statistics_large(self): FILE: dev/triton_groupGEMM/groupgemm.py function early_config_prune (line 61) | def early_config_prune(configs, named_args, dtsize=None, dtype=None, **k... function _kernel_grouped_gemm (line 131) | def _kernel_grouped_gemm( function _kernel_grouped_gemm_fp8_rowwise (line 270) | def _kernel_grouped_gemm_fp8_rowwise( function _grouped_gemm (line 407) | def _grouped_gemm( function grouped_gemm (line 518) | def grouped_gemm( function grouped_gemm_fp8_rowwise (line 524) | def grouped_gemm_fp8_rowwise( FILE: dev/triton_groupGEMM/testing/base_testing.py class TestGroupedGEMM (line 39) | class TestGroupedGEMM(unittest.TestCase): method setUp (line 40) | def setUp(self) -> None: method test_grouped_gemm_bf16 (line 98) | def test_grouped_gemm_bf16(self) -> None: FILE: dev/triton_groupGEMM/testing/unit_tests.py class TestGroupedGEMM (line 23) | class TestGroupedGEMM(unittest.TestCase): method test_grouped_gemm_bf16 (line 24) | def test_grouped_gemm_bf16(self) -> None: method test_grouped_gemm_bf16_various_dimensions (line 63) | def test_grouped_gemm_bf16_various_dimensions(self) -> None: method test_grouped_gemm_bf16_edge_cases (line 105) | def test_grouped_gemm_bf16_edge_cases(self) -> None: method test_grouped_gemm_bf16_invalid_inputs (line 168) | def test_grouped_gemm_bf16_invalid_inputs(self) -> None: method test_grouped_gemm_bf16_deterministic (line 214) | def test_grouped_gemm_bf16_deterministic(self) -> None: method test_grouped_gemm_bf16_large_matrices (line 235) | def test_grouped_gemm_bf16_large_matrices(self) -> None: FILE: dev/triton_groupGEMM/tma_utils.py function map_dtype_to_triton (line 18) | def map_dtype_to_triton(dtype: torch.dtype) -> tl.dtype: class TmaAutoTuneHelper (line 58) | class TmaAutoTuneHelper: class KernelParamWrapper (line 61) | class KernelParamWrapper: method __init__ (line 62) | def __init__(self, desc): method tma_desc_cpu_ptr (line 65) | def tma_desc_cpu_ptr(self): method __init__ (line 70) | def __init__(self): method init_tma_descriptor (line 83) | def init_tma_descriptor(self, name): method fill_1d_tma_descriptor (line 94) | def fill_1d_tma_descriptor(self, name, ptr, dim, block_dim, element_si... method fill_2d_tma_descriptor (line 110) | def fill_2d_tma_descriptor( method get_tma_descriptor_kernel_param (line 127) | def get_tma_descriptor_kernel_param(self, name): FILE: dev/triton_groupGEMM/triton_tutorial_groupgemm.py function is_cuda (line 41) | def is_cuda(): function supports_tma (line 45) | def supports_tma(): function num_sms (line 49) | def num_sms(): function grouped_matmul_kernel (line 109) | def grouped_matmul_kernel( function group_gemm_fn (line 187) | def group_gemm_fn(group_A, group_B): function grouped_matmul_tma_kernel (line 250) | def grouped_matmul_tma_kernel( function group_gemm_tma_fn (line 346) | def group_gemm_tma_fn(group_A, group_B): function triton_perf_fn (line 433) | def triton_perf_fn(a_ptrs, b_ptrs, c_ptrs, sizes, lds, group_size): function triton_tma_perf_fn (line 445) | def triton_tma_perf_fn(a_ptrs, b_ptrs, c_ptrs, sizes, lds, group_size, d... function torch_perf_fn (line 459) | def torch_perf_fn(group_A, group_B): function benchmark_square_matrices (line 484) | def benchmark_square_matrices(N, provider): function benchmark_batches (line 567) | def benchmark_batches(M, provider): FILE: kernels/MoE/group_GEMM/triton/testing/fast_verification.py function test_backward_pass (line 23) | def test_backward_pass(): FILE: kernels/MoE/group_GEMM/triton/testing/pytorch_reference_backwards.py function _compute_grad_x_pytorch (line 15) | def _compute_grad_x_pytorch(grad_output, w, m_sizes, grad_x): function _compute_grad_w_pytorch (line 68) | def _compute_grad_w_pytorch(grad_output, x, m_sizes, grad_w): function _pytorch_fallback_backward (line 139) | def _pytorch_fallback_backward(grad_output, x, w, m_sizes): function _pytorch_reference_backward (line 165) | def _pytorch_reference_backward(grad_output, x, w, m_sizes): FILE: kernels/MoE/group_GEMM/triton/tgroup_gemm_backwards.py function _kernel_grouped_gemm_backward_x_scheduled (line 28) | def _kernel_grouped_gemm_backward_x_scheduled( function _kernel_grouped_gemm_backward_w_scheduled (line 202) | def _kernel_grouped_gemm_backward_w_scheduled( function grouped_gemm_backward (line 382) | def grouped_gemm_backward( FILE: kernels/MoE/group_GEMM/triton/tgroup_gemm_forward.py function _kernel_grouped_gemm (line 137) | def _kernel_grouped_gemm( function _kernel_grouped_gemm_fp8_rowwise (line 312) | def _kernel_grouped_gemm_fp8_rowwise( function _grouped_gemm (line 485) | def _grouped_gemm( function grouped_gemm_forward (line 626) | def grouped_gemm_forward( function grouped_gemm_fp8_rowwise (line 632) | def grouped_gemm_fp8_rowwise( FILE: kernels/MoE/group_GEMM/triton/utils/tma_utils.py function map_dtype_to_triton (line 18) | def map_dtype_to_triton(dtype: torch.dtype) -> tl.dtype: class TmaAutoTuneHelper (line 58) | class TmaAutoTuneHelper: class KernelParamWrapper (line 61) | class KernelParamWrapper: method __init__ (line 62) | def __init__(self, desc): method tma_desc_cpu_ptr (line 65) | def tma_desc_cpu_ptr(self): method __init__ (line 70) | def __init__(self): method init_tma_descriptor (line 83) | def init_tma_descriptor(self, name): method fill_1d_tma_descriptor (line 94) | def fill_1d_tma_descriptor(self, name, ptr, dim, block_dim, element_si... method fill_2d_tma_descriptor (line 110) | def fill_2d_tma_descriptor( method get_tma_descriptor_kernel_param (line 127) | def get_tma_descriptor_kernel_param(self, name): FILE: kernels/blackwell/cute_gemm_01/driver.py function sm100_gemm_f16 (line 17) | def sm100_gemm_f16(A, B, C=None, alpha=1.0, beta=0.0): function benchmark_sm100_vs_torch (line 70) | def benchmark_sm100_vs_torch( FILE: kernels/blackwell/cute_gemm_01/sm100_gemm_pytorch.cpp function is_sm100_supported (line 11) | bool is_sm100_supported() { function check_sm100_device (line 20) | bool check_sm100_device() { function sm100_gemm_f16 (line 34) | torch::Tensor sm100_gemm_f16(const torch::Tensor &A, const torch::Tensor... function get_device_info (line 109) | torch::Tensor get_device_info() { function get_aligned_shape (line 128) | std::vector get_aligned_shape(int64_t M, int64_t N, int64_t K) { function create_aligned_tensor (line 137) | torch::Tensor create_aligned_tensor(const std::vector &shape, function PYBIND11_MODULE (line 156) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { FILE: kernels/blackwell/cute_gemm_02_tma/driver.py function check_sm100_compatibility (line 14) | def check_sm100_compatibility(): function sm100_gemm_f16_tma (line 38) | def sm100_gemm_f16_tma(A, B, C=None, alpha=1.0, beta=0.0, check_alignmen... function create_aligned_tensors (line 114) | def create_aligned_tensors( function pad_to_aligned (line 132) | def pad_to_aligned(tensor, target_shape=None, dim_requirements=None): function unpad_result (line 175) | def unpad_result(tensor, padding_info): function benchmark_sm100_vs_torch (line 181) | def benchmark_sm100_vs_torch( class SM100LinearTMA (line 284) | class SM100LinearTMA(torch.nn.Module): method __init__ (line 289) | def __init__(self, in_features, out_features, bias=True, device="cuda"): method forward (line 324) | def forward(self, x): function benchmark_tma_vs_cooperative_copy (line 363) | def benchmark_tma_vs_cooperative_copy(M=512, N=1024, K=256, num_trials=50): function stress_test_large_matrices (line 377) | def stress_test_large_matrices(): function check_sm100_compatibility (line 528) | def check_sm100_compatibility(): function sm100_gemm_f16 (line 552) | def sm100_gemm_f16(A, B, C=None, alpha=1.0, beta=0.0, check_alignment=Tr... function create_aligned_tensors (line 623) | def create_aligned_tensors( function pad_to_aligned (line 641) | def pad_to_aligned(tensor, target_shape=None, dim_requirements=None): function unpad_result (line 684) | def unpad_result(tensor, padding_info): function benchmark_sm100_vs_torch (line 690) | def benchmark_sm100_vs_torch( class SM100Linear (line 781) | class SM100Linear(torch.nn.Module): method __init__ (line 786) | def __init__(self, in_features, out_features, bias=True, device="cuda"): method forward (line 820) | def forward(self, x): FILE: kernels/blackwell/cute_gemm_02_tma/sm100_gemm_pytorch.cpp function is_sm100_supported (line 11) | bool is_sm100_supported() { function check_sm100_device (line 20) | bool check_sm100_device() { function sm100_gemm_f16 (line 34) | torch::Tensor sm100_gemm_f16(const torch::Tensor &A, const torch::Tensor... function get_device_info (line 109) | torch::Tensor get_device_info() { function get_aligned_shape (line 128) | std::vector get_aligned_shape(int64_t M, int64_t N, int64_t K) { function create_aligned_tensor (line 137) | torch::Tensor create_aligned_tensor(const std::vector &shape, function PYBIND11_MODULE (line 156) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { FILE: kernels/cuda/cutlass_gemm/broadcast_load_epilogue_c3x.hpp type cutlass::epilogue::fusion (line 60) | namespace cutlass::epilogue::fusion { type Sm90RowOrScalarBroadcast (line 75) | struct Sm90RowOrScalarBroadcast { type SharedStorage (line 82) | struct SharedStorage { type Arguments (line 89) | struct Arguments { method Params (line 98) | static constexpr Params method get_workspace_size (line 104) | static size_t method initialize_workspace (line 110) | static cutlass::Status method CUTLASS_HOST_DEVICE (line 116) | CUTLASS_HOST_DEVICE method CUTLASS_DEVICE (line 127) | CUTLASS_DEVICE bool method CUTLASS_DEVICE (line 132) | CUTLASS_DEVICE bool method CUTLASS_DEVICE (line 137) | CUTLASS_DEVICE bool type ProducerLoadCallbacks (line 143) | struct ProducerLoadCallbacks : EmptyProducerLoadCallbacks { method CUTLASS_DEVICE (line 154) | CUTLASS_DEVICE void method get_producer_load_callbacks (line 174) | CUTLASS_DEVICE auto type ConsumerStoreCallbacks (line 191) | struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks { method CUTLASS_DEVICE (line 202) | CUTLASS_DEVICE void method CUTLASS_DEVICE (line 218) | CUTLASS_DEVICE Array method get_consumer_store_callbacks (line 235) | CUTLASS_DEVICE auto type Sm90ColOrScalarBroadcast (line 261) | struct Sm90ColOrScalarBroadcast { type SharedStorage (line 269) | struct SharedStorage { } type Arguments (line 274) | struct Arguments { method Params (line 283) | static constexpr Params method get_workspace_size (line 289) | static size_t method initialize_workspace (line 295) | static cutlass::Status method CUTLASS_DEVICE (line 301) | CUTLASS_DEVICE bool method CUTLASS_DEVICE (line 306) | CUTLASS_DEVICE bool method CUTLASS_DEVICE (line 311) | CUTLASS_DEVICE bool method CUTLASS_HOST_DEVICE (line 316) | CUTLASS_HOST_DEVICE method CUTLASS_HOST_DEVICE (line 319) | CUTLASS_HOST_DEVICE method get_producer_load_callbacks (line 326) | CUTLASS_DEVICE auto type ConsumerStoreCallbacks (line 332) | struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks { method CUTLASS_DEVICE (line 343) | CUTLASS_DEVICE void method CUTLASS_DEVICE (line 356) | CUTLASS_DEVICE Array method get_consumer_store_callbacks (line 375) | CUTLASS_DEVICE auto FILE: kernels/cuda/cutlass_gemm/common.hpp function next_pow_2 (line 15) | inline uint32_t next_pow_2(uint32_t const num) { function get_cuda_max_shared_memory_per_block_opt_in (line 20) | inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) { FILE: kernels/cuda/cutlass_gemm/cutlass.cpp function cutlass_scaled_mm (line 7) | torch::Tensor cutlass_scaled_mm(torch::Tensor a, torch::Tensor b, torch:... function PYBIND11_MODULE (line 17) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { FILE: kernels/cuda/inference/hadamard_transform/hadamard_transform.cpp function is_power_of_two (line 11) | constexpr bool is_power_of_two(uint32_t x) { function hadamard_transform (line 15) | torch::Tensor hadamard_transform(at::Tensor& in, bool inplace) { function PYBIND11_MODULE (line 58) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { FILE: kernels/cuda/inference/hadamard_transform/test.py function get_scale (line 24) | def get_scale(size): function truth_hadamard_transform_inplace (line 34) | def truth_hadamard_transform_inplace(a: torch.Tensor, truth_hadamards): function test_hadamard_transform_inplace_rowmajor (line 42) | def test_hadamard_transform_inplace_rowmajor(a: torch.Tensor): function check_correctness (line 48) | def check_correctness(m, elem_c, a, result, truth, atol=1e-2, rtol=0): FILE: kernels/needs_perf_help/fp8_gemm_bench.py function bench (line 28) | def bench(cuda_graph: bool, rowwise_tma: bool=True) -> None: function bf16_bench (line 107) | def bf16_bench(x: Tensor, w: Tensor) -> Callable[[], Tensor]: function scale_row_bench (line 114) | def scale_row_bench(x: Tensor, w: Tensor) -> Callable[[], Tensor]: function row_gemm_bench (line 136) | def row_gemm_bench(x: Tensor, w: Tensor) -> Callable[[], Tensor]: function row_gemm_bench_tma (line 159) | def row_gemm_bench_tma(x: Tensor, w: Tensor) -> Callable[[], Tensor]: FILE: kernels/needs_perf_help/fp8_rowwise_tma_persistent.py function get_fp8_constants (line 27) | def get_fp8_constants() -> Tuple[torch.dtype, tl.dtype, float, float]: function convert_fp8_type (line 46) | def convert_fp8_type(tensor, dtype) -> triton.TensorWrapper: function init_to_zero (line 60) | def init_to_zero(name): function get_configs_io_bound (line 64) | def get_configs_io_bound() -> List[Config]: function _kernel_matmul_fp8_row_tma_persistent (line 108) | def _kernel_matmul_fp8_row_tma_persistent( function matmul_fp8_row (line 235) | def matmul_fp8_row( FILE: kernels/triton/inference/col_major_moe_gemm/perf_test_moe.py function torch_moe (line 16) | def torch_moe(a, w1, w2, topk_weight, topk_ids): function test_fused_moe (line 33) | def test_fused_moe( function benchmark (line 99) | def benchmark(m, provider): FILE: kernels/triton/inference/col_major_moe_gemm/profile_moe.py function torch_moe (line 15) | def torch_moe(a, w1, w2, topk_weight, topk_ids): function test_fused_moe (line 32) | def test_fused_moe( FILE: kernels/triton/inference/col_major_moe_gemm/test_moe_gemm.py function torch_moe (line 16) | def torch_moe(a, w1, w2, topk_weight, topk_ids): function test_fused_moe (line 39) | def test_fused_moe( FILE: kernels/triton/inference/col_major_moe_gemm/v0_moe_fused.py function fused_moe_kernel (line 18) | def fused_moe_kernel( function moe_align_block_size (line 138) | def moe_align_block_size( function invoke_fused_moe_kernel (line 183) | def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.T... function fused_moe (line 222) | def fused_moe(hidden_states: torch.Tensor, FILE: kernels/triton/inference/col_major_moe_gemm/v1_moe_fused.py function grouped_launch (line 21) | def grouped_launch(pid, function fused_moe_kernel_splitk (line 39) | def fused_moe_kernel_splitk( function moe_align_block_size (line 150) | def moe_align_block_size( function invoke_fused_moe_kernel (line 195) | def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.T... function fused_moe (line 249) | def fused_moe(hidden_states: torch.Tensor, FILE: kernels/triton/inference/col_major_moe_gemm/v2_moe_fused.py function col_major (line 18) | def col_major(pid, function fused_moe_kernel (line 31) | def fused_moe_kernel( function moe_align_block_size (line 136) | def moe_align_block_size( function invoke_fused_moe_kernel (line 181) | def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.T... function fused_moe (line 220) | def fused_moe(hidden_states: torch.Tensor, FILE: kernels/triton/inference/flash_attention/stay_attention.py function stay_attention (line 7) | def stay_attention( function flash_fn (line 107) | def flash_fn(q, k, v): FILE: kernels/triton/inference/fp8/float8_groupwise_quant.py function _float8_groupwise_quant_kernel (line 21) | def _float8_groupwise_quant_kernel( function float8_groupwise_quantize (line 53) | def float8_groupwise_quantize(x: torch.Tensor, block_size=128): FILE: kernels/triton/inference/fp8/scaled_fp8_gemm.py function grouped_launch (line 10) | def grouped_launch(pid, function column_major (line 27) | def column_major(pid, function scaled_gemm_splitk (line 39) | def scaled_gemm_splitk(a_ptr, b_ptr, c_ptr, function scaled_mm_splitk (line 94) | def scaled_mm_splitk(a, b, scale_a: float=1.0, scale_b: float=1.0): FILE: kernels/triton/inference/fp8/splitk_gemm_fp8.py function grouped_launch (line 9) | def grouped_launch(pid, function col_major (line 27) | def col_major(pid, function gemm_split_k_kernel (line 40) | def gemm_split_k_kernel(a_ptr, b_ptr, c_ptr, function gemm_split_k (line 90) | def gemm_split_k(a, b): FILE: kernels/triton/inference/fp8/tma_gemm.py function gemm_kernel_tma (line 7) | def gemm_kernel_tma(a_desc_ptr, b_desc_ptr, c_desc_ptr, # function matmul (line 32) | def matmul(a, b, config=None): FILE: kernels/triton/inference/gptq/a100_qlinear.py function _a100_quantized_matmul (line 6) | def _a100_quantized_matmul(a_ptr, b_ptr, c_ptr, scales_ptr, zeros_ptr, class a100_qlinear (line 77) | class a100_qlinear(torch.autograd.Function): method forward (line 78) | def forward(ctx, a, b, scales, zeros): FILE: kernels/triton/inference/gptq/benchmark.py function benchmark_generation_speed (line 15) | def benchmark_generation_speed(model, tokenizer, prompt, batch_size, dev... function main (line 59) | def main(): FILE: kernels/triton/inference/gptq/h100_qlinear.py function _h100_quantized_matmul (line 7) | def _h100_quantized_matmul(a_ptr, b_ptr, c_ptr, scales_ptr, zeros_ptr, class h100_qlinear (line 87) | class h100_qlinear(torch.autograd.Function): method forward (line 88) | def forward(ctx, a, b, scales, zeros): FILE: kernels/triton/inference/gptq/mixtral/test_dequant_moe_gemm.py function torch_moe (line 9) | def torch_moe(a, w1, w2, topk_weight, topk_ids): function test_dequant_moe (line 25) | def test_dequant_moe( FILE: kernels/triton/inference/gptq/mixtral/w4a16_fused_dequant_gemm.py function print_tensor_dim (line 9) | def print_tensor_dim(tensor, str_name): function print_value (line 13) | def print_value(value): function grouped_launch (line 18) | def grouped_launch(pid, function col_major (line 36) | def col_major(pid, function w4a16_fused_moe_kernel (line 50) | def w4a16_fused_moe_kernel( function invoke_dequant_gemm_moe (line 153) | def invoke_dequant_gemm_moe(activations: torch.Tensor, function moe_align_block_size (line 211) | def moe_align_block_size( function dequant_gemm_moe (line 255) | def dequant_gemm_moe(hidden_states: torch.Tensor, FILE: kernels/triton/inference/gptq/small_benchmark_cuda_graphs.py function swizzle_tile (line 11) | def swizzle_tile(pid, function matmul_data_parallel_kernel (line 29) | def matmul_data_parallel_kernel(a_ptr, b_ptr, c_ptr, scales_ptr, zeros_ptr, class small_qlinear (line 109) | class small_qlinear(torch.autograd.Function): method forward (line 110) | def forward(ctx, a, b, scales, zeros): function matmul_split_k_kernel (line 161) | def matmul_split_k_kernel(a_ptr, b_ptr, c_ptr, scales_ptr, zeros_ptr, function matmul_split_k (line 228) | def matmul_split_k(a, b, scales, zeros): function make_tensor (line 281) | def make_tensor(M, N, dtype): function gen_quant4 (line 292) | def gen_quant4(m, n, groupsize=-1): FILE: kernels/triton/inference/gptq/splitk_dequant_gemm.py function swizzle_tile (line 7) | def swizzle_tile(pid, function matmul_split_k_kernel (line 24) | def matmul_split_k_kernel(a_ptr, b_ptr, c_ptr, scales_ptr, zeros_ptr, function matmul_split_k (line 91) | def matmul_split_k(a, b, scales, zeros): function make_tensor (line 143) | def make_tensor(M, N, dtype): FILE: kernels/triton/inference/mamba/causal_1d_conv/causal_1d_conv/causal_1d_conv.py function _causal_conv1d_fwd_kernel (line 27) | def _causal_conv1d_fwd_kernel( function causal_conv1d_fwd (line 121) | def causal_conv1d_fwd( class CausalConv1dFn (line 203) | class CausalConv1dFn(torch.autograd.Function): method forward (line 205) | def forward( method backward (line 259) | def backward(ctx, dout, *args): function causal_conv1d_fn (line 295) | def causal_conv1d_fn( FILE: kernels/triton/inference/mamba/causal_1d_conv/tests/test_causal_1d_conv.py function _undecorated_test_causal_conv1d (line 24) | def _undecorated_test_causal_conv1d( function causal_conv1d_ref (line 117) | def causal_conv1d_ref( function test_causal_conv1d (line 183) | def test_causal_conv1d( FILE: kernels/triton/inference/paged_attention/attention_triton.py function print_tensor_dim (line 14) | def print_tensor_dim(tensor, str_name): function print_value (line 20) | def print_value(value): function print_line (line 27) | def print_line(str_line): function paged_attention_v1 (line 33) | def paged_attention_v1( function paged_attention_triton_v1 (line 158) | def paged_attention_triton_v1( function paged_attention_v2 (line 206) | def paged_attention_v2( function paged_attention_triton_v2 (line 358) | def paged_attention_triton_v2( FILE: kernels/triton/inference/torch_compile/flash_backward.py class MetaData (line 43) | class MetaData(): method __init__ (line 55) | def __init__(self, sm_scale=1.0): method set_varlen_params (line 58) | def set_varlen_params(self, cu_seqlens_q, cu_seqlens_k): method need_bias (line 70) | def need_bias(self, bias, batch, nheads, seqlen_q, seqlen_k): method need_alibi (line 77) | def need_alibi(self, alibi_slopes, batch, nheads): method need_causal (line 84) | def need_causal(self): method need_dropout (line 87) | def need_dropout(dropout_p, return_encoded_softmax): method check_args (line 91) | def check_args(self, q, k, v, o): function cdiv_fn (line 120) | def cdiv_fn(x,y): function max_fn (line 124) | def max_fn(x, y): function dropout_offsets (line 128) | def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride): function dropout_rng (line 134) | def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride): function dropout_mask (line 140) | def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride): function load_fn (line 146) | def load_fn(block_ptr, first, second, pad): function _attn_fwd_inner (line 158) | def _attn_fwd_inner( function attn_fwd (line 270) | def attn_fwd( function attention (line 553) | def attention(q, k, v, sm_scale): function _attn_bwd_preprocess (line 637) | def _attn_bwd_preprocess( function _bwd_kernel_dk_dv (line 691) | def _bwd_kernel_dk_dv( function _bwd_kernel_dq (line 761) | def _bwd_kernel_dq(dq, q, K, V, function _attn_bwd (line 821) | def _attn_bwd(Q, K, V, sm_scale, alibi_slopes, function flash_bwd (line 1016) | def flash_bwd(q: torch.Tensor, k: torch.Tensor, v:torch.Tensor, o: torch... function _ (line 1074) | def _(q, k, v, o, M, do): function flash (line 1083) | def flash(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, o: torch.Te... function _ (line 1159) | def _(q, k, v, o, M): function setup_context (line 1163) | def setup_context(ctx, inputs, output) -> torch.Tensor: function backward (line 1167) | def backward(ctx, do): function f (line 1187) | def f(q, k, v): FILE: kernels/triton/training/fused_softmax/softmax.py function _get_num_warps (line 18) | def _get_num_warps(block_size: int)-> int: function _softmax_kernel_fwd (line 27) | def _softmax_kernel_fwd( function _softmax_kernel_bwd (line 56) | def _softmax_kernel_bwd( class triton_softmax (line 93) | class triton_softmax(autograd.Function): method forward (line 95) | def forward(ctx, x): method backward (line 122) | def backward(ctx, grad_probs): FILE: kernels/triton/training/rms_norm/fused_rms_norm.py function _rms_norm_fwd_kernel (line 33) | def _rms_norm_fwd_kernel( function _rms_norm_bwd_kernel_sm (line 83) | def _rms_norm_bwd_kernel_sm( class ttt_RMSNorm (line 181) | class ttt_RMSNorm(torch.autograd.Function): method forward (line 183) | def forward(ctx, x, weight, eps): method backward (line 225) | def backward(ctx, dy): function fused_rms_norm_fn (line 313) | def fused_rms_norm_fn( class FusedRMSNorm (line 325) | class FusedRMSNorm(torch.nn.Module): method __init__ (line 326) | def __init__(self, hidden_size, eps=1e-5, dropout_p=0.0, device=None, ... method reset_parameters (line 333) | def reset_parameters(self): method forward (line 336) | def forward( FILE: tutorials/triton/kernels/fused_softmax.py function _get_num_warps (line 15) | def _get_num_warps(block_size: int)-> int: function _softmax_kernel_fwd (line 24) | def _softmax_kernel_fwd( function _softmax_kernel_bwd (line 53) | def _softmax_kernel_bwd( class triton_softmax (line 90) | class triton_softmax(autograd.Function): method forward (line 92) | def forward(ctx, x): method backward (line 119) | def backward(ctx, grad_probs): FILE: tutorials/triton/kernels/vector_add.py function kernel_vector_addition (line 9) | def kernel_vector_addition(a_ptr, b_ptr, out_ptr, function ceil_div (line 24) | def ceil_div(x: int, y: int)-> int: function vector_addition (line 27) | def vector_addition(a: torch.tensor, b: torch.tensor)-> torch.tensor: FILE: tutorials/triton/tests/test_softmax.py function set_seed (line 13) | def set_seed(): class TestForwardSoftMax (line 18) | class TestForwardSoftMax: method test_forward_2D_float32 (line 20) | def test_forward_2D_float32(self,): method test_forward_2D_bfloat16 (line 36) | def test_forward_2D_bfloat16(self,): method test_forward_3D_bfloat16 (line 51) | def test_forward_3D_bfloat16(self,): class TestBackwardSoftMax (line 70) | class TestBackwardSoftMax: method test_backward_2D (line 72) | def test_backward_2D(self,): method test_bwd_3D (line 96) | def test_bwd_3D(self,): FILE: tutorials/triton/tests/test_utils.py function assert_expected (line 10) | def assert_expected( function set_rng_seed (line 26) | def set_rng_seed(seed): function gpu_test (line 31) | def gpu_test(gpu_count: int = 1):