SYMBOL INDEX (317 symbols across 52 files)

FILE: dev/sr/src/stochastic_rounding.hpp
  type philox (line 9) | namespace philox {
  class PhiloxGenerator (line 18) | class PhiloxGenerator {

FILE: dev/sr/tests/benchmark.py
  function measure_performance (line 8) | def measure_performance(func, input_tensor, warmup=0, repeats=1):
  function benchmark_sizes (line 27) | def benchmark_sizes(sizes= [1000, 10000, 100000, 1000000, 10000000, (100...
  function benchmark_shapes (line 59) | def benchmark_shapes(total_size=1000000):
  function stress_test (line 84) | def stress_test(duration=10):
  function memory_test (line 100) | def memory_test(max_size=1e9):
  function main (line 128) | def main():

FILE: dev/sr/tests/core_unit_tests.py
  class TestStochasticRounding (line 8) | class TestStochasticRounding(unittest.TestCase):
    method setup (line 9) | def setup(self):
    method _test_rounding_statistics_helper (line 14) | def _test_rounding_statistics_helper(self, value, lower_value, upper_v...
    method test_special_values (line 40) | def test_special_values(self):
    method test_small_values (line 59) | def test_small_values(self):
    method test_vectorized_loading (line 67) | def test_vectorized_loading(self):
    method test_large_values (line 81) | def test_large_values(self):
    method test_rounding_statistics (line 89) | def test_rounding_statistics(self):
    method test_rounding_statistics_2 (line 93) | def test_rounding_statistics_2(self):
    method test_rounding_statistics_small (line 97) | def test_rounding_statistics_small(self):
    method test_rounding_statistics_large (line 101) | def test_rounding_statistics_large(self):

FILE: dev/triton_groupGEMM/groupgemm.py
  function early_config_prune (line 61) | def early_config_prune(configs, named_args, dtsize=None, dtype=None, **k...
  function _kernel_grouped_gemm (line 131) | def _kernel_grouped_gemm(
  function _kernel_grouped_gemm_fp8_rowwise (line 270) | def _kernel_grouped_gemm_fp8_rowwise(
  function _grouped_gemm (line 407) | def _grouped_gemm(
  function grouped_gemm (line 518) | def grouped_gemm(
  function grouped_gemm_fp8_rowwise (line 524) | def grouped_gemm_fp8_rowwise(

FILE: dev/triton_groupGEMM/testing/base_testing.py
  class TestGroupedGEMM (line 39) | class TestGroupedGEMM(unittest.TestCase):
    method setUp (line 40) | def setUp(self) -> None:
    method test_grouped_gemm_bf16 (line 98) | def test_grouped_gemm_bf16(self) -> None:

FILE: dev/triton_groupGEMM/testing/unit_tests.py
  class TestGroupedGEMM (line 23) | class TestGroupedGEMM(unittest.TestCase):
    method test_grouped_gemm_bf16 (line 24) | def test_grouped_gemm_bf16(self) -> None:
    method test_grouped_gemm_bf16_various_dimensions (line 63) | def test_grouped_gemm_bf16_various_dimensions(self) -> None:
    method test_grouped_gemm_bf16_edge_cases (line 105) | def test_grouped_gemm_bf16_edge_cases(self) -> None:
    method test_grouped_gemm_bf16_invalid_inputs (line 168) | def test_grouped_gemm_bf16_invalid_inputs(self) -> None:
    method test_grouped_gemm_bf16_deterministic (line 214) | def test_grouped_gemm_bf16_deterministic(self) -> None:
    method test_grouped_gemm_bf16_large_matrices (line 235) | def test_grouped_gemm_bf16_large_matrices(self) -> None:

FILE: dev/triton_groupGEMM/tma_utils.py
  function map_dtype_to_triton (line 18) | def map_dtype_to_triton(dtype: torch.dtype) -> tl.dtype:
  class TmaAutoTuneHelper (line 58) | class TmaAutoTuneHelper:
    class KernelParamWrapper (line 61) | class KernelParamWrapper:
      method __init__ (line 62) | def __init__(self, desc):
      method tma_desc_cpu_ptr (line 65) | def tma_desc_cpu_ptr(self):
    method __init__ (line 70) | def __init__(self):
    method init_tma_descriptor (line 83) | def init_tma_descriptor(self, name):
    method fill_1d_tma_descriptor (line 94) | def fill_1d_tma_descriptor(self, name, ptr, dim, block_dim, element_si...
    method fill_2d_tma_descriptor (line 110) | def fill_2d_tma_descriptor(
    method get_tma_descriptor_kernel_param (line 127) | def get_tma_descriptor_kernel_param(self, name):

FILE: dev/triton_groupGEMM/triton_tutorial_groupgemm.py
  function is_cuda (line 41) | def is_cuda():
  function supports_tma (line 45) | def supports_tma():
  function num_sms (line 49) | def num_sms():
  function grouped_matmul_kernel (line 109) | def grouped_matmul_kernel(
  function group_gemm_fn (line 187) | def group_gemm_fn(group_A, group_B):
  function grouped_matmul_tma_kernel (line 250) | def grouped_matmul_tma_kernel(
  function group_gemm_tma_fn (line 346) | def group_gemm_tma_fn(group_A, group_B):
  function triton_perf_fn (line 433) | def triton_perf_fn(a_ptrs, b_ptrs, c_ptrs, sizes, lds, group_size):
  function triton_tma_perf_fn (line 445) | def triton_tma_perf_fn(a_ptrs, b_ptrs, c_ptrs, sizes, lds, group_size, d...
  function torch_perf_fn (line 459) | def torch_perf_fn(group_A, group_B):
  function benchmark_square_matrices (line 484) | def benchmark_square_matrices(N, provider):
  function benchmark_batches (line 567) | def benchmark_batches(M, provider):

FILE: kernels/MoE/group_GEMM/triton/testing/fast_verification.py
  function test_backward_pass (line 23) | def test_backward_pass():

FILE: kernels/MoE/group_GEMM/triton/testing/pytorch_reference_backwards.py
  function _compute_grad_x_pytorch (line 15) | def _compute_grad_x_pytorch(grad_output, w, m_sizes, grad_x):
  function _compute_grad_w_pytorch (line 68) | def _compute_grad_w_pytorch(grad_output, x, m_sizes, grad_w):
  function _pytorch_fallback_backward (line 139) | def _pytorch_fallback_backward(grad_output, x, w, m_sizes):
  function _pytorch_reference_backward (line 165) | def _pytorch_reference_backward(grad_output, x, w, m_sizes):

FILE: kernels/MoE/group_GEMM/triton/tgroup_gemm_backwards.py
  function _kernel_grouped_gemm_backward_x_scheduled (line 28) | def _kernel_grouped_gemm_backward_x_scheduled(
  function _kernel_grouped_gemm_backward_w_scheduled (line 202) | def _kernel_grouped_gemm_backward_w_scheduled(
  function grouped_gemm_backward (line 382) | def grouped_gemm_backward(

FILE: kernels/MoE/group_GEMM/triton/tgroup_gemm_forward.py
  function _kernel_grouped_gemm (line 137) | def _kernel_grouped_gemm(
  function _kernel_grouped_gemm_fp8_rowwise (line 312) | def _kernel_grouped_gemm_fp8_rowwise(
  function _grouped_gemm (line 485) | def _grouped_gemm(
  function grouped_gemm_forward (line 626) | def grouped_gemm_forward(
  function grouped_gemm_fp8_rowwise (line 632) | def grouped_gemm_fp8_rowwise(

FILE: kernels/MoE/group_GEMM/triton/utils/tma_utils.py
  function map_dtype_to_triton (line 18) | def map_dtype_to_triton(dtype: torch.dtype) -> tl.dtype:
  class TmaAutoTuneHelper (line 58) | class TmaAutoTuneHelper:
    class KernelParamWrapper (line 61) | class KernelParamWrapper:
      method __init__ (line 62) | def __init__(self, desc):
      method tma_desc_cpu_ptr (line 65) | def tma_desc_cpu_ptr(self):
    method __init__ (line 70) | def __init__(self):
    method init_tma_descriptor (line 83) | def init_tma_descriptor(self, name):
    method fill_1d_tma_descriptor (line 94) | def fill_1d_tma_descriptor(self, name, ptr, dim, block_dim, element_si...
    method fill_2d_tma_descriptor (line 110) | def fill_2d_tma_descriptor(
    method get_tma_descriptor_kernel_param (line 127) | def get_tma_descriptor_kernel_param(self, name):

FILE: kernels/blackwell/cute_gemm_01/driver.py
  function sm100_gemm_f16 (line 17) | def sm100_gemm_f16(A, B, C=None, alpha=1.0, beta=0.0):
  function benchmark_sm100_vs_torch (line 70) | def benchmark_sm100_vs_torch(

FILE: kernels/blackwell/cute_gemm_01/sm100_gemm_pytorch.cpp
  function is_sm100_supported (line 11) | bool is_sm100_supported() {
  function check_sm100_device (line 20) | bool check_sm100_device() {
  function sm100_gemm_f16 (line 34) | torch::Tensor sm100_gemm_f16(const torch::Tensor &A, const torch::Tensor...
  function get_device_info (line 109) | torch::Tensor get_device_info() {
  function get_aligned_shape (line 128) | std::vector<int64_t> get_aligned_shape(int64_t M, int64_t N, int64_t K) {
  function create_aligned_tensor (line 137) | torch::Tensor create_aligned_tensor(const std::vector<int64_t> &shape,
  function PYBIND11_MODULE (line 156) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: kernels/blackwell/cute_gemm_02_tma/driver.py
  function check_sm100_compatibility (line 14) | def check_sm100_compatibility():
  function sm100_gemm_f16_tma (line 38) | def sm100_gemm_f16_tma(A, B, C=None, alpha=1.0, beta=0.0, check_alignmen...
  function create_aligned_tensors (line 114) | def create_aligned_tensors(
  function pad_to_aligned (line 132) | def pad_to_aligned(tensor, target_shape=None, dim_requirements=None):
  function unpad_result (line 175) | def unpad_result(tensor, padding_info):
  function benchmark_sm100_vs_torch (line 181) | def benchmark_sm100_vs_torch(
  class SM100LinearTMA (line 284) | class SM100LinearTMA(torch.nn.Module):
    method __init__ (line 289) | def __init__(self, in_features, out_features, bias=True, device="cuda"):
    method forward (line 324) | def forward(self, x):
  function benchmark_tma_vs_cooperative_copy (line 363) | def benchmark_tma_vs_cooperative_copy(M=512, N=1024, K=256, num_trials=50):
  function stress_test_large_matrices (line 377) | def stress_test_large_matrices():
  function check_sm100_compatibility (line 528) | def check_sm100_compatibility():
  function sm100_gemm_f16 (line 552) | def sm100_gemm_f16(A, B, C=None, alpha=1.0, beta=0.0, check_alignment=Tr...
  function create_aligned_tensors (line 623) | def create_aligned_tensors(
  function pad_to_aligned (line 641) | def pad_to_aligned(tensor, target_shape=None, dim_requirements=None):
  function unpad_result (line 684) | def unpad_result(tensor, padding_info):
  function benchmark_sm100_vs_torch (line 690) | def benchmark_sm100_vs_torch(
  class SM100Linear (line 781) | class SM100Linear(torch.nn.Module):
    method __init__ (line 786) | def __init__(self, in_features, out_features, bias=True, device="cuda"):
    method forward (line 820) | def forward(self, x):

FILE: kernels/blackwell/cute_gemm_02_tma/sm100_gemm_pytorch.cpp
  function is_sm100_supported (line 11) | bool is_sm100_supported() {
  function check_sm100_device (line 20) | bool check_sm100_device() {
  function sm100_gemm_f16 (line 34) | torch::Tensor sm100_gemm_f16(const torch::Tensor &A, const torch::Tensor...
  function get_device_info (line 109) | torch::Tensor get_device_info() {
  function get_aligned_shape (line 128) | std::vector<int64_t> get_aligned_shape(int64_t M, int64_t N, int64_t K) {
  function create_aligned_tensor (line 137) | torch::Tensor create_aligned_tensor(const std::vector<int64_t> &shape,
  function PYBIND11_MODULE (line 156) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: kernels/cuda/cutlass_gemm/broadcast_load_epilogue_c3x.hpp
  type cutlass::epilogue::fusion (line 60) | namespace cutlass::epilogue::fusion {
    type Sm90RowOrScalarBroadcast (line 75) | struct Sm90RowOrScalarBroadcast {
      type SharedStorage (line 82) | struct SharedStorage {
      type Arguments (line 89) | struct Arguments {
      method Params (line 98) | static constexpr Params
      method get_workspace_size (line 104) | static size_t
      method initialize_workspace (line 110) | static cutlass::Status
      method CUTLASS_HOST_DEVICE (line 116) | CUTLASS_HOST_DEVICE
      method CUTLASS_DEVICE (line 127) | CUTLASS_DEVICE bool
      method CUTLASS_DEVICE (line 132) | CUTLASS_DEVICE bool
      method CUTLASS_DEVICE (line 137) | CUTLASS_DEVICE bool
      type ProducerLoadCallbacks (line 143) | struct ProducerLoadCallbacks : EmptyProducerLoadCallbacks {
        method CUTLASS_DEVICE (line 154) | CUTLASS_DEVICE void
      method get_producer_load_callbacks (line 174) | CUTLASS_DEVICE auto
      type ConsumerStoreCallbacks (line 191) | struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
        method CUTLASS_DEVICE (line 202) | CUTLASS_DEVICE void
        method CUTLASS_DEVICE (line 218) | CUTLASS_DEVICE Array<Element, FragmentSize>
      method get_consumer_store_callbacks (line 235) | CUTLASS_DEVICE auto
    type Sm90ColOrScalarBroadcast (line 261) | struct Sm90ColOrScalarBroadcast {
      type SharedStorage (line 269) | struct SharedStorage { }
      type Arguments (line 274) | struct Arguments {
      method Params (line 283) | static constexpr Params
      method get_workspace_size (line 289) | static size_t
      method initialize_workspace (line 295) | static cutlass::Status
      method CUTLASS_DEVICE (line 301) | CUTLASS_DEVICE bool
      method CUTLASS_DEVICE (line 306) | CUTLASS_DEVICE bool
      method CUTLASS_DEVICE (line 311) | CUTLASS_DEVICE bool
      method CUTLASS_HOST_DEVICE (line 316) | CUTLASS_HOST_DEVICE
      method CUTLASS_HOST_DEVICE (line 319) | CUTLASS_HOST_DEVICE
      method get_producer_load_callbacks (line 326) | CUTLASS_DEVICE auto
      type ConsumerStoreCallbacks (line 332) | struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
        method CUTLASS_DEVICE (line 343) | CUTLASS_DEVICE void
        method CUTLASS_DEVICE (line 356) | CUTLASS_DEVICE Array<Element, FragmentSize>
      method get_consumer_store_callbacks (line 375) | CUTLASS_DEVICE auto

FILE: kernels/cuda/cutlass_gemm/common.hpp
  function next_pow_2 (line 15) | inline uint32_t next_pow_2(uint32_t const num) {
  function get_cuda_max_shared_memory_per_block_opt_in (line 20) | inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {

FILE: kernels/cuda/cutlass_gemm/cutlass.cpp
  function cutlass_scaled_mm (line 7) | torch::Tensor cutlass_scaled_mm(torch::Tensor a, torch::Tensor b, torch:...
  function PYBIND11_MODULE (line 17) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: kernels/cuda/inference/hadamard_transform/hadamard_transform.cpp
  function is_power_of_two (line 11) | constexpr bool is_power_of_two(uint32_t x) {
  function hadamard_transform (line 15) | torch::Tensor hadamard_transform(at::Tensor& in, bool inplace) {
  function PYBIND11_MODULE (line 58) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: kernels/cuda/inference/hadamard_transform/test.py
  function get_scale (line 24) | def get_scale(size):
  function truth_hadamard_transform_inplace (line 34) | def truth_hadamard_transform_inplace(a: torch.Tensor, truth_hadamards):
  function test_hadamard_transform_inplace_rowmajor (line 42) | def test_hadamard_transform_inplace_rowmajor(a: torch.Tensor):
  function check_correctness (line 48) | def check_correctness(m, elem_c, a, result, truth, atol=1e-2, rtol=0):

FILE: kernels/needs_perf_help/fp8_gemm_bench.py
  function bench (line 28) | def bench(cuda_graph: bool, rowwise_tma: bool=True) -> None:
  function bf16_bench (line 107) | def bf16_bench(x: Tensor, w: Tensor) -> Callable[[], Tensor]:
  function scale_row_bench (line 114) | def scale_row_bench(x: Tensor, w: Tensor) -> Callable[[], Tensor]:
  function row_gemm_bench (line 136) | def row_gemm_bench(x: Tensor, w: Tensor) -> Callable[[], Tensor]:
  function row_gemm_bench_tma (line 159) | def row_gemm_bench_tma(x: Tensor, w: Tensor) -> Callable[[], Tensor]:

FILE: kernels/needs_perf_help/fp8_rowwise_tma_persistent.py
  function get_fp8_constants (line 27) | def get_fp8_constants() -> Tuple[torch.dtype, tl.dtype, float, float]:
  function convert_fp8_type (line 46) | def convert_fp8_type(tensor, dtype) -> triton.TensorWrapper:
  function init_to_zero (line 60) | def init_to_zero(name):
  function get_configs_io_bound (line 64) | def get_configs_io_bound() -> List[Config]:
  function _kernel_matmul_fp8_row_tma_persistent (line 108) | def _kernel_matmul_fp8_row_tma_persistent(
  function matmul_fp8_row (line 235) | def matmul_fp8_row(

FILE: kernels/triton/inference/col_major_moe_gemm/perf_test_moe.py
  function torch_moe (line 16) | def torch_moe(a, w1, w2, topk_weight, topk_ids):
  function test_fused_moe (line 33) | def test_fused_moe(
  function benchmark (line 99) | def benchmark(m, provider):

FILE: kernels/triton/inference/col_major_moe_gemm/profile_moe.py
  function torch_moe (line 15) | def torch_moe(a, w1, w2, topk_weight, topk_ids):
  function test_fused_moe (line 32) | def test_fused_moe(

FILE: kernels/triton/inference/col_major_moe_gemm/test_moe_gemm.py
  function torch_moe (line 16) | def torch_moe(a, w1, w2, topk_weight, topk_ids):
  function test_fused_moe (line 39) | def test_fused_moe(

FILE: kernels/triton/inference/col_major_moe_gemm/v0_moe_fused.py
  function fused_moe_kernel (line 18) | def fused_moe_kernel(
  function moe_align_block_size (line 138) | def moe_align_block_size(
  function invoke_fused_moe_kernel (line 183) | def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.T...
  function fused_moe (line 222) | def fused_moe(hidden_states: torch.Tensor,

FILE: kernels/triton/inference/col_major_moe_gemm/v1_moe_fused.py
  function grouped_launch (line 21) | def grouped_launch(pid,
  function fused_moe_kernel_splitk (line 39) | def fused_moe_kernel_splitk(
  function moe_align_block_size (line 150) | def moe_align_block_size(
  function invoke_fused_moe_kernel (line 195) | def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.T...
  function fused_moe (line 249) | def fused_moe(hidden_states: torch.Tensor,

FILE: kernels/triton/inference/col_major_moe_gemm/v2_moe_fused.py
  function col_major (line 18) | def col_major(pid,
  function fused_moe_kernel (line 31) | def fused_moe_kernel(
  function moe_align_block_size (line 136) | def moe_align_block_size(
  function invoke_fused_moe_kernel (line 181) | def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.T...
  function fused_moe (line 220) | def fused_moe(hidden_states: torch.Tensor,

FILE: kernels/triton/inference/flash_attention/stay_attention.py
  function stay_attention (line 7) | def stay_attention(
  function flash_fn (line 107) | def flash_fn(q, k, v):

FILE: kernels/triton/inference/fp8/float8_groupwise_quant.py
  function _float8_groupwise_quant_kernel (line 21) | def _float8_groupwise_quant_kernel(
  function float8_groupwise_quantize (line 53) | def float8_groupwise_quantize(x: torch.Tensor, block_size=128):

FILE: kernels/triton/inference/fp8/scaled_fp8_gemm.py
  function grouped_launch (line 10) | def grouped_launch(pid,
  function column_major (line 27) | def column_major(pid,
  function scaled_gemm_splitk (line 39) | def scaled_gemm_splitk(a_ptr, b_ptr, c_ptr,
  function scaled_mm_splitk (line 94) | def scaled_mm_splitk(a, b, scale_a: float=1.0, scale_b: float=1.0):

FILE: kernels/triton/inference/fp8/splitk_gemm_fp8.py
  function grouped_launch (line 9) | def grouped_launch(pid,
  function col_major (line 27) | def col_major(pid,
  function gemm_split_k_kernel (line 40) | def gemm_split_k_kernel(a_ptr, b_ptr, c_ptr,
  function gemm_split_k (line 90) | def gemm_split_k(a, b):

FILE: kernels/triton/inference/fp8/tma_gemm.py
  function gemm_kernel_tma (line 7) | def gemm_kernel_tma(a_desc_ptr, b_desc_ptr, c_desc_ptr,  #
  function matmul (line 32) | def matmul(a, b, config=None):

FILE: kernels/triton/inference/gptq/a100_qlinear.py
  function _a100_quantized_matmul (line 6) | def _a100_quantized_matmul(a_ptr, b_ptr, c_ptr, scales_ptr, zeros_ptr,
  class a100_qlinear (line 77) | class a100_qlinear(torch.autograd.Function):
    method forward (line 78) | def forward(ctx, a, b, scales, zeros):

FILE: kernels/triton/inference/gptq/benchmark.py
  function benchmark_generation_speed (line 15) | def benchmark_generation_speed(model, tokenizer, prompt, batch_size, dev...
  function main (line 59) | def main():

FILE: kernels/triton/inference/gptq/h100_qlinear.py
  function _h100_quantized_matmul (line 7) | def _h100_quantized_matmul(a_ptr, b_ptr, c_ptr, scales_ptr, zeros_ptr,
  class h100_qlinear (line 87) | class h100_qlinear(torch.autograd.Function):
    method forward (line 88) | def forward(ctx, a, b, scales, zeros):

FILE: kernels/triton/inference/gptq/mixtral/test_dequant_moe_gemm.py
  function torch_moe (line 9) | def torch_moe(a, w1, w2, topk_weight, topk_ids):
  function test_dequant_moe (line 25) | def test_dequant_moe(

FILE: kernels/triton/inference/gptq/mixtral/w4a16_fused_dequant_gemm.py
  function print_tensor_dim (line 9) | def print_tensor_dim(tensor, str_name):
  function print_value (line 13) | def print_value(value):
  function grouped_launch (line 18) | def grouped_launch(pid,
  function col_major (line 36) | def col_major(pid,
  function w4a16_fused_moe_kernel (line 50) | def w4a16_fused_moe_kernel(
  function invoke_dequant_gemm_moe (line 153) | def invoke_dequant_gemm_moe(activations: torch.Tensor,
  function moe_align_block_size (line 211) | def moe_align_block_size(
  function dequant_gemm_moe (line 255) | def dequant_gemm_moe(hidden_states: torch.Tensor,

FILE: kernels/triton/inference/gptq/small_benchmark_cuda_graphs.py
  function swizzle_tile (line 11) | def swizzle_tile(pid,
  function matmul_data_parallel_kernel (line 29) | def matmul_data_parallel_kernel(a_ptr, b_ptr, c_ptr, scales_ptr, zeros_ptr,
  class small_qlinear (line 109) | class small_qlinear(torch.autograd.Function):
    method forward (line 110) | def forward(ctx, a, b, scales, zeros):
  function matmul_split_k_kernel (line 161) | def matmul_split_k_kernel(a_ptr, b_ptr, c_ptr, scales_ptr, zeros_ptr,
  function matmul_split_k (line 228) | def matmul_split_k(a, b, scales, zeros):
  function make_tensor (line 281) | def make_tensor(M, N, dtype):
  function gen_quant4 (line 292) | def gen_quant4(m, n, groupsize=-1):

FILE: kernels/triton/inference/gptq/splitk_dequant_gemm.py
  function swizzle_tile (line 7) | def swizzle_tile(pid,
  function matmul_split_k_kernel (line 24) | def matmul_split_k_kernel(a_ptr, b_ptr, c_ptr, scales_ptr, zeros_ptr,
  function matmul_split_k (line 91) | def matmul_split_k(a, b, scales, zeros):
  function make_tensor (line 143) | def make_tensor(M, N, dtype):

FILE: kernels/triton/inference/mamba/causal_1d_conv/causal_1d_conv/causal_1d_conv.py
  function _causal_conv1d_fwd_kernel (line 27) | def _causal_conv1d_fwd_kernel(
  function causal_conv1d_fwd (line 121) | def causal_conv1d_fwd(
  class CausalConv1dFn (line 203) | class CausalConv1dFn(torch.autograd.Function):
    method forward (line 205) | def forward(
    method backward (line 259) | def backward(ctx, dout, *args):
  function causal_conv1d_fn (line 295) | def causal_conv1d_fn(

FILE: kernels/triton/inference/mamba/causal_1d_conv/tests/test_causal_1d_conv.py
  function _undecorated_test_causal_conv1d (line 24) | def _undecorated_test_causal_conv1d(
  function causal_conv1d_ref (line 117) | def causal_conv1d_ref(
  function test_causal_conv1d (line 183) | def test_causal_conv1d(

FILE: kernels/triton/inference/paged_attention/attention_triton.py
  function print_tensor_dim (line 14) | def print_tensor_dim(tensor, str_name):
  function print_value (line 20) | def print_value(value):
  function print_line (line 27) | def print_line(str_line):
  function paged_attention_v1 (line 33) | def paged_attention_v1(
  function paged_attention_triton_v1 (line 158) | def paged_attention_triton_v1(
  function paged_attention_v2 (line 206) | def paged_attention_v2(
  function paged_attention_triton_v2 (line 358) | def paged_attention_triton_v2(

FILE: kernels/triton/inference/torch_compile/flash_backward.py
  class MetaData (line 43) | class MetaData():
    method __init__ (line 55) | def __init__(self, sm_scale=1.0):
    method set_varlen_params (line 58) | def set_varlen_params(self, cu_seqlens_q, cu_seqlens_k):
    method need_bias (line 70) | def need_bias(self, bias, batch, nheads, seqlen_q, seqlen_k):
    method need_alibi (line 77) | def need_alibi(self, alibi_slopes, batch, nheads):
    method need_causal (line 84) | def need_causal(self):
    method need_dropout (line 87) | def need_dropout(dropout_p, return_encoded_softmax):
    method check_args (line 91) | def check_args(self, q, k, v, o):
  function cdiv_fn (line 120) | def cdiv_fn(x,y):
  function max_fn (line 124) | def max_fn(x, y):
  function dropout_offsets (line 128) | def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride):
  function dropout_rng (line 134) | def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride):
  function dropout_mask (line 140) | def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride):
  function load_fn (line 146) | def load_fn(block_ptr, first, second, pad):
  function _attn_fwd_inner (line 158) | def _attn_fwd_inner(
  function attn_fwd (line 270) | def attn_fwd(
  function attention (line 553) | def attention(q, k, v, sm_scale):
  function _attn_bwd_preprocess (line 637) | def _attn_bwd_preprocess(
  function _bwd_kernel_dk_dv (line 691) | def _bwd_kernel_dk_dv(
  function _bwd_kernel_dq (line 761) | def _bwd_kernel_dq(dq, q, K, V,
  function _attn_bwd (line 821) | def _attn_bwd(Q, K, V, sm_scale, alibi_slopes,
  function flash_bwd (line 1016) | def flash_bwd(q: torch.Tensor, k: torch.Tensor, v:torch.Tensor, o: torch...
  function _ (line 1074) | def _(q, k, v, o, M, do):
  function flash (line 1083) | def flash(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, o: torch.Te...
  function _ (line 1159) | def _(q, k, v, o, M):
  function setup_context (line 1163) | def setup_context(ctx, inputs, output) -> torch.Tensor:
  function backward (line 1167) | def backward(ctx, do):
  function f (line 1187) | def f(q, k, v):

FILE: kernels/triton/training/fused_softmax/softmax.py
  function _get_num_warps (line 18) | def _get_num_warps(block_size: int)-> int:
  function _softmax_kernel_fwd (line 27) | def _softmax_kernel_fwd(
  function _softmax_kernel_bwd (line 56) | def _softmax_kernel_bwd(
  class triton_softmax (line 93) | class triton_softmax(autograd.Function):
    method forward (line 95) | def forward(ctx, x):
    method backward (line 122) | def backward(ctx, grad_probs):

FILE: kernels/triton/training/rms_norm/fused_rms_norm.py
  function _rms_norm_fwd_kernel (line 33) | def _rms_norm_fwd_kernel(
  function _rms_norm_bwd_kernel_sm (line 83) | def _rms_norm_bwd_kernel_sm(
  class ttt_RMSNorm (line 181) | class ttt_RMSNorm(torch.autograd.Function):
    method forward (line 183) | def forward(ctx, x, weight, eps):
    method backward (line 225) | def backward(ctx, dy):
  function fused_rms_norm_fn (line 313) | def fused_rms_norm_fn(
  class FusedRMSNorm (line 325) | class FusedRMSNorm(torch.nn.Module):
    method __init__ (line 326) | def __init__(self, hidden_size, eps=1e-5, dropout_p=0.0, device=None, ...
    method reset_parameters (line 333) | def reset_parameters(self):
    method forward (line 336) | def forward(

FILE: tutorials/triton/kernels/fused_softmax.py
  function _get_num_warps (line 15) | def _get_num_warps(block_size: int)-> int:
  function _softmax_kernel_fwd (line 24) | def _softmax_kernel_fwd(
  function _softmax_kernel_bwd (line 53) | def _softmax_kernel_bwd(
  class triton_softmax (line 90) | class triton_softmax(autograd.Function):
    method forward (line 92) | def forward(ctx, x):
    method backward (line 119) | def backward(ctx, grad_probs):

FILE: tutorials/triton/kernels/vector_add.py
  function kernel_vector_addition (line 9) | def kernel_vector_addition(a_ptr, b_ptr, out_ptr,
  function ceil_div (line 24) | def ceil_div(x: int, y: int)-> int:
  function vector_addition (line 27) | def vector_addition(a: torch.tensor, b: torch.tensor)-> torch.tensor:

FILE: tutorials/triton/tests/test_softmax.py
  function set_seed (line 13) | def set_seed():
  class TestForwardSoftMax (line 18) | class TestForwardSoftMax:
    method test_forward_2D_float32 (line 20) | def test_forward_2D_float32(self,):
    method test_forward_2D_bfloat16 (line 36) | def test_forward_2D_bfloat16(self,):
    method test_forward_3D_bfloat16 (line 51) | def test_forward_3D_bfloat16(self,):
  class TestBackwardSoftMax (line 70) | class TestBackwardSoftMax:
    method test_backward_2D (line 72) | def test_backward_2D(self,):
    method test_bwd_3D (line 96) | def test_bwd_3D(self,):

FILE: tutorials/triton/tests/test_utils.py
  function assert_expected (line 10) | def assert_expected(
  function set_rng_seed (line 26) | def set_rng_seed(seed):
  function gpu_test (line 31) | def gpu_test(gpu_count: int = 1):