SYMBOL INDEX (89 symbols across 11 files)

FILE: csrc/config.hpp
  type deep_ep (line 6) | namespace deep_ep {
    function dtype_t (line 9) | dtype_t ceil_div(dtype_t a, dtype_t b) {
    function dtype_t (line 14) | dtype_t align_up(dtype_t a, dtype_t b) {
    function dtype_t (line 19) | dtype_t align_down(dtype_t a, dtype_t b) {
    type Config (line 23) | struct Config {
      method Config (line 30) | Config(int num_sms,
      method get_nvl_buffer_size_hint (line 52) | size_t get_nvl_buffer_size_hint(size_t hidden_bytes, int num_ranks) ...
      method get_rdma_buffer_size_hint (line 76) | size_t get_rdma_buffer_size_hint(int64_t hidden_bytes, int num_ranks...
    type LowLatencyBuffer (line 107) | struct LowLatencyBuffer {
      method clean_meta (line 121) | std::pair<int*, int> clean_meta() {
    type LowLatencyLayout (line 127) | struct LowLatencyLayout {
      method out_ptr_t (line 132) | out_ptr_t advance(const in_ptr_t& ptr, size_t count) {
      method LowLatencyLayout (line 136) | LowLatencyLayout(void* rdma_buffer, int num_max_dispatch_tokens_per_...
    function get_low_latency_rdma_size_hint (line 190) | size_t get_low_latency_rdma_size_hint(int num_max_dispatch_tokens_per_...

FILE: csrc/deep_ep.cpp
  type shared_memory (line 15) | namespace shared_memory {
    function cu_mem_set_access_all (line 16) | void cu_mem_set_access_all(void* ptr, size_t size) {
    function cu_mem_free (line 30) | void cu_mem_free(void* ptr) {
    function get_size_align_to_granularity (line 42) | size_t get_size_align_to_granularity(size_t size_raw, size_t granulari...
  type deep_ep (line 126) | namespace deep_ep {
    function is_sm90_compiled (line 1817) | bool is_sm90_compiled() {
  function PYBIND11_MODULE (line 1846) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: csrc/deep_ep.hpp
  type shared_memory (line 24) | namespace shared_memory {
    type MemHandle (line 31) | struct MemHandle {
    class SharedMemoryAllocator (line 38) | class SharedMemoryAllocator {
  type deep_ep (line 52) | namespace deep_ep {

FILE: csrc/event.hpp
  type deep_ep (line 7) | namespace deep_ep {
    type EventHandle (line 9) | struct EventHandle {
      method EventHandle (line 12) | EventHandle() {
      method EventHandle (line 17) | explicit EventHandle(const at::cuda::CUDAStream& stream) {
      method EventHandle (line 22) | EventHandle(const EventHandle& other) = default;
      method current_stream_wait (line 24) | void current_stream_wait() const { at::cuda::getCurrentCUDAStream()....
    function create_event (line 27) | torch::Event create_event(const at::cuda::CUDAStream& s) {
    function stream_wait (line 33) | void stream_wait(const at::cuda::CUDAStream& s_0, const at::cuda::CUDA...
    function stream_wait (line 38) | void stream_wait(const at::cuda::CUDAStream& s, const EventHandle& eve...

FILE: deep_ep/buffer.py
  class Buffer (line 13) | class Buffer:
    method __init__ (line 32) | def __init__(self,
    method destroy (line 138) | def destroy(self):
    method is_sm90_compiled (line 150) | def is_sm90_compiled():
    method set_num_sms (line 154) | def set_num_sms(new_num_sms: int) -> None:
    method capture (line 166) | def capture() -> EventOverlap:
    method get_low_latency_rdma_size_hint (line 176) | def get_low_latency_rdma_size_hint(num_max_dispatch_tokens_per_rank: i...
    method get_comm_stream (line 191) | def get_comm_stream(self) -> torch.Stream:
    method get_local_buffer_tensor (line 201) | def get_local_buffer_tensor(self,
    method _unpack_bias (line 223) | def _unpack_bias(bias: Union[torch.Tensor, Tuple[torch.Tensor, torch.T...
    method get_dispatch_config (line 233) | def get_dispatch_config(num_ranks: int) -> Config:
    method get_combine_config (line 263) | def get_combine_config(num_ranks: int) -> Config:
    method get_dispatch_layout (line 293) | def get_dispatch_layout(self, topk_idx: torch.Tensor, num_experts: int,
    method dispatch (line 322) | def dispatch(self, x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Te...
    method combine (line 405) | def combine(self, x: torch.Tensor, handle: Tuple,
    method internode_dispatch (line 453) | def internode_dispatch(self, x: Union[torch.Tensor, Tuple[torch.Tensor...
    method internode_combine (line 504) | def internode_combine(self, x: torch.Tensor, handle: Union[tuple, list],
    method clean_low_latency_buffer (line 533) | def clean_low_latency_buffer(self, num_max_dispatch_tokens_per_rank: i...
    method low_latency_dispatch (line 548) | def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
    method low_latency_combine (line 617) | def low_latency_combine(self, x: torch.Tensor, topk_idx: torch.Tensor,...
    method low_latency_update_mask_buffer (line 663) | def low_latency_update_mask_buffer(self, rank_to_mask: int, mask: bool...
    method low_latency_query_mask_buffer (line 674) | def low_latency_query_mask_buffer(self, mask_status: torch.Tensor):
    method low_latency_clean_mask_buffer (line 684) | def low_latency_clean_mask_buffer(self):
    method get_next_low_latency_combine_buffer (line 691) | def get_next_low_latency_combine_buffer(self, handle: object):

FILE: deep_ep/utils.py
  class EventOverlap (line 10) | class EventOverlap:
    method __init__ (line 19) | def __init__(self, event: Optional[EventHandle] = None, extra_tensors:...
    method current_stream_wait (line 33) | def current_stream_wait(self) -> None:
    method __enter__ (line 40) | def __enter__(self) -> Any:
    method __exit__ (line 54) | def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
  function check_nvlink_connections (line 64) | def check_nvlink_connections(group: dist.ProcessGroup):

FILE: setup.py
  function get_nvshmem_host_lib_name (line 11) | def get_nvshmem_host_lib_name(base_dir):

FILE: tests/test_internode.py
  function test_main (line 16) | def test_main(args: argparse.Namespace,
  function test_loop (line 316) | def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Name...

FILE: tests/test_intranode.py
  function test_main (line 15) | def test_main(args: argparse.Namespace, num_sms: int, local_rank: int, n...
  function test_loop (line 266) | def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Name...

FILE: tests/test_low_latency.py
  function simulate_failure_and_skip (line 12) | def simulate_failure_and_skip(rank: int, api: Literal["dispatch", "combi...
  function query_mask_buffer_and_check (line 31) | def query_mask_buffer_and_check(api: Literal["dispatch", "combine", "cle...
  function test_main (line 37) | def test_main(num_tokens: int,
  function test_loop (line 253) | def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Name...

FILE: tests/utils.py
  function init_dist (line 14) | def init_dist(local_rank: int, num_local_ranks: int):
  function calc_diff (line 39) | def calc_diff(x: torch.Tensor, y: torch.Tensor):
  function align_up (line 46) | def align_up(x, y):
  function per_token_cast_to_fp8 (line 50) | def per_token_cast_to_fp8(x: torch.Tensor):
  function per_token_cast_back (line 61) | def per_token_cast_back(x_fp8: torch.Tensor, x_scales: torch.Tensor):
  function inplace_unique (line 77) | def inplace_unique(x: torch.Tensor, num_slots: int):
  function create_grouped_scores (line 92) | def create_grouped_scores(scores: torch.Tensor, group_idx: torch.Tensor,...
  function bench (line 100) | def bench(fn, num_warmups: int = 50, num_tests: int = 50, post_fn=None):
  class empty_suppress (line 128) | class empty_suppress:
    method __enter__ (line 130) | def __enter__(self):
    method __exit__ (line 133) | def __exit__(self, *_):
  class suppress_stdout_stderr (line 137) | class suppress_stdout_stderr:
    method __enter__ (line 139) | def __enter__(self):
    method __exit__ (line 159) | def __exit__(self, *_):
  function bench_kineto (line 173) | def bench_kineto(fn,
  function hash_tensor (line 241) | def hash_tensor(t: torch.Tensor):