SYMBOL INDEX (89 symbols across 11 files) FILE: csrc/config.hpp type deep_ep (line 6) | namespace deep_ep { function dtype_t (line 9) | dtype_t ceil_div(dtype_t a, dtype_t b) { function dtype_t (line 14) | dtype_t align_up(dtype_t a, dtype_t b) { function dtype_t (line 19) | dtype_t align_down(dtype_t a, dtype_t b) { type Config (line 23) | struct Config { method Config (line 30) | Config(int num_sms, method get_nvl_buffer_size_hint (line 52) | size_t get_nvl_buffer_size_hint(size_t hidden_bytes, int num_ranks) ... method get_rdma_buffer_size_hint (line 76) | size_t get_rdma_buffer_size_hint(int64_t hidden_bytes, int num_ranks... type LowLatencyBuffer (line 107) | struct LowLatencyBuffer { method clean_meta (line 121) | std::pair clean_meta() { type LowLatencyLayout (line 127) | struct LowLatencyLayout { method out_ptr_t (line 132) | out_ptr_t advance(const in_ptr_t& ptr, size_t count) { method LowLatencyLayout (line 136) | LowLatencyLayout(void* rdma_buffer, int num_max_dispatch_tokens_per_... function get_low_latency_rdma_size_hint (line 190) | size_t get_low_latency_rdma_size_hint(int num_max_dispatch_tokens_per_... FILE: csrc/deep_ep.cpp type shared_memory (line 15) | namespace shared_memory { function cu_mem_set_access_all (line 16) | void cu_mem_set_access_all(void* ptr, size_t size) { function cu_mem_free (line 30) | void cu_mem_free(void* ptr) { function get_size_align_to_granularity (line 42) | size_t get_size_align_to_granularity(size_t size_raw, size_t granulari... type deep_ep (line 126) | namespace deep_ep { function is_sm90_compiled (line 1817) | bool is_sm90_compiled() { function PYBIND11_MODULE (line 1846) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { FILE: csrc/deep_ep.hpp type shared_memory (line 24) | namespace shared_memory { type MemHandle (line 31) | struct MemHandle { class SharedMemoryAllocator (line 38) | class SharedMemoryAllocator { type deep_ep (line 52) | namespace deep_ep { FILE: csrc/event.hpp type deep_ep (line 7) | namespace deep_ep { type EventHandle (line 9) | struct EventHandle { method EventHandle (line 12) | EventHandle() { method EventHandle (line 17) | explicit EventHandle(const at::cuda::CUDAStream& stream) { method EventHandle (line 22) | EventHandle(const EventHandle& other) = default; method current_stream_wait (line 24) | void current_stream_wait() const { at::cuda::getCurrentCUDAStream().... function create_event (line 27) | torch::Event create_event(const at::cuda::CUDAStream& s) { function stream_wait (line 33) | void stream_wait(const at::cuda::CUDAStream& s_0, const at::cuda::CUDA... function stream_wait (line 38) | void stream_wait(const at::cuda::CUDAStream& s, const EventHandle& eve... FILE: deep_ep/buffer.py class Buffer (line 13) | class Buffer: method __init__ (line 32) | def __init__(self, method destroy (line 138) | def destroy(self): method is_sm90_compiled (line 150) | def is_sm90_compiled(): method set_num_sms (line 154) | def set_num_sms(new_num_sms: int) -> None: method capture (line 166) | def capture() -> EventOverlap: method get_low_latency_rdma_size_hint (line 176) | def get_low_latency_rdma_size_hint(num_max_dispatch_tokens_per_rank: i... method get_comm_stream (line 191) | def get_comm_stream(self) -> torch.Stream: method get_local_buffer_tensor (line 201) | def get_local_buffer_tensor(self, method _unpack_bias (line 223) | def _unpack_bias(bias: Union[torch.Tensor, Tuple[torch.Tensor, torch.T... method get_dispatch_config (line 233) | def get_dispatch_config(num_ranks: int) -> Config: method get_combine_config (line 263) | def get_combine_config(num_ranks: int) -> Config: method get_dispatch_layout (line 293) | def get_dispatch_layout(self, topk_idx: torch.Tensor, num_experts: int, method dispatch (line 322) | def dispatch(self, x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Te... method combine (line 405) | def combine(self, x: torch.Tensor, handle: Tuple, method internode_dispatch (line 453) | def internode_dispatch(self, x: Union[torch.Tensor, Tuple[torch.Tensor... method internode_combine (line 504) | def internode_combine(self, x: torch.Tensor, handle: Union[tuple, list], method clean_low_latency_buffer (line 533) | def clean_low_latency_buffer(self, num_max_dispatch_tokens_per_rank: i... method low_latency_dispatch (line 548) | def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor, method low_latency_combine (line 617) | def low_latency_combine(self, x: torch.Tensor, topk_idx: torch.Tensor,... method low_latency_update_mask_buffer (line 663) | def low_latency_update_mask_buffer(self, rank_to_mask: int, mask: bool... method low_latency_query_mask_buffer (line 674) | def low_latency_query_mask_buffer(self, mask_status: torch.Tensor): method low_latency_clean_mask_buffer (line 684) | def low_latency_clean_mask_buffer(self): method get_next_low_latency_combine_buffer (line 691) | def get_next_low_latency_combine_buffer(self, handle: object): FILE: deep_ep/utils.py class EventOverlap (line 10) | class EventOverlap: method __init__ (line 19) | def __init__(self, event: Optional[EventHandle] = None, extra_tensors:... method current_stream_wait (line 33) | def current_stream_wait(self) -> None: method __enter__ (line 40) | def __enter__(self) -> Any: method __exit__ (line 54) | def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: function check_nvlink_connections (line 64) | def check_nvlink_connections(group: dist.ProcessGroup): FILE: setup.py function get_nvshmem_host_lib_name (line 11) | def get_nvshmem_host_lib_name(base_dir): FILE: tests/test_internode.py function test_main (line 16) | def test_main(args: argparse.Namespace, function test_loop (line 316) | def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Name... FILE: tests/test_intranode.py function test_main (line 15) | def test_main(args: argparse.Namespace, num_sms: int, local_rank: int, n... function test_loop (line 266) | def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Name... FILE: tests/test_low_latency.py function simulate_failure_and_skip (line 12) | def simulate_failure_and_skip(rank: int, api: Literal["dispatch", "combi... function query_mask_buffer_and_check (line 31) | def query_mask_buffer_and_check(api: Literal["dispatch", "combine", "cle... function test_main (line 37) | def test_main(num_tokens: int, function test_loop (line 253) | def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Name... FILE: tests/utils.py function init_dist (line 14) | def init_dist(local_rank: int, num_local_ranks: int): function calc_diff (line 39) | def calc_diff(x: torch.Tensor, y: torch.Tensor): function align_up (line 46) | def align_up(x, y): function per_token_cast_to_fp8 (line 50) | def per_token_cast_to_fp8(x: torch.Tensor): function per_token_cast_back (line 61) | def per_token_cast_back(x_fp8: torch.Tensor, x_scales: torch.Tensor): function inplace_unique (line 77) | def inplace_unique(x: torch.Tensor, num_slots: int): function create_grouped_scores (line 92) | def create_grouped_scores(scores: torch.Tensor, group_idx: torch.Tensor,... function bench (line 100) | def bench(fn, num_warmups: int = 50, num_tests: int = 50, post_fn=None): class empty_suppress (line 128) | class empty_suppress: method __enter__ (line 130) | def __enter__(self): method __exit__ (line 133) | def __exit__(self, *_): class suppress_stdout_stderr (line 137) | class suppress_stdout_stderr: method __enter__ (line 139) | def __enter__(self): method __exit__ (line 159) | def __exit__(self, *_): function bench_kineto (line 173) | def bench_kineto(fn, function hash_tensor (line 241) | def hash_tensor(t: torch.Tensor):