SYMBOL INDEX (1483 symbols across 133 files)

FILE: csrc/dp_core.cpp
  function argmin (line 13) | inline size_t argmin(const ForwardIterator begin, const ForwardIterator ...
  function argmax (line 19) | inline size_t argmax(const ForwardIterator begin, const ForwardIterator ...
  function dynamic_programming_core (line 24) | std::pair<std::map<int, double>, std::map<int, int> > dynamic_programmin...
  function PYBIND11_MODULE (line 122) | PYBIND11_MODULE(galvatron_dp_core, m) {

FILE: galvatron/core/args_schema.py
  class CoreArgs (line 46) | class CoreArgs(BaseModel):

FILE: galvatron/core/arguments.py
  function _coerce_cli_value (line 15) | def _coerce_cli_value(raw: str) -> Any:
  function _legacy_cli_to_flat_map (line 33) | def _legacy_cli_to_flat_map(tokens: List[str]) -> Dict[str, Any]:
  function _runtime_subsection_for_key (line 52) | def _runtime_subsection_for_key(key: str) -> Optional[str]:
  function _legacy_cli_to_hydra_overrides (line 64) | def _legacy_cli_to_hydra_overrides(tokens: List[str]) -> List[str]:
  function _normalize_runtime_model_dtype (line 88) | def _normalize_runtime_model_dtype(config_dict: Dict[str, Any]) -> None:
  function _normalize_profiler_fields (line 115) | def _normalize_profiler_fields(config_dict: Dict[str, Any]) -> None:
  function load_with_hydra (line 125) | def load_with_hydra(

FILE: galvatron/core/cost_model/components/embedding_lmhead_cost.py
  class EmbeddingLMHeadTimeCostModel (line 9) | class EmbeddingLMHeadTimeCostModel:
    method __init__ (line 18) | def __init__(
    method initialize (line 59) | def initialize(self):
    method estimate_computation_time (line 81) | def estimate_computation_time(self):
    method estimate_dp_communication_time (line 99) | def estimate_dp_communication_time(self):
    method estimate_tp_communication_time (line 125) | def estimate_tp_communication_time(self):
    method get_overlap_time (line 155) | def get_overlap_time(self, forward_comm_time, forward_comp_time, backw...
    method gen_result (line 168) | def gen_result(self) -> Tuple[List[float], List[float]]:
  class EmbeddingLMHeadMemoryCostModel (line 187) | class EmbeddingLMHeadMemoryCostModel:
    method __init__ (line 195) | def __init__(
    method initialize (line 231) | def initialize(self):
    method estimate_model_states_size (line 261) | def estimate_model_states_size(self):
    method estimate_activation_size (line 280) | def estimate_activation_size(self):
    method get_memory_cost (line 302) | def get_memory_cost(self):

FILE: galvatron/core/cost_model/components/layer_cost.py
  class TimeCostModelBase (line 9) | class TimeCostModelBase:
    method __init__ (line 18) | def __init__(
    method initialize (line 58) | def initialize(self):
    method estimate_computation_time (line 88) | def estimate_computation_time(self):
    method estimate_dp_communication_time (line 105) | def estimate_dp_communication_time(self):
    method estimate_tp_communication_time (line 119) | def estimate_tp_communication_time(self): # TODO: split tp and sp to d...
    method estimate_pp_communication_time (line 152) | def estimate_pp_communication_time(self):
    method bct_dp_overlap (line 161) | def bct_dp_overlap(self, dp_message_size, bct):
    method get_result (line 180) | def get_result(self, no_gradient_sync:bool = False):
    method gen_result (line 210) | def gen_result(self) -> tuple[float, float]:
  class MemoryCostModelBase (line 215) | class MemoryCostModelBase:
    method __init__ (line 223) | def __init__(
    method initialize (line 261) | def initialize(self):
    method estimate_parameter_size (line 302) | def estimate_parameter_size(self):
    method estimate_model_states_size (line 306) | def estimate_model_states_size(self):
    method estimate_activation_size (line 313) | def estimate_activation_size(self):
    method get_memory_cost (line 322) | def get_memory_cost(self):

FILE: galvatron/core/cost_model/cost_model_args.py
  class ModelArgs (line 6) | class ModelArgs:
  class TrainArgs (line 13) | class TrainArgs:
  class ParallelArgs (line 20) | class ParallelArgs:
  class ProfileModelArgs (line 29) | class ProfileModelArgs:
  class ProfileHardwareArgs (line 37) | class ProfileHardwareArgs:

FILE: galvatron/core/cost_model/cost_model_handler.py
  function get_time_cost_all_stages (line 8) | def get_time_cost_all_stages(layer_timecosts, pp_stage_division):
  function pipeline_costmodel (line 16) | def pipeline_costmodel(

FILE: galvatron/core/profiler/args_schema.py
  class GalvatronModelProfilerArgs (line 9) | class GalvatronModelProfilerArgs(BaseModel):
  class ProfilerHardwareArgs (line 40) | class ProfilerHardwareArgs(BaseModel):

FILE: galvatron/core/profiler/arguments.py
  function galvatron_profile_args (line 1) | def galvatron_profile_args(parser):
  function galvatron_profile_hardware_args (line 108) | def galvatron_profile_hardware_args(parser):

FILE: galvatron/core/profiler/base_profiler.py
  class BaseProfiler (line 4) | class BaseProfiler():
    method __init__ (line 5) | def __init__(self):
    method set_work_dir (line 13) | def set_work_dir(self, work_dir):
    method set_model_name (line 16) | def set_model_name(self, model_name):
    method set_profile_unit (line 19) | def set_profile_unit(self, profile_unit):
    method set_mixed_precision (line 22) | def set_mixed_precision(self, mixed_precision):
    method set_specific_time_path (line 25) | def set_specific_time_path(self, specific_time_path):
    method set_specific_memory_path (line 28) | def set_specific_memory_path(self, specific_memory_path):
    method memory_profiling_path (line 31) | def memory_profiling_path(self):
    method time_profiling_path (line 48) | def time_profiling_path(self):

FILE: galvatron/core/profiler/hardware_profiler.py
  class HardwareProfiler (line 9) | class HardwareProfiler(BaseProfiler):
    method __init__ (line 12) | def __init__(self, args: ProfilerHardwareArgs):
    method set_path (line 17) | def set_path(self, path: str) -> None:
    method get_env (line 21) | def get_env(self) -> str:
    method generate_script (line 39) | def generate_script(self, num_nodes: int, num_gpus_per_node: int) -> N...
    method generate_sp_script (line 99) | def generate_sp_script(self, num_nodes: int, num_gpus_per_node: int) -...
    method profile_bandwidth (line 156) | def profile_bandwidth(self) -> None:
    method profile_sp_bandwidth (line 161) | def profile_sp_bandwidth(self):
    method write_config (line 166) | def write_config(self, hardware_config_path: str, key: str, bandwidth:...
    method profile_overlap (line 180) | def profile_overlap(self):
  function _halving_tp_degrees (line 196) | def _halving_tp_degrees(world_size: int, max_tp: int) -> list[int]:
  function _halving_batch_sizes (line 206) | def _halving_batch_sizes(start: int = 1024) -> list[int]:
  function _p2p_pp_deg_sweep (line 216) | def _p2p_pp_deg_sweep(world_size: int, max_pp_deg: int) -> list[int]:
  function _shell_int_list (line 226) | def _shell_int_list(xs: list[int]) -> str:

FILE: galvatron/core/profiler/model_profiler.py
  class ModelProfiler (line 15) | class ModelProfiler(BaseProfiler):
    method __init__ (line 18) | def __init__(self, args: GalvatronModelProfilerArgs):
    method set_profiler_launcher (line 42) | def set_profiler_launcher(self, path: str, model_name: Optional[str] =...
    method get_global_batch_size_list (line 60) | def get_global_batch_size_list(self) -> List[int]:
    method get_layernum_tuple_list (line 76) | def get_layernum_tuple_list(self) -> Union[List[Tuple[int]], List[Tupl...
    method get_seq_length_tuple_list (line 95) | def get_seq_length_tuple_list(self) -> Union[List[Tuple[int]], List[Tu...
    method get_basic_overrides_dict (line 138) | def get_basic_overrides_dict(self) -> Dict[str, Any]:
    method get_envs_dict (line 199) | def get_envs_dict(self) -> Dict[str, Any]:
    method dict_to_str (line 208) | def dict_to_str(self, d: dict, sep: str = "=") -> str:
    method launch_profiling_scripts (line 215) | def launch_profiling_scripts(self) -> None:
    method _launch_memory_profiling (line 231) | def _launch_memory_profiling(self) -> None:
    method _launch_computation_profiling (line 343) | def _launch_computation_profiling(self) -> None:
    method process_profiled_data (line 394) | def process_profiled_data(self) -> None:
    method _process_computation_data (line 422) | def _process_computation_data(self, layernum_lists: List[List[int]]) -...
    method _process_memory_data (line 473) | def _process_memory_data(self, world_size: int, layernum_lists: List[L...
    method _process_single_sequence_config (line 520) | def _process_single_sequence_config(
    method key_format (line 806) | def key_format(
    method total_memcost (line 846) | def total_memcost(
    method argval2str (line 883) | def argval2str(self, val: Union[List, Any]) -> str:
    method arg2str (line 896) | def arg2str(self, key: str, val: Union[List, Any]) -> str:
    method args2str (line 908) | def args2str(self, args: Union[Dict, List[Tuple]], exclude_args: List[...
    method env_args (line 929) | def env_args(self) -> Dict[str, Union[str, int]]:
    method launch_scripts (line 952) | def launch_scripts(self, env_args: Dict[str, str]) -> str:

FILE: galvatron/core/profiler/runtime_profiler.py
  class RuntimeProfiler (line 12) | class RuntimeProfiler(BaseProfiler):
    method __init__ (line 15) | def __init__(self, args: GalvatronRuntimeArgs):
    method set_profiler_dist (line 24) | def set_profiler_dist(
    method set_profiler_single (line 64) | def set_profiler_single(self, start_iter=10, end_iter=20):
    method set_model_layer_configs (line 76) | def set_model_layer_configs(self, model_layer_configs: Optional[List[D...
    method set_memory_profiler (line 92) | def set_memory_profiler(self, rank: int, profile_ranks: List[int] = []...
    method profile_memory (line 105) | def profile_memory(self, iter: int, stage: str = "") -> None:
    method post_profile_memory (line 134) | def post_profile_memory(self, iter: int) -> None:
    method set_time_profiler (line 197) | def set_time_profiler(self, start_iter: int, end_iter: int, exit: bool...
    method profile_time_start (line 218) | def profile_time_start(self, iter: int) -> None:
    method profile_time_end (line 233) | def profile_time_end(
    method profile_time_python (line 260) | def profile_time_python(self, iter: int) -> None:
    method _process_time_results (line 290) | def _process_time_results(self) -> None:
    method _filtered_time_samples (line 312) | def _filtered_time_samples(self) -> List[float]:
    method _log_iteration_stats (line 333) | def _log_iteration_stats(

FILE: galvatron/core/profiler/utils.py
  function print_peak_memory (line 8) | def print_peak_memory(prefix, device, type="allocated"):
  function save_profiled_memory (line 22) | def save_profiled_memory(
  function save_profiled_time (line 57) | def save_profiled_time(path, time, bsz, layer_num, seq):

FILE: galvatron/core/runtime/__init__.py
  function _reshard (line 23) | def _reshard(

FILE: galvatron/core/runtime/args_schema.py
  class GalvatronParallelArgs (line 18) | class GalvatronParallelArgs(BaseModel):
  class GalvatronModelArgs (line 51) | class GalvatronModelArgs(BaseModel):
    method model_type (line 174) | def model_type(self):
  class GalvatronProfileArgs (line 178) | class GalvatronProfileArgs(BaseModel):
  class CommonTrainArgs (line 195) | class CommonTrainArgs(BaseModel):
  function _str_to_list (line 262) | def _str_to_list(v):
  class CommonDataArgs (line 271) | class CommonDataArgs(BaseModel):
    method str_to_list (line 298) | def str_to_list(cls, v):
  class CommonCkptArgs (line 323) | class CommonCkptArgs(BaseModel):
  class LoggingConfig (line 335) | class LoggingConfig(BaseModel):
  class GalvatronRuntimeArgs (line 344) | class GalvatronRuntimeArgs(BaseModel):

FILE: galvatron/core/runtime/checkpoint/gpt_adapter.py
  function load_hf_checkpoint (line 18) | def load_hf_checkpoint(load, tp_groups, name, submodule, module):
  function load_gpt_module (line 154) | def load_gpt_module(load, tp_groups, name, submodule, module, distribute...

FILE: galvatron/core/runtime/checkpoint/llama_adapter.py
  function load_distributed_checkpoint (line 30) | def load_distributed_checkpoint(load, tp_groups, name, submodule, module):
  function load_hf_checkpoint (line 51) | def load_hf_checkpoint(load, tp_groups, name, submodule, module):
  function load_llama_module (line 164) | def load_llama_module(load, tp_groups, name, submodule, module, distribu...
  function save_llama_module (line 172) | def save_llama_module(save_path, model, optimizer, opt_param_scheduler, ...

FILE: galvatron/core/runtime/checkpoint/moe_adapter.py
  function _runtime_args (line 37) | def _runtime_args():
  function _load_file (line 45) | def _load_file(path):
  function _copy_module_state (line 49) | def _copy_module_state(checkpoint, name, submodule):
  function load_distributed_checkpoint (line 58) | def load_distributed_checkpoint(load, tp_groups, name, submodule, module...
  function _load_embedding_from_hf (line 102) | def _load_embedding_from_hf(load, tp_groups, submodule):
  function _load_lm_head_from_hf (line 123) | def _load_lm_head_from_hf(load, tp_groups, submodule):
  function _load_attention_from_hf (line 144) | def _load_attention_from_hf(checkpoint, tp_groups, name, submodule):
  function _load_router_from_hf (line 185) | def _load_router_from_hf(checkpoint, submodule):
  function _load_mlp_from_hf (line 192) | def _load_mlp_from_hf(checkpoint, tp_groups, name, submodule, module):
  function load_hf_checkpoint (line 225) | def load_hf_checkpoint(load, tp_groups, name, submodule, module, ep_grou...
  function load_moe_module (line 258) | def load_moe_module(load, tp_groups, name, submodule, module, distribute...
  function save_moe_module (line 266) | def save_moe_module(save_path, model, optimizer, opt_param_scheduler, it...

FILE: galvatron/core/runtime/comm_groups.py
  class CommGroup (line 4) | class CommGroup(object):
    method __init__ (line 5) | def __init__(self, ranks:List[int]):
    method has_rank (line 10) | def has_rank(self, rank):
    method print (line 13) | def print(self):
  function show_groups (line 17) | def show_groups(groups:List[CommGroup]):
  function build_rank_to_parallel_coords (line 26) | def build_rank_to_parallel_coords(world_size, name2size, order='pp-dp-cp...
  function get_groups (line 44) | def get_groups(degree_rank_dict:Dict[int, Dict[str, int]], ignore_keys=[...
  function get_embedding_group (line 66) | def get_embedding_group(pp_size, pp_group:CommGroup, manual_global_rank=...
  function merge_redistributed_group (line 73) | def merge_redistributed_group(split_tp_sp_cp_group:CommGroup, allgather_...
  function gen_comm_groups (line 108) | def gen_comm_groups(

FILE: galvatron/core/runtime/dataloader.py
  class FakeCausalLMDataset (line 35) | class FakeCausalLMDataset(Dataset):
    method __init__ (line 38) | def __init__(self, args, device, dataset_size=2560 * 16):
    method __len__ (line 45) | def __len__(self):
    method __getitem__ (line 48) | def __getitem__(self, idx):
  function random_collate_fn (line 52) | def random_collate_fn(batch):
  function build_pretraining_data_loader (line 73) | def build_pretraining_data_loader(dataset, consumed_samples):
  class MegatronPretrainingSampler (line 113) | class MegatronPretrainingSampler:
    method __init__ (line 115) | def __init__(self, total_samples, consumed_samples, micro_batch_size,
    method __len__ (line 138) | def __len__(self):
    method get_start_end_idx (line 141) | def get_start_end_idx(self):
    method __iter__ (line 146) | def __iter__(self):
  class RandomSeedDataset (line 162) | class RandomSeedDataset(Dataset):
    method __init__ (line 164) | def __init__(self, dataset):
    method __len__ (line 170) | def __len__(self):
    method set_epoch (line 173) | def set_epoch(self, epoch):
    method __getitem__ (line 176) | def __getitem__(self, idx):
  class MegatronPretrainingRandomSampler (line 184) | class MegatronPretrainingRandomSampler:
    method __init__ (line 186) | def __init__(self, dataset, total_samples, consumed_samples, micro_bat...
    method __len__ (line 210) | def __len__(self):
    method __iter__ (line 213) | def __iter__(self):
  function get_blend_and_blend_per_split (line 254) | def get_blend_and_blend_per_split(args):
  function get_train_valid_test_num_samples (line 299) | def get_train_valid_test_num_samples():
  function build_train_valid_test_datasets (line 321) | def build_train_valid_test_datasets(build_train_valid_test_datasets_prov...
  function build_train_valid_test_data_loaders (line 331) | def build_train_valid_test_data_loaders(
  function build_train_valid_test_data_iterators (line 389) | def build_train_valid_test_data_iterators(
  function _build_random_data_iterator (line 442) | def _build_random_data_iterator():
  function get_train_valid_test_data_iterators (line 460) | def get_train_valid_test_data_iterators():
  function get_batch (line 509) | def get_batch(data_iterator):
  function _loss_func (line 541) | def _loss_func(micro_lossmask, label: List, output_tensor: List):

FILE: galvatron/core/runtime/datasets/megatron/blended_dataset.py
  class BlendedDataset (line 24) | class BlendedDataset(torch.utils.data.Dataset):
    method __init__ (line 41) | def __init__(
    method __len__ (line 90) | def __len__(self) -> int:
    method __getitem__ (line 93) | def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
    method _build_indices (line 98) | def _build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]:

FILE: galvatron/core/runtime/datasets/megatron/blended_megatron_dataset_builder.py
  function need_to_build_dataset (line 28) | def need_to_build_dataset():
  class BlendedMegatronDatasetBuilder (line 39) | class BlendedMegatronDatasetBuilder(object):
    method __init__ (line 54) | def __init__(
    method build (line 94) | def build(self) -> List[Optional[TopLevelDataset]]:
    method _build_blended_dataset_splits (line 186) | def _build_blended_dataset_splits(self) -> List[Optional[TopLevelDatas...
    method _build_megatron_datasets_parallel (line 353) | def _build_megatron_datasets_parallel(
    method _build_megatron_dataset_splits (line 435) | def _build_megatron_dataset_splits(
    method build_generic_dataset (line 502) | def build_generic_dataset(
  function _get_size_per_split_per_dataset (line 561) | def _get_size_per_split_per_dataset(

FILE: galvatron/core/runtime/datasets/megatron/blended_megatron_dataset_config.py
  class BlendedMegatronDatasetConfig (line 16) | class BlendedMegatronDatasetConfig:
    method __post_init__ (line 66) | def __post_init__(self) -> None:
  function parse_and_normalize_split (line 109) | def parse_and_normalize_split(split: str) -> List[float]:
  function convert_split_vector_to_split_matrix (line 129) | def convert_split_vector_to_split_matrix(

FILE: galvatron/core/runtime/datasets/megatron/gpt_dataset.py
  class GPTDatasetConfig (line 26) | class GPTDatasetConfig(BlendedMegatronDatasetConfig):
    method __post_init__ (line 54) | def __post_init__(self) -> None:
  class GPTDataset (line 65) | class GPTDataset(MegatronDataset):
    method __init__ (line 83) | def __init__(
    method numel_low_level_dataset (line 117) | def numel_low_level_dataset(low_level_dataset: IndexedDataset) -> int:
    method build_low_level_dataset (line 132) | def build_low_level_dataset(dataset_path: str, config: GPTDatasetConfi...
    method __len__ (line 152) | def __len__(self) -> int:
    method __getitem__ (line 160) | def __getitem__(self, idx: Optional[int]) -> Dict[str, torch.Tensor]:
    method _query_document_sample_shuffle_indices (line 233) | def _query_document_sample_shuffle_indices(
    method _build_document_sample_shuffle_indices (line 304) | def _build_document_sample_shuffle_indices(
    method _get_num_tokens_per_epoch (line 525) | def _get_num_tokens_per_epoch(self) -> int:
    method _get_num_epochs (line 533) | def _get_num_epochs(self, num_tokens_per_epoch: int) -> int:
  function _build_document_index (line 556) | def _build_document_index(
  function _build_shuffle_index (line 589) | def _build_shuffle_index(
  function _get_ltor_masks_and_position_ids (line 620) | def _get_ltor_masks_and_position_ids(
  class MockGPTLowLevelDataset (line 697) | class MockGPTLowLevelDataset:
    method __init__ (line 717) | def __init__(self, tokenizer: MegatronTokenizer) -> None:
    method __len__ (line 724) | def __len__(self) -> int:
    method __getitem__ (line 727) | def __getitem__(self, idx: int) -> numpy.number:
    method get (line 734) | def get(self, idx: int, offset: int = 0, length: Optional[int] = None)...
  class MockGPTDataset (line 752) | class MockGPTDataset(GPTDataset):
    method __init__ (line 770) | def __init__(
    method numel_low_level_dataset (line 784) | def numel_low_level_dataset(low_level_dataset: MockGPTLowLevelDataset)...
    method build_low_level_dataset (line 796) | def build_low_level_dataset(

FILE: galvatron/core/runtime/datasets/megatron/helpers.cpp
  function build_exhaustive_blending_indices (line 21) | void build_exhaustive_blending_indices(py::array_t<int16_t> &dataset_ind...
  function build_blending_indices (line 75) | void build_blending_indices(py::array_t<int16_t> &dataset_index,
  function build_sample_idx (line 143) | py::array_t<T> build_sample_idx(
  function get_target_sample_len (line 248) | inline int32_t get_target_sample_len(const int32_t short_seq_ratio,
  function build_mapping_impl (line 266) | py::array build_mapping_impl(const py::array_t<int64_t> &docs_,
  function build_mapping (line 526) | py::array build_mapping(const py::array_t<int64_t> &docs_,
  function build_blocks_mapping_impl (line 564) | py::array build_blocks_mapping_impl(const py::array_t<int64_t> &docs_,
  function build_blocks_mapping (line 805) | py::array build_blocks_mapping(const py::array_t<int64_t> &docs_,
  function PYBIND11_MODULE (line 838) | PYBIND11_MODULE(helpers_cpp, m)

FILE: galvatron/core/runtime/datasets/megatron/helpers.py
  function build_sample_idx (line 11) | def build_sample_idx(

FILE: galvatron/core/runtime/datasets/megatron/indexed_dataset.py
  class DType (line 41) | class DType(Enum):
    method code_from_dtype (line 54) | def code_from_dtype(cls, value: Type[numpy.number]) -> int:
    method dtype_from_code (line 66) | def dtype_from_code(cls, value: int) -> Type[numpy.number]:
    method size (line 78) | def size(key: Union[int, Type[numpy.number]]) -> int:
    method optimal_dtype (line 98) | def optimal_dtype(cardinality: Optional[int]) -> Type[numpy.number]:
  class _IndexWriter (line 113) | class _IndexWriter(object):
    method __init__ (line 122) | def __init__(self, idx_path: str, dtype: Type[numpy.number]) -> None:
    method __enter__ (line 126) | def __enter__(self) -> "_IndexWriter":
    method __exit__ (line 141) | def __exit__(
    method write (line 161) | def write(
    method _sequence_pointers (line 206) | def _sequence_pointers(self, sequence_lengths: List[int]) -> List[int]:
  class _IndexReader (line 224) | class _IndexReader(object):
    method __init__ (line 233) | def __init__(self, idx_path: str, multimodal: bool) -> None:
    method __del__ (line 313) | def __del__(self) -> None:
    method __len__ (line 319) | def __len__(self) -> int:
    method __getitem__ (line 328) | def __getitem__(self, idx: int) -> Tuple[numpy.int32, numpy.int64, Opt...
  class _BinReader (line 344) | class _BinReader(ABC):
    method read (line 348) | def read(self, dtype: Type[numpy.number], count: int, offset: int) -> ...
  class _MMapBinReader (line 364) | class _MMapBinReader(_BinReader):
    method __init__ (line 371) | def __init__(self, bin_path: str) -> None:
    method read (line 375) | def read(self, dtype: Type[numpy.number], count: int, offset: int) -> ...
    method __del__ (line 390) | def __del__(self) -> None:
  class _FileBinReader (line 397) | class _FileBinReader(_BinReader):
    method __init__ (line 404) | def __init__(self, bin_path: str) -> None:
    method read (line 407) | def read(self, dtype: Type[numpy.number], count: int, offset: int) -> ...
  class _S3BinReader (line 427) | class _S3BinReader(_BinReader):
    method __init__ (line 436) | def __init__(self, bin_path: str, bin_chunk_nbytes: int) -> None:
    method _extract_from_cache (line 445) | def _extract_from_cache(self, offset: int, size: int) -> bytes:
    method read (line 453) | def read(self, dtype: Type[numpy.number], count: int, offset: int) -> ...
    method __del__ (line 501) | def __del__(self) -> None:
  class IndexedDataset (line 506) | class IndexedDataset(torch.utils.data.Dataset):
    method __init__ (line 519) | def __init__(
    method initialize (line 542) | def initialize(
    method __getstate__ (line 582) | def __getstate__(self) -> Tuple[str, bool, bool, Optional[S3Config]]:
    method __setstate__ (line 590) | def __setstate__(self, state: Tuple[str, bool, bool, Optional[S3Config...
    method __del__ (line 599) | def __del__(self) -> None:
    method __len__ (line 604) | def __len__(self) -> int:
    method __getitem__ (line 612) | def __getitem__(
    method get (line 653) | def get(self, idx: int, offset: int = 0, length: Optional[int] = None)...
    method sequence_lengths (line 679) | def sequence_lengths(self) -> numpy.ndarray:
    method document_indices (line 688) | def document_indices(self) -> numpy.ndarray:
    method get_document_indices (line 696) | def get_document_indices(self) -> numpy.ndarray:
    method set_document_indices (line 706) | def set_document_indices(self, document_indices: numpy.ndarray) -> None:
    method sequence_modes (line 717) | def sequence_modes(self) -> numpy.ndarray:
    method exists (line 726) | def exists(path_prefix: str) -> bool:
  class IndexedDatasetBuilder (line 745) | class IndexedDatasetBuilder(object):
    method __init__ (line 756) | def __init__(
    method add_item (line 767) | def add_item(self, tensor: torch.Tensor, mode: int = 0) -> None:
    method add_document (line 781) | def add_document(
    method end_document (line 800) | def end_document(self) -> None:
    method add_index (line 804) | def add_index(self, path_prefix: str) -> None:
    method finalize (line 825) | def finalize(self, idx_path: str) -> None:
  function get_idx_path (line 836) | def get_idx_path(path_prefix: str) -> str:
  function get_bin_path (line 848) | def get_bin_path(path_prefix: str) -> str:

FILE: galvatron/core/runtime/datasets/megatron/megatron_dataset.py
  class MegatronDataset (line 19) | class MegatronDataset(ABC, torch.utils.data.Dataset):
    method __init__ (line 36) | def __init__(
    method numel_low_level_dataset (line 71) | def numel_low_level_dataset(low_level_dataset: LowLevelDataset) -> int:
    method build_low_level_dataset (line 88) | def build_low_level_dataset(
    method _key_config_attributes (line 109) | def _key_config_attributes() -> List[str]:
    method __len__ (line 121) | def __len__(self) -> int:
    method __getitem__ (line 130) | def __getitem__(self, idx: int) -> Dict[str, Union[torch.Tensor, numpy...

FILE: galvatron/core/runtime/datasets/megatron/megatron_tokenizer.py
  class MegatronTokenizer (line 10) | class MegatronTokenizer(ABC):
    method __init__ (line 22) | def __init__(self, *tokenizer_paths: str, **tokenizer_options: Any):
    method tokenize (line 35) | def tokenize(self, text: str) -> numpy.ndarray:
    method detokenize (line 46) | def detokenize(self, ids: numpy.ndarray) -> str:
    method offsets (line 60) | def offsets(self, ids: list[int], text: str) -> list[int]:
    method vocab (line 77) | def vocab(self):
    method inv_vocab (line 83) | def inv_vocab(self):
    method vocab_size (line 89) | def vocab_size(self):
    method cls (line 94) | def cls(self):
    method sep (line 103) | def sep(self):
    method pad (line 112) | def pad(self):
    method eod (line 121) | def eod(self):
    method bos (line 130) | def bos(self):
    method eos (line 139) | def eos(self):
    method mask (line 148) | def mask(self):

FILE: galvatron/core/runtime/datasets/megatron/tokenizer.py
  function _vocab_size_with_padding (line 7) | def _vocab_size_with_padding(orig_vocab_size, args, logging_enabled=True):
  function build_tokenizer (line 23) | def build_tokenizer(args: GalvatronRuntimeArgs, **kwargs):
  class _HuggingFaceTokenizer (line 34) | class _HuggingFaceTokenizer(MegatronTokenizer):
    method __init__ (line 35) | def __init__(self, pretrained_model_name_or_path, **kwargs):
    method vocab_size (line 52) | def vocab_size(self):
    method vocab (line 56) | def vocab(self):
    method inv_vocab (line 61) | def inv_vocab(self):
    method decoder (line 66) | def decoder(self):
    method tokenize (line 69) | def tokenize(self, text, **kwargs):
    method detokenize (line 72) | def detokenize(self, token_ids, **kwargs):
    method offsets (line 75) | def offsets(self, ids: list[int], text: str) -> list[int]:
    method eod (line 88) | def eod(self):

FILE: galvatron/core/runtime/datasets/megatron/utils.py
  class Split (line 15) | class Split(Enum):
  function compile_helpers (line 21) | def compile_helpers():
  function normalize (line 34) | def normalize(weights: List[float]) -> List[float]:
  function get_blend_from_list (line 49) | def get_blend_from_list(

FILE: galvatron/core/runtime/datasets/megatron/utils_s3.py
  class S3Config (line 16) | class S3Config(NamedTuple):
  class S3Client (line 34) | class S3Client(Protocol):
    method download_file (line 37) | def download_file(self, Bucket: str, Key: str, Filename: str) -> None:...
    method upload_file (line 39) | def upload_file(self, Filename: str, Bucket: str, Key: str) -> None: ...
    method head_object (line 41) | def head_object(self, Bucket: str, Key: str) -> Dict[str, Any]: ...
    method get_object (line 43) | def get_object(self, Bucket: str, Key: str, Range: str) -> Dict[str, A...
    method close (line 45) | def close(self) -> None: ...
  function is_s3_path (line 48) | def is_s3_path(path: str) -> bool:
  function parse_s3_path (line 60) | def parse_s3_path(path: str) -> Tuple[str, str]:
  function object_exists (line 80) | def object_exists(client: S3Client, path: str) -> bool:
  function _download_file (line 103) | def _download_file(client: S3Client, s3_path: str, local_path: str) -> N...
  function maybe_download_file (line 119) | def maybe_download_file(s3_path: str, local_path: str) -> None:

FILE: galvatron/core/runtime/datasets/random_dataset.py
  class RandomTokenDataset (line 11) | class RandomTokenDataset(Dataset):
    method __init__ (line 25) | def __init__(self, vocab_size: int, seq_length: int, size: int = 256):
    method __len__ (line 28) | def __len__(self) -> int:
    method __getitem__ (line 31) | def __getitem__(self, idx: int) -> torch.Tensor:
  function random_collate_fn (line 35) | def random_collate_fn(batch):

FILE: galvatron/core/runtime/hybrid_parallel_config.py
  function get_pp_ranks_enc (line 10) | def get_pp_ranks_enc(pp_divide):
  function get_hybrid_parallel_configs_api (line 18) | def get_hybrid_parallel_configs_api(args:GalvatronRuntimeArgs):
  function check_hp_config (line 186) | def check_hp_config(hp_configs, layernum_list):
  function print_hp_config (line 216) | def print_hp_config(key, val):
  function print_hp_configs (line 223) | def print_hp_configs(hp_configs):
  function hp_config_whole_model (line 229) | def hp_config_whole_model(module_types, hp_configs, vocab_sdp=0, embed_c...
  function get_enc_groups (line 317) | def get_enc_groups(groups_whole, module_types):
  function mixed_precision_dtype (line 326) | def mixed_precision_dtype(mixed_precision):
  function layer_shapes_dtypes_whole_model (line 330) | def layer_shapes_dtypes_whole_model(module_types, layernum_list, layer_s...
  function get_chunks (line 359) | def get_chunks(args):

FILE: galvatron/core/runtime/hybrid_parallel_model.py
  class GalvatronModel (line 42) | class GalvatronModel(nn.Module):
    method __init__ (line 43) | def __init__(self, hp_model: PipelineParallel):
    method forward_backward (line 51) | def forward_backward(self, batch, iter=None, profiler=None, loss_func=...
    method fake_tensor (line 81) | def fake_tensor(self, x):
    method fake_loss_func (line 84) | def fake_loss_func(self, labels, outputs):
    method loss_to_cpu (line 90) | def loss_to_cpu(self, loss):
  function construct_hybrid_parallel_model_api (line 99) | def construct_hybrid_parallel_model_api(

FILE: galvatron/core/runtime/initialize.py
  function init_empty_weights (line 15) | def init_empty_weights(include_buffers: bool = True):
  function init_on_device (line 47) | def init_on_device(device: torch.device, include_buffers: bool = True):
  function _initialize_distributed (line 114) | def _initialize_distributed(args:GalvatronRuntimeArgs):
  function initialize_galvatron (line 142) | def initialize_galvatron(args:GalvatronRuntimeArgs):
  function _compile_dependencies (line 163) | def _compile_dependencies():
  function validate_args (line 190) | def validate_args(args:GalvatronRuntimeArgs):
  function _print_args (line 240) | def _print_args(args:GalvatronRuntimeArgs, title: str = "arguments"):

FILE: galvatron/core/runtime/models/arch.py
  function arch_to_module_types (line 55) | def arch_to_module_types(arch_list: List[str]) -> List[str]:
  class ModelInfo (line 63) | class ModelInfo:
    method __init__ (line 64) | def __init__(self):
    method set_layernums (line 67) | def set_layernums(self, info):
    method set_shapes (line 70) | def set_shapes(self, info):
    method set_dtypes (line 73) | def set_dtypes(self, info):
    method set_module_types (line 76) | def set_module_types(self, info):
    method layernums (line 79) | def layernums(self):
    method shapes (line 82) | def shapes(self):
    method dtypes (line 85) | def dtypes(self):
    method module_types (line 88) | def module_types(self):
  class ArchModelInfo (line 96) | class ArchModelInfo(ModelInfo):
    method __init__ (line 99) | def __init__(self, arch_list: List[str], args:GalvatronRuntimeArgs):
  class BlockNames (line 127) | class BlockNames:

FILE: galvatron/core/runtime/models/builder.py
  function build_sequential_from_arch (line 42) | def build_sequential_from_arch(
  function build_causal_lm_arch (line 111) | def build_causal_lm_arch(args:GalvatronRuntimeArgs) -> List[str]:
  function get_block_names (line 124) | def get_block_names(args:GalvatronRuntimeArgs):
  function build_model (line 158) | def build_model(args:GalvatronRuntimeArgs):
  function get_runtime_profiler (line 190) | def get_runtime_profiler(args, path, start_iter=10, end_iter=20):

FILE: galvatron/core/runtime/models/modules.py
  class GalvatronEmbedding (line 35) | class GalvatronEmbedding(nn.Module):
    method __init__ (line 41) | def __init__(self, args: GalvatronRuntimeArgs, tp_group=None, sp_group...
    method forward (line 77) | def forward(self, input_ids, position_ids=None, attention_mask=None, l...
  class GalvatronAttention (line 103) | class GalvatronAttention(nn.Module):
    method __init__ (line 106) | def __init__(self, args: GalvatronRuntimeArgs, layer_idx, tp_group=Non...
    method _get_rotary_pos_emb (line 163) | def _get_rotary_pos_emb(self, hidden_states):
    method forward (line 178) | def forward(self, hidden_states, position_ids, attention_mask, rotary_...
  class GalvatronMLP (line 192) | class GalvatronMLP(nn.Module):
    method __init__ (line 195) | def __init__(self, args: GalvatronRuntimeArgs, layer_idx, tp_group=Non...
    method forward (line 210) | def forward(self, hidden_states):
  class GalvatronDecoderLayer (line 223) | class GalvatronDecoderLayer(nn.Module):
    method __init__ (line 226) | def __init__(self, args: GalvatronRuntimeArgs, layer_idx, tp_group=Non...
    method forward (line 232) | def forward(self, hidden_states, position_ids=None, attention_mask=Non...
  class GalvatronFinalNorm (line 242) | class GalvatronFinalNorm(nn.Module):
    method __init__ (line 245) | def __init__(self, args: GalvatronRuntimeArgs):
    method forward (line 250) | def forward(self, hidden_states, position_ids=None, attention_mask=Non...
  class _LMHeadLinear (line 258) | class _LMHeadLinear(nn.Module):
    method __init__ (line 261) | def __init__(self, config, sequence_parallel, tp_group):
    method forward (line 278) | def forward(self, hidden_states):
  class GalvatronCausalLMHead (line 290) | class GalvatronCausalLMHead(nn.Module):
    method __init__ (line 293) | def __init__(self, args: GalvatronRuntimeArgs, tp_group=None, sp_group...
    method forward (line 315) | def forward(self, hidden_states, position_ids=None, attention_mask=Non...

FILE: galvatron/core/runtime/models/moe_modules.py
  class GalvatronMoEAttention (line 19) | class GalvatronMoEAttention(nn.Module):
    method __init__ (line 20) | def __init__(self, args: GalvatronRuntimeArgs, layer_idx, tp_group=Non...
    method forward (line 26) | def forward(self, hidden_states, position_ids=None, attention_mask=Non...
  class GalvatronMoERouter (line 33) | class GalvatronMoERouter(nn.Module):
    method __init__ (line 34) | def __init__(self, args: GalvatronRuntimeArgs, layer_idx):
    method reset_parameters (line 43) | def reset_parameters(self):
    method forward (line 50) | def forward(self, hidden_states):
  class GalvatronMoEMLP (line 56) | class GalvatronMoEMLP(nn.Module):
    method __init__ (line 57) | def __init__(self, args: GalvatronRuntimeArgs, layer_idx, ep_group=Non...
    method forward (line 121) | def forward(self, hidden_states, mlp_residual, probs, routing_map):
  class GalvatronMoEDecoderLayer (line 131) | class GalvatronMoEDecoderLayer(nn.Module):
    method __init__ (line 134) | def __init__(
    method forward (line 151) | def forward(self, hidden_states, position_ids=None, attention_mask=Non...

FILE: galvatron/core/runtime/moe/fused_a2a.py
  function get_hidden_bytes (line 18) | def get_hidden_bytes(x: torch.Tensor) -> int:
  function get_buffer (line 30) | def get_buffer(group: torch.distributed.ProcessGroup, hidden_bytes: int):
  class FusedDispatch (line 66) | class FusedDispatch(torch.autograd.Function):
    method forward (line 70) | def forward(ctx, x, token_indices, token_probs, num_experts, group, pr...
    method backward (line 119) | def backward(
  class FusedCombine (line 137) | class FusedCombine(torch.autograd.Function):
    method forward (line 141) | def forward(ctx, x, group, handle, previous_event=None):
    method backward (line 153) | def backward(ctx, grad_output, previous_event=None):
  function fused_dispatch (line 168) | def fused_dispatch(x, token_indices, token_probs, num_experts, group, pr...
  function fused_combine (line 186) | def fused_combine(x, group, handle, previous_event=None):

FILE: galvatron/core/runtime/moe/fused_kernels.py
  function moe_unpermute (line 10) | def moe_unpermute(
  class _moe_unpermute_mask_map (line 55) | class _moe_unpermute_mask_map(torch.autograd.Function):
    method forward (line 59) | def forward(
    method backward (line 105) | def backward(ctx, unpermuted_act_grad):
  function triton_unpermute_with_mask_map (line 147) | def triton_unpermute_with_mask_map(
  function _unpermute_kernel (line 199) | def _unpermute_kernel(
  function triton_unpermute_with_mask_map_bwd_with_merging_probs (line 266) | def triton_unpermute_with_mask_map_bwd_with_merging_probs(
  function _unpermute_bwd_with_merging_probs_kernel (line 317) | def _unpermute_bwd_with_merging_probs_kernel(
  function moe_permute (line 398) | def moe_permute(
  class _moe_permute_mask_map (line 440) | class _moe_permute_mask_map(torch.autograd.Function):
    method forward (line 444) | def forward(
    method backward (line 487) | def backward(
  function triton_make_row_id_map (line 514) | def triton_make_row_id_map(
  function _row_id_map_pass_1_kernel (line 545) | def _row_id_map_pass_1_kernel(
  function _row_id_map_pass_2_kernel (line 576) | def _row_id_map_pass_2_kernel(
  function triton_permute_with_mask_map (line 607) | def triton_permute_with_mask_map(
  function _permute_kernel (line 654) | def _permute_kernel(
  class _moe_chunk_sort (line 698) | class _moe_chunk_sort(torch.autograd.Function):
    method forward (line 702) | def forward(
    method backward (line 737) | def backward(
  function moe_sort_chunks_by_index (line 762) | def moe_sort_chunks_by_index(
  function _sort_chunks_by_idxs_kernel (line 796) | def _sort_chunks_by_idxs_kernel(
  function sort_chunks_by_idx (line 874) | def sort_chunks_by_idx(
  function _sort_chunks_by_map (line 924) | def _sort_chunks_by_map(
  function sort_chunks_by_map (line 962) | def sort_chunks_by_map(

FILE: galvatron/core/runtime/moe/grouped_gemm_util.py
  function grouped_gemm_is_available (line 9) | def grouped_gemm_is_available():
  function assert_grouped_gemm_is_available (line 14) | def assert_grouped_gemm_is_available():

FILE: galvatron/core/runtime/moe/mlp.py
  class GroupedMLP (line 26) | class GroupedMLP(torch.nn.Module):
    method __init__ (line 32) | def __init__(
    method forward (line 99) | def forward(self, permuted_local_hidden_states: torch.Tensor, tokens_p...
  class SequentialMLP (line 128) | class SequentialMLP(torch.nn.Module):
    method __init__ (line 134) | def __init__(
    method _pad_tensor_for_fp8 (line 164) | def _pad_tensor_for_fp8(self, hidden):
    method forward (line 176) | def forward(self, permuted_local_hidden_states: torch.Tensor, tokens_p...
  class SharedExpertMLP (line 215) | class SharedExpertMLP(MLP):
    method __init__ (line 224) | def __init__(self, config: GalvatronModelArgs, submodules: MLPSubmodul...
    method forward (line 271) | def forward(self, hidden_states):
    method pre_forward_comm (line 280) | def pre_forward_comm(self, input):
    method linear_fc1_forward_and_act (line 301) | def linear_fc1_forward_and_act(self, overlapped_comm_output=None):
    method linear_fc2_forward (line 348) | def linear_fc2_forward(self, overlapped_comm_output=None):
    method post_forward_comm (line 363) | def post_forward_comm(self):
    method get_output (line 383) | def get_output(self):
  function set_tensor_grad_fn_sequence_sr (line 403) | def set_tensor_grad_fn_sequence_sr(tensor, value):

FILE: galvatron/core/runtime/moe/moe_utils.py
  function switch_load_balancing_loss_func (line 14) | def switch_load_balancing_loss_func(
  function sequence_load_balancing_loss_func (line 62) | def sequence_load_balancing_loss_func(
  function z_loss_func (line 115) | def z_loss_func(logits, z_loss_coeff):
  function sinkhorn (line 130) | def sinkhorn(cost: torch.Tensor, tol: float = 0.0001):
  function get_capacity (line 147) | def get_capacity(num_tokens: int, num_experts: int, capacity_factor: flo...
  class MoEAuxLossAutoScaler (line 166) | class MoEAuxLossAutoScaler(torch.autograd.Function):
    method forward (line 172) | def forward(ctx, output: torch.Tensor, aux_loss: torch.Tensor):
    method backward (line 186) | def backward(ctx, grad_output: torch.Tensor):
    method set_loss_scale (line 206) | def set_loss_scale(scale: torch.Tensor):
  function permute (line 219) | def permute(
  function unpermute (line 280) | def unpermute(
  function sort_chunks_by_idxs (line 356) | def sort_chunks_by_idxs(
  function group_limited_topk (line 372) | def group_limited_topk(
  function topk_softmax_with_capacity (line 430) | def topk_softmax_with_capacity(
  function save_to_aux_losses_tracker (line 547) | def save_to_aux_losses_tracker(
  function clear_aux_losses_tracker (line 577) | def clear_aux_losses_tracker():
  function reduce_aux_losses_tracker_across_ranks (line 586) | def reduce_aux_losses_tracker_across_ranks():
  function track_moe_metrics (line 604) | def track_moe_metrics(
  function get_updated_expert_bias (line 645) | def get_updated_expert_bias(tokens_per_expert, expert_bias, expert_bias_...
  function maybe_move_tensor_to_cpu (line 665) | def maybe_move_tensor_to_cpu(tensor, as_numpy=False, record_stream=False):

FILE: galvatron/core/runtime/moe/router.py
  class Router (line 22) | class Router(ABC, torch.nn.Module):
    method __init__ (line 25) | def __init__(self, config: GalvatronModelArgs) -> None:
    method gating (line 49) | def gating(self, input: torch.Tensor):
    method routing (line 71) | def routing(self, logits: torch.Tensor):
    method forward (line 84) | def forward(self, input: torch.Tensor):
    method set_layer_idx (line 93) | def set_layer_idx(self, layer_idx: int):
  class TopKRouter (line 98) | class TopKRouter(Router):
    method __init__ (line 101) | def __init__(self, config: GalvatronModelArgs) -> None:
    method _maintain_float32_expert_bias (line 129) | def _maintain_float32_expert_bias(self):
    method sinkhorn_load_balancing (line 140) | def sinkhorn_load_balancing(self, logits: torch.Tensor):
    method compute_routing_scores_for_aux_loss (line 173) | def compute_routing_scores_for_aux_loss(self, logits: torch.Tensor) ->...
    method aux_loss_load_balancing (line 193) | def aux_loss_load_balancing(self, logits: torch.Tensor):
    method seq_aux_loss_load_balancing (line 233) | def seq_aux_loss_load_balancing(self, logits: torch.Tensor, bsz: int, ...
    method apply_load_balancing_loss (line 278) | def apply_load_balancing_loss(
    method apply_z_loss (line 316) | def apply_z_loss(self, logits):
    method apply_input_jitter (line 350) | def apply_input_jitter(self, input: torch.Tensor):
    method routing (line 371) | def routing(self, logits: torch.Tensor):
    method forward (line 423) | def forward(self, input: torch.Tensor):

FILE: galvatron/core/runtime/moe/token_dispatcher.py
  class MoETokenDispatcher (line 37) | class MoETokenDispatcher:
    method __init__ (line 42) | def __init__(
    method ep_group (line 62) | def ep_group(self):
    method tp_group (line 67) | def tp_group(self):
    method tp_rank (line 72) | def tp_rank(self):
    method tp_ep_group (line 77) | def tp_ep_group(self):
    method token_permutation (line 82) | def token_permutation(
    method token_unpermutation (line 98) | def token_unpermutation(self, expert_output: torch.Tensor, bias: torch...
    method set_shared_experts (line 110) | def set_shared_experts(self, shared_experts):
  class MoEAllGatherTokenDispatcher (line 116) | class MoEAllGatherTokenDispatcher(MoETokenDispatcher):
    method __init__ (line 122) | def __init__(
    method token_permutation (line 150) | def token_permutation(
    method token_unpermutation (line 216) | def token_unpermutation(self, hidden_states: torch.Tensor, bias: torch...
  class MoEAlltoAllTokenDispatcher (line 287) | class MoEAlltoAllTokenDispatcher(MoETokenDispatcher):
    method __init__ (line 297) | def __init__(
    method preprocess (line 379) | def preprocess(self, routing_map: torch.Tensor) -> torch.Tensor:
    method token_permutation (line 510) | def token_permutation(
    method token_unpermutation (line 606) | def token_unpermutation(
    method _maybe_update_cuda_sync_point (line 691) | def _maybe_update_cuda_sync_point(self, point: str):
    method _maybe_dtoh_and_synchronize (line 702) | def _maybe_dtoh_and_synchronize(
  class _DispatchManager (line 743) | class _DispatchManager(ABC):
    method setup_metadata (line 756) | def setup_metadata(self, routing_map: torch.Tensor, probs: torch.Tensor):
    method dispatch (line 761) | def dispatch(self, hidden_states: torch.Tensor) -> torch.Tensor:
    method combine (line 766) | def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
    method get_dispached_metadata (line 771) | def get_dispached_metadata(self) -> torch.Tensor:
    method get_permuted_hidden_states_by_experts (line 776) | def get_permuted_hidden_states_by_experts(self, hidden_states: torch.T...
    method get_restored_hidden_states_by_experts (line 781) | def get_restored_hidden_states_by_experts(self, hidden_states: torch.T...
  class _DeepepManager (line 786) | class _DeepepManager(_DispatchManager):
    method __init__ (line 808) | def __init__(
    method setup_metadata (line 838) | def setup_metadata(self, routing_map: torch.Tensor, probs: torch.Tensor):
    method dispatch (line 850) | def dispatch(self, hidden_states: torch.Tensor) -> torch.Tensor:
    method _indices_to_multihot (line 868) | def _indices_to_multihot(self, indices, probs):
    method get_dispached_metadata (line 899) | def get_dispached_metadata(self) -> torch.Tensor:
    method get_number_of_tokens_per_expert (line 902) | def get_number_of_tokens_per_expert(self) -> torch.Tensor:
    method combine (line 908) | def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
    method get_permuted_hidden_states_by_experts (line 914) | def get_permuted_hidden_states_by_experts(self, hidden_states: torch.T...
    method get_restored_hidden_states_by_experts (line 927) | def get_restored_hidden_states_by_experts(self, hidden_states: torch.T...
  class MoEFlexTokenDispatcher (line 942) | class MoEFlexTokenDispatcher(MoETokenDispatcher):
    method __init__ (line 947) | def __init__(
    method set_shared_experts (line 980) | def set_shared_experts(self, shared_experts):
    method _initialize_metadata (line 985) | def _initialize_metadata(self, routing_map: torch.Tensor, probs: torch...
    method token_permutation (line 1012) | def token_permutation(
    method token_unpermutation (line 1030) | def token_unpermutation(

FILE: galvatron/core/runtime/optimizer/clip_grads.py
  function local_multi_tensor_applier (line 11) | def local_multi_tensor_applier(op, noop_flag_buffer, tensor_lists, *args):
  function local_multi_tensor_l2_norm (line 18) | def local_multi_tensor_l2_norm(chunk_size, noop_flag, tensor_lists, per_...
  function local_multi_tensor_scale (line 30) | def local_multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale):
  function get_grad_norm_fp32 (line 66) | def get_grad_norm_fp32(
  function clip_grad_by_total_norm_fp32 (line 154) | def clip_grad_by_total_norm_fp32(

FILE: galvatron/core/runtime/optimizer/num_microbatches_calculator.py
  function get_num_microbatches (line 17) | def get_num_microbatches() -> int:
  function get_current_global_batch_size (line 22) | def get_current_global_batch_size() -> int:
  function get_micro_batch_size (line 27) | def get_micro_batch_size() -> int:
  function get_current_running_global_batch_size (line 32) | def get_current_running_global_batch_size() -> int:
  function update_num_microbatches (line 38) | def update_num_microbatches(
  function unset_num_microbatches_calculator (line 54) | def unset_num_microbatches_calculator():
  function init_num_microbatches_calculator (line 64) | def init_num_microbatches_calculator(
  function destroy_num_microbatches_calculator (line 101) | def destroy_num_microbatches_calculator():
  function reconfigure_num_microbatches_calculator (line 107) | def reconfigure_num_microbatches_calculator(
  function _configure_global_num_microbatches_calculator (line 144) | def _configure_global_num_microbatches_calculator(
  function _build_num_microbatches_calculator (line 191) | def _build_num_microbatches_calculator(
  function _round (line 261) | def _round(batch_size: int, divisor: int) -> int:
  class NumMicroBatchesCalculator (line 266) | class NumMicroBatchesCalculator(ABC):
    method __init__ (line 269) | def __init__(self) -> None:
    method get (line 275) | def get(self) -> int:
    method get_current_global_batch_size (line 279) | def get_current_global_batch_size(self) -> int:
    method get_micro_batch_size (line 283) | def get_micro_batch_size(self) -> int:
    method get_current_running_global_batch_size (line 287) | def get_current_running_global_batch_size(self) -> int:
    method update (line 293) | def update(self, consumed_samples, consistency_check, verbose=False) -...
  class ConstantNumMicroBatchesCalculator (line 298) | class ConstantNumMicroBatchesCalculator(NumMicroBatchesCalculator):
    method __init__ (line 315) | def __init__(
    method update (line 356) | def update(self, consumed_samples, consistency_check, verbose=False) -...
  class RampupBatchsizeNumMicroBatchesCalculator (line 360) | class RampupBatchsizeNumMicroBatchesCalculator(NumMicroBatchesCalculator):
    method __init__ (line 387) | def __init__(
    method update (line 441) | def update(self, consumed_samples: int, consistency_check: bool, verbo...

FILE: galvatron/core/runtime/optimizer/param_scheduler.py
  function update_train_iters (line 11) | def update_train_iters(args):
  function get_optimizer_param_scheduler (line 45) | def get_optimizer_param_scheduler(optimizer):
  class OptimizerParamScheduler (line 102) | class OptimizerParamScheduler:
    method __init__ (line 127) | def __init__(
    method get_wd (line 186) | def get_wd(self) -> float:
    method get_lr (line 209) | def get_lr(self, param_group: dict) -> float:
    method step (line 270) | def step(self, increment: int) -> None:
    method state_dict (line 283) | def state_dict(self) -> dict:
    method _check_and_set (line 299) | def _check_and_set(self, cls_value: float, sd_value: float, name: str)...
    method load_state_dict (line 322) | def load_state_dict(self, state_dict: dict) -> None:

FILE: galvatron/core/runtime/optimizer/utils.py
  function clip_grad_norm (line 14) | def clip_grad_norm(model, max_norm, norm_type=2):
  function get_optimizer_and_param_scheduler (line 43) | def get_optimizer_and_param_scheduler(model, args):

FILE: galvatron/core/runtime/parallel.py
  function _get_modules_to_materialize (line 19) | def _get_modules_to_materialize(root_module: nn.Module) -> List[nn.Module]:
  function wrap_data_parallel (line 41) | def wrap_data_parallel(
  function param_init_fn (line 87) | def param_init_fn(all_block_name, load, distributed_checkpoint, tp_group...
  function wrap_module_fsdp_manually (line 100) | def wrap_module_fsdp_manually(
  function apply_fsdp (line 192) | def apply_fsdp(model, fsdp_args, wrap_block_name, need_ignore=False):
  function apply_ckpt (line 213) | def apply_ckpt(model, checkpoint_wrapper_fn, wrap_block_name):
  function wrap_modules_checkpoint (line 226) | def wrap_modules_checkpoint(module_list, checkpoint_flags, wrap_block_na...
  function wrap_model_checkpoint (line 240) | def wrap_model_checkpoint(model, wrap_block_names=[]):
  function relocate_activations (line 246) | def relocate_activations(input, allgather_cp_group, allgather_tp_sp_cp_g...
  class Module_with_relocation (line 272) | class Module_with_relocation(nn.Module):
    method __init__ (line 273) | def __init__(self, module, allgather_cp_group, allgather_tp_sp_cp_group,
    method forward (line 292) | def forward(self, *inputs, **kwargs):
  function wrap_modules_data_parallel (line 307) | def wrap_modules_data_parallel(
  function modules_to_devices (line 390) | def modules_to_devices(module_list, pp_devices):
  function wrap_modules_relocation (line 396) | def wrap_modules_relocation(module_list, allgather_cp_groups, allgather_...

FILE: galvatron/core/runtime/parallel_state.py
  function _ensure_var_is_initialized (line 12) | def _ensure_var_is_initialized(var, name):
  function _ensure_var_is_not_initialized (line 17) | def _ensure_var_is_not_initialized(var, name):
  function get_parallel_world_size (line 23) | def get_parallel_world_size(group:torch.distributed.ProcessGroup):
  function get_parallel_rank (line 27) | def get_parallel_rank(group:torch.distributed.ProcessGroup):
  function set_global_memory_buffer (line 34) | def set_global_memory_buffer():
  function get_global_memory_buffer (line 41) | def get_global_memory_buffer():
  function destroy_global_memory_buffer (line 47) | def destroy_global_memory_buffer():
  function set_args (line 56) | def set_args(args:GalvatronRuntimeArgs):
  function get_args (line 62) | def get_args():
  function _build_tokenizer (line 71) | def _build_tokenizer(args:GalvatronRuntimeArgs):
  function get_tokenizer (line 79) | def get_tokenizer():
  function _set_tensorboard_writer (line 88) | def _set_tensorboard_writer(args:GalvatronRuntimeArgs):
  function _set_wandb_writer (line 110) | def _set_wandb_writer(args:GalvatronRuntimeArgs):
  function set_global_variables (line 135) | def set_global_variables(args:GalvatronRuntimeArgs):
  function set_pp_comm_group (line 146) | def set_pp_comm_group(comm_group:CommGroup):
  function get_pp_comm_group (line 152) | def get_pp_comm_group():
  function get_pp_world_size (line 158) | def get_pp_world_size():
  function get_pp_rank (line 164) | def get_pp_rank():
  function is_pipeline_first_stage (line 170) | def is_pipeline_first_stage():
  function is_pipeline_last_stage (line 174) | def is_pipeline_last_stage():
  function get_virtual_pipeline_model_parallel_rank (line 179) | def get_virtual_pipeline_model_parallel_rank():
  function set_vocab_tp_sp_comm_group (line 190) | def set_vocab_tp_sp_comm_group(comm_group:CommGroup):
  function set_vocab_cp_comm_group (line 196) | def set_vocab_cp_comm_group(comm_group:CommGroup):
  function set_vocab_dp_comm_group (line 202) | def set_vocab_dp_comm_group(comm_group:CommGroup):
  function set_vocab_tp_sp_src_rank (line 208) | def set_vocab_tp_sp_src_rank(rank:int):
  function get_vocab_tp_sp_comm_group (line 214) | def get_vocab_tp_sp_comm_group():
  function get_vocab_cp_comm_group (line 220) | def get_vocab_cp_comm_group():
  function get_vocab_dp_comm_group (line 226) | def get_vocab_dp_comm_group():
  function get_vocab_tp_sp_src_rank (line 232) | def get_vocab_tp_sp_src_rank():
  function get_vocab_tp_sp_world_size (line 238) | def get_vocab_tp_sp_world_size():
  function get_vocab_tp_sp_rank (line 244) | def get_vocab_tp_sp_rank():
  function get_vocab_dp_world_size (line 250) | def get_vocab_dp_world_size():
  function get_vocab_dp_rank (line 256) | def get_vocab_dp_rank():
  function get_vocab_cp_world_size (line 262) | def get_vocab_cp_world_size():
  function get_vocab_cp_rank (line 268) | def get_vocab_cp_rank():
  function _set_vocab_tp_sp_cp_group (line 274) | def _set_vocab_tp_sp_cp_group():
  function get_vocab_tp_sp_cp_group (line 288) | def get_vocab_tp_sp_cp_group():
  function get_vocab_tp_sp_cp_world_size (line 294) | def get_vocab_tp_sp_cp_world_size():
  function get_vocab_tp_sp_cp_rank (line 301) | def get_vocab_tp_sp_cp_rank():
  function set_tp_whole_comm_group (line 315) | def set_tp_whole_comm_group(whole_comm_group:List[CommGroup]):
  function set_sp_whole_comm_group (line 321) | def set_sp_whole_comm_group(whole_comm_group:List[CommGroup]):
  function set_dp_whole_comm_group (line 327) | def set_dp_whole_comm_group(whole_comm_group:List[CommGroup]):
  function set_cp_whole_comm_group (line 333) | def set_cp_whole_comm_group(whole_comm_group:List[CommGroup]):
  function set_sdp_whole_comm_group (line 339) | def set_sdp_whole_comm_group(whole_comm_group:List[CommGroup]):
  function get_tp_whole_comm_group (line 345) | def get_tp_whole_comm_group():
  function get_sp_whole_comm_group (line 351) | def get_sp_whole_comm_group():
  function get_dp_whole_comm_group (line 357) | def get_dp_whole_comm_group():
  function get_cp_whole_comm_group (line 363) | def get_cp_whole_comm_group():
  function get_sdp_whole_comm_group (line 369) | def get_sdp_whole_comm_group():
  function get_moe_layer_wise_logging_tracker (line 378) | def get_moe_layer_wise_logging_tracker():

FILE: galvatron/core/runtime/pipeline/grad_reduce.py
  function _send_backward_hook (line 36) | def _send_backward_hook(
  function fsdp_reduce_gradients (line 48) | def fsdp_reduce_gradients(model):
  function _allreduce_word_embedding_no_pipeline (line 69) | def _allreduce_word_embedding_no_pipeline(wte_model, wte_attr_name, lmhe...
  function _allreduce_word_embedding (line 87) | def _allreduce_word_embedding(module, tied_wte_attr_name, group):
  function _allreduce_word_embedding_grads_no_pipeline (line 99) | def _allreduce_word_embedding_grads_no_pipeline(wte_model, wte_attr_name...
  function _allreduce_word_embedding_grads (line 117) | def _allreduce_word_embedding_grads(module, tied_wte_attr_name, group):
  function enter_no_sync_context (line 128) | def enter_no_sync_context(model):
  function exit_no_sync_context (line 141) | def exit_no_sync_context(model):
  function _register_post_backward_hook_bf16 (line 152) | def _register_post_backward_hook_bf16(
  function _finalize_params_bf16 (line 199) | def _finalize_params_bf16(

FILE: galvatron/core/runtime/pipeline/pipeline.py
  function forward_step_function (line 32) | def forward_step_function(loss_func, **kwargs):
  class PipelineParallel (line 43) | class PipelineParallel(nn.Module):
    method __init__ (line 44) | def __init__(
    method check_tensor_dtype (line 155) | def check_tensor_dtype(self, layer_output_tensor_shapes, layer_output_...
    method get_default_tensor_dtype (line 161) | def get_default_tensor_dtype(self, layer_output_tensor_shapes):
    method wrap_pipeline_modules_data_parallel (line 170) | def wrap_pipeline_modules_data_parallel(
    method wrap_pipeline_modules_checkpoint (line 227) | def wrap_pipeline_modules_checkpoint(self, checkpoint_flags, wrap_bloc...
    method sync_embedding (line 237) | def sync_embedding(self):
    method gen_sp_layernorm_info (line 255) | def gen_sp_layernorm_info(self, layer_module_types, layer_tp_groups, l...
    method set_last_batch (line 269) | def set_last_batch(self, state):
    method update_tensor_shape (line 276) | def update_tensor_shape(self, microbatches, dp_size_input, dp_size, tp...
    method no_pipeline_forward_backward (line 307) | def no_pipeline_forward_backward(
    method pipedream_flush_forward_backward (line 387) | def pipedream_flush_forward_backward(
    method gpipe_forward_backward (line 715) | def gpipe_forward_backward(
    method gpipe_forward (line 730) | def gpipe_forward(
    method gpipe_backward (line 837) | def gpipe_backward(self):
    method to_list (line 897) | def to_list(self, tensor):
    method forward_step (line 907) | def forward_step(self, forward_step_func, batch, model, input_tensor, ...
    method check_finish_backward (line 939) | def check_finish_backward(self, require_grad_param_num):
    method backward_step (line 943) | def backward_step(self, input_tensor, output_tensor, output_tensor_grad):
    method finalize_wte_grads_func (line 1043) | def finalize_wte_grads_func(self):
    method get_pipeline_model_parallel_first_rank (line 1063) | def get_pipeline_model_parallel_first_rank(self):
    method get_pipeline_model_parallel_last_rank (line 1066) | def get_pipeline_model_parallel_last_rank(self):
    method get_pipeline_model_parallel_next_rank (line 1070) | def get_pipeline_model_parallel_next_rank(self):
    method get_pipeline_model_parallel_prev_rank (line 1075) | def get_pipeline_model_parallel_prev_rank(self):
    method is_pipeline_first_stage (line 1080) | def is_pipeline_first_stage(self):
    method is_pipeline_last_stage (line 1084) | def is_pipeline_last_stage(self):
    method _run_p2pops (line 1092) | def _run_p2pops(
    method _communicate (line 1141) | def _communicate(
    method recv_forward (line 1271) | def recv_forward(
    method recv_backward (line 1292) | def recv_backward(
    method send_forward (line 1311) | def send_forward(
    method send_backward (line 1332) | def send_backward(
    method send_forward_recv_backward (line 1351) | def send_forward_recv_backward(
    method send_backward_recv_forward (line 1371) | def send_backward_recv_forward(
    method send_forward_recv_forward (line 1391) | def send_forward_recv_forward(
    method send_backward_recv_backward (line 1410) | def send_backward_recv_backward(
    method send_forward_backward_recv_forward_backward (line 1429) | def send_forward_backward_recv_forward_backward(
    method recv_forward_multi (line 1454) | def recv_forward_multi(
    method recv_backward_multi (line 1473) | def recv_backward_multi(
    method send_forward_multi (line 1491) | def send_forward_multi(
    method send_backward_multi (line 1512) | def send_backward_multi(
    method send_forward_recv_backward_multi (line 1534) | def send_forward_recv_backward_multi(
    method send_backward_recv_forward_multi (line 1563) | def send_backward_recv_forward_multi(
  class PipeSequential (line 1593) | class PipeSequential(nn.Sequential):
    method forward (line 1598) | def forward(self, *inputs, **kwargs):

FILE: galvatron/core/runtime/pipeline/sp_grad_reduce.py
  function _post_backward_hook_sp (line 48) | def _post_backward_hook_sp(

FILE: galvatron/core/runtime/pipeline/utils.py
  function listify_model (line 6) | def listify_model(model: Union[torch.nn.Module, List[torch.nn.Module]]) ...
  function chunk_batch (line 12) | def chunk_batch(inputs, chunks):
  function chunk_dict (line 45) | def chunk_dict(kwargs, chunks):

FILE: galvatron/core/runtime/redistribute.py
  function _zigzag_transformation (line 5) | def _zigzag_transformation(input_, cp_world_size):
  function _reverse_zigzag_transformation (line 26) | def _reverse_zigzag_transformation(input_, cp_world_size):
  function _split_along_first_dim_with_sequence_parallel (line 43) | def _split_along_first_dim_with_sequence_parallel(input_, split_cp_group...
  function _gather_along_first_dim_with_sequence_parallel (line 85) | def _gather_along_first_dim_with_sequence_parallel(input_, allgather_cp_...
  function _split_along_first_dim (line 129) | def _split_along_first_dim(input_, split_tp_sp_cp_group):
  function _gather_along_first_dim (line 150) | def _gather_along_first_dim(input_, allgather_tp_sp_cp_group):
  class _Split (line 166) | class _Split(torch.autograd.Function):
    method forward (line 174) | def forward(ctx, input_, split_cp_group, split_tp_sp_cp_group, is_input):
    method backward (line 184) | def backward(ctx, grad_output):
  class _Gather (line 191) | class _Gather(torch.autograd.Function):
    method forward (line 199) | def forward(ctx, input_, allgather_cp_group, allgather_tp_sp_cp_group,...
    method backward (line 209) | def backward(ctx, grad_output):
  function split_to_group (line 216) | def split_to_group(input_, split_cp_group, split_tp_sp_cp_group, is_input):
  function gather_from_group (line 220) | def gather_from_group(input_, allgather_cp_group, allgather_tp_sp_cp_gro...
  function _fused_split_allgather_along_first_dim (line 223) | def _fused_split_allgather_along_first_dim(
  function _fused_split_allgather_along_first_dim_with_sequence_parallel (line 261) | def _fused_split_allgather_along_first_dim_with_sequence_parallel(
  class _Fused_split_allgather (line 345) | class _Fused_split_allgather(torch.autograd.Function):
    method forward (line 348) | def forward(ctx, input_, is_input, allgather_cp_group, allgather_tp_sp...
    method backward (line 372) | def backward(ctx, grad_output):
  function fused_split_allgather (line 408) | def fused_split_allgather(input_, is_input, allgather_cp_group, allgathe...

FILE: galvatron/core/runtime/tensor_parallel/layers.py
  function set_tensor_model_parallel_attributes (line 48) | def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride):
  class VocabParallelEmbedding (line 59) | class VocabParallelEmbedding(torch.nn.Module):
    method __init__ (line 78) | def __init__(
    method forward (line 120) | def forward(self, input_):
  class LinearWithFrozenWeight (line 150) | class LinearWithFrozenWeight(torch.autograd.Function):
    method forward (line 161) | def forward(ctx, input, weight, bias, allreduce_dgrad, tp_group):
    method backward (line 173) | def backward(ctx, grad_output):
  function linear_with_frozen_weight (line 186) | def linear_with_frozen_weight(
  class LinearWithGradAccumulationAndAsyncCommunication (line 262) | class LinearWithGradAccumulationAndAsyncCommunication(torch.autograd.Fun...
    method forward (line 267) | def forward(
    method backward (line 307) | def backward(ctx, grad_output):
  function linear_with_grad_accumulation_and_async_allreduce (line 430) | def linear_with_grad_accumulation_and_async_allreduce(
  class ColumnParallelLinear (line 547) | class ColumnParallelLinear(torch.nn.Module):
    method __init__ (line 596) | def __init__(
    method forward (line 708) | def forward(
    method __repr__ (line 810) | def __repr__(self):
  class RowParallelLinear (line 819) | class RowParallelLinear(torch.nn.Module):
    method __init__ (line 855) | def __init__(
    method forward (line 925) | def forward(self, input_):
    method __repr__ (line 982) | def __repr__(self):

FILE: galvatron/core/runtime/tensor_parallel/mappings.py
  function _reduce (line 18) | def _reduce(input_, group):
  function split_tensor_along_last_dim (line 31) | def split_tensor_along_last_dim(
  function _split_along_last_dim (line 57) | def _split_along_last_dim(input_, group):
  function _split_along_first_dim (line 76) | def _split_along_first_dim(input_, group):
  function _gather_along_last_dim (line 99) | def _gather_along_last_dim(input_, group):
  function _reduce_scatter_along_last_dim (line 120) | def _reduce_scatter_along_last_dim(input_, group):
  function _gather_along_first_dim (line 134) | def _gather_along_first_dim(input_, group, output_split_sizes=None, use_...
  function _reduce_scatter_along_first_dim (line 174) | def _reduce_scatter_along_first_dim(
  class _CopyToModelParallelRegion (line 217) | class _CopyToModelParallelRegion(torch.autograd.Function):
    method symbolic (line 221) | def symbolic(graph, input_, group):
    method forward (line 226) | def forward(ctx, input_, group):
    method backward (line 232) | def backward(ctx, grad_output):
  class _ReduceFromModelParallelRegion (line 237) | class _ReduceFromModelParallelRegion(torch.autograd.Function):
    method symbolic (line 241) | def symbolic(graph, input_, group):
    method forward (line 246) | def forward(ctx, input_, group):
    method backward (line 251) | def backward(ctx, grad_output):
  class _ScatterToModelParallelRegion (line 256) | class _ScatterToModelParallelRegion(torch.autograd.Function):
    method symbolic (line 260) | def symbolic(graph, input_, group):
    method forward (line 265) | def forward(ctx, input_, group):
    method backward (line 271) | def backward(ctx, grad_output):
  class _GatherFromModelParallelRegion (line 276) | class _GatherFromModelParallelRegion(torch.autograd.Function):
    method symbolic (line 280) | def symbolic(graph, input_, group=None):
    method forward (line 285) | def forward(ctx, input_, group=None):
    method backward (line 291) | def backward(ctx, grad_output):
  class _ScatterToSequenceParallelRegion (line 296) | class _ScatterToSequenceParallelRegion(torch.autograd.Function):
    method symbolic (line 300) | def symbolic(graph, input_, group):
    method forward (line 305) | def forward(ctx, input_, group):
    method backward (line 311) | def backward(ctx, grad_output):
  class _GatherFromSequenceParallelRegion (line 316) | class _GatherFromSequenceParallelRegion(torch.autograd.Function):
    method symbolic (line 320) | def symbolic(
    method forward (line 332) | def forward(
    method backward (line 348) | def backward(ctx, grad_output):
  class _ReduceScatterToSequenceParallelRegion (line 371) | class _ReduceScatterToSequenceParallelRegion(torch.autograd.Function):
    method symbolic (line 375) | def symbolic(graph, input_, group, input_split_sizes=None, use_global_...
    method forward (line 380) | def forward(ctx, input_, group, input_split_sizes=None, use_global_buf...
    method backward (line 388) | def backward(ctx, grad_output):
  class _AllGatherFromTensorParallelRegion (line 400) | class _AllGatherFromTensorParallelRegion(torch.autograd.Function):
    method symbolic (line 404) | def symbolic(graph, input_, group):
    method forward (line 409) | def forward(ctx, input_, group):
    method backward (line 415) | def backward(ctx, grad_output):
  class _ReduceScatterToTensorParallelRegion (line 420) | class _ReduceScatterToTensorParallelRegion(torch.autograd.Function):
    method symbolic (line 424) | def symbolic(graph, input_, group):
    method forward (line 429) | def forward(ctx, input_, group):
    method backward (line 435) | def backward(ctx, grad_output):
  class _AllToAll (line 440) | class _AllToAll(torch.autograd.Function):
    method forward (line 442) | def forward(ctx, group, input, output_split_sizes, input_split_sizes):
    method backward (line 474) | def backward(ctx, *grad_output):
  function copy_to_tensor_model_parallel_region (line 489) | def copy_to_tensor_model_parallel_region(input_, group):
  function reduce_from_tensor_model_parallel_region (line 494) | def reduce_from_tensor_model_parallel_region(input_, group):
  function scatter_to_tensor_model_parallel_region (line 499) | def scatter_to_tensor_model_parallel_region(input_, group):
  function gather_from_tensor_model_parallel_region (line 504) | def gather_from_tensor_model_parallel_region(input_, group):
  function scatter_to_sequence_parallel_region (line 509) | def scatter_to_sequence_parallel_region(input_, group):
  function gather_from_sequence_parallel_region (line 514) | def gather_from_sequence_parallel_region(
  function reduce_scatter_to_sequence_parallel_region (line 527) | def reduce_scatter_to_sequence_parallel_region(
  function all_gather_last_dim_from_tensor_parallel_region (line 536) | def all_gather_last_dim_from_tensor_parallel_region(input_, group):
  function reduce_scatter_last_dim_to_tensor_parallel_region (line 541) | def reduce_scatter_last_dim_to_tensor_parallel_region(input_, group):
  function all_to_all (line 546) | def all_to_all(group, input_, output_split_sizes_=None, input_split_size...

FILE: galvatron/core/runtime/tensor_parallel/random.py
  function _get_cuda_rng_state (line 23) | def _get_cuda_rng_state(
  function _set_cuda_rng_state (line 54) | def _set_cuda_rng_state(new_state: torch.Tensor, device: int = -1, graph...
  function get_expert_parallel_rng_tracker_name (line 96) | def get_expert_parallel_rng_tracker_name(group=None):
  function get_tensor_parallel_rng_tracker_name (line 104) | def get_tensor_parallel_rng_tracker_name(group=None):
  function get_data_parallel_rng_tracker_name (line 114) | def get_data_parallel_rng_tracker_name():
  class CudaRNGStatesTracker (line 120) | class CudaRNGStatesTracker:
    method __init__ (line 129) | def __init__(self, use_cudagraphable_rng=False, is_inference_rng_track...
    method is_initialized (line 142) | def is_initialized(self):
    method reset (line 146) | def reset(self):
    method get_states (line 158) | def get_states(self):
    method set_states (line 166) | def set_states(self, states):
    method check (line 172) | def check(self, name):
    method add (line 177) | def add(self, name, seed):
    method fork (line 203) | def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
  function initialize_rng_tracker (line 233) | def initialize_rng_tracker(
  function set_seed_with_group (line 279) | def set_seed_with_group(
  function get_cuda_rng_tracker (line 319) | def get_cuda_rng_tracker(

FILE: galvatron/core/runtime/tensor_parallel/reset.py
  function colummn_row_reset_parameters (line 11) | def colummn_row_reset_parameters(self):
  function router_reset_parameters (line 25) | def router_reset_parameters(self):
  function init_reset_parameter (line 31) | def init_reset_parameter():

FILE: galvatron/core/runtime/tensor_parallel/triton_cross_entropy.py
  function _tiled_max_kernel (line 22) | def _tiled_max_kernel(
  function _tiled_cross_entropy_forward_kernel (line 58) | def _tiled_cross_entropy_forward_kernel(
  function _tiled_cross_entropy_backward_kernel (line 103) | def _tiled_cross_entropy_backward_kernel(
  function tiled_max_reduction (line 150) | def tiled_max_reduction(
  function tiled_cross_entropy_forward (line 167) | def tiled_cross_entropy_forward(
  function tiled_cross_entropy_backward (line 191) | def tiled_cross_entropy_backward(
  class _VocabParallelCrossEntropyTritonFused (line 219) | class _VocabParallelCrossEntropyTritonFused(torch.autograd.Function):
    method forward (line 221) | def forward(ctx, vocab_parallel_logits, target, tp_group):
    method backward (line 245) | def backward(ctx, grad_output):
  function triton_fused_vocab_parallel_cross_entropy (line 256) | def triton_fused_vocab_parallel_cross_entropy(

FILE: galvatron/core/runtime/tensor_parallel/utils.py
  function init_method_normal (line 9) | def init_method_normal(sigma):
  function scaled_init_method_normal (line 18) | def scaled_init_method_normal(sigma, num_layers):
  function ensure_divisibility (line 27) | def ensure_divisibility(numerator, denominator):
  function divide (line 32) | def divide(numerator, denominator):
  class VocabUtility (line 39) | class VocabUtility:
    method vocab_range_from_per_partition_vocab_size (line 47) | def vocab_range_from_per_partition_vocab_size(
    method vocab_range_from_global_vocab_size (line 56) | def vocab_range_from_global_vocab_size(
  function prepare_input_tensors_for_wgrad_compute (line 66) | def prepare_input_tensors_for_wgrad_compute(grad_output, all_gathered_in...

FILE: galvatron/core/runtime/transformer/attention.py
  class SelfAttentionSubmodules (line 56) | class SelfAttentionSubmodules:
  class CrossAttentionSubmodules (line 72) | class CrossAttentionSubmodules:
  class PackedSeqParams (line 86) | class PackedSeqParams:
  class AttnMaskType (line 101) | class AttnMaskType(enum.Enum):
  class Attention (line 111) | class Attention(torch.nn.Module, ABC):
    method __init__ (line 118) | def __init__(
    method _allocate_memory (line 241) | def _allocate_memory(self, inference_max_sequence_length, batch_size, ...
    method _adjust_key_value_for_inference (line 253) | def _adjust_key_value_for_inference(
    method get_query_key_value_tensors (line 392) | def get_query_key_value_tensors(self, hidden_states, key_value_states):
    method flash_decode (line 398) | def flash_decode(
    method flash_decode_and_prefill (line 443) | def flash_decode_and_prefill(
    method forward (line 515) | def forward(
  class SelfAttention (line 736) | class SelfAttention(Attention):
    method __init__ (line 743) | def __init__(
    method run_realtime_tests (line 805) | def run_realtime_tests(self):
    method get_query_key_value_tensors (line 876) | def get_query_key_value_tensors(self, hidden_states, key_value_states=...
  class CrossAttention (line 929) | class CrossAttention(Attention):
    method __init__ (line 936) | def __init__(
    method get_query_key_value_tensors (line 989) | def get_query_key_value_tensors(self, hidden_states, key_value_states):

FILE: galvatron/core/runtime/transformer/attention_impl.py
  class FlashSelfOrCrossAttention (line 29) | class FlashSelfOrCrossAttention(torch.nn.Module):
    method __init__ (line 40) | def __init__(self, causal=False, softmax_scale=None, attention_dropout...
    method forward (line 55) | def forward(self, q, k, v):
  function post_all2all (line 115) | def post_all2all(scatter_idx, batch_dim_idx, seq_world_size, bs, seq_len...
  function single_all_to_all (line 139) | def single_all_to_all(input, scatter_idx, gather_idx, batch_dim_idx, gro...
  class _SeqAllToAll (line 201) | class _SeqAllToAll(torch.autograd.Function):
    method forward (line 204) | def forward(
    method backward (line 253) | def backward(ctx: Any, *grad_output: Tensor) -> Tuple[None, Tensor, No...
  class DistributedAttention (line 278) | class DistributedAttention(torch.nn.Module):
    method __init__ (line 288) | def __init__(
    method layer_sync (line 312) | def layer_sync(self, layer):
    method forward (line 316) | def forward(self, query: Tensor, key: Tensor, value: Tensor, batch_dim...
  function _get_default_args (line 420) | def _get_default_args(func):
  function get_default_args (line 429) | def get_default_args(func):
  function _update_out_and_lse (line 438) | def _update_out_and_lse(
  function update_out_and_lse (line 458) | def update_out_and_lse(
  class RingComm (line 481) | class RingComm:
    method __init__ (line 482) | def __init__(self, process_group: dist.ProcessGroup, batch_comm = True):
    method send_recv (line 500) | def send_recv(
    method commit (line 525) | def commit(self):
    method wait (line 533) | def wait(self):
    method send_recv_kv (line 547) | def send_recv_kv(
  function zigzag_ring_flash_attn_forward (line 564) | def zigzag_ring_flash_attn_forward(
  function zigzag_ring_flash_attn_backward (line 652) | def zigzag_ring_flash_attn_backward(
  class ZigZagRingFlashAttnFunc (line 783) | class ZigZagRingFlashAttnFunc(torch.autograd.Function):
    method forward (line 785) | def forward(
    method backward (line 832) | def backward(ctx, dout, *args):
  function zigzag_ring_flash_attn_func (line 855) | def zigzag_ring_flash_attn_func(
  class ZigzagRingFlashAttention (line 885) | class ZigzagRingFlashAttention(torch.nn.Module):
    method __init__ (line 886) | def __init__(self, attention_dropout, cp_group, cp_ranks, softmax_scal...
    method forward (line 894) | def forward(self, q, k, v):

FILE: galvatron/core/runtime/transformer/fused_kernels.py
  function geglu (line 20) | def geglu(y):
  function bias_geglu (line 26) | def bias_geglu(bias, y):
  function geglu_back (line 35) | def geglu_back(g, y):
  function bias_geglu_back (line 46) | def bias_geglu_back(g, y, bias):
  class BiasGeGLUFunction (line 51) | class BiasGeGLUFunction(torch.autograd.Function):
    method forward (line 54) | def forward(ctx, input, bias):
    method backward (line 59) | def backward(ctx, grad_output):
  class GeGLUFunction (line 65) | class GeGLUFunction(torch.autograd.Function):
    method forward (line 68) | def forward(ctx, input):
    method backward (line 73) | def backward(ctx, grad_output):
  function bias_geglu_impl (line 79) | def bias_geglu_impl(input, bias):
  function bias_gelu (line 101) | def bias_gelu(bias, y):
  function bias_gelu_back (line 110) | def bias_gelu_back(g, bias, y):
  class GeLUFunction (line 120) | class GeLUFunction(torch.autograd.Function):
    method forward (line 123) | def forward(ctx, input, bias):
    method backward (line 128) | def backward(ctx, grad_output):
    method apply (line 135) | def apply(cls, *args, **kwargs):
  function swiglu (line 143) | def swiglu(y):
  function bias_swiglu (line 149) | def bias_swiglu(y, bias):
  function swiglu_back (line 158) | def swiglu_back(g, y):
  function bias_swiglu_back (line 166) | def bias_swiglu_back(g, y, bias):
  class BiasSwiGLUFunction (line 171) | class BiasSwiGLUFunction(torch.autograd.Function):
    method forward (line 174) | def forward(ctx, input, bias, fp8_input_store):
    method backward (line 182) | def backward(ctx, grad_output):
  class SwiGLUFunction (line 189) | class SwiGLUFunction(torch.autograd.Function):
    method forward (line 192) | def forward(ctx, input, fp8_input_store):
    method backward (line 200) | def backward(ctx, grad_output):
  function bias_swiglu_impl (line 207) | def bias_swiglu_impl(input, bias, fp8_input_store=False):
  function fused_apply_rotary_pos_emb (line 227) | def fused_apply_rotary_pos_emb(
  function fused_apply_rotary_pos_emb_thd (line 237) | def fused_apply_rotary_pos_emb_thd(
  class VocabParallelCrossEntropy (line 259) | class VocabParallelCrossEntropy:
    method calculate_logits_max (line 266) | def calculate_logits_max(
    method calculate_predicted_logits (line 280) | def calculate_predicted_logits(
    method calculate_cross_entropy_loss (line 316) | def calculate_cross_entropy_loss(
    method prepare_gradient_calculation_operands (line 330) | def prepare_gradient_calculation_operands(
    method calculate_gradients (line 349) | def calculate_gradients(
  function calculate_logits_max (line 368) | def calculate_logits_max(vocab_parallel_logits: torch.Tensor, half_entro...
  function calculate_predicted_logits (line 381) | def calculate_predicted_logits(
  function calculate_cross_entropy_loss (line 403) | def calculate_cross_entropy_loss(
  function calculate_gradients (line 420) | def calculate_gradients(
  class _VocabParallelCrossEntropy (line 442) | class _VocabParallelCrossEntropy(torch.autograd.Function):
    method forward (line 444) | def forward(ctx, vocab_parallel_logits, target, half_entropy, tp_group):
    method backward (line 479) | def backward(ctx, grad_output):
  function fused_vocab_parallel_cross_entropy (line 491) | def fused_vocab_parallel_cross_entropy(vocab_parallel_logits, target, ha...
  class _VocabParallelCrossEntropyNonFused (line 508) | class _VocabParallelCrossEntropyNonFused(torch.autograd.Function):
    method forward (line 516) | def forward(ctx, vocab_parallel_logits, target, tp_group):
    method backward (line 543) | def backward(ctx, grad_output):
  function vocab_parallel_cross_entropy (line 554) | def vocab_parallel_cross_entropy(vocab_parallel_logits, target, tp_group):

FILE: galvatron/core/runtime/transformer/inference.py
  class BaseInferenceContext (line 6) | class BaseInferenceContext(abc.ABC):
    method is_static_batching (line 14) | def is_static_batching(self) -> bool:
    method is_dynamic_batching (line 18) | def is_dynamic_batching(self) -> bool:

FILE: galvatron/core/runtime/transformer/mlp.py
  class MLPSubmodules (line 18) | class MLPSubmodules:
  class MLP (line 23) | class MLP(torch.nn.Module):
    method __init__ (line 40) | def __init__(
    method forward (line 98) | def forward(self, hidden_states):

FILE: galvatron/core/runtime/transformer/norm.py
  class GalvatronNorm (line 6) | class GalvatronNorm:
    method __new__ (line 12) | def __new__(cls, config: GalvatronModelArgs, hidden_size: int, eps: fl...

FILE: galvatron/core/runtime/transformer/rope_utils.py
  function get_pos_emb_on_this_cp_rank (line 47) | def get_pos_emb_on_this_cp_rank(pos_emb: Tensor, seq_dim: int) -> Tensor:
  function _rotate_half (line 67) | def _rotate_half(x: Tensor, rotary_interleaved: bool) -> Tensor:
  function _apply_rotary_pos_emb_bshd (line 86) | def _apply_rotary_pos_emb_bshd(
  function _get_thd_freqs_on_this_cp_rank (line 123) | def _get_thd_freqs_on_this_cp_rank(cp_rank: int, cp_size: int, x: Tensor...
  function _apply_rotary_pos_emb_thd (line 137) | def _apply_rotary_pos_emb_thd(
  function apply_rotary_pos_emb (line 176) | def apply_rotary_pos_emb(
  function apply_rotary_pos_emb_with_cos_sin (line 237) | def apply_rotary_pos_emb_with_cos_sin(

FILE: galvatron/core/runtime/transformer/rotary_pos_embedding.py
  function get_pos_emb_on_this_cp_sp_rank_galvatron (line 34) | def get_pos_emb_on_this_cp_sp_rank_galvatron(cp_group, sp_group, pos_emb...
  function get_pos_emb_on_this_cp_rank (line 59) | def get_pos_emb_on_this_cp_rank(pos_emb, seq_dim):
  class RotaryEmbedding (line 73) | class RotaryEmbedding(nn.Module):
    method __init__ (line 93) | def __init__(
    method _apply_scaling (line 124) | def _apply_scaling(
    method get_freqs_non_repeated (line 159) | def get_freqs_non_repeated(self, max_seq_len: int, offset: int = 0) ->...
    method get_cos_sin (line 174) | def get_cos_sin(self, max_seq_len: int, offset: int = 0) -> (Tensor, T...
    method forward (line 183) | def forward(self, max_seq_len: int, offset: int = 0, packed_seq: bool ...
    method _load_from_state_dict (line 217) | def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
    method get_rotary_seq_len (line 221) | def get_rotary_seq_len(
  class MultimodalRotaryEmbedding (line 267) | class MultimodalRotaryEmbedding(nn.Module):
    method __init__ (line 286) | def __init__(
    method forward (line 310) | def forward(self, position_ids: torch.Tensor, mrope_section: List[int]...

FILE: galvatron/core/runtime/transformer/spec_utils.py
  class ModuleSpec (line 9) | class ModuleSpec:
  function import_module (line 30) | def import_module(module_path: Tuple[str]):
  function get_module (line 45) | def get_module(spec_or_module: Union[ModuleSpec, type], **additional_kwa...
  function build_module (line 58) | def build_module(spec_or_module: Union[ModuleSpec, type], *args, **kwargs):

FILE: galvatron/core/runtime/transformer/utils.py
  function deprecate_inference_params (line 4) | def deprecate_inference_params(inference_context, inference_params):

FILE: galvatron/core/runtime/utils/rerun_state_machine.py
  class Caller (line 43) | class Caller(NamedTuple):
  class Call (line 51) | class Call(NamedTuple):
  class RerunDiagnostic (line 58) | class RerunDiagnostic(str, Enum):
  class RerunMode (line 72) | class RerunMode(str, Enum):
  class RerunState (line 80) | class RerunState(Enum):
  class RerunValidationStatus (line 112) | class RerunValidationStatus(str, Enum):
  class RerunStateMachine (line 127) | class RerunStateMachine:
    method __init__ (line 183) | def __init__(
    method set_mode (line 239) | def set_mode(self, mode: RerunMode) -> None:
    method get_mode (line 246) | def get_mode(self) -> RerunMode:
    method should_run_forward_backward (line 251) | def should_run_forward_backward(self, data_iterator: DataIteratorArgTy...
    method should_checkpoint_and_exit (line 374) | def should_checkpoint_and_exit(self) -> Tuple[bool, bool, int]:
    method validate_result (line 434) | def validate_result(
    method is_unexpectedly_large (line 651) | def is_unexpectedly_large(
    method _sanitize_data_iterators (line 841) | def _sanitize_data_iterators(
    method _get_validation_call_info (line 858) | def _get_validation_call_info(self) -> Call:
    method _save_state (line 871) | def _save_state(self) -> None:
    method _restore_state (line 892) | def _restore_state(self) -> None:
    method _maybe_report_stats (line 903) | def _maybe_report_stats(self) -> None:
    method _log_validation_error_to_file (line 930) | def _log_validation_error_to_file(
    method get_skipped_iterations_from_tracker_file (line 951) | def get_skipped_iterations_from_tracker_file(cls, tracker_file_name: s...
  class RerunDataIterator (line 989) | class RerunDataIterator:
    method __init__ (line 1008) | def __init__(self, iterable: Iterable[Any]) -> None:
    method __next__ (line 1014) | def __next__(self) -> Any:
    method rewind (line 1029) | def rewind(self) -> None:
    method advance (line 1035) | def advance(self) -> None:
    method state_dict (line 1041) | def state_dict(self) -> SerializableStateType:
    method load_state_dict (line 1050) | def load_state_dict(self, state_dict: SerializableStateType) -> None:
  class QuickStats (line 1058) | class QuickStats:
    method __init__ (line 1065) | def __init__(self, max_size: int = 100000) -> None:
    method record (line 1072) | def record(self, data: float) -> None:
    method combine (line 1086) | def combine(self, others: list["QuickStats"]) -> None:
    method reset (line 1099) | def reset(self) -> None:
    method print_stats (line 1107) | def print_stats(self) -> str:
    method __getstate_ (line 1129) | def __getstate_(self) -> Any:
    method __setstate (line 1134) | def __setstate(self, state: Any) -> Any:
  class RerunErrorInjector (line 1143) | class RerunErrorInjector:
    method __init__ (line 1152) | def __init__(
    method maybe_inject (line 1167) | def maybe_inject(self) -> bool:
    method maybe_miscompare (line 1185) | def maybe_miscompare(
    method state_dict (line 1222) | def state_dict(self) -> SerializableStateType:
    method load_state_dict (line 1232) | def load_state_dict(self, state_dict: SerializableStateType) -> None:
  function initialize_rerun_state_machine (line 1241) | def initialize_rerun_state_machine(**kwargs) -> None:
  function destroy_rerun_state_machine (line 1251) | def destroy_rerun_state_machine() -> None:
  function get_rerun_state_machine (line 1258) | def get_rerun_state_machine() -> RerunStateMachine:
  function _set_rerun_state_machine (line 1267) | def _set_rerun_state_machine(rerun_state_machine) -> None:
  function _safe_get_rank (line 1275) | def _safe_get_rank() -> int:
  function _compare_floats (line 1288) | def _compare_floats(a: torch.Tensor, b: torch.Tensor) -> float:

FILE: galvatron/core/runtime/utils/utils.py
  function rgetattr (line 28) | def rgetattr(obj, attr):
  function rsetattr (line 41) | def rsetattr(obj, attr, val):
  function rhasattr (line 46) | def rhasattr(obj, attr):
  function log_single_rank (line 54) | def log_single_rank(logger: logging.Logger, *args: Any, rank: int = 0, *...
  class GlobalMemoryBuffer (line 73) | class GlobalMemoryBuffer:
    method __init__ (line 78) | def __init__(self):
    method get_tensor (line 81) | def get_tensor(self, tensor_shape, dtype, name):
  function get_torch_version (line 97) | def get_torch_version():
  function is_torch_min_version (line 114) | def is_torch_min_version(version, check_equality=True):
  function get_te_version (line 121) | def get_te_version():
  function is_te_min_version (line 138) | def is_te_min_version(version, check_equality=True):
  function print_rank_0 (line 145) | def print_rank_0(message):
  function set_megatron_args_for_dataset (line 154) | def set_megatron_args_for_dataset(args:GalvatronRuntimeArgs):
  function get_layernorm_offset (line 170) | def get_layernorm_offset(model, layernorm_name=[]):
  function get_batch_on_this_tp_rank (line 194) | def get_batch_on_this_tp_rank(data_iterator):
  function get_batch_on_this_cp_rank (line 295) | def get_batch_on_this_cp_rank(batch: Dict[str, Any]):
  function average_losses_across_data_parallel_group (line 328) | def average_losses_across_data_parallel_group(losses):

FILE: galvatron/core/search_engine/args_schema.py
  class SearchEngineBatchSizeArgs (line 12) | class SearchEngineBatchSizeArgs(BaseModel):
  class SearchEngineHardwareInfoArgs (line 21) | class SearchEngineHardwareInfoArgs(BaseModel):
  class SearchEngineSearchSpaceArgs (line 26) | class SearchEngineSearchSpaceArgs(BaseModel):
  class SearchEngineProfilingArgs (line 42) | class SearchEngineProfilingArgs(BaseModel):
  class SearchEngineOptionsArgs (line 53) | class SearchEngineOptionsArgs(BaseModel):
  class SearchEngineDebugArgs (line 61) | class SearchEngineDebugArgs(BaseModel):
  class GalvatronSearchArgs (line 65) | class GalvatronSearchArgs(BaseModel):

FILE: galvatron/core/search_engine/dynamic_programming.py
  class DPAlg (line 12) | class DPAlg():
    method __init__ (line 13) | def __init__(self, max_mem=8200, other_mem_cost=None, other_time_cost ...
    method set_v_and_cost (line 32) | def set_v_and_cost(self, v: np.ndarray, intra_layer_cost: np.ndarray, ...
    method fit (line 50) | def fit(self):
  class DpOnModel (line 117) | class DpOnModel:
    method __init__ (line 118) | def __init__(
    method match_strategy (line 161) | def match_strategy(self, former:LayerStrategy, latter:LayerStrategy, d...
    method _build_dp_and_run_multi_layer_type (line 212) | def _build_dp_and_run_multi_layer_type(
    method log (line 612) | def log(self, msg) -> None:
    method fit (line 618) | def fit(

FILE: galvatron/core/search_engine/search_engine.py
  class GalvatronSearchEngine (line 21) | class GalvatronSearchEngine():
    method __init__ (line 22) | def __init__(self, args: GalvatronSearchArgs):
    method set_search_engine_info (line 39) | def set_search_engine_info(self, path, model_layer_configs, model_name):
    method set_path (line 46) | def set_path(self, path):
    method set_model_type (line 49) | def set_model_type(self, model_type):
    method set_model_name (line 52) | def set_model_name(self, name):
    method memory_profiling_path (line 55) | def memory_profiling_path(self): # TODO: add split mode profile path
    method time_profiling_path (line 68) | def time_profiling_path(self): # TODO: add split mode profile path
    method set_model_layer_configs (line 82) | def set_model_layer_configs(self, model_layer_configs):
    method initialize_search_engine (line 93) | def initialize_search_engine(self, show_all_strategy_list=False):
    method generate_strategy_list (line 106) | def generate_strategy_list(self) -> None:
    method filter_strategy_list (line 183) | def filter_strategy_list(self, disable_pp=None, disable_tp=None, disab...
    method show_all_strategy_list (line 257) | def show_all_strategy_list(self):
    method convert_keys_to_int (line 275) | def convert_keys_to_int(self, d):
    method get_profiled_model_configs (line 286) | def get_profiled_model_configs(self): # TODO: add split mode profile c...
    method get_profiled_hardware_configs (line 419) | def get_profiled_hardware_configs(self):
    method set_cost_models (line 464) | def set_cost_models(self): # TODO: add split mode cost models
    method get_pp_size_range (line 512) | def get_pp_size_range(self) -> None:
    method parallelism_optimization (line 520) | def parallelism_optimization(self):
    method search_for_single_task (line 646) | def search_for_single_task(self, gbsz, chunks, pp_size, global_buffer_...
    method set_searching_bsz (line 729) | def set_searching_bsz(self):
    method save_results (line 749) | def save_results(self, optimal, optimal_bsz, chunk):
    method check_cost_model (line 788) | def check_cost_model(self, gbsz, chunks, specific_strategy_list:List[L...
    method show_search_info (line 902) | def show_search_info(self):
  function pp_division_memory_balanced (line 954) | def pp_division_memory_balanced(model_args_list, train_args_list, parall...
  function get_pp_stage_for_bsz (line 1060) | def get_pp_stage_for_bsz(strategies:List[LayerStrategy], model_args_list...
  function get_cost_all_stages (line 1072) | def get_cost_all_stages(layer_memcosts, pp_stage_division):
  function get_layer_costs (line 1088) | def get_layer_costs(layernum_list, layer_costs):
  function pp_division_even (line 1094) | def pp_division_even(layernum_list, pp_deg):

FILE: galvatron/core/search_engine/utils.py
  function ensure_log_dir (line 4) | def ensure_log_dir(log_dir='logs'):
  function get_thread_logger_single_task (line 8) | def get_thread_logger_single_task(gbsz, chunks, pp_size, global_buffer_t...
  function remove_all_galvatron_loggers (line 32) | def remove_all_galvatron_loggers(prefix='galvatron'):

FILE: galvatron/models/gpt/train_dist.py
  function train (line 21) | def train(args):

FILE: galvatron/models/moe/train_dist.py
  function train (line 22) | def train(args):

FILE: galvatron/profile_hardware/profile_all2all.py
  function single_all_to_all (line 20) | def single_all_to_all(input_tensor, group):
  function set_seed (line 28) | def set_seed(rank):
  function _profile_all2all_one (line 34) | def _profile_all2all_one(
  function train (line 93) | def train(args):

FILE: galvatron/profile_hardware/profile_allreduce.py
  function single_all_reduce (line 20) | def single_all_reduce(input_tensor, group):
  function set_seed (line 26) | def set_seed(rank):
  function bandwidth_jobs_from_tp_degrees (line 32) | def bandwidth_jobs_from_tp_degrees(world_size, tp_degrees: list[int]):
  function allreduce_work_items (line 45) | def allreduce_work_items(
  function _profile_allreduce_one (line 84) | def _profile_allreduce_one(
  function train (line 162) | def train(args):

FILE: galvatron/profile_hardware/profile_overlap.py
  function profile (line 10) | def profile(args):

FILE: galvatron/profile_hardware/profile_p2p.py
  function single_p2p_send_recv (line 19) | def single_p2p_send_recv(input_tensor, prev_rank, next_rank, rank, pp_ra...
  function set_seed (line 53) | def set_seed(rank):
  function _profile_p2p_one (line 59) | def _profile_p2p_one(
  function train (line 149) | def train(args):

FILE: galvatron/tools/args_schema.py
  class CheckpointConvertH2GArgs (line 5) | class CheckpointConvertH2GArgs(BaseModel):
  class CheckpointConvertG2HArgs (line 13) | class CheckpointConvertG2HArgs(BaseModel):

FILE: galvatron/tools/checkpoint_convert_g2h.py
  function convert_checkpoints_llama (line 11) | def convert_checkpoints_llama(input_checkpoint_path, output_dir, load_it...
  function convert_checkpoints_bert_mlm (line 111) | def convert_checkpoints_bert_mlm(input_checkpoint_path, output_dir, load...
  function main (line 253) | def main():

FILE: galvatron/tools/checkpoint_convert_h2g.py
  function convert_checkpoints_gpt (line 9) | def convert_checkpoints_gpt(input_checkpoint_path, output_dir):
  function convert_checkpoints_llama (line 47) | def convert_checkpoints_llama(input_checkpoint_path, output_dir):
  function convert_checkpoints_mixtral (line 89) | def convert_checkpoints_mixtral(input_checkpoint_path, output_dir):
  function convert_checkpoints_bert_mlm (line 93) | def convert_checkpoints_bert_mlm(input_checkpoint_path, output_dir):
  function main (line 140) | def main():

FILE: galvatron/utils/config_utils.py
  function str2array (line 8) | def str2array(s):
  function array2str (line 11) | def array2str(a):
  function read_json_config (line 14) | def read_json_config(path):
  function write_json_config (line 18) | def write_json_config(config, path):
  function config2strategy (line 24) | def config2strategy(config):
  function read_allreduce_bandwidth_config (line 48) | def read_allreduce_bandwidth_config(config_path, gpu_num):
  function read_p2p_bandwidth_config (line 77) | def read_p2p_bandwidth_config(config_path):
  function num2str (line 90) | def num2str(num, name):
  function dict_join_dirname (line 103) | def dict_join_dirname(dic, dirname):
  function remap_config (line 108) | def remap_config(config, op):
  function print_single_rank (line 140) | def print_single_rank(message, rank=0):
  function remap_config_for_latency (line 147) | def remap_config_for_latency(config, op):

FILE: galvatron/utils/hf_config_adapter.py
  function _get_model_args (line 39) | def _get_model_args(args: Union[GalvatronRuntimeArgs, GalvatronSearchArg...
  function _get_train_args (line 47) | def _get_train_args(args: Union[GalvatronRuntimeArgs, GalvatronSearchArg...
  function get_hf_attr (line 73) | def get_hf_attr(config, canonical_name: str, default=None):
  function set_hf_attr (line 82) | def set_hf_attr(config, canonical_name: str, value):
  function _detect_normalization (line 104) | def _detect_normalization(hf_config) -> str:
  function _detect_activation (line 110) | def _detect_activation(hf_config) -> tuple:
  function _detect_position_embedding_type (line 117) | def _detect_position_embedding_type(hf_config) -> str:
  function _load_yaml_model_config (line 154) | def _load_yaml_model_config(yaml_path: str) -> dict:
  function _apply_yaml_to_model_args (line 165) | def _apply_yaml_to_model_args(args: Union[GalvatronRuntimeArgs, Galvatro...
  function populate_model_args_from_hf (line 196) | def populate_model_args_from_hf(args: Union[GalvatronRuntimeArgs, Galvat...
  function _fill_model_args_from_hf (line 212) | def _fill_model_args_from_hf(args: Union[GalvatronRuntimeArgs, Galvatron...
  function resolve_model_config (line 285) | def resolve_model_config(args: Union[GalvatronRuntimeArgs, GalvatronSear...
  function create_hf_config (line 333) | def create_hf_config(args: Union[GalvatronRuntimeArgs, GalvatronSearchAr...
  function model_name (line 372) | def model_name(args: Union[GalvatronRuntimeArgs, GalvatronSearchArgs]) -...
  function model_layer_configs (line 384) | def model_layer_configs(args: Union[GalvatronRuntimeArgs, GalvatronSearc...

FILE: galvatron/utils/memory_utils.py
  function print_peak_memory (line 3) | def print_peak_memory(prefix, device, type='allocated'):
  function print_param_num (line 16) | def print_param_num(model):

FILE: galvatron/utils/print_utils.py
  class ColorSet (line 7) | class ColorSet:
  function print_args_rank0 (line 15) | def print_args_rank0(args: pydantic.BaseModel, title: str = "arguments"):
  function print_single_rank (line 25) | def print_single_rank(message, rank=0):

FILE: galvatron/utils/strategy_utils.py
  function is_power_of_two (line 11) | def is_power_of_two(n: int) -> bool:
  class DPType (line 14) | class DPType(Enum):
    method values (line 20) | def values(cls):
    method contains (line 24) | def contains(cls, value) -> bool:
    method __lt__ (line 27) | def __lt__(self, other):
  class StrategyBase (line 33) | class StrategyBase:
  class EmbeddingLMHeadStrategy (line 37) | class EmbeddingLMHeadStrategy(StrategyBase):
    method __post_init__ (line 45) | def __post_init__(self):
    method _check_and_fix_sdp (line 49) | def _check_and_fix_sdp(self):
    method _check_tp_sp (line 54) | def _check_tp_sp(self):
    method world_size (line 58) | def world_size(self):
    method sdp_size (line 62) | def sdp_size(self):
    method tp_sp_size (line 66) | def tp_sp_size(self):
    method to_string (line 69) | def to_string(self):
    method to_simple_string (line 72) | def to_simple_string(self):
    method __eq__ (line 93) | def __eq__(self, other):
    method __lt__ (line 101) | def __lt__(self, other):
    method __hash__ (line 111) | def __hash__(self):
    method __str__ (line 115) | def __str__(self):
  class AttentionStrategy (line 119) | class AttentionStrategy(EmbeddingLMHeadStrategy):
    method __hash__ (line 122) | def __hash__(self):
    method to_embedding_lmhead_strategy (line 126) | def to_embedding_lmhead_strategy(self):
    method to_ffn_strategy (line 136) | def to_ffn_strategy(self):
    method to_layer_strategy (line 147) | def to_layer_strategy(self):
  class FFNStrategy (line 160) | class FFNStrategy(EmbeddingLMHeadStrategy):
    method __hash__ (line 163) | def __hash__(self):
    method to_embedding_lmhead_strategy (line 167) | def to_embedding_lmhead_strategy(self):
  class LayerStrategy (line 178) | class LayerStrategy(EmbeddingLMHeadStrategy):
    method __hash__ (line 181) | def __hash__(self):
    method to_embedding_lmhead_strategy (line 185) | def to_embedding_lmhead_strategy(self):
  class MoEFFNStrategy (line 196) | class MoEFFNStrategy(StrategyBase):
    method __post_init__ (line 204) | def __post_init__(self):
    method _check_and_fix_dp (line 207) | def _check_and_fix_dp(self):
    method world_size (line 215) | def world_size(self):
    method sdp_size (line 219) | def sdp_size(self):
    method __eq__ (line 222) | def __eq__(self, other):
    method __lt__ (line 230) | def __lt__(self, other):
    method __hash__ (line 240) | def __hash__(self):
    method __str__ (line 244) | def __str__(self):
  function old_version_strategy_to_new_version_strategy (line 248) | def old_version_strategy_to_new_version_strategy(strategy:list, default_...
  function new_version_strategy_to_old_version_strategy (line 277) | def new_version_strategy_to_old_version_strategy(strategy:StrategyBase):
  function print_strategy_list (line 300) | def print_strategy_list(strategy_list:Union[List[LayerStrategy], List[Em...
  function strategy_list2config (line 308) | def strategy_list2config(strategy_list:List[LayerStrategy]):
  function config2strategy (line 332) | def config2strategy(config:dict, default_dp_type:str='zero2') -> List[La...

FILE: galvatron/utils/training_utils.py
  function set_seed (line 7) | def set_seed(seed = 1234):
  function distributed_dataloader (line 13) | def distributed_dataloader(dataset, global_bsz, shuffle = True, args = N...
  function print_loss (line 25) | def print_loss(args, loss, ep, iter):
  function gen_profiling_groups (line 43) | def gen_profiling_groups(group_size, consecutive):

FILE: setup.py
  class CustomInstall (line 18) | class CustomInstall(install):
    method run (line 19) | def run(self):
  class CustomDevelop (line 29) | class CustomDevelop(develop):
    method run (line 30) | def run(self):
  class CustomBuildExt (line 41) | class CustomBuildExt(build_ext):
    method run (line 42) | def run(self):

FILE: tests/conftest.py
  function _pick_free_port (line 19) | def _pick_free_port() -> int:
  function small_model_config (line 25) | def small_model_config():
  function device (line 36) | def device():
  function seed (line 41) | def seed():
  function _terminate_process (line 45) | def _terminate_process(p: subprocess.Popen, grace: float = 5.0) -> None:
  function run_distributed (line 81) | def run_distributed():
  function checkpoint_dir (line 194) | def checkpoint_dir():
  function base_config_dirs (line 203) | def base_config_dirs(tmp_path: Path) -> Tuple[Path, Path, Path]:
  function profiler_model_configs_dir (line 211) | def profiler_model_configs_dir(tmp_path: Path) -> Path:
  function profiler_hardware_configs_dir (line 218) | def profiler_hardware_configs_dir(tmp_path: Path) -> Path:
  function base_log_dirs (line 227) | def base_log_dirs(tmp_path: Path) -> str:

FILE: tests/core/test_ep.py
  class _PytestMarkStub (line 10) | class _PytestMarkStub:
    method skipif (line 11) | def skipif(self, *args, **kwargs):
    method parametrize (line 14) | def parametrize(self, *args, **kwargs):
    method __getattr__ (line 19) | def __getattr__(self, _name):
  class _PytestStub (line 24) | class _PytestStub:
  function _ep_parallel_config (line 58) | def _ep_parallel_config(
  function _run_test (line 95) | def _run_test(test_args: Dict[str, Any]):
  function test_ep_correctness (line 245) | def test_ep_correctness(run_distributed, ep_size, dispatcher, checkpoint...

FILE: tests/core/test_fsdp.py
  function _run_test (line 25) | def _run_test(test_args: Dict[str, Any]):
  function test_dp_correctness (line 185) | def test_dp_correctness(

FILE: tests/core/test_hybrid.py
  function _run_test (line 20) | def _run_test(test_args: Dict[str, Any]):
  function test_hybrid_correctness (line 180) | def test_hybrid_correctness(

FILE: tests/core/test_mixed_precision.py
  function _dp_parallel_config (line 25) | def _dp_parallel_config(batch: int, chunks: int) -> Dict[str, Any]:
  function _run_test (line 45) | def _run_test(test_args: Dict[str, Any]):
  function test_dp_correctness (line 162) | def test_dp_correctness(run_distributed, mixed_precision, use_flash_attn...

FILE: tests/core/test_pp.py
  function _pp_parallel_config (line 25) | def _pp_parallel_config(pp_size: int, batch: int, chunks: int, pipeline_...
  function _run_test (line 52) | def _run_test(test_args: Dict[str, Any]):
  function test_pp (line 171) | def test_pp(run_distributed, world_size, pp_size, pipeline_type, chunks,...

FILE: tests/core/test_redistributed.py
  function _run_test (line 22) | def _run_test(test_args: Dict[str, Any]):
  function test_redistributed (line 183) | def test_redistributed(run_distributed, model_type, world_size, tp_size,...

FILE: tests/core/test_tp.py
  function _tp_parallel_config (line 25) | def _tp_parallel_config(
  function _run_test (line 71) | def _run_test(test_args: Dict[str, Any]):
  function test_tp (line 193) | def test_tp(run_distributed, world_size, tp_size, sp, chunks, checkpoint...

FILE: tests/core/test_utils.py
  class DummyModule (line 7) | class DummyModule(nn.Module):
    method __init__ (line 8) | def __init__(self):
  function dummy_module (line 14) | def dummy_module():
  function test_rgetattr (line 17) | def test_rgetattr(dummy_module):
  function test_rsetattr (line 26) | def test_rsetattr(dummy_module):
  function test_rhasattr (line 32) | def test_rhasattr(dummy_module):

FILE: tests/kernels/test_triton_cross_entropy.py
  function non_fused_ce (line 39) | def non_fused_ce(logits, target, tp_group):
  function jit_fused_ce (line 44) | def jit_fused_ce(logits, target, tp_group):
  function triton_fused_ce (line 49) | def triton_fused_ce(logits, target, tp_group):
  function print_rank0 (line 54) | def print_rank0(rank, msg):
  function run_test_forward_backward (line 63) | def run_test_forward_backward(ce_func, logits_cpu, target_cpu, tp_group,...
  function benchmark_performance (line 95) | def benchmark_performance(ce_func, logits_cpu, target_cpu, tp_group, dev...
  function compare_results (line 125) | def compare_results(name1, name2, loss1, grad1, loss2, grad2, rank):
  function _run_test (line 163) | def _run_test(args):
  function test_triton_cross_entropy (line 270) | def test_triton_cross_entropy(run_distributed, tp_size, seq_len, batch_s...

FILE: tests/kernels/test_triton_cross_entropy_debug.py
  function non_fused_ce (line 24) | def non_fused_ce(logits, target, tp_group):
  function jit_fused_ce (line 28) | def jit_fused_ce(logits, target, tp_group):
  function triton_fused_ce (line 32) | def triton_fused_ce(logits, target, tp_group):
  function print_rank0 (line 36) | def print_rank0(rank, msg):
  function run_test_forward_backward (line 41) | def run_test_forward_backward(ce_func, logits_cpu, target_cpu, tp_group,...
  function benchmark_performance (line 73) | def benchmark_performance(ce_func, logits_cpu, target_cpu, tp_group, dev...
  function compare_results (line 103) | def compare_results(name1, name2, loss1, grad1, loss2, grad2, rank):
  function test_triton_cross_entropy (line 141) | def test_triton_cross_entropy():

FILE: tests/kernels/test_triton_cross_entropy_kernels.py
  function device (line 77) | def device():
  function reset_seed (line 85) | def reset_seed():
  function check_precision (line 90) | def check_precision(triton_val, torch_val, name, rtol=1e-2, atol=1e-3):
  function test_max_reduction (line 121) | def test_max_reduction(device, seq_len, batch_size, vocab_size, model_co...
  function test_forward (line 143) | def test_forward(device, seq_len, batch_size, vocab_size, model_config):
  function test_backward (line 176) | def test_backward(device, seq_len, batch_size, vocab_size, model_config):
  function test_edge_cases_max (line 222) | def test_edge_cases_max(device, case_name, seq_len, batch_size, vocab_si...
  function test_boundary_targets (line 246) | def test_boundary_targets(device):

FILE: tests/kernels/test_triton_cross_entropy_kernels_debug.py
  function check_precision (line 23) | def check_precision(triton_val, torch_val, name, rtol=1e-2, atol=1e-3):
  function test_max_reduction (line 41) | def test_max_reduction():
  function test_forward (line 69) | def test_forward():
  function test_backward (line 105) | def test_backward():
  function test_edge_cases (line 153) | def test_edge_cases():
  function main (line 204) | def main():

FILE: tests/models/test_checkpoint_convert.py
  function test_convert_checkpoints_bert_mlm (line 8) | def test_convert_checkpoints_bert_mlm(checkpoint_dir):

FILE: tests/models/test_dataloader.py
  function _run_test (line 17) | def _run_test(args: dict):
  function test_distributed_dataloader_with_groups (line 106) | def test_distributed_dataloader_with_groups(run_distributed, small_model...

FILE: tests/models/test_model_correctness.py
  function _dp_parallel_config (line 28) | def _dp_parallel_config(num_layers: int, batch: int, chunks: int) -> Dic...
  function _run_test (line 49) | def _run_test(test_args: Dict[str, Any]):
  function test_dp_correctness (line 229) | def test_dp_correctness(run_distributed, hf_arch, dp_size, checkpoint_dir):

FILE: tests/models/test_moe_correctness.py
  class _PytestMarkStub (line 10) | class _PytestMarkStub:
    method skipif (line 11) | def skipif(self, *args, **kwargs):
    method parametrize (line 14) | def parametrize(self, *args, **kwargs):
    method __getattr__ (line 19) | def __getattr__(self, _name):
  class _PytestStub (line 24) | class _PytestStub:
  function _dp_parallel_config (line 58) | def _dp_parallel_config(num_layers: int, batch: int, chunks: int) -> Dic...
  function _run_test (line 81) | def _run_test(test_args: Dict[str, Any]):
  function test_dp_correctness (line 226) | def test_dp_correctness(run_distributed, dp_size, checkpoint_dir):

FILE: tests/profiler/test_hardware_profile.py
  function base_profiler (line 9) | def base_profiler(profiler_hardware_configs_dir):
  function _count_torchrun_blocks (line 15) | def _count_torchrun_blocks(scripts_dir: str, filename: str) -> int:
  function test_torch_hardware_profile (line 32) | def test_torch_hardware_profile(

FILE: tests/profiler/test_model_profile.py
  function _reset_profiler_caches (line 19) | def _reset_profiler_caches(profiler):
  function base_profiler (line 27) | def base_profiler(profiler_model_configs_dir):
  function test_get_seq_list (line 42) | def test_get_seq_list(base_profiler, mode, expected_seq_list, config):
  function test_get_bsz_list (line 60) | def test_get_bsz_list(base_profiler, mode, expected_bsz_list, config):
  function test_launch_profiling_scripts (line 89) | def test_launch_profiling_scripts(base_profiler, profile_type, profile_m...
  function test_process_computation_profiled_data (line 132) | def test_process_computation_profiled_data(base_profiler, profiler_model...
  function test_process_memory_profiled_data (line 171) | def test_process_memory_profiled_data(base_profiler, profiler_model_conf...

FILE: tests/profiler/test_runtime_profile.py
  function mock_distributed (line 8) | def mock_distributed():
  function base_profiler (line 16) | def base_profiler(profiler_model_configs_dir):
  function test_profile_memory_stages (line 28) | def test_profile_memory_stages(base_profiler, stage, expected_keys):
  function test_post_profile_memory (line 56) | def test_post_profile_memory(base_profiler, pipeline_type, expected_keys):
  function test_post_profile_memory_with_save (line 83) | def test_post_profile_memory_with_save(base_profiler):
  class MockCUDAEvent (line 114) | class MockCUDAEvent:
    method __init__ (line 119) | def __init__(self):
    method record (line 122) | def record(self):
    method elapsed_time (line 126) | def elapsed_time(self, end):
  function test_profile_time_start_normal (line 130) | def test_profile_time_start_normal(base_profiler):
  function test_profile_time_start_with_save (line 149) | def test_profile_time_start_with_save(base_profiler):
  function test_profile_time_end_with_loss (line 169) | def test_profile_time_end_with_loss(base_profiler):
  function test_profile_time_python (line 205) | def test_profile_time_python(base_profiler):

FILE: tests/search_engine/test_bsz_utils.py
  function base_engine (line 8) | def base_engine():
  function test_settle_bsz (line 20) | def test_settle_bsz(base_engine):
  function test_normal_bsz_range (line 31) | def test_normal_bsz_range(base_engine):
  function test_bsz_range_with_different_scales (line 46) | def test_bsz_range_with_different_scales(base_engine, min_bsz, max_bsz, ...
  function test_max_bsz_adjustment (line 70) | def test_max_bsz_adjustment(base_engine):
  function test_min_bsz_smaller_than_scale (line 80) | def test_min_bsz_smaller_than_scale(base_engine):

FILE: tests/search_engine/test_generate_strategies.py
  function test_generate_strategies (line 10) | def test_generate_strategies(model_type, tmp_path, disables, capsys):

FILE: tests/search_engine/test_get_configs.py
  function _build_hf_test_args (line 15) | def _build_hf_test_args(config_json, time_mode):
  function _promote_profile_filenames_to_all (line 30) | def _promote_profile_filenames_to_all(configs_dir: Path, precision: str,...
  function test_config_loading (line 52) | def test_config_loading(base_config_dirs, model_type, time_mode, memory_...
  function test_hardware_config_loading (line 120) | def test_hardware_config_loading(base_config_dirs, num_nodes, gpus_per_n...

FILE: tests/search_engine/test_initialize.py
  function test_set_cost_models (line 15) | def test_set_cost_models(base_config_dirs, base_log_dirs, model_type, ti...

FILE: tests/search_engine/test_parallelsim_optimization.py
  function test_basic_search_flow (line 15) | def test_basic_search_flow(base_config_dirs, base_log_dirs, idx, model_t...

FILE: tests/search_engine/test_strategy_utils.py
  class TestDPType (line 38) | class TestDPType:
    method test_enum_values (line 39) | def test_enum_values(self):
    method test_values_returns_all_members (line 44) | def test_values_returns_all_members(self):
    method test_contains_true (line 48) | def test_contains_true(self):
    method test_contains_false (line 52) | def test_contains_false(self):
    method test_lt_ordering (line 55) | def test_lt_ordering(self):
    method test_lt_type_error (line 61) | def test_lt_type_error(self):
  class TestColorSet (line 69) | class TestColorSet:
    method test_ansi_codes_exist (line 70) | def test_ansi_codes_exist(self):
  class TestEmbeddingLMHeadStrategy (line 81) | class TestEmbeddingLMHeadStrategy:
    method test_default_values (line 82) | def test_default_values(self):
    method test_auto_reset_dp_type_when_sdp_is_1 (line 92) | def test_auto_reset_dp_type_when_sdp_is_1(self):
    method test_dp_type_preserved_when_sdp_gt_1 (line 97) | def test_dp_type_preserved_when_sdp_gt_1(self):
    method test_tp_and_sp_mutual_exclusion (line 101) | def test_tp_and_sp_mutual_exclusion(self):
    method test_world_size (line 105) | def test_world_size(self):
    method test_sdp_size (line 109) | def test_sdp_size(self):
    method test_tp_sp_size_with_tp (line 113) | def test_tp_sp_size_with_tp(self):
    method test_tp_sp_size_with_sp (line 117) | def test_tp_sp_size_with_sp(self):
    method test_equality_same (line 121) | def test_equality_same(self):
    method test_equality_different (line 126) | def test_equality_different(self):
    method test_equality_different_type (line 131) | def test_equality_different_type(self):
    method test_hash_consistency (line 135) | def test_hash_consistency(self):
    method test_hash_usable_in_set (line 140) | def test_hash_usable_in_set(self):
    method test_lt (line 145) | def test_lt(self):
    method test_lt_not_implemented_for_different_types (line 151) | def test_lt_not_implemented_for_different_types(self):
    method test_to_string (line 155) | def test_to_string(self):
    method test_str (line 161) | def test_str(self):
    method test_to_simple_string_basic (line 166) | def test_to_simple_string_basic(self):
    method test_to_simple_string_with_tp (line 171) | def test_to_simple_string_with_tp(self):
    method test_to_simple_string_zero3 (line 176) | def test_to_simple_string_zero3(self):
    method test_to_simple_string_with_sp (line 181) | def test_to_simple_string_with_sp(self):
  class TestAttentionStrategy (line 191) | class TestAttentionStrategy:
    method test_default_checkpoint_false (line 192) | def test_default_checkpoint_false(self):
    method test_inherits_embedding_fields (line 196) | def test_inherits_embedding_fields(self):
    method test_to_embedding_lmhead_strategy (line 201) | def test_to_embedding_lmhead_strategy(self):
    method test_to_ffn_strategy (line 209) | def test_to_ffn_strategy(self):
    method test_to_layer_strategy (line 216) | def test_to_layer_strategy(self):
    method test_hash (line 222) | def test_hash(self):
    method test_to_simple_string_with_checkpoint (line 227) | def test_to_simple_string_with_checkpoint(self):
  class TestFFNStrategy (line 236) | class TestFFNStrategy:
    method test_default_checkpoint (line 237) | def test_default_checkpoint(self):
    method test_to_embedding_lmhead_strategy (line 241) | def test_to_embedding_lmhead_strategy(self):
    method test_hash (line 247) | def test_hash(self):
  class TestLayerStrategy (line 256) | class TestLayerStrategy:
    method test_default_checkpoint (line 257) | def test_default_checkpoint(self):
    method test_to_embedding_lmhead_strategy (line 261) | def test_to_embedding_lmhead_strategy(self):
    method test_hash (line 267) | def test_hash(self):
  class TestMoEFFNStrategy (line 277) | class TestMoEFFNStrategy:
    method test_default_values (line 278) | def test_default_values(self):
    method test_auto_reset_dp_type_when_dp_is_1 (line 288) | def test_auto_reset_dp_type_when_dp_is_1(self):
    method test_dp_type_preserved_when_dp_gt_1 (line 292) | def test_dp_type_preserved_when_dp_gt_1(self):
    method test_world_size (line 296) | def test_world_size(self):
    method test_sdp_size (line 300) | def test_sdp_size(self):
    method test_equality (line 304) | def test_equality(self):
    method test_inequality (line 309) | def test_inequality(self):
    method test_equality_different_type (line 314) | def test_equality_different_type(self):
    method test_lt (line 318) | def test_lt(self):
    method test_lt_not_implemented (line 323) | def test_lt_not_implemented(self):
    method test_hash (line 327) | def test_hash(self):
    method test_str (line 332) | def test_str(self):
  class TestIsPowerOfTwo (line 341) | class TestIsPowerOfTwo:
    method test_powers_of_two (line 343) | def test_powers_of_two(self, n):
    method test_not_powers_of_two (line 347) | def test_not_powers_of_two(self, n):
  class TestConstants (line 351) | class TestConstants:
    method test_byte_to_MB (line 352) | def test_byte_to_MB(self):
    method test_model_states_ratio (line 355) | def test_model_states_ratio(self):
  class TestOldToNewVersionStrategy (line 362) | class TestOldToNewVersionStrategy:
    method test_basic_ddp (line 363) | def test_basic_ddp(self):
    method test_with_fsdp (line 376) | def test_with_fsdp(self):
    method test_with_checkpoint (line 382) | def test_with_checkpoint(self):
    method test_with_sp (line 387) | def test_with_sp(self):
    method test_default_zero2 (line 393) | def test_default_zero2(self):
    method test_dp_size_1_forces_ddp (line 398) | def test_dp_size_1_forces_ddp(self):
  class TestNewToOldVersionStrategy (line 404) | class TestNewToOldVersionStrategy:
    method test_basic_roundtrip_ddp (line 405) | def test_basic_roundtrip_ddp(self):
    method test_fsdp_flag (line 413) | def test_fsdp_flag(self):
    method test_tp_flag (line 418) | def test_tp_flag(self):
    method test_sp_flag (line 425) | def test_sp_flag(self):
    method test_checkpoint_flag (line 431) | def test_checkpoint_flag(self):
  class TestPrintStrategyList (line 440) | class TestPrintStrategyList:
    method test_none_input (line 441) | def test_none_input(self, capsys):
    method test_prints_strategies (line 447) | def test_prints_strategies(self, capsys):
    method test_with_logger (line 457) | def test_with_logger(self):
  class TestStrategyList2Config (line 476) | class TestStrategyList2Config:
    method test_empty_list (line 477) | def test_empty_list(self):
    method test_single_layer (line 480) | def test_single_layer(self):
    method test_multiple_layers (line 492) | def test_multiple_layers(self):
    method test_all_zero3 (line 506) | def test_all_zero3(self):

FILE: tests/test_arguments.py
  function test_load_with_hydra_train_dist_runtime_matches_yaml (line 26) | def test_load_with_hydra_train_dist_runtime_matches_yaml():
  function test_load_with_hydra_train_dist_overrides (line 62) | def test_load_with_hydra_train_dist_overrides():
  function test_profiler_args_defaults (line 74) | def test_profiler_args_defaults():
  function test_profiler_hardware_args_defaults (line 89) | def test_profiler_hardware_args_defaults():
  function test_search_engine_args_defaults (line 105) | def test_search_engine_args_defaults():

FILE: tests/utils.py
  function init_dist_env (line 3) | def init_dist_env():

FILE: tests/utils/init_dist.py
  function init_dist_env (line 5) | def init_dist_env():

FILE: tests/utils/model_utils.py
  class ModelFactory (line 7) | class ModelFactory:
    method _get_yaml_dir (line 32) | def _get_yaml_dir() -> str:
    method _resolve_yaml_path (line 36) | def _resolve_yaml_path(model_type: str) -> str:
    method resolve_model_config (line 45) | def resolve_model_config(args: Union[GalvatronRuntimeArgs, GalvatronSe...
    method get_test_config (line 60) | def get_test_config(model_type: str) -> Dict[str, Any]:
    method get_model_layer_configs (line 84) | def get_model_layer_configs(args: Union[GalvatronRuntimeArgs, Galvatro...
    method get_model_name (line 90) | def get_model_name(args: Union[GalvatronRuntimeArgs, GalvatronSearchAr...
    method get_model_layer_configs_func (line 96) | def get_model_layer_configs_func() -> Callable:
    method get_model_name_func (line 102) | def get_model_name_func() -> Callable:

FILE: tests/utils/parallel_config.py
  class ParallelConfig (line 6) | class ParallelConfig:
    method to_dict (line 21) | def to_dict(self):

FILE: tests/utils/profiler_configs.py
  function create_computation_static_config (line 5) | def create_computation_static_config() -> Dict[str, float]:
  function create_computation_batch_config (line 12) | def create_computation_batch_config() -> Dict[str, float]:
  function create_computation_sequence_config (line 37) | def create_computation_sequence_config() -> Dict[str, float]:
  function create_memory_static_config (line 58) | def create_memory_static_config() -> Dict:
  function create_memory_static_config_sp (line 239) | def create_memory_static_config_sp() -> Dict:
  function create_memory_sequence_config_sp (line 420) | def create_memory_sequence_config_sp() -> Dict:
  function save_profiler_configs (line 613) | def save_profiler_configs(

FILE: tests/utils/profiler_utils.py
  function initialize_model_profile_profiler (line 7) | def initialize_model_profile_profiler(profiler_model_configs_dir, model_...
  function initialize_hardware_profile_profiler (line 33) | def initialize_hardware_profile_profiler(profiler_hardware_configs_dir):
  function initialize_runtime_profile_profiler (line 41) | def initialize_runtime_profile_profiler(profiler_model_configs_dir, mode...

FILE: tests/utils/runtime_args.py
  class TestRuntimeArgs (line 15) | class TestRuntimeArgs(GalvatronRuntimeArgs):
    method padded_vocab_size (line 22) | def padded_vocab_size(self):
    method hidden_size (line 26) | def hidden_size(self):
    method num_attention_heads (line 30) | def num_attention_heads(self):
    method seq_length (line 34) | def seq_length(self):
    method kv_channels (line 38) | def kv_channels(self):
    method group_query_attention (line 42) | def group_query_attention(self):
    method num_query_groups (line 47) | def num_query_groups(self):
  function _ensure_config_path (line 56) | def _ensure_config_path(config):
  function make_test_args (line 68) | def make_test_args(

FILE: tests/utils/search_args.py
  class SearchArgs (line 4) | class SearchArgs:
    method __init__ (line 6) | def __init__(self):

FILE: tests/utils/search_configs.py
  function create_static_time_config (line 10) | def create_static_time_config() -> Dict[str, float]:
  function create_batch_time_config (line 17) | def create_batch_time_config() -> Dict[str, float]:
  function create_sequence_time_config (line 42) | def create_sequence_time_config() -> Dict[str, float]:
  function create_static_memory_config (line 63) | def create_static_memory_config():
  function create_static_memory_config_sp (line 124) | def create_static_memory_config_sp():
  function create_sequence_memory_config_sp (line 189) | def create_sequence_memory_config_sp():
  function create_hardware_configs (line 462) | def create_hardware_configs():
  function write_time_config (line 550) | def write_time_config(
  function write_memory_config (line 569) | def write_memory_config(
  function write_hardware_config (line 587) | def write_hardware_config(
  function _auto_update_nested_args (line 612) | def _auto_update_nested_args(model: BaseModel, flat_updates: Dict) -> Ba...
  function initialize_search_engine (line 647) | def initialize_search_engine(base_config_dirs, base_log_dirs, model_type...