SYMBOL INDEX (1483 symbols across 133 files) FILE: csrc/dp_core.cpp function argmin (line 13) | inline size_t argmin(const ForwardIterator begin, const ForwardIterator ... function argmax (line 19) | inline size_t argmax(const ForwardIterator begin, const ForwardIterator ... function dynamic_programming_core (line 24) | std::pair, std::map > dynamic_programmin... function PYBIND11_MODULE (line 122) | PYBIND11_MODULE(galvatron_dp_core, m) { FILE: galvatron/core/args_schema.py class CoreArgs (line 46) | class CoreArgs(BaseModel): FILE: galvatron/core/arguments.py function _coerce_cli_value (line 15) | def _coerce_cli_value(raw: str) -> Any: function _legacy_cli_to_flat_map (line 33) | def _legacy_cli_to_flat_map(tokens: List[str]) -> Dict[str, Any]: function _runtime_subsection_for_key (line 52) | def _runtime_subsection_for_key(key: str) -> Optional[str]: function _legacy_cli_to_hydra_overrides (line 64) | def _legacy_cli_to_hydra_overrides(tokens: List[str]) -> List[str]: function _normalize_runtime_model_dtype (line 88) | def _normalize_runtime_model_dtype(config_dict: Dict[str, Any]) -> None: function _normalize_profiler_fields (line 115) | def _normalize_profiler_fields(config_dict: Dict[str, Any]) -> None: function load_with_hydra (line 125) | def load_with_hydra( FILE: galvatron/core/cost_model/components/embedding_lmhead_cost.py class EmbeddingLMHeadTimeCostModel (line 9) | class EmbeddingLMHeadTimeCostModel: method __init__ (line 18) | def __init__( method initialize (line 59) | def initialize(self): method estimate_computation_time (line 81) | def estimate_computation_time(self): method estimate_dp_communication_time (line 99) | def estimate_dp_communication_time(self): method estimate_tp_communication_time (line 125) | def estimate_tp_communication_time(self): method get_overlap_time (line 155) | def get_overlap_time(self, forward_comm_time, forward_comp_time, backw... method gen_result (line 168) | def gen_result(self) -> Tuple[List[float], List[float]]: class EmbeddingLMHeadMemoryCostModel (line 187) | class EmbeddingLMHeadMemoryCostModel: method __init__ (line 195) | def __init__( method initialize (line 231) | def initialize(self): method estimate_model_states_size (line 261) | def estimate_model_states_size(self): method estimate_activation_size (line 280) | def estimate_activation_size(self): method get_memory_cost (line 302) | def get_memory_cost(self): FILE: galvatron/core/cost_model/components/layer_cost.py class TimeCostModelBase (line 9) | class TimeCostModelBase: method __init__ (line 18) | def __init__( method initialize (line 58) | def initialize(self): method estimate_computation_time (line 88) | def estimate_computation_time(self): method estimate_dp_communication_time (line 105) | def estimate_dp_communication_time(self): method estimate_tp_communication_time (line 119) | def estimate_tp_communication_time(self): # TODO: split tp and sp to d... method estimate_pp_communication_time (line 152) | def estimate_pp_communication_time(self): method bct_dp_overlap (line 161) | def bct_dp_overlap(self, dp_message_size, bct): method get_result (line 180) | def get_result(self, no_gradient_sync:bool = False): method gen_result (line 210) | def gen_result(self) -> tuple[float, float]: class MemoryCostModelBase (line 215) | class MemoryCostModelBase: method __init__ (line 223) | def __init__( method initialize (line 261) | def initialize(self): method estimate_parameter_size (line 302) | def estimate_parameter_size(self): method estimate_model_states_size (line 306) | def estimate_model_states_size(self): method estimate_activation_size (line 313) | def estimate_activation_size(self): method get_memory_cost (line 322) | def get_memory_cost(self): FILE: galvatron/core/cost_model/cost_model_args.py class ModelArgs (line 6) | class ModelArgs: class TrainArgs (line 13) | class TrainArgs: class ParallelArgs (line 20) | class ParallelArgs: class ProfileModelArgs (line 29) | class ProfileModelArgs: class ProfileHardwareArgs (line 37) | class ProfileHardwareArgs: FILE: galvatron/core/cost_model/cost_model_handler.py function get_time_cost_all_stages (line 8) | def get_time_cost_all_stages(layer_timecosts, pp_stage_division): function pipeline_costmodel (line 16) | def pipeline_costmodel( FILE: galvatron/core/profiler/args_schema.py class GalvatronModelProfilerArgs (line 9) | class GalvatronModelProfilerArgs(BaseModel): class ProfilerHardwareArgs (line 40) | class ProfilerHardwareArgs(BaseModel): FILE: galvatron/core/profiler/arguments.py function galvatron_profile_args (line 1) | def galvatron_profile_args(parser): function galvatron_profile_hardware_args (line 108) | def galvatron_profile_hardware_args(parser): FILE: galvatron/core/profiler/base_profiler.py class BaseProfiler (line 4) | class BaseProfiler(): method __init__ (line 5) | def __init__(self): method set_work_dir (line 13) | def set_work_dir(self, work_dir): method set_model_name (line 16) | def set_model_name(self, model_name): method set_profile_unit (line 19) | def set_profile_unit(self, profile_unit): method set_mixed_precision (line 22) | def set_mixed_precision(self, mixed_precision): method set_specific_time_path (line 25) | def set_specific_time_path(self, specific_time_path): method set_specific_memory_path (line 28) | def set_specific_memory_path(self, specific_memory_path): method memory_profiling_path (line 31) | def memory_profiling_path(self): method time_profiling_path (line 48) | def time_profiling_path(self): FILE: galvatron/core/profiler/hardware_profiler.py class HardwareProfiler (line 9) | class HardwareProfiler(BaseProfiler): method __init__ (line 12) | def __init__(self, args: ProfilerHardwareArgs): method set_path (line 17) | def set_path(self, path: str) -> None: method get_env (line 21) | def get_env(self) -> str: method generate_script (line 39) | def generate_script(self, num_nodes: int, num_gpus_per_node: int) -> N... method generate_sp_script (line 99) | def generate_sp_script(self, num_nodes: int, num_gpus_per_node: int) -... method profile_bandwidth (line 156) | def profile_bandwidth(self) -> None: method profile_sp_bandwidth (line 161) | def profile_sp_bandwidth(self): method write_config (line 166) | def write_config(self, hardware_config_path: str, key: str, bandwidth:... method profile_overlap (line 180) | def profile_overlap(self): function _halving_tp_degrees (line 196) | def _halving_tp_degrees(world_size: int, max_tp: int) -> list[int]: function _halving_batch_sizes (line 206) | def _halving_batch_sizes(start: int = 1024) -> list[int]: function _p2p_pp_deg_sweep (line 216) | def _p2p_pp_deg_sweep(world_size: int, max_pp_deg: int) -> list[int]: function _shell_int_list (line 226) | def _shell_int_list(xs: list[int]) -> str: FILE: galvatron/core/profiler/model_profiler.py class ModelProfiler (line 15) | class ModelProfiler(BaseProfiler): method __init__ (line 18) | def __init__(self, args: GalvatronModelProfilerArgs): method set_profiler_launcher (line 42) | def set_profiler_launcher(self, path: str, model_name: Optional[str] =... method get_global_batch_size_list (line 60) | def get_global_batch_size_list(self) -> List[int]: method get_layernum_tuple_list (line 76) | def get_layernum_tuple_list(self) -> Union[List[Tuple[int]], List[Tupl... method get_seq_length_tuple_list (line 95) | def get_seq_length_tuple_list(self) -> Union[List[Tuple[int]], List[Tu... method get_basic_overrides_dict (line 138) | def get_basic_overrides_dict(self) -> Dict[str, Any]: method get_envs_dict (line 199) | def get_envs_dict(self) -> Dict[str, Any]: method dict_to_str (line 208) | def dict_to_str(self, d: dict, sep: str = "=") -> str: method launch_profiling_scripts (line 215) | def launch_profiling_scripts(self) -> None: method _launch_memory_profiling (line 231) | def _launch_memory_profiling(self) -> None: method _launch_computation_profiling (line 343) | def _launch_computation_profiling(self) -> None: method process_profiled_data (line 394) | def process_profiled_data(self) -> None: method _process_computation_data (line 422) | def _process_computation_data(self, layernum_lists: List[List[int]]) -... method _process_memory_data (line 473) | def _process_memory_data(self, world_size: int, layernum_lists: List[L... method _process_single_sequence_config (line 520) | def _process_single_sequence_config( method key_format (line 806) | def key_format( method total_memcost (line 846) | def total_memcost( method argval2str (line 883) | def argval2str(self, val: Union[List, Any]) -> str: method arg2str (line 896) | def arg2str(self, key: str, val: Union[List, Any]) -> str: method args2str (line 908) | def args2str(self, args: Union[Dict, List[Tuple]], exclude_args: List[... method env_args (line 929) | def env_args(self) -> Dict[str, Union[str, int]]: method launch_scripts (line 952) | def launch_scripts(self, env_args: Dict[str, str]) -> str: FILE: galvatron/core/profiler/runtime_profiler.py class RuntimeProfiler (line 12) | class RuntimeProfiler(BaseProfiler): method __init__ (line 15) | def __init__(self, args: GalvatronRuntimeArgs): method set_profiler_dist (line 24) | def set_profiler_dist( method set_profiler_single (line 64) | def set_profiler_single(self, start_iter=10, end_iter=20): method set_model_layer_configs (line 76) | def set_model_layer_configs(self, model_layer_configs: Optional[List[D... method set_memory_profiler (line 92) | def set_memory_profiler(self, rank: int, profile_ranks: List[int] = []... method profile_memory (line 105) | def profile_memory(self, iter: int, stage: str = "") -> None: method post_profile_memory (line 134) | def post_profile_memory(self, iter: int) -> None: method set_time_profiler (line 197) | def set_time_profiler(self, start_iter: int, end_iter: int, exit: bool... method profile_time_start (line 218) | def profile_time_start(self, iter: int) -> None: method profile_time_end (line 233) | def profile_time_end( method profile_time_python (line 260) | def profile_time_python(self, iter: int) -> None: method _process_time_results (line 290) | def _process_time_results(self) -> None: method _filtered_time_samples (line 312) | def _filtered_time_samples(self) -> List[float]: method _log_iteration_stats (line 333) | def _log_iteration_stats( FILE: galvatron/core/profiler/utils.py function print_peak_memory (line 8) | def print_peak_memory(prefix, device, type="allocated"): function save_profiled_memory (line 22) | def save_profiled_memory( function save_profiled_time (line 57) | def save_profiled_time(path, time, bsz, layer_num, seq): FILE: galvatron/core/runtime/__init__.py function _reshard (line 23) | def _reshard( FILE: galvatron/core/runtime/args_schema.py class GalvatronParallelArgs (line 18) | class GalvatronParallelArgs(BaseModel): class GalvatronModelArgs (line 51) | class GalvatronModelArgs(BaseModel): method model_type (line 174) | def model_type(self): class GalvatronProfileArgs (line 178) | class GalvatronProfileArgs(BaseModel): class CommonTrainArgs (line 195) | class CommonTrainArgs(BaseModel): function _str_to_list (line 262) | def _str_to_list(v): class CommonDataArgs (line 271) | class CommonDataArgs(BaseModel): method str_to_list (line 298) | def str_to_list(cls, v): class CommonCkptArgs (line 323) | class CommonCkptArgs(BaseModel): class LoggingConfig (line 335) | class LoggingConfig(BaseModel): class GalvatronRuntimeArgs (line 344) | class GalvatronRuntimeArgs(BaseModel): FILE: galvatron/core/runtime/checkpoint/gpt_adapter.py function load_hf_checkpoint (line 18) | def load_hf_checkpoint(load, tp_groups, name, submodule, module): function load_gpt_module (line 154) | def load_gpt_module(load, tp_groups, name, submodule, module, distribute... FILE: galvatron/core/runtime/checkpoint/llama_adapter.py function load_distributed_checkpoint (line 30) | def load_distributed_checkpoint(load, tp_groups, name, submodule, module): function load_hf_checkpoint (line 51) | def load_hf_checkpoint(load, tp_groups, name, submodule, module): function load_llama_module (line 164) | def load_llama_module(load, tp_groups, name, submodule, module, distribu... function save_llama_module (line 172) | def save_llama_module(save_path, model, optimizer, opt_param_scheduler, ... FILE: galvatron/core/runtime/checkpoint/moe_adapter.py function _runtime_args (line 37) | def _runtime_args(): function _load_file (line 45) | def _load_file(path): function _copy_module_state (line 49) | def _copy_module_state(checkpoint, name, submodule): function load_distributed_checkpoint (line 58) | def load_distributed_checkpoint(load, tp_groups, name, submodule, module... function _load_embedding_from_hf (line 102) | def _load_embedding_from_hf(load, tp_groups, submodule): function _load_lm_head_from_hf (line 123) | def _load_lm_head_from_hf(load, tp_groups, submodule): function _load_attention_from_hf (line 144) | def _load_attention_from_hf(checkpoint, tp_groups, name, submodule): function _load_router_from_hf (line 185) | def _load_router_from_hf(checkpoint, submodule): function _load_mlp_from_hf (line 192) | def _load_mlp_from_hf(checkpoint, tp_groups, name, submodule, module): function load_hf_checkpoint (line 225) | def load_hf_checkpoint(load, tp_groups, name, submodule, module, ep_grou... function load_moe_module (line 258) | def load_moe_module(load, tp_groups, name, submodule, module, distribute... function save_moe_module (line 266) | def save_moe_module(save_path, model, optimizer, opt_param_scheduler, it... FILE: galvatron/core/runtime/comm_groups.py class CommGroup (line 4) | class CommGroup(object): method __init__ (line 5) | def __init__(self, ranks:List[int]): method has_rank (line 10) | def has_rank(self, rank): method print (line 13) | def print(self): function show_groups (line 17) | def show_groups(groups:List[CommGroup]): function build_rank_to_parallel_coords (line 26) | def build_rank_to_parallel_coords(world_size, name2size, order='pp-dp-cp... function get_groups (line 44) | def get_groups(degree_rank_dict:Dict[int, Dict[str, int]], ignore_keys=[... function get_embedding_group (line 66) | def get_embedding_group(pp_size, pp_group:CommGroup, manual_global_rank=... function merge_redistributed_group (line 73) | def merge_redistributed_group(split_tp_sp_cp_group:CommGroup, allgather_... function gen_comm_groups (line 108) | def gen_comm_groups( FILE: galvatron/core/runtime/dataloader.py class FakeCausalLMDataset (line 35) | class FakeCausalLMDataset(Dataset): method __init__ (line 38) | def __init__(self, args, device, dataset_size=2560 * 16): method __len__ (line 45) | def __len__(self): method __getitem__ (line 48) | def __getitem__(self, idx): function random_collate_fn (line 52) | def random_collate_fn(batch): function build_pretraining_data_loader (line 73) | def build_pretraining_data_loader(dataset, consumed_samples): class MegatronPretrainingSampler (line 113) | class MegatronPretrainingSampler: method __init__ (line 115) | def __init__(self, total_samples, consumed_samples, micro_batch_size, method __len__ (line 138) | def __len__(self): method get_start_end_idx (line 141) | def get_start_end_idx(self): method __iter__ (line 146) | def __iter__(self): class RandomSeedDataset (line 162) | class RandomSeedDataset(Dataset): method __init__ (line 164) | def __init__(self, dataset): method __len__ (line 170) | def __len__(self): method set_epoch (line 173) | def set_epoch(self, epoch): method __getitem__ (line 176) | def __getitem__(self, idx): class MegatronPretrainingRandomSampler (line 184) | class MegatronPretrainingRandomSampler: method __init__ (line 186) | def __init__(self, dataset, total_samples, consumed_samples, micro_bat... method __len__ (line 210) | def __len__(self): method __iter__ (line 213) | def __iter__(self): function get_blend_and_blend_per_split (line 254) | def get_blend_and_blend_per_split(args): function get_train_valid_test_num_samples (line 299) | def get_train_valid_test_num_samples(): function build_train_valid_test_datasets (line 321) | def build_train_valid_test_datasets(build_train_valid_test_datasets_prov... function build_train_valid_test_data_loaders (line 331) | def build_train_valid_test_data_loaders( function build_train_valid_test_data_iterators (line 389) | def build_train_valid_test_data_iterators( function _build_random_data_iterator (line 442) | def _build_random_data_iterator(): function get_train_valid_test_data_iterators (line 460) | def get_train_valid_test_data_iterators(): function get_batch (line 509) | def get_batch(data_iterator): function _loss_func (line 541) | def _loss_func(micro_lossmask, label: List, output_tensor: List): FILE: galvatron/core/runtime/datasets/megatron/blended_dataset.py class BlendedDataset (line 24) | class BlendedDataset(torch.utils.data.Dataset): method __init__ (line 41) | def __init__( method __len__ (line 90) | def __len__(self) -> int: method __getitem__ (line 93) | def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]: method _build_indices (line 98) | def _build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]: FILE: galvatron/core/runtime/datasets/megatron/blended_megatron_dataset_builder.py function need_to_build_dataset (line 28) | def need_to_build_dataset(): class BlendedMegatronDatasetBuilder (line 39) | class BlendedMegatronDatasetBuilder(object): method __init__ (line 54) | def __init__( method build (line 94) | def build(self) -> List[Optional[TopLevelDataset]]: method _build_blended_dataset_splits (line 186) | def _build_blended_dataset_splits(self) -> List[Optional[TopLevelDatas... method _build_megatron_datasets_parallel (line 353) | def _build_megatron_datasets_parallel( method _build_megatron_dataset_splits (line 435) | def _build_megatron_dataset_splits( method build_generic_dataset (line 502) | def build_generic_dataset( function _get_size_per_split_per_dataset (line 561) | def _get_size_per_split_per_dataset( FILE: galvatron/core/runtime/datasets/megatron/blended_megatron_dataset_config.py class BlendedMegatronDatasetConfig (line 16) | class BlendedMegatronDatasetConfig: method __post_init__ (line 66) | def __post_init__(self) -> None: function parse_and_normalize_split (line 109) | def parse_and_normalize_split(split: str) -> List[float]: function convert_split_vector_to_split_matrix (line 129) | def convert_split_vector_to_split_matrix( FILE: galvatron/core/runtime/datasets/megatron/gpt_dataset.py class GPTDatasetConfig (line 26) | class GPTDatasetConfig(BlendedMegatronDatasetConfig): method __post_init__ (line 54) | def __post_init__(self) -> None: class GPTDataset (line 65) | class GPTDataset(MegatronDataset): method __init__ (line 83) | def __init__( method numel_low_level_dataset (line 117) | def numel_low_level_dataset(low_level_dataset: IndexedDataset) -> int: method build_low_level_dataset (line 132) | def build_low_level_dataset(dataset_path: str, config: GPTDatasetConfi... method __len__ (line 152) | def __len__(self) -> int: method __getitem__ (line 160) | def __getitem__(self, idx: Optional[int]) -> Dict[str, torch.Tensor]: method _query_document_sample_shuffle_indices (line 233) | def _query_document_sample_shuffle_indices( method _build_document_sample_shuffle_indices (line 304) | def _build_document_sample_shuffle_indices( method _get_num_tokens_per_epoch (line 525) | def _get_num_tokens_per_epoch(self) -> int: method _get_num_epochs (line 533) | def _get_num_epochs(self, num_tokens_per_epoch: int) -> int: function _build_document_index (line 556) | def _build_document_index( function _build_shuffle_index (line 589) | def _build_shuffle_index( function _get_ltor_masks_and_position_ids (line 620) | def _get_ltor_masks_and_position_ids( class MockGPTLowLevelDataset (line 697) | class MockGPTLowLevelDataset: method __init__ (line 717) | def __init__(self, tokenizer: MegatronTokenizer) -> None: method __len__ (line 724) | def __len__(self) -> int: method __getitem__ (line 727) | def __getitem__(self, idx: int) -> numpy.number: method get (line 734) | def get(self, idx: int, offset: int = 0, length: Optional[int] = None)... class MockGPTDataset (line 752) | class MockGPTDataset(GPTDataset): method __init__ (line 770) | def __init__( method numel_low_level_dataset (line 784) | def numel_low_level_dataset(low_level_dataset: MockGPTLowLevelDataset)... method build_low_level_dataset (line 796) | def build_low_level_dataset( FILE: galvatron/core/runtime/datasets/megatron/helpers.cpp function build_exhaustive_blending_indices (line 21) | void build_exhaustive_blending_indices(py::array_t &dataset_ind... function build_blending_indices (line 75) | void build_blending_indices(py::array_t &dataset_index, function build_sample_idx (line 143) | py::array_t build_sample_idx( function get_target_sample_len (line 248) | inline int32_t get_target_sample_len(const int32_t short_seq_ratio, function build_mapping_impl (line 266) | py::array build_mapping_impl(const py::array_t &docs_, function build_mapping (line 526) | py::array build_mapping(const py::array_t &docs_, function build_blocks_mapping_impl (line 564) | py::array build_blocks_mapping_impl(const py::array_t &docs_, function build_blocks_mapping (line 805) | py::array build_blocks_mapping(const py::array_t &docs_, function PYBIND11_MODULE (line 838) | PYBIND11_MODULE(helpers_cpp, m) FILE: galvatron/core/runtime/datasets/megatron/helpers.py function build_sample_idx (line 11) | def build_sample_idx( FILE: galvatron/core/runtime/datasets/megatron/indexed_dataset.py class DType (line 41) | class DType(Enum): method code_from_dtype (line 54) | def code_from_dtype(cls, value: Type[numpy.number]) -> int: method dtype_from_code (line 66) | def dtype_from_code(cls, value: int) -> Type[numpy.number]: method size (line 78) | def size(key: Union[int, Type[numpy.number]]) -> int: method optimal_dtype (line 98) | def optimal_dtype(cardinality: Optional[int]) -> Type[numpy.number]: class _IndexWriter (line 113) | class _IndexWriter(object): method __init__ (line 122) | def __init__(self, idx_path: str, dtype: Type[numpy.number]) -> None: method __enter__ (line 126) | def __enter__(self) -> "_IndexWriter": method __exit__ (line 141) | def __exit__( method write (line 161) | def write( method _sequence_pointers (line 206) | def _sequence_pointers(self, sequence_lengths: List[int]) -> List[int]: class _IndexReader (line 224) | class _IndexReader(object): method __init__ (line 233) | def __init__(self, idx_path: str, multimodal: bool) -> None: method __del__ (line 313) | def __del__(self) -> None: method __len__ (line 319) | def __len__(self) -> int: method __getitem__ (line 328) | def __getitem__(self, idx: int) -> Tuple[numpy.int32, numpy.int64, Opt... class _BinReader (line 344) | class _BinReader(ABC): method read (line 348) | def read(self, dtype: Type[numpy.number], count: int, offset: int) -> ... class _MMapBinReader (line 364) | class _MMapBinReader(_BinReader): method __init__ (line 371) | def __init__(self, bin_path: str) -> None: method read (line 375) | def read(self, dtype: Type[numpy.number], count: int, offset: int) -> ... method __del__ (line 390) | def __del__(self) -> None: class _FileBinReader (line 397) | class _FileBinReader(_BinReader): method __init__ (line 404) | def __init__(self, bin_path: str) -> None: method read (line 407) | def read(self, dtype: Type[numpy.number], count: int, offset: int) -> ... class _S3BinReader (line 427) | class _S3BinReader(_BinReader): method __init__ (line 436) | def __init__(self, bin_path: str, bin_chunk_nbytes: int) -> None: method _extract_from_cache (line 445) | def _extract_from_cache(self, offset: int, size: int) -> bytes: method read (line 453) | def read(self, dtype: Type[numpy.number], count: int, offset: int) -> ... method __del__ (line 501) | def __del__(self) -> None: class IndexedDataset (line 506) | class IndexedDataset(torch.utils.data.Dataset): method __init__ (line 519) | def __init__( method initialize (line 542) | def initialize( method __getstate__ (line 582) | def __getstate__(self) -> Tuple[str, bool, bool, Optional[S3Config]]: method __setstate__ (line 590) | def __setstate__(self, state: Tuple[str, bool, bool, Optional[S3Config... method __del__ (line 599) | def __del__(self) -> None: method __len__ (line 604) | def __len__(self) -> int: method __getitem__ (line 612) | def __getitem__( method get (line 653) | def get(self, idx: int, offset: int = 0, length: Optional[int] = None)... method sequence_lengths (line 679) | def sequence_lengths(self) -> numpy.ndarray: method document_indices (line 688) | def document_indices(self) -> numpy.ndarray: method get_document_indices (line 696) | def get_document_indices(self) -> numpy.ndarray: method set_document_indices (line 706) | def set_document_indices(self, document_indices: numpy.ndarray) -> None: method sequence_modes (line 717) | def sequence_modes(self) -> numpy.ndarray: method exists (line 726) | def exists(path_prefix: str) -> bool: class IndexedDatasetBuilder (line 745) | class IndexedDatasetBuilder(object): method __init__ (line 756) | def __init__( method add_item (line 767) | def add_item(self, tensor: torch.Tensor, mode: int = 0) -> None: method add_document (line 781) | def add_document( method end_document (line 800) | def end_document(self) -> None: method add_index (line 804) | def add_index(self, path_prefix: str) -> None: method finalize (line 825) | def finalize(self, idx_path: str) -> None: function get_idx_path (line 836) | def get_idx_path(path_prefix: str) -> str: function get_bin_path (line 848) | def get_bin_path(path_prefix: str) -> str: FILE: galvatron/core/runtime/datasets/megatron/megatron_dataset.py class MegatronDataset (line 19) | class MegatronDataset(ABC, torch.utils.data.Dataset): method __init__ (line 36) | def __init__( method numel_low_level_dataset (line 71) | def numel_low_level_dataset(low_level_dataset: LowLevelDataset) -> int: method build_low_level_dataset (line 88) | def build_low_level_dataset( method _key_config_attributes (line 109) | def _key_config_attributes() -> List[str]: method __len__ (line 121) | def __len__(self) -> int: method __getitem__ (line 130) | def __getitem__(self, idx: int) -> Dict[str, Union[torch.Tensor, numpy... FILE: galvatron/core/runtime/datasets/megatron/megatron_tokenizer.py class MegatronTokenizer (line 10) | class MegatronTokenizer(ABC): method __init__ (line 22) | def __init__(self, *tokenizer_paths: str, **tokenizer_options: Any): method tokenize (line 35) | def tokenize(self, text: str) -> numpy.ndarray: method detokenize (line 46) | def detokenize(self, ids: numpy.ndarray) -> str: method offsets (line 60) | def offsets(self, ids: list[int], text: str) -> list[int]: method vocab (line 77) | def vocab(self): method inv_vocab (line 83) | def inv_vocab(self): method vocab_size (line 89) | def vocab_size(self): method cls (line 94) | def cls(self): method sep (line 103) | def sep(self): method pad (line 112) | def pad(self): method eod (line 121) | def eod(self): method bos (line 130) | def bos(self): method eos (line 139) | def eos(self): method mask (line 148) | def mask(self): FILE: galvatron/core/runtime/datasets/megatron/tokenizer.py function _vocab_size_with_padding (line 7) | def _vocab_size_with_padding(orig_vocab_size, args, logging_enabled=True): function build_tokenizer (line 23) | def build_tokenizer(args: GalvatronRuntimeArgs, **kwargs): class _HuggingFaceTokenizer (line 34) | class _HuggingFaceTokenizer(MegatronTokenizer): method __init__ (line 35) | def __init__(self, pretrained_model_name_or_path, **kwargs): method vocab_size (line 52) | def vocab_size(self): method vocab (line 56) | def vocab(self): method inv_vocab (line 61) | def inv_vocab(self): method decoder (line 66) | def decoder(self): method tokenize (line 69) | def tokenize(self, text, **kwargs): method detokenize (line 72) | def detokenize(self, token_ids, **kwargs): method offsets (line 75) | def offsets(self, ids: list[int], text: str) -> list[int]: method eod (line 88) | def eod(self): FILE: galvatron/core/runtime/datasets/megatron/utils.py class Split (line 15) | class Split(Enum): function compile_helpers (line 21) | def compile_helpers(): function normalize (line 34) | def normalize(weights: List[float]) -> List[float]: function get_blend_from_list (line 49) | def get_blend_from_list( FILE: galvatron/core/runtime/datasets/megatron/utils_s3.py class S3Config (line 16) | class S3Config(NamedTuple): class S3Client (line 34) | class S3Client(Protocol): method download_file (line 37) | def download_file(self, Bucket: str, Key: str, Filename: str) -> None:... method upload_file (line 39) | def upload_file(self, Filename: str, Bucket: str, Key: str) -> None: ... method head_object (line 41) | def head_object(self, Bucket: str, Key: str) -> Dict[str, Any]: ... method get_object (line 43) | def get_object(self, Bucket: str, Key: str, Range: str) -> Dict[str, A... method close (line 45) | def close(self) -> None: ... function is_s3_path (line 48) | def is_s3_path(path: str) -> bool: function parse_s3_path (line 60) | def parse_s3_path(path: str) -> Tuple[str, str]: function object_exists (line 80) | def object_exists(client: S3Client, path: str) -> bool: function _download_file (line 103) | def _download_file(client: S3Client, s3_path: str, local_path: str) -> N... function maybe_download_file (line 119) | def maybe_download_file(s3_path: str, local_path: str) -> None: FILE: galvatron/core/runtime/datasets/random_dataset.py class RandomTokenDataset (line 11) | class RandomTokenDataset(Dataset): method __init__ (line 25) | def __init__(self, vocab_size: int, seq_length: int, size: int = 256): method __len__ (line 28) | def __len__(self) -> int: method __getitem__ (line 31) | def __getitem__(self, idx: int) -> torch.Tensor: function random_collate_fn (line 35) | def random_collate_fn(batch): FILE: galvatron/core/runtime/hybrid_parallel_config.py function get_pp_ranks_enc (line 10) | def get_pp_ranks_enc(pp_divide): function get_hybrid_parallel_configs_api (line 18) | def get_hybrid_parallel_configs_api(args:GalvatronRuntimeArgs): function check_hp_config (line 186) | def check_hp_config(hp_configs, layernum_list): function print_hp_config (line 216) | def print_hp_config(key, val): function print_hp_configs (line 223) | def print_hp_configs(hp_configs): function hp_config_whole_model (line 229) | def hp_config_whole_model(module_types, hp_configs, vocab_sdp=0, embed_c... function get_enc_groups (line 317) | def get_enc_groups(groups_whole, module_types): function mixed_precision_dtype (line 326) | def mixed_precision_dtype(mixed_precision): function layer_shapes_dtypes_whole_model (line 330) | def layer_shapes_dtypes_whole_model(module_types, layernum_list, layer_s... function get_chunks (line 359) | def get_chunks(args): FILE: galvatron/core/runtime/hybrid_parallel_model.py class GalvatronModel (line 42) | class GalvatronModel(nn.Module): method __init__ (line 43) | def __init__(self, hp_model: PipelineParallel): method forward_backward (line 51) | def forward_backward(self, batch, iter=None, profiler=None, loss_func=... method fake_tensor (line 81) | def fake_tensor(self, x): method fake_loss_func (line 84) | def fake_loss_func(self, labels, outputs): method loss_to_cpu (line 90) | def loss_to_cpu(self, loss): function construct_hybrid_parallel_model_api (line 99) | def construct_hybrid_parallel_model_api( FILE: galvatron/core/runtime/initialize.py function init_empty_weights (line 15) | def init_empty_weights(include_buffers: bool = True): function init_on_device (line 47) | def init_on_device(device: torch.device, include_buffers: bool = True): function _initialize_distributed (line 114) | def _initialize_distributed(args:GalvatronRuntimeArgs): function initialize_galvatron (line 142) | def initialize_galvatron(args:GalvatronRuntimeArgs): function _compile_dependencies (line 163) | def _compile_dependencies(): function validate_args (line 190) | def validate_args(args:GalvatronRuntimeArgs): function _print_args (line 240) | def _print_args(args:GalvatronRuntimeArgs, title: str = "arguments"): FILE: galvatron/core/runtime/models/arch.py function arch_to_module_types (line 55) | def arch_to_module_types(arch_list: List[str]) -> List[str]: class ModelInfo (line 63) | class ModelInfo: method __init__ (line 64) | def __init__(self): method set_layernums (line 67) | def set_layernums(self, info): method set_shapes (line 70) | def set_shapes(self, info): method set_dtypes (line 73) | def set_dtypes(self, info): method set_module_types (line 76) | def set_module_types(self, info): method layernums (line 79) | def layernums(self): method shapes (line 82) | def shapes(self): method dtypes (line 85) | def dtypes(self): method module_types (line 88) | def module_types(self): class ArchModelInfo (line 96) | class ArchModelInfo(ModelInfo): method __init__ (line 99) | def __init__(self, arch_list: List[str], args:GalvatronRuntimeArgs): class BlockNames (line 127) | class BlockNames: FILE: galvatron/core/runtime/models/builder.py function build_sequential_from_arch (line 42) | def build_sequential_from_arch( function build_causal_lm_arch (line 111) | def build_causal_lm_arch(args:GalvatronRuntimeArgs) -> List[str]: function get_block_names (line 124) | def get_block_names(args:GalvatronRuntimeArgs): function build_model (line 158) | def build_model(args:GalvatronRuntimeArgs): function get_runtime_profiler (line 190) | def get_runtime_profiler(args, path, start_iter=10, end_iter=20): FILE: galvatron/core/runtime/models/modules.py class GalvatronEmbedding (line 35) | class GalvatronEmbedding(nn.Module): method __init__ (line 41) | def __init__(self, args: GalvatronRuntimeArgs, tp_group=None, sp_group... method forward (line 77) | def forward(self, input_ids, position_ids=None, attention_mask=None, l... class GalvatronAttention (line 103) | class GalvatronAttention(nn.Module): method __init__ (line 106) | def __init__(self, args: GalvatronRuntimeArgs, layer_idx, tp_group=Non... method _get_rotary_pos_emb (line 163) | def _get_rotary_pos_emb(self, hidden_states): method forward (line 178) | def forward(self, hidden_states, position_ids, attention_mask, rotary_... class GalvatronMLP (line 192) | class GalvatronMLP(nn.Module): method __init__ (line 195) | def __init__(self, args: GalvatronRuntimeArgs, layer_idx, tp_group=Non... method forward (line 210) | def forward(self, hidden_states): class GalvatronDecoderLayer (line 223) | class GalvatronDecoderLayer(nn.Module): method __init__ (line 226) | def __init__(self, args: GalvatronRuntimeArgs, layer_idx, tp_group=Non... method forward (line 232) | def forward(self, hidden_states, position_ids=None, attention_mask=Non... class GalvatronFinalNorm (line 242) | class GalvatronFinalNorm(nn.Module): method __init__ (line 245) | def __init__(self, args: GalvatronRuntimeArgs): method forward (line 250) | def forward(self, hidden_states, position_ids=None, attention_mask=Non... class _LMHeadLinear (line 258) | class _LMHeadLinear(nn.Module): method __init__ (line 261) | def __init__(self, config, sequence_parallel, tp_group): method forward (line 278) | def forward(self, hidden_states): class GalvatronCausalLMHead (line 290) | class GalvatronCausalLMHead(nn.Module): method __init__ (line 293) | def __init__(self, args: GalvatronRuntimeArgs, tp_group=None, sp_group... method forward (line 315) | def forward(self, hidden_states, position_ids=None, attention_mask=Non... FILE: galvatron/core/runtime/models/moe_modules.py class GalvatronMoEAttention (line 19) | class GalvatronMoEAttention(nn.Module): method __init__ (line 20) | def __init__(self, args: GalvatronRuntimeArgs, layer_idx, tp_group=Non... method forward (line 26) | def forward(self, hidden_states, position_ids=None, attention_mask=Non... class GalvatronMoERouter (line 33) | class GalvatronMoERouter(nn.Module): method __init__ (line 34) | def __init__(self, args: GalvatronRuntimeArgs, layer_idx): method reset_parameters (line 43) | def reset_parameters(self): method forward (line 50) | def forward(self, hidden_states): class GalvatronMoEMLP (line 56) | class GalvatronMoEMLP(nn.Module): method __init__ (line 57) | def __init__(self, args: GalvatronRuntimeArgs, layer_idx, ep_group=Non... method forward (line 121) | def forward(self, hidden_states, mlp_residual, probs, routing_map): class GalvatronMoEDecoderLayer (line 131) | class GalvatronMoEDecoderLayer(nn.Module): method __init__ (line 134) | def __init__( method forward (line 151) | def forward(self, hidden_states, position_ids=None, attention_mask=Non... FILE: galvatron/core/runtime/moe/fused_a2a.py function get_hidden_bytes (line 18) | def get_hidden_bytes(x: torch.Tensor) -> int: function get_buffer (line 30) | def get_buffer(group: torch.distributed.ProcessGroup, hidden_bytes: int): class FusedDispatch (line 66) | class FusedDispatch(torch.autograd.Function): method forward (line 70) | def forward(ctx, x, token_indices, token_probs, num_experts, group, pr... method backward (line 119) | def backward( class FusedCombine (line 137) | class FusedCombine(torch.autograd.Function): method forward (line 141) | def forward(ctx, x, group, handle, previous_event=None): method backward (line 153) | def backward(ctx, grad_output, previous_event=None): function fused_dispatch (line 168) | def fused_dispatch(x, token_indices, token_probs, num_experts, group, pr... function fused_combine (line 186) | def fused_combine(x, group, handle, previous_event=None): FILE: galvatron/core/runtime/moe/fused_kernels.py function moe_unpermute (line 10) | def moe_unpermute( class _moe_unpermute_mask_map (line 55) | class _moe_unpermute_mask_map(torch.autograd.Function): method forward (line 59) | def forward( method backward (line 105) | def backward(ctx, unpermuted_act_grad): function triton_unpermute_with_mask_map (line 147) | def triton_unpermute_with_mask_map( function _unpermute_kernel (line 199) | def _unpermute_kernel( function triton_unpermute_with_mask_map_bwd_with_merging_probs (line 266) | def triton_unpermute_with_mask_map_bwd_with_merging_probs( function _unpermute_bwd_with_merging_probs_kernel (line 317) | def _unpermute_bwd_with_merging_probs_kernel( function moe_permute (line 398) | def moe_permute( class _moe_permute_mask_map (line 440) | class _moe_permute_mask_map(torch.autograd.Function): method forward (line 444) | def forward( method backward (line 487) | def backward( function triton_make_row_id_map (line 514) | def triton_make_row_id_map( function _row_id_map_pass_1_kernel (line 545) | def _row_id_map_pass_1_kernel( function _row_id_map_pass_2_kernel (line 576) | def _row_id_map_pass_2_kernel( function triton_permute_with_mask_map (line 607) | def triton_permute_with_mask_map( function _permute_kernel (line 654) | def _permute_kernel( class _moe_chunk_sort (line 698) | class _moe_chunk_sort(torch.autograd.Function): method forward (line 702) | def forward( method backward (line 737) | def backward( function moe_sort_chunks_by_index (line 762) | def moe_sort_chunks_by_index( function _sort_chunks_by_idxs_kernel (line 796) | def _sort_chunks_by_idxs_kernel( function sort_chunks_by_idx (line 874) | def sort_chunks_by_idx( function _sort_chunks_by_map (line 924) | def _sort_chunks_by_map( function sort_chunks_by_map (line 962) | def sort_chunks_by_map( FILE: galvatron/core/runtime/moe/grouped_gemm_util.py function grouped_gemm_is_available (line 9) | def grouped_gemm_is_available(): function assert_grouped_gemm_is_available (line 14) | def assert_grouped_gemm_is_available(): FILE: galvatron/core/runtime/moe/mlp.py class GroupedMLP (line 26) | class GroupedMLP(torch.nn.Module): method __init__ (line 32) | def __init__( method forward (line 99) | def forward(self, permuted_local_hidden_states: torch.Tensor, tokens_p... class SequentialMLP (line 128) | class SequentialMLP(torch.nn.Module): method __init__ (line 134) | def __init__( method _pad_tensor_for_fp8 (line 164) | def _pad_tensor_for_fp8(self, hidden): method forward (line 176) | def forward(self, permuted_local_hidden_states: torch.Tensor, tokens_p... class SharedExpertMLP (line 215) | class SharedExpertMLP(MLP): method __init__ (line 224) | def __init__(self, config: GalvatronModelArgs, submodules: MLPSubmodul... method forward (line 271) | def forward(self, hidden_states): method pre_forward_comm (line 280) | def pre_forward_comm(self, input): method linear_fc1_forward_and_act (line 301) | def linear_fc1_forward_and_act(self, overlapped_comm_output=None): method linear_fc2_forward (line 348) | def linear_fc2_forward(self, overlapped_comm_output=None): method post_forward_comm (line 363) | def post_forward_comm(self): method get_output (line 383) | def get_output(self): function set_tensor_grad_fn_sequence_sr (line 403) | def set_tensor_grad_fn_sequence_sr(tensor, value): FILE: galvatron/core/runtime/moe/moe_utils.py function switch_load_balancing_loss_func (line 14) | def switch_load_balancing_loss_func( function sequence_load_balancing_loss_func (line 62) | def sequence_load_balancing_loss_func( function z_loss_func (line 115) | def z_loss_func(logits, z_loss_coeff): function sinkhorn (line 130) | def sinkhorn(cost: torch.Tensor, tol: float = 0.0001): function get_capacity (line 147) | def get_capacity(num_tokens: int, num_experts: int, capacity_factor: flo... class MoEAuxLossAutoScaler (line 166) | class MoEAuxLossAutoScaler(torch.autograd.Function): method forward (line 172) | def forward(ctx, output: torch.Tensor, aux_loss: torch.Tensor): method backward (line 186) | def backward(ctx, grad_output: torch.Tensor): method set_loss_scale (line 206) | def set_loss_scale(scale: torch.Tensor): function permute (line 219) | def permute( function unpermute (line 280) | def unpermute( function sort_chunks_by_idxs (line 356) | def sort_chunks_by_idxs( function group_limited_topk (line 372) | def group_limited_topk( function topk_softmax_with_capacity (line 430) | def topk_softmax_with_capacity( function save_to_aux_losses_tracker (line 547) | def save_to_aux_losses_tracker( function clear_aux_losses_tracker (line 577) | def clear_aux_losses_tracker(): function reduce_aux_losses_tracker_across_ranks (line 586) | def reduce_aux_losses_tracker_across_ranks(): function track_moe_metrics (line 604) | def track_moe_metrics( function get_updated_expert_bias (line 645) | def get_updated_expert_bias(tokens_per_expert, expert_bias, expert_bias_... function maybe_move_tensor_to_cpu (line 665) | def maybe_move_tensor_to_cpu(tensor, as_numpy=False, record_stream=False): FILE: galvatron/core/runtime/moe/router.py class Router (line 22) | class Router(ABC, torch.nn.Module): method __init__ (line 25) | def __init__(self, config: GalvatronModelArgs) -> None: method gating (line 49) | def gating(self, input: torch.Tensor): method routing (line 71) | def routing(self, logits: torch.Tensor): method forward (line 84) | def forward(self, input: torch.Tensor): method set_layer_idx (line 93) | def set_layer_idx(self, layer_idx: int): class TopKRouter (line 98) | class TopKRouter(Router): method __init__ (line 101) | def __init__(self, config: GalvatronModelArgs) -> None: method _maintain_float32_expert_bias (line 129) | def _maintain_float32_expert_bias(self): method sinkhorn_load_balancing (line 140) | def sinkhorn_load_balancing(self, logits: torch.Tensor): method compute_routing_scores_for_aux_loss (line 173) | def compute_routing_scores_for_aux_loss(self, logits: torch.Tensor) ->... method aux_loss_load_balancing (line 193) | def aux_loss_load_balancing(self, logits: torch.Tensor): method seq_aux_loss_load_balancing (line 233) | def seq_aux_loss_load_balancing(self, logits: torch.Tensor, bsz: int, ... method apply_load_balancing_loss (line 278) | def apply_load_balancing_loss( method apply_z_loss (line 316) | def apply_z_loss(self, logits): method apply_input_jitter (line 350) | def apply_input_jitter(self, input: torch.Tensor): method routing (line 371) | def routing(self, logits: torch.Tensor): method forward (line 423) | def forward(self, input: torch.Tensor): FILE: galvatron/core/runtime/moe/token_dispatcher.py class MoETokenDispatcher (line 37) | class MoETokenDispatcher: method __init__ (line 42) | def __init__( method ep_group (line 62) | def ep_group(self): method tp_group (line 67) | def tp_group(self): method tp_rank (line 72) | def tp_rank(self): method tp_ep_group (line 77) | def tp_ep_group(self): method token_permutation (line 82) | def token_permutation( method token_unpermutation (line 98) | def token_unpermutation(self, expert_output: torch.Tensor, bias: torch... method set_shared_experts (line 110) | def set_shared_experts(self, shared_experts): class MoEAllGatherTokenDispatcher (line 116) | class MoEAllGatherTokenDispatcher(MoETokenDispatcher): method __init__ (line 122) | def __init__( method token_permutation (line 150) | def token_permutation( method token_unpermutation (line 216) | def token_unpermutation(self, hidden_states: torch.Tensor, bias: torch... class MoEAlltoAllTokenDispatcher (line 287) | class MoEAlltoAllTokenDispatcher(MoETokenDispatcher): method __init__ (line 297) | def __init__( method preprocess (line 379) | def preprocess(self, routing_map: torch.Tensor) -> torch.Tensor: method token_permutation (line 510) | def token_permutation( method token_unpermutation (line 606) | def token_unpermutation( method _maybe_update_cuda_sync_point (line 691) | def _maybe_update_cuda_sync_point(self, point: str): method _maybe_dtoh_and_synchronize (line 702) | def _maybe_dtoh_and_synchronize( class _DispatchManager (line 743) | class _DispatchManager(ABC): method setup_metadata (line 756) | def setup_metadata(self, routing_map: torch.Tensor, probs: torch.Tensor): method dispatch (line 761) | def dispatch(self, hidden_states: torch.Tensor) -> torch.Tensor: method combine (line 766) | def combine(self, hidden_states: torch.Tensor) -> torch.Tensor: method get_dispached_metadata (line 771) | def get_dispached_metadata(self) -> torch.Tensor: method get_permuted_hidden_states_by_experts (line 776) | def get_permuted_hidden_states_by_experts(self, hidden_states: torch.T... method get_restored_hidden_states_by_experts (line 781) | def get_restored_hidden_states_by_experts(self, hidden_states: torch.T... class _DeepepManager (line 786) | class _DeepepManager(_DispatchManager): method __init__ (line 808) | def __init__( method setup_metadata (line 838) | def setup_metadata(self, routing_map: torch.Tensor, probs: torch.Tensor): method dispatch (line 850) | def dispatch(self, hidden_states: torch.Tensor) -> torch.Tensor: method _indices_to_multihot (line 868) | def _indices_to_multihot(self, indices, probs): method get_dispached_metadata (line 899) | def get_dispached_metadata(self) -> torch.Tensor: method get_number_of_tokens_per_expert (line 902) | def get_number_of_tokens_per_expert(self) -> torch.Tensor: method combine (line 908) | def combine(self, hidden_states: torch.Tensor) -> torch.Tensor: method get_permuted_hidden_states_by_experts (line 914) | def get_permuted_hidden_states_by_experts(self, hidden_states: torch.T... method get_restored_hidden_states_by_experts (line 927) | def get_restored_hidden_states_by_experts(self, hidden_states: torch.T... class MoEFlexTokenDispatcher (line 942) | class MoEFlexTokenDispatcher(MoETokenDispatcher): method __init__ (line 947) | def __init__( method set_shared_experts (line 980) | def set_shared_experts(self, shared_experts): method _initialize_metadata (line 985) | def _initialize_metadata(self, routing_map: torch.Tensor, probs: torch... method token_permutation (line 1012) | def token_permutation( method token_unpermutation (line 1030) | def token_unpermutation( FILE: galvatron/core/runtime/optimizer/clip_grads.py function local_multi_tensor_applier (line 11) | def local_multi_tensor_applier(op, noop_flag_buffer, tensor_lists, *args): function local_multi_tensor_l2_norm (line 18) | def local_multi_tensor_l2_norm(chunk_size, noop_flag, tensor_lists, per_... function local_multi_tensor_scale (line 30) | def local_multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale): function get_grad_norm_fp32 (line 66) | def get_grad_norm_fp32( function clip_grad_by_total_norm_fp32 (line 154) | def clip_grad_by_total_norm_fp32( FILE: galvatron/core/runtime/optimizer/num_microbatches_calculator.py function get_num_microbatches (line 17) | def get_num_microbatches() -> int: function get_current_global_batch_size (line 22) | def get_current_global_batch_size() -> int: function get_micro_batch_size (line 27) | def get_micro_batch_size() -> int: function get_current_running_global_batch_size (line 32) | def get_current_running_global_batch_size() -> int: function update_num_microbatches (line 38) | def update_num_microbatches( function unset_num_microbatches_calculator (line 54) | def unset_num_microbatches_calculator(): function init_num_microbatches_calculator (line 64) | def init_num_microbatches_calculator( function destroy_num_microbatches_calculator (line 101) | def destroy_num_microbatches_calculator(): function reconfigure_num_microbatches_calculator (line 107) | def reconfigure_num_microbatches_calculator( function _configure_global_num_microbatches_calculator (line 144) | def _configure_global_num_microbatches_calculator( function _build_num_microbatches_calculator (line 191) | def _build_num_microbatches_calculator( function _round (line 261) | def _round(batch_size: int, divisor: int) -> int: class NumMicroBatchesCalculator (line 266) | class NumMicroBatchesCalculator(ABC): method __init__ (line 269) | def __init__(self) -> None: method get (line 275) | def get(self) -> int: method get_current_global_batch_size (line 279) | def get_current_global_batch_size(self) -> int: method get_micro_batch_size (line 283) | def get_micro_batch_size(self) -> int: method get_current_running_global_batch_size (line 287) | def get_current_running_global_batch_size(self) -> int: method update (line 293) | def update(self, consumed_samples, consistency_check, verbose=False) -... class ConstantNumMicroBatchesCalculator (line 298) | class ConstantNumMicroBatchesCalculator(NumMicroBatchesCalculator): method __init__ (line 315) | def __init__( method update (line 356) | def update(self, consumed_samples, consistency_check, verbose=False) -... class RampupBatchsizeNumMicroBatchesCalculator (line 360) | class RampupBatchsizeNumMicroBatchesCalculator(NumMicroBatchesCalculator): method __init__ (line 387) | def __init__( method update (line 441) | def update(self, consumed_samples: int, consistency_check: bool, verbo... FILE: galvatron/core/runtime/optimizer/param_scheduler.py function update_train_iters (line 11) | def update_train_iters(args): function get_optimizer_param_scheduler (line 45) | def get_optimizer_param_scheduler(optimizer): class OptimizerParamScheduler (line 102) | class OptimizerParamScheduler: method __init__ (line 127) | def __init__( method get_wd (line 186) | def get_wd(self) -> float: method get_lr (line 209) | def get_lr(self, param_group: dict) -> float: method step (line 270) | def step(self, increment: int) -> None: method state_dict (line 283) | def state_dict(self) -> dict: method _check_and_set (line 299) | def _check_and_set(self, cls_value: float, sd_value: float, name: str)... method load_state_dict (line 322) | def load_state_dict(self, state_dict: dict) -> None: FILE: galvatron/core/runtime/optimizer/utils.py function clip_grad_norm (line 14) | def clip_grad_norm(model, max_norm, norm_type=2): function get_optimizer_and_param_scheduler (line 43) | def get_optimizer_and_param_scheduler(model, args): FILE: galvatron/core/runtime/parallel.py function _get_modules_to_materialize (line 19) | def _get_modules_to_materialize(root_module: nn.Module) -> List[nn.Module]: function wrap_data_parallel (line 41) | def wrap_data_parallel( function param_init_fn (line 87) | def param_init_fn(all_block_name, load, distributed_checkpoint, tp_group... function wrap_module_fsdp_manually (line 100) | def wrap_module_fsdp_manually( function apply_fsdp (line 192) | def apply_fsdp(model, fsdp_args, wrap_block_name, need_ignore=False): function apply_ckpt (line 213) | def apply_ckpt(model, checkpoint_wrapper_fn, wrap_block_name): function wrap_modules_checkpoint (line 226) | def wrap_modules_checkpoint(module_list, checkpoint_flags, wrap_block_na... function wrap_model_checkpoint (line 240) | def wrap_model_checkpoint(model, wrap_block_names=[]): function relocate_activations (line 246) | def relocate_activations(input, allgather_cp_group, allgather_tp_sp_cp_g... class Module_with_relocation (line 272) | class Module_with_relocation(nn.Module): method __init__ (line 273) | def __init__(self, module, allgather_cp_group, allgather_tp_sp_cp_group, method forward (line 292) | def forward(self, *inputs, **kwargs): function wrap_modules_data_parallel (line 307) | def wrap_modules_data_parallel( function modules_to_devices (line 390) | def modules_to_devices(module_list, pp_devices): function wrap_modules_relocation (line 396) | def wrap_modules_relocation(module_list, allgather_cp_groups, allgather_... FILE: galvatron/core/runtime/parallel_state.py function _ensure_var_is_initialized (line 12) | def _ensure_var_is_initialized(var, name): function _ensure_var_is_not_initialized (line 17) | def _ensure_var_is_not_initialized(var, name): function get_parallel_world_size (line 23) | def get_parallel_world_size(group:torch.distributed.ProcessGroup): function get_parallel_rank (line 27) | def get_parallel_rank(group:torch.distributed.ProcessGroup): function set_global_memory_buffer (line 34) | def set_global_memory_buffer(): function get_global_memory_buffer (line 41) | def get_global_memory_buffer(): function destroy_global_memory_buffer (line 47) | def destroy_global_memory_buffer(): function set_args (line 56) | def set_args(args:GalvatronRuntimeArgs): function get_args (line 62) | def get_args(): function _build_tokenizer (line 71) | def _build_tokenizer(args:GalvatronRuntimeArgs): function get_tokenizer (line 79) | def get_tokenizer(): function _set_tensorboard_writer (line 88) | def _set_tensorboard_writer(args:GalvatronRuntimeArgs): function _set_wandb_writer (line 110) | def _set_wandb_writer(args:GalvatronRuntimeArgs): function set_global_variables (line 135) | def set_global_variables(args:GalvatronRuntimeArgs): function set_pp_comm_group (line 146) | def set_pp_comm_group(comm_group:CommGroup): function get_pp_comm_group (line 152) | def get_pp_comm_group(): function get_pp_world_size (line 158) | def get_pp_world_size(): function get_pp_rank (line 164) | def get_pp_rank(): function is_pipeline_first_stage (line 170) | def is_pipeline_first_stage(): function is_pipeline_last_stage (line 174) | def is_pipeline_last_stage(): function get_virtual_pipeline_model_parallel_rank (line 179) | def get_virtual_pipeline_model_parallel_rank(): function set_vocab_tp_sp_comm_group (line 190) | def set_vocab_tp_sp_comm_group(comm_group:CommGroup): function set_vocab_cp_comm_group (line 196) | def set_vocab_cp_comm_group(comm_group:CommGroup): function set_vocab_dp_comm_group (line 202) | def set_vocab_dp_comm_group(comm_group:CommGroup): function set_vocab_tp_sp_src_rank (line 208) | def set_vocab_tp_sp_src_rank(rank:int): function get_vocab_tp_sp_comm_group (line 214) | def get_vocab_tp_sp_comm_group(): function get_vocab_cp_comm_group (line 220) | def get_vocab_cp_comm_group(): function get_vocab_dp_comm_group (line 226) | def get_vocab_dp_comm_group(): function get_vocab_tp_sp_src_rank (line 232) | def get_vocab_tp_sp_src_rank(): function get_vocab_tp_sp_world_size (line 238) | def get_vocab_tp_sp_world_size(): function get_vocab_tp_sp_rank (line 244) | def get_vocab_tp_sp_rank(): function get_vocab_dp_world_size (line 250) | def get_vocab_dp_world_size(): function get_vocab_dp_rank (line 256) | def get_vocab_dp_rank(): function get_vocab_cp_world_size (line 262) | def get_vocab_cp_world_size(): function get_vocab_cp_rank (line 268) | def get_vocab_cp_rank(): function _set_vocab_tp_sp_cp_group (line 274) | def _set_vocab_tp_sp_cp_group(): function get_vocab_tp_sp_cp_group (line 288) | def get_vocab_tp_sp_cp_group(): function get_vocab_tp_sp_cp_world_size (line 294) | def get_vocab_tp_sp_cp_world_size(): function get_vocab_tp_sp_cp_rank (line 301) | def get_vocab_tp_sp_cp_rank(): function set_tp_whole_comm_group (line 315) | def set_tp_whole_comm_group(whole_comm_group:List[CommGroup]): function set_sp_whole_comm_group (line 321) | def set_sp_whole_comm_group(whole_comm_group:List[CommGroup]): function set_dp_whole_comm_group (line 327) | def set_dp_whole_comm_group(whole_comm_group:List[CommGroup]): function set_cp_whole_comm_group (line 333) | def set_cp_whole_comm_group(whole_comm_group:List[CommGroup]): function set_sdp_whole_comm_group (line 339) | def set_sdp_whole_comm_group(whole_comm_group:List[CommGroup]): function get_tp_whole_comm_group (line 345) | def get_tp_whole_comm_group(): function get_sp_whole_comm_group (line 351) | def get_sp_whole_comm_group(): function get_dp_whole_comm_group (line 357) | def get_dp_whole_comm_group(): function get_cp_whole_comm_group (line 363) | def get_cp_whole_comm_group(): function get_sdp_whole_comm_group (line 369) | def get_sdp_whole_comm_group(): function get_moe_layer_wise_logging_tracker (line 378) | def get_moe_layer_wise_logging_tracker(): FILE: galvatron/core/runtime/pipeline/grad_reduce.py function _send_backward_hook (line 36) | def _send_backward_hook( function fsdp_reduce_gradients (line 48) | def fsdp_reduce_gradients(model): function _allreduce_word_embedding_no_pipeline (line 69) | def _allreduce_word_embedding_no_pipeline(wte_model, wte_attr_name, lmhe... function _allreduce_word_embedding (line 87) | def _allreduce_word_embedding(module, tied_wte_attr_name, group): function _allreduce_word_embedding_grads_no_pipeline (line 99) | def _allreduce_word_embedding_grads_no_pipeline(wte_model, wte_attr_name... function _allreduce_word_embedding_grads (line 117) | def _allreduce_word_embedding_grads(module, tied_wte_attr_name, group): function enter_no_sync_context (line 128) | def enter_no_sync_context(model): function exit_no_sync_context (line 141) | def exit_no_sync_context(model): function _register_post_backward_hook_bf16 (line 152) | def _register_post_backward_hook_bf16( function _finalize_params_bf16 (line 199) | def _finalize_params_bf16( FILE: galvatron/core/runtime/pipeline/pipeline.py function forward_step_function (line 32) | def forward_step_function(loss_func, **kwargs): class PipelineParallel (line 43) | class PipelineParallel(nn.Module): method __init__ (line 44) | def __init__( method check_tensor_dtype (line 155) | def check_tensor_dtype(self, layer_output_tensor_shapes, layer_output_... method get_default_tensor_dtype (line 161) | def get_default_tensor_dtype(self, layer_output_tensor_shapes): method wrap_pipeline_modules_data_parallel (line 170) | def wrap_pipeline_modules_data_parallel( method wrap_pipeline_modules_checkpoint (line 227) | def wrap_pipeline_modules_checkpoint(self, checkpoint_flags, wrap_bloc... method sync_embedding (line 237) | def sync_embedding(self): method gen_sp_layernorm_info (line 255) | def gen_sp_layernorm_info(self, layer_module_types, layer_tp_groups, l... method set_last_batch (line 269) | def set_last_batch(self, state): method update_tensor_shape (line 276) | def update_tensor_shape(self, microbatches, dp_size_input, dp_size, tp... method no_pipeline_forward_backward (line 307) | def no_pipeline_forward_backward( method pipedream_flush_forward_backward (line 387) | def pipedream_flush_forward_backward( method gpipe_forward_backward (line 715) | def gpipe_forward_backward( method gpipe_forward (line 730) | def gpipe_forward( method gpipe_backward (line 837) | def gpipe_backward(self): method to_list (line 897) | def to_list(self, tensor): method forward_step (line 907) | def forward_step(self, forward_step_func, batch, model, input_tensor, ... method check_finish_backward (line 939) | def check_finish_backward(self, require_grad_param_num): method backward_step (line 943) | def backward_step(self, input_tensor, output_tensor, output_tensor_grad): method finalize_wte_grads_func (line 1043) | def finalize_wte_grads_func(self): method get_pipeline_model_parallel_first_rank (line 1063) | def get_pipeline_model_parallel_first_rank(self): method get_pipeline_model_parallel_last_rank (line 1066) | def get_pipeline_model_parallel_last_rank(self): method get_pipeline_model_parallel_next_rank (line 1070) | def get_pipeline_model_parallel_next_rank(self): method get_pipeline_model_parallel_prev_rank (line 1075) | def get_pipeline_model_parallel_prev_rank(self): method is_pipeline_first_stage (line 1080) | def is_pipeline_first_stage(self): method is_pipeline_last_stage (line 1084) | def is_pipeline_last_stage(self): method _run_p2pops (line 1092) | def _run_p2pops( method _communicate (line 1141) | def _communicate( method recv_forward (line 1271) | def recv_forward( method recv_backward (line 1292) | def recv_backward( method send_forward (line 1311) | def send_forward( method send_backward (line 1332) | def send_backward( method send_forward_recv_backward (line 1351) | def send_forward_recv_backward( method send_backward_recv_forward (line 1371) | def send_backward_recv_forward( method send_forward_recv_forward (line 1391) | def send_forward_recv_forward( method send_backward_recv_backward (line 1410) | def send_backward_recv_backward( method send_forward_backward_recv_forward_backward (line 1429) | def send_forward_backward_recv_forward_backward( method recv_forward_multi (line 1454) | def recv_forward_multi( method recv_backward_multi (line 1473) | def recv_backward_multi( method send_forward_multi (line 1491) | def send_forward_multi( method send_backward_multi (line 1512) | def send_backward_multi( method send_forward_recv_backward_multi (line 1534) | def send_forward_recv_backward_multi( method send_backward_recv_forward_multi (line 1563) | def send_backward_recv_forward_multi( class PipeSequential (line 1593) | class PipeSequential(nn.Sequential): method forward (line 1598) | def forward(self, *inputs, **kwargs): FILE: galvatron/core/runtime/pipeline/sp_grad_reduce.py function _post_backward_hook_sp (line 48) | def _post_backward_hook_sp( FILE: galvatron/core/runtime/pipeline/utils.py function listify_model (line 6) | def listify_model(model: Union[torch.nn.Module, List[torch.nn.Module]]) ... function chunk_batch (line 12) | def chunk_batch(inputs, chunks): function chunk_dict (line 45) | def chunk_dict(kwargs, chunks): FILE: galvatron/core/runtime/redistribute.py function _zigzag_transformation (line 5) | def _zigzag_transformation(input_, cp_world_size): function _reverse_zigzag_transformation (line 26) | def _reverse_zigzag_transformation(input_, cp_world_size): function _split_along_first_dim_with_sequence_parallel (line 43) | def _split_along_first_dim_with_sequence_parallel(input_, split_cp_group... function _gather_along_first_dim_with_sequence_parallel (line 85) | def _gather_along_first_dim_with_sequence_parallel(input_, allgather_cp_... function _split_along_first_dim (line 129) | def _split_along_first_dim(input_, split_tp_sp_cp_group): function _gather_along_first_dim (line 150) | def _gather_along_first_dim(input_, allgather_tp_sp_cp_group): class _Split (line 166) | class _Split(torch.autograd.Function): method forward (line 174) | def forward(ctx, input_, split_cp_group, split_tp_sp_cp_group, is_input): method backward (line 184) | def backward(ctx, grad_output): class _Gather (line 191) | class _Gather(torch.autograd.Function): method forward (line 199) | def forward(ctx, input_, allgather_cp_group, allgather_tp_sp_cp_group,... method backward (line 209) | def backward(ctx, grad_output): function split_to_group (line 216) | def split_to_group(input_, split_cp_group, split_tp_sp_cp_group, is_input): function gather_from_group (line 220) | def gather_from_group(input_, allgather_cp_group, allgather_tp_sp_cp_gro... function _fused_split_allgather_along_first_dim (line 223) | def _fused_split_allgather_along_first_dim( function _fused_split_allgather_along_first_dim_with_sequence_parallel (line 261) | def _fused_split_allgather_along_first_dim_with_sequence_parallel( class _Fused_split_allgather (line 345) | class _Fused_split_allgather(torch.autograd.Function): method forward (line 348) | def forward(ctx, input_, is_input, allgather_cp_group, allgather_tp_sp... method backward (line 372) | def backward(ctx, grad_output): function fused_split_allgather (line 408) | def fused_split_allgather(input_, is_input, allgather_cp_group, allgathe... FILE: galvatron/core/runtime/tensor_parallel/layers.py function set_tensor_model_parallel_attributes (line 48) | def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride): class VocabParallelEmbedding (line 59) | class VocabParallelEmbedding(torch.nn.Module): method __init__ (line 78) | def __init__( method forward (line 120) | def forward(self, input_): class LinearWithFrozenWeight (line 150) | class LinearWithFrozenWeight(torch.autograd.Function): method forward (line 161) | def forward(ctx, input, weight, bias, allreduce_dgrad, tp_group): method backward (line 173) | def backward(ctx, grad_output): function linear_with_frozen_weight (line 186) | def linear_with_frozen_weight( class LinearWithGradAccumulationAndAsyncCommunication (line 262) | class LinearWithGradAccumulationAndAsyncCommunication(torch.autograd.Fun... method forward (line 267) | def forward( method backward (line 307) | def backward(ctx, grad_output): function linear_with_grad_accumulation_and_async_allreduce (line 430) | def linear_with_grad_accumulation_and_async_allreduce( class ColumnParallelLinear (line 547) | class ColumnParallelLinear(torch.nn.Module): method __init__ (line 596) | def __init__( method forward (line 708) | def forward( method __repr__ (line 810) | def __repr__(self): class RowParallelLinear (line 819) | class RowParallelLinear(torch.nn.Module): method __init__ (line 855) | def __init__( method forward (line 925) | def forward(self, input_): method __repr__ (line 982) | def __repr__(self): FILE: galvatron/core/runtime/tensor_parallel/mappings.py function _reduce (line 18) | def _reduce(input_, group): function split_tensor_along_last_dim (line 31) | def split_tensor_along_last_dim( function _split_along_last_dim (line 57) | def _split_along_last_dim(input_, group): function _split_along_first_dim (line 76) | def _split_along_first_dim(input_, group): function _gather_along_last_dim (line 99) | def _gather_along_last_dim(input_, group): function _reduce_scatter_along_last_dim (line 120) | def _reduce_scatter_along_last_dim(input_, group): function _gather_along_first_dim (line 134) | def _gather_along_first_dim(input_, group, output_split_sizes=None, use_... function _reduce_scatter_along_first_dim (line 174) | def _reduce_scatter_along_first_dim( class _CopyToModelParallelRegion (line 217) | class _CopyToModelParallelRegion(torch.autograd.Function): method symbolic (line 221) | def symbolic(graph, input_, group): method forward (line 226) | def forward(ctx, input_, group): method backward (line 232) | def backward(ctx, grad_output): class _ReduceFromModelParallelRegion (line 237) | class _ReduceFromModelParallelRegion(torch.autograd.Function): method symbolic (line 241) | def symbolic(graph, input_, group): method forward (line 246) | def forward(ctx, input_, group): method backward (line 251) | def backward(ctx, grad_output): class _ScatterToModelParallelRegion (line 256) | class _ScatterToModelParallelRegion(torch.autograd.Function): method symbolic (line 260) | def symbolic(graph, input_, group): method forward (line 265) | def forward(ctx, input_, group): method backward (line 271) | def backward(ctx, grad_output): class _GatherFromModelParallelRegion (line 276) | class _GatherFromModelParallelRegion(torch.autograd.Function): method symbolic (line 280) | def symbolic(graph, input_, group=None): method forward (line 285) | def forward(ctx, input_, group=None): method backward (line 291) | def backward(ctx, grad_output): class _ScatterToSequenceParallelRegion (line 296) | class _ScatterToSequenceParallelRegion(torch.autograd.Function): method symbolic (line 300) | def symbolic(graph, input_, group): method forward (line 305) | def forward(ctx, input_, group): method backward (line 311) | def backward(ctx, grad_output): class _GatherFromSequenceParallelRegion (line 316) | class _GatherFromSequenceParallelRegion(torch.autograd.Function): method symbolic (line 320) | def symbolic( method forward (line 332) | def forward( method backward (line 348) | def backward(ctx, grad_output): class _ReduceScatterToSequenceParallelRegion (line 371) | class _ReduceScatterToSequenceParallelRegion(torch.autograd.Function): method symbolic (line 375) | def symbolic(graph, input_, group, input_split_sizes=None, use_global_... method forward (line 380) | def forward(ctx, input_, group, input_split_sizes=None, use_global_buf... method backward (line 388) | def backward(ctx, grad_output): class _AllGatherFromTensorParallelRegion (line 400) | class _AllGatherFromTensorParallelRegion(torch.autograd.Function): method symbolic (line 404) | def symbolic(graph, input_, group): method forward (line 409) | def forward(ctx, input_, group): method backward (line 415) | def backward(ctx, grad_output): class _ReduceScatterToTensorParallelRegion (line 420) | class _ReduceScatterToTensorParallelRegion(torch.autograd.Function): method symbolic (line 424) | def symbolic(graph, input_, group): method forward (line 429) | def forward(ctx, input_, group): method backward (line 435) | def backward(ctx, grad_output): class _AllToAll (line 440) | class _AllToAll(torch.autograd.Function): method forward (line 442) | def forward(ctx, group, input, output_split_sizes, input_split_sizes): method backward (line 474) | def backward(ctx, *grad_output): function copy_to_tensor_model_parallel_region (line 489) | def copy_to_tensor_model_parallel_region(input_, group): function reduce_from_tensor_model_parallel_region (line 494) | def reduce_from_tensor_model_parallel_region(input_, group): function scatter_to_tensor_model_parallel_region (line 499) | def scatter_to_tensor_model_parallel_region(input_, group): function gather_from_tensor_model_parallel_region (line 504) | def gather_from_tensor_model_parallel_region(input_, group): function scatter_to_sequence_parallel_region (line 509) | def scatter_to_sequence_parallel_region(input_, group): function gather_from_sequence_parallel_region (line 514) | def gather_from_sequence_parallel_region( function reduce_scatter_to_sequence_parallel_region (line 527) | def reduce_scatter_to_sequence_parallel_region( function all_gather_last_dim_from_tensor_parallel_region (line 536) | def all_gather_last_dim_from_tensor_parallel_region(input_, group): function reduce_scatter_last_dim_to_tensor_parallel_region (line 541) | def reduce_scatter_last_dim_to_tensor_parallel_region(input_, group): function all_to_all (line 546) | def all_to_all(group, input_, output_split_sizes_=None, input_split_size... FILE: galvatron/core/runtime/tensor_parallel/random.py function _get_cuda_rng_state (line 23) | def _get_cuda_rng_state( function _set_cuda_rng_state (line 54) | def _set_cuda_rng_state(new_state: torch.Tensor, device: int = -1, graph... function get_expert_parallel_rng_tracker_name (line 96) | def get_expert_parallel_rng_tracker_name(group=None): function get_tensor_parallel_rng_tracker_name (line 104) | def get_tensor_parallel_rng_tracker_name(group=None): function get_data_parallel_rng_tracker_name (line 114) | def get_data_parallel_rng_tracker_name(): class CudaRNGStatesTracker (line 120) | class CudaRNGStatesTracker: method __init__ (line 129) | def __init__(self, use_cudagraphable_rng=False, is_inference_rng_track... method is_initialized (line 142) | def is_initialized(self): method reset (line 146) | def reset(self): method get_states (line 158) | def get_states(self): method set_states (line 166) | def set_states(self, states): method check (line 172) | def check(self, name): method add (line 177) | def add(self, name, seed): method fork (line 203) | def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME): function initialize_rng_tracker (line 233) | def initialize_rng_tracker( function set_seed_with_group (line 279) | def set_seed_with_group( function get_cuda_rng_tracker (line 319) | def get_cuda_rng_tracker( FILE: galvatron/core/runtime/tensor_parallel/reset.py function colummn_row_reset_parameters (line 11) | def colummn_row_reset_parameters(self): function router_reset_parameters (line 25) | def router_reset_parameters(self): function init_reset_parameter (line 31) | def init_reset_parameter(): FILE: galvatron/core/runtime/tensor_parallel/triton_cross_entropy.py function _tiled_max_kernel (line 22) | def _tiled_max_kernel( function _tiled_cross_entropy_forward_kernel (line 58) | def _tiled_cross_entropy_forward_kernel( function _tiled_cross_entropy_backward_kernel (line 103) | def _tiled_cross_entropy_backward_kernel( function tiled_max_reduction (line 150) | def tiled_max_reduction( function tiled_cross_entropy_forward (line 167) | def tiled_cross_entropy_forward( function tiled_cross_entropy_backward (line 191) | def tiled_cross_entropy_backward( class _VocabParallelCrossEntropyTritonFused (line 219) | class _VocabParallelCrossEntropyTritonFused(torch.autograd.Function): method forward (line 221) | def forward(ctx, vocab_parallel_logits, target, tp_group): method backward (line 245) | def backward(ctx, grad_output): function triton_fused_vocab_parallel_cross_entropy (line 256) | def triton_fused_vocab_parallel_cross_entropy( FILE: galvatron/core/runtime/tensor_parallel/utils.py function init_method_normal (line 9) | def init_method_normal(sigma): function scaled_init_method_normal (line 18) | def scaled_init_method_normal(sigma, num_layers): function ensure_divisibility (line 27) | def ensure_divisibility(numerator, denominator): function divide (line 32) | def divide(numerator, denominator): class VocabUtility (line 39) | class VocabUtility: method vocab_range_from_per_partition_vocab_size (line 47) | def vocab_range_from_per_partition_vocab_size( method vocab_range_from_global_vocab_size (line 56) | def vocab_range_from_global_vocab_size( function prepare_input_tensors_for_wgrad_compute (line 66) | def prepare_input_tensors_for_wgrad_compute(grad_output, all_gathered_in... FILE: galvatron/core/runtime/transformer/attention.py class SelfAttentionSubmodules (line 56) | class SelfAttentionSubmodules: class CrossAttentionSubmodules (line 72) | class CrossAttentionSubmodules: class PackedSeqParams (line 86) | class PackedSeqParams: class AttnMaskType (line 101) | class AttnMaskType(enum.Enum): class Attention (line 111) | class Attention(torch.nn.Module, ABC): method __init__ (line 118) | def __init__( method _allocate_memory (line 241) | def _allocate_memory(self, inference_max_sequence_length, batch_size, ... method _adjust_key_value_for_inference (line 253) | def _adjust_key_value_for_inference( method get_query_key_value_tensors (line 392) | def get_query_key_value_tensors(self, hidden_states, key_value_states): method flash_decode (line 398) | def flash_decode( method flash_decode_and_prefill (line 443) | def flash_decode_and_prefill( method forward (line 515) | def forward( class SelfAttention (line 736) | class SelfAttention(Attention): method __init__ (line 743) | def __init__( method run_realtime_tests (line 805) | def run_realtime_tests(self): method get_query_key_value_tensors (line 876) | def get_query_key_value_tensors(self, hidden_states, key_value_states=... class CrossAttention (line 929) | class CrossAttention(Attention): method __init__ (line 936) | def __init__( method get_query_key_value_tensors (line 989) | def get_query_key_value_tensors(self, hidden_states, key_value_states): FILE: galvatron/core/runtime/transformer/attention_impl.py class FlashSelfOrCrossAttention (line 29) | class FlashSelfOrCrossAttention(torch.nn.Module): method __init__ (line 40) | def __init__(self, causal=False, softmax_scale=None, attention_dropout... method forward (line 55) | def forward(self, q, k, v): function post_all2all (line 115) | def post_all2all(scatter_idx, batch_dim_idx, seq_world_size, bs, seq_len... function single_all_to_all (line 139) | def single_all_to_all(input, scatter_idx, gather_idx, batch_dim_idx, gro... class _SeqAllToAll (line 201) | class _SeqAllToAll(torch.autograd.Function): method forward (line 204) | def forward( method backward (line 253) | def backward(ctx: Any, *grad_output: Tensor) -> Tuple[None, Tensor, No... class DistributedAttention (line 278) | class DistributedAttention(torch.nn.Module): method __init__ (line 288) | def __init__( method layer_sync (line 312) | def layer_sync(self, layer): method forward (line 316) | def forward(self, query: Tensor, key: Tensor, value: Tensor, batch_dim... function _get_default_args (line 420) | def _get_default_args(func): function get_default_args (line 429) | def get_default_args(func): function _update_out_and_lse (line 438) | def _update_out_and_lse( function update_out_and_lse (line 458) | def update_out_and_lse( class RingComm (line 481) | class RingComm: method __init__ (line 482) | def __init__(self, process_group: dist.ProcessGroup, batch_comm = True): method send_recv (line 500) | def send_recv( method commit (line 525) | def commit(self): method wait (line 533) | def wait(self): method send_recv_kv (line 547) | def send_recv_kv( function zigzag_ring_flash_attn_forward (line 564) | def zigzag_ring_flash_attn_forward( function zigzag_ring_flash_attn_backward (line 652) | def zigzag_ring_flash_attn_backward( class ZigZagRingFlashAttnFunc (line 783) | class ZigZagRingFlashAttnFunc(torch.autograd.Function): method forward (line 785) | def forward( method backward (line 832) | def backward(ctx, dout, *args): function zigzag_ring_flash_attn_func (line 855) | def zigzag_ring_flash_attn_func( class ZigzagRingFlashAttention (line 885) | class ZigzagRingFlashAttention(torch.nn.Module): method __init__ (line 886) | def __init__(self, attention_dropout, cp_group, cp_ranks, softmax_scal... method forward (line 894) | def forward(self, q, k, v): FILE: galvatron/core/runtime/transformer/fused_kernels.py function geglu (line 20) | def geglu(y): function bias_geglu (line 26) | def bias_geglu(bias, y): function geglu_back (line 35) | def geglu_back(g, y): function bias_geglu_back (line 46) | def bias_geglu_back(g, y, bias): class BiasGeGLUFunction (line 51) | class BiasGeGLUFunction(torch.autograd.Function): method forward (line 54) | def forward(ctx, input, bias): method backward (line 59) | def backward(ctx, grad_output): class GeGLUFunction (line 65) | class GeGLUFunction(torch.autograd.Function): method forward (line 68) | def forward(ctx, input): method backward (line 73) | def backward(ctx, grad_output): function bias_geglu_impl (line 79) | def bias_geglu_impl(input, bias): function bias_gelu (line 101) | def bias_gelu(bias, y): function bias_gelu_back (line 110) | def bias_gelu_back(g, bias, y): class GeLUFunction (line 120) | class GeLUFunction(torch.autograd.Function): method forward (line 123) | def forward(ctx, input, bias): method backward (line 128) | def backward(ctx, grad_output): method apply (line 135) | def apply(cls, *args, **kwargs): function swiglu (line 143) | def swiglu(y): function bias_swiglu (line 149) | def bias_swiglu(y, bias): function swiglu_back (line 158) | def swiglu_back(g, y): function bias_swiglu_back (line 166) | def bias_swiglu_back(g, y, bias): class BiasSwiGLUFunction (line 171) | class BiasSwiGLUFunction(torch.autograd.Function): method forward (line 174) | def forward(ctx, input, bias, fp8_input_store): method backward (line 182) | def backward(ctx, grad_output): class SwiGLUFunction (line 189) | class SwiGLUFunction(torch.autograd.Function): method forward (line 192) | def forward(ctx, input, fp8_input_store): method backward (line 200) | def backward(ctx, grad_output): function bias_swiglu_impl (line 207) | def bias_swiglu_impl(input, bias, fp8_input_store=False): function fused_apply_rotary_pos_emb (line 227) | def fused_apply_rotary_pos_emb( function fused_apply_rotary_pos_emb_thd (line 237) | def fused_apply_rotary_pos_emb_thd( class VocabParallelCrossEntropy (line 259) | class VocabParallelCrossEntropy: method calculate_logits_max (line 266) | def calculate_logits_max( method calculate_predicted_logits (line 280) | def calculate_predicted_logits( method calculate_cross_entropy_loss (line 316) | def calculate_cross_entropy_loss( method prepare_gradient_calculation_operands (line 330) | def prepare_gradient_calculation_operands( method calculate_gradients (line 349) | def calculate_gradients( function calculate_logits_max (line 368) | def calculate_logits_max(vocab_parallel_logits: torch.Tensor, half_entro... function calculate_predicted_logits (line 381) | def calculate_predicted_logits( function calculate_cross_entropy_loss (line 403) | def calculate_cross_entropy_loss( function calculate_gradients (line 420) | def calculate_gradients( class _VocabParallelCrossEntropy (line 442) | class _VocabParallelCrossEntropy(torch.autograd.Function): method forward (line 444) | def forward(ctx, vocab_parallel_logits, target, half_entropy, tp_group): method backward (line 479) | def backward(ctx, grad_output): function fused_vocab_parallel_cross_entropy (line 491) | def fused_vocab_parallel_cross_entropy(vocab_parallel_logits, target, ha... class _VocabParallelCrossEntropyNonFused (line 508) | class _VocabParallelCrossEntropyNonFused(torch.autograd.Function): method forward (line 516) | def forward(ctx, vocab_parallel_logits, target, tp_group): method backward (line 543) | def backward(ctx, grad_output): function vocab_parallel_cross_entropy (line 554) | def vocab_parallel_cross_entropy(vocab_parallel_logits, target, tp_group): FILE: galvatron/core/runtime/transformer/inference.py class BaseInferenceContext (line 6) | class BaseInferenceContext(abc.ABC): method is_static_batching (line 14) | def is_static_batching(self) -> bool: method is_dynamic_batching (line 18) | def is_dynamic_batching(self) -> bool: FILE: galvatron/core/runtime/transformer/mlp.py class MLPSubmodules (line 18) | class MLPSubmodules: class MLP (line 23) | class MLP(torch.nn.Module): method __init__ (line 40) | def __init__( method forward (line 98) | def forward(self, hidden_states): FILE: galvatron/core/runtime/transformer/norm.py class GalvatronNorm (line 6) | class GalvatronNorm: method __new__ (line 12) | def __new__(cls, config: GalvatronModelArgs, hidden_size: int, eps: fl... FILE: galvatron/core/runtime/transformer/rope_utils.py function get_pos_emb_on_this_cp_rank (line 47) | def get_pos_emb_on_this_cp_rank(pos_emb: Tensor, seq_dim: int) -> Tensor: function _rotate_half (line 67) | def _rotate_half(x: Tensor, rotary_interleaved: bool) -> Tensor: function _apply_rotary_pos_emb_bshd (line 86) | def _apply_rotary_pos_emb_bshd( function _get_thd_freqs_on_this_cp_rank (line 123) | def _get_thd_freqs_on_this_cp_rank(cp_rank: int, cp_size: int, x: Tensor... function _apply_rotary_pos_emb_thd (line 137) | def _apply_rotary_pos_emb_thd( function apply_rotary_pos_emb (line 176) | def apply_rotary_pos_emb( function apply_rotary_pos_emb_with_cos_sin (line 237) | def apply_rotary_pos_emb_with_cos_sin( FILE: galvatron/core/runtime/transformer/rotary_pos_embedding.py function get_pos_emb_on_this_cp_sp_rank_galvatron (line 34) | def get_pos_emb_on_this_cp_sp_rank_galvatron(cp_group, sp_group, pos_emb... function get_pos_emb_on_this_cp_rank (line 59) | def get_pos_emb_on_this_cp_rank(pos_emb, seq_dim): class RotaryEmbedding (line 73) | class RotaryEmbedding(nn.Module): method __init__ (line 93) | def __init__( method _apply_scaling (line 124) | def _apply_scaling( method get_freqs_non_repeated (line 159) | def get_freqs_non_repeated(self, max_seq_len: int, offset: int = 0) ->... method get_cos_sin (line 174) | def get_cos_sin(self, max_seq_len: int, offset: int = 0) -> (Tensor, T... method forward (line 183) | def forward(self, max_seq_len: int, offset: int = 0, packed_seq: bool ... method _load_from_state_dict (line 217) | def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs): method get_rotary_seq_len (line 221) | def get_rotary_seq_len( class MultimodalRotaryEmbedding (line 267) | class MultimodalRotaryEmbedding(nn.Module): method __init__ (line 286) | def __init__( method forward (line 310) | def forward(self, position_ids: torch.Tensor, mrope_section: List[int]... FILE: galvatron/core/runtime/transformer/spec_utils.py class ModuleSpec (line 9) | class ModuleSpec: function import_module (line 30) | def import_module(module_path: Tuple[str]): function get_module (line 45) | def get_module(spec_or_module: Union[ModuleSpec, type], **additional_kwa... function build_module (line 58) | def build_module(spec_or_module: Union[ModuleSpec, type], *args, **kwargs): FILE: galvatron/core/runtime/transformer/utils.py function deprecate_inference_params (line 4) | def deprecate_inference_params(inference_context, inference_params): FILE: galvatron/core/runtime/utils/rerun_state_machine.py class Caller (line 43) | class Caller(NamedTuple): class Call (line 51) | class Call(NamedTuple): class RerunDiagnostic (line 58) | class RerunDiagnostic(str, Enum): class RerunMode (line 72) | class RerunMode(str, Enum): class RerunState (line 80) | class RerunState(Enum): class RerunValidationStatus (line 112) | class RerunValidationStatus(str, Enum): class RerunStateMachine (line 127) | class RerunStateMachine: method __init__ (line 183) | def __init__( method set_mode (line 239) | def set_mode(self, mode: RerunMode) -> None: method get_mode (line 246) | def get_mode(self) -> RerunMode: method should_run_forward_backward (line 251) | def should_run_forward_backward(self, data_iterator: DataIteratorArgTy... method should_checkpoint_and_exit (line 374) | def should_checkpoint_and_exit(self) -> Tuple[bool, bool, int]: method validate_result (line 434) | def validate_result( method is_unexpectedly_large (line 651) | def is_unexpectedly_large( method _sanitize_data_iterators (line 841) | def _sanitize_data_iterators( method _get_validation_call_info (line 858) | def _get_validation_call_info(self) -> Call: method _save_state (line 871) | def _save_state(self) -> None: method _restore_state (line 892) | def _restore_state(self) -> None: method _maybe_report_stats (line 903) | def _maybe_report_stats(self) -> None: method _log_validation_error_to_file (line 930) | def _log_validation_error_to_file( method get_skipped_iterations_from_tracker_file (line 951) | def get_skipped_iterations_from_tracker_file(cls, tracker_file_name: s... class RerunDataIterator (line 989) | class RerunDataIterator: method __init__ (line 1008) | def __init__(self, iterable: Iterable[Any]) -> None: method __next__ (line 1014) | def __next__(self) -> Any: method rewind (line 1029) | def rewind(self) -> None: method advance (line 1035) | def advance(self) -> None: method state_dict (line 1041) | def state_dict(self) -> SerializableStateType: method load_state_dict (line 1050) | def load_state_dict(self, state_dict: SerializableStateType) -> None: class QuickStats (line 1058) | class QuickStats: method __init__ (line 1065) | def __init__(self, max_size: int = 100000) -> None: method record (line 1072) | def record(self, data: float) -> None: method combine (line 1086) | def combine(self, others: list["QuickStats"]) -> None: method reset (line 1099) | def reset(self) -> None: method print_stats (line 1107) | def print_stats(self) -> str: method __getstate_ (line 1129) | def __getstate_(self) -> Any: method __setstate (line 1134) | def __setstate(self, state: Any) -> Any: class RerunErrorInjector (line 1143) | class RerunErrorInjector: method __init__ (line 1152) | def __init__( method maybe_inject (line 1167) | def maybe_inject(self) -> bool: method maybe_miscompare (line 1185) | def maybe_miscompare( method state_dict (line 1222) | def state_dict(self) -> SerializableStateType: method load_state_dict (line 1232) | def load_state_dict(self, state_dict: SerializableStateType) -> None: function initialize_rerun_state_machine (line 1241) | def initialize_rerun_state_machine(**kwargs) -> None: function destroy_rerun_state_machine (line 1251) | def destroy_rerun_state_machine() -> None: function get_rerun_state_machine (line 1258) | def get_rerun_state_machine() -> RerunStateMachine: function _set_rerun_state_machine (line 1267) | def _set_rerun_state_machine(rerun_state_machine) -> None: function _safe_get_rank (line 1275) | def _safe_get_rank() -> int: function _compare_floats (line 1288) | def _compare_floats(a: torch.Tensor, b: torch.Tensor) -> float: FILE: galvatron/core/runtime/utils/utils.py function rgetattr (line 28) | def rgetattr(obj, attr): function rsetattr (line 41) | def rsetattr(obj, attr, val): function rhasattr (line 46) | def rhasattr(obj, attr): function log_single_rank (line 54) | def log_single_rank(logger: logging.Logger, *args: Any, rank: int = 0, *... class GlobalMemoryBuffer (line 73) | class GlobalMemoryBuffer: method __init__ (line 78) | def __init__(self): method get_tensor (line 81) | def get_tensor(self, tensor_shape, dtype, name): function get_torch_version (line 97) | def get_torch_version(): function is_torch_min_version (line 114) | def is_torch_min_version(version, check_equality=True): function get_te_version (line 121) | def get_te_version(): function is_te_min_version (line 138) | def is_te_min_version(version, check_equality=True): function print_rank_0 (line 145) | def print_rank_0(message): function set_megatron_args_for_dataset (line 154) | def set_megatron_args_for_dataset(args:GalvatronRuntimeArgs): function get_layernorm_offset (line 170) | def get_layernorm_offset(model, layernorm_name=[]): function get_batch_on_this_tp_rank (line 194) | def get_batch_on_this_tp_rank(data_iterator): function get_batch_on_this_cp_rank (line 295) | def get_batch_on_this_cp_rank(batch: Dict[str, Any]): function average_losses_across_data_parallel_group (line 328) | def average_losses_across_data_parallel_group(losses): FILE: galvatron/core/search_engine/args_schema.py class SearchEngineBatchSizeArgs (line 12) | class SearchEngineBatchSizeArgs(BaseModel): class SearchEngineHardwareInfoArgs (line 21) | class SearchEngineHardwareInfoArgs(BaseModel): class SearchEngineSearchSpaceArgs (line 26) | class SearchEngineSearchSpaceArgs(BaseModel): class SearchEngineProfilingArgs (line 42) | class SearchEngineProfilingArgs(BaseModel): class SearchEngineOptionsArgs (line 53) | class SearchEngineOptionsArgs(BaseModel): class SearchEngineDebugArgs (line 61) | class SearchEngineDebugArgs(BaseModel): class GalvatronSearchArgs (line 65) | class GalvatronSearchArgs(BaseModel): FILE: galvatron/core/search_engine/dynamic_programming.py class DPAlg (line 12) | class DPAlg(): method __init__ (line 13) | def __init__(self, max_mem=8200, other_mem_cost=None, other_time_cost ... method set_v_and_cost (line 32) | def set_v_and_cost(self, v: np.ndarray, intra_layer_cost: np.ndarray, ... method fit (line 50) | def fit(self): class DpOnModel (line 117) | class DpOnModel: method __init__ (line 118) | def __init__( method match_strategy (line 161) | def match_strategy(self, former:LayerStrategy, latter:LayerStrategy, d... method _build_dp_and_run_multi_layer_type (line 212) | def _build_dp_and_run_multi_layer_type( method log (line 612) | def log(self, msg) -> None: method fit (line 618) | def fit( FILE: galvatron/core/search_engine/search_engine.py class GalvatronSearchEngine (line 21) | class GalvatronSearchEngine(): method __init__ (line 22) | def __init__(self, args: GalvatronSearchArgs): method set_search_engine_info (line 39) | def set_search_engine_info(self, path, model_layer_configs, model_name): method set_path (line 46) | def set_path(self, path): method set_model_type (line 49) | def set_model_type(self, model_type): method set_model_name (line 52) | def set_model_name(self, name): method memory_profiling_path (line 55) | def memory_profiling_path(self): # TODO: add split mode profile path method time_profiling_path (line 68) | def time_profiling_path(self): # TODO: add split mode profile path method set_model_layer_configs (line 82) | def set_model_layer_configs(self, model_layer_configs): method initialize_search_engine (line 93) | def initialize_search_engine(self, show_all_strategy_list=False): method generate_strategy_list (line 106) | def generate_strategy_list(self) -> None: method filter_strategy_list (line 183) | def filter_strategy_list(self, disable_pp=None, disable_tp=None, disab... method show_all_strategy_list (line 257) | def show_all_strategy_list(self): method convert_keys_to_int (line 275) | def convert_keys_to_int(self, d): method get_profiled_model_configs (line 286) | def get_profiled_model_configs(self): # TODO: add split mode profile c... method get_profiled_hardware_configs (line 419) | def get_profiled_hardware_configs(self): method set_cost_models (line 464) | def set_cost_models(self): # TODO: add split mode cost models method get_pp_size_range (line 512) | def get_pp_size_range(self) -> None: method parallelism_optimization (line 520) | def parallelism_optimization(self): method search_for_single_task (line 646) | def search_for_single_task(self, gbsz, chunks, pp_size, global_buffer_... method set_searching_bsz (line 729) | def set_searching_bsz(self): method save_results (line 749) | def save_results(self, optimal, optimal_bsz, chunk): method check_cost_model (line 788) | def check_cost_model(self, gbsz, chunks, specific_strategy_list:List[L... method show_search_info (line 902) | def show_search_info(self): function pp_division_memory_balanced (line 954) | def pp_division_memory_balanced(model_args_list, train_args_list, parall... function get_pp_stage_for_bsz (line 1060) | def get_pp_stage_for_bsz(strategies:List[LayerStrategy], model_args_list... function get_cost_all_stages (line 1072) | def get_cost_all_stages(layer_memcosts, pp_stage_division): function get_layer_costs (line 1088) | def get_layer_costs(layernum_list, layer_costs): function pp_division_even (line 1094) | def pp_division_even(layernum_list, pp_deg): FILE: galvatron/core/search_engine/utils.py function ensure_log_dir (line 4) | def ensure_log_dir(log_dir='logs'): function get_thread_logger_single_task (line 8) | def get_thread_logger_single_task(gbsz, chunks, pp_size, global_buffer_t... function remove_all_galvatron_loggers (line 32) | def remove_all_galvatron_loggers(prefix='galvatron'): FILE: galvatron/models/gpt/train_dist.py function train (line 21) | def train(args): FILE: galvatron/models/moe/train_dist.py function train (line 22) | def train(args): FILE: galvatron/profile_hardware/profile_all2all.py function single_all_to_all (line 20) | def single_all_to_all(input_tensor, group): function set_seed (line 28) | def set_seed(rank): function _profile_all2all_one (line 34) | def _profile_all2all_one( function train (line 93) | def train(args): FILE: galvatron/profile_hardware/profile_allreduce.py function single_all_reduce (line 20) | def single_all_reduce(input_tensor, group): function set_seed (line 26) | def set_seed(rank): function bandwidth_jobs_from_tp_degrees (line 32) | def bandwidth_jobs_from_tp_degrees(world_size, tp_degrees: list[int]): function allreduce_work_items (line 45) | def allreduce_work_items( function _profile_allreduce_one (line 84) | def _profile_allreduce_one( function train (line 162) | def train(args): FILE: galvatron/profile_hardware/profile_overlap.py function profile (line 10) | def profile(args): FILE: galvatron/profile_hardware/profile_p2p.py function single_p2p_send_recv (line 19) | def single_p2p_send_recv(input_tensor, prev_rank, next_rank, rank, pp_ra... function set_seed (line 53) | def set_seed(rank): function _profile_p2p_one (line 59) | def _profile_p2p_one( function train (line 149) | def train(args): FILE: galvatron/tools/args_schema.py class CheckpointConvertH2GArgs (line 5) | class CheckpointConvertH2GArgs(BaseModel): class CheckpointConvertG2HArgs (line 13) | class CheckpointConvertG2HArgs(BaseModel): FILE: galvatron/tools/checkpoint_convert_g2h.py function convert_checkpoints_llama (line 11) | def convert_checkpoints_llama(input_checkpoint_path, output_dir, load_it... function convert_checkpoints_bert_mlm (line 111) | def convert_checkpoints_bert_mlm(input_checkpoint_path, output_dir, load... function main (line 253) | def main(): FILE: galvatron/tools/checkpoint_convert_h2g.py function convert_checkpoints_gpt (line 9) | def convert_checkpoints_gpt(input_checkpoint_path, output_dir): function convert_checkpoints_llama (line 47) | def convert_checkpoints_llama(input_checkpoint_path, output_dir): function convert_checkpoints_mixtral (line 89) | def convert_checkpoints_mixtral(input_checkpoint_path, output_dir): function convert_checkpoints_bert_mlm (line 93) | def convert_checkpoints_bert_mlm(input_checkpoint_path, output_dir): function main (line 140) | def main(): FILE: galvatron/utils/config_utils.py function str2array (line 8) | def str2array(s): function array2str (line 11) | def array2str(a): function read_json_config (line 14) | def read_json_config(path): function write_json_config (line 18) | def write_json_config(config, path): function config2strategy (line 24) | def config2strategy(config): function read_allreduce_bandwidth_config (line 48) | def read_allreduce_bandwidth_config(config_path, gpu_num): function read_p2p_bandwidth_config (line 77) | def read_p2p_bandwidth_config(config_path): function num2str (line 90) | def num2str(num, name): function dict_join_dirname (line 103) | def dict_join_dirname(dic, dirname): function remap_config (line 108) | def remap_config(config, op): function print_single_rank (line 140) | def print_single_rank(message, rank=0): function remap_config_for_latency (line 147) | def remap_config_for_latency(config, op): FILE: galvatron/utils/hf_config_adapter.py function _get_model_args (line 39) | def _get_model_args(args: Union[GalvatronRuntimeArgs, GalvatronSearchArg... function _get_train_args (line 47) | def _get_train_args(args: Union[GalvatronRuntimeArgs, GalvatronSearchArg... function get_hf_attr (line 73) | def get_hf_attr(config, canonical_name: str, default=None): function set_hf_attr (line 82) | def set_hf_attr(config, canonical_name: str, value): function _detect_normalization (line 104) | def _detect_normalization(hf_config) -> str: function _detect_activation (line 110) | def _detect_activation(hf_config) -> tuple: function _detect_position_embedding_type (line 117) | def _detect_position_embedding_type(hf_config) -> str: function _load_yaml_model_config (line 154) | def _load_yaml_model_config(yaml_path: str) -> dict: function _apply_yaml_to_model_args (line 165) | def _apply_yaml_to_model_args(args: Union[GalvatronRuntimeArgs, Galvatro... function populate_model_args_from_hf (line 196) | def populate_model_args_from_hf(args: Union[GalvatronRuntimeArgs, Galvat... function _fill_model_args_from_hf (line 212) | def _fill_model_args_from_hf(args: Union[GalvatronRuntimeArgs, Galvatron... function resolve_model_config (line 285) | def resolve_model_config(args: Union[GalvatronRuntimeArgs, GalvatronSear... function create_hf_config (line 333) | def create_hf_config(args: Union[GalvatronRuntimeArgs, GalvatronSearchAr... function model_name (line 372) | def model_name(args: Union[GalvatronRuntimeArgs, GalvatronSearchArgs]) -... function model_layer_configs (line 384) | def model_layer_configs(args: Union[GalvatronRuntimeArgs, GalvatronSearc... FILE: galvatron/utils/memory_utils.py function print_peak_memory (line 3) | def print_peak_memory(prefix, device, type='allocated'): function print_param_num (line 16) | def print_param_num(model): FILE: galvatron/utils/print_utils.py class ColorSet (line 7) | class ColorSet: function print_args_rank0 (line 15) | def print_args_rank0(args: pydantic.BaseModel, title: str = "arguments"): function print_single_rank (line 25) | def print_single_rank(message, rank=0): FILE: galvatron/utils/strategy_utils.py function is_power_of_two (line 11) | def is_power_of_two(n: int) -> bool: class DPType (line 14) | class DPType(Enum): method values (line 20) | def values(cls): method contains (line 24) | def contains(cls, value) -> bool: method __lt__ (line 27) | def __lt__(self, other): class StrategyBase (line 33) | class StrategyBase: class EmbeddingLMHeadStrategy (line 37) | class EmbeddingLMHeadStrategy(StrategyBase): method __post_init__ (line 45) | def __post_init__(self): method _check_and_fix_sdp (line 49) | def _check_and_fix_sdp(self): method _check_tp_sp (line 54) | def _check_tp_sp(self): method world_size (line 58) | def world_size(self): method sdp_size (line 62) | def sdp_size(self): method tp_sp_size (line 66) | def tp_sp_size(self): method to_string (line 69) | def to_string(self): method to_simple_string (line 72) | def to_simple_string(self): method __eq__ (line 93) | def __eq__(self, other): method __lt__ (line 101) | def __lt__(self, other): method __hash__ (line 111) | def __hash__(self): method __str__ (line 115) | def __str__(self): class AttentionStrategy (line 119) | class AttentionStrategy(EmbeddingLMHeadStrategy): method __hash__ (line 122) | def __hash__(self): method to_embedding_lmhead_strategy (line 126) | def to_embedding_lmhead_strategy(self): method to_ffn_strategy (line 136) | def to_ffn_strategy(self): method to_layer_strategy (line 147) | def to_layer_strategy(self): class FFNStrategy (line 160) | class FFNStrategy(EmbeddingLMHeadStrategy): method __hash__ (line 163) | def __hash__(self): method to_embedding_lmhead_strategy (line 167) | def to_embedding_lmhead_strategy(self): class LayerStrategy (line 178) | class LayerStrategy(EmbeddingLMHeadStrategy): method __hash__ (line 181) | def __hash__(self): method to_embedding_lmhead_strategy (line 185) | def to_embedding_lmhead_strategy(self): class MoEFFNStrategy (line 196) | class MoEFFNStrategy(StrategyBase): method __post_init__ (line 204) | def __post_init__(self): method _check_and_fix_dp (line 207) | def _check_and_fix_dp(self): method world_size (line 215) | def world_size(self): method sdp_size (line 219) | def sdp_size(self): method __eq__ (line 222) | def __eq__(self, other): method __lt__ (line 230) | def __lt__(self, other): method __hash__ (line 240) | def __hash__(self): method __str__ (line 244) | def __str__(self): function old_version_strategy_to_new_version_strategy (line 248) | def old_version_strategy_to_new_version_strategy(strategy:list, default_... function new_version_strategy_to_old_version_strategy (line 277) | def new_version_strategy_to_old_version_strategy(strategy:StrategyBase): function print_strategy_list (line 300) | def print_strategy_list(strategy_list:Union[List[LayerStrategy], List[Em... function strategy_list2config (line 308) | def strategy_list2config(strategy_list:List[LayerStrategy]): function config2strategy (line 332) | def config2strategy(config:dict, default_dp_type:str='zero2') -> List[La... FILE: galvatron/utils/training_utils.py function set_seed (line 7) | def set_seed(seed = 1234): function distributed_dataloader (line 13) | def distributed_dataloader(dataset, global_bsz, shuffle = True, args = N... function print_loss (line 25) | def print_loss(args, loss, ep, iter): function gen_profiling_groups (line 43) | def gen_profiling_groups(group_size, consecutive): FILE: setup.py class CustomInstall (line 18) | class CustomInstall(install): method run (line 19) | def run(self): class CustomDevelop (line 29) | class CustomDevelop(develop): method run (line 30) | def run(self): class CustomBuildExt (line 41) | class CustomBuildExt(build_ext): method run (line 42) | def run(self): FILE: tests/conftest.py function _pick_free_port (line 19) | def _pick_free_port() -> int: function small_model_config (line 25) | def small_model_config(): function device (line 36) | def device(): function seed (line 41) | def seed(): function _terminate_process (line 45) | def _terminate_process(p: subprocess.Popen, grace: float = 5.0) -> None: function run_distributed (line 81) | def run_distributed(): function checkpoint_dir (line 194) | def checkpoint_dir(): function base_config_dirs (line 203) | def base_config_dirs(tmp_path: Path) -> Tuple[Path, Path, Path]: function profiler_model_configs_dir (line 211) | def profiler_model_configs_dir(tmp_path: Path) -> Path: function profiler_hardware_configs_dir (line 218) | def profiler_hardware_configs_dir(tmp_path: Path) -> Path: function base_log_dirs (line 227) | def base_log_dirs(tmp_path: Path) -> str: FILE: tests/core/test_ep.py class _PytestMarkStub (line 10) | class _PytestMarkStub: method skipif (line 11) | def skipif(self, *args, **kwargs): method parametrize (line 14) | def parametrize(self, *args, **kwargs): method __getattr__ (line 19) | def __getattr__(self, _name): class _PytestStub (line 24) | class _PytestStub: function _ep_parallel_config (line 58) | def _ep_parallel_config( function _run_test (line 95) | def _run_test(test_args: Dict[str, Any]): function test_ep_correctness (line 245) | def test_ep_correctness(run_distributed, ep_size, dispatcher, checkpoint... FILE: tests/core/test_fsdp.py function _run_test (line 25) | def _run_test(test_args: Dict[str, Any]): function test_dp_correctness (line 185) | def test_dp_correctness( FILE: tests/core/test_hybrid.py function _run_test (line 20) | def _run_test(test_args: Dict[str, Any]): function test_hybrid_correctness (line 180) | def test_hybrid_correctness( FILE: tests/core/test_mixed_precision.py function _dp_parallel_config (line 25) | def _dp_parallel_config(batch: int, chunks: int) -> Dict[str, Any]: function _run_test (line 45) | def _run_test(test_args: Dict[str, Any]): function test_dp_correctness (line 162) | def test_dp_correctness(run_distributed, mixed_precision, use_flash_attn... FILE: tests/core/test_pp.py function _pp_parallel_config (line 25) | def _pp_parallel_config(pp_size: int, batch: int, chunks: int, pipeline_... function _run_test (line 52) | def _run_test(test_args: Dict[str, Any]): function test_pp (line 171) | def test_pp(run_distributed, world_size, pp_size, pipeline_type, chunks,... FILE: tests/core/test_redistributed.py function _run_test (line 22) | def _run_test(test_args: Dict[str, Any]): function test_redistributed (line 183) | def test_redistributed(run_distributed, model_type, world_size, tp_size,... FILE: tests/core/test_tp.py function _tp_parallel_config (line 25) | def _tp_parallel_config( function _run_test (line 71) | def _run_test(test_args: Dict[str, Any]): function test_tp (line 193) | def test_tp(run_distributed, world_size, tp_size, sp, chunks, checkpoint... FILE: tests/core/test_utils.py class DummyModule (line 7) | class DummyModule(nn.Module): method __init__ (line 8) | def __init__(self): function dummy_module (line 14) | def dummy_module(): function test_rgetattr (line 17) | def test_rgetattr(dummy_module): function test_rsetattr (line 26) | def test_rsetattr(dummy_module): function test_rhasattr (line 32) | def test_rhasattr(dummy_module): FILE: tests/kernels/test_triton_cross_entropy.py function non_fused_ce (line 39) | def non_fused_ce(logits, target, tp_group): function jit_fused_ce (line 44) | def jit_fused_ce(logits, target, tp_group): function triton_fused_ce (line 49) | def triton_fused_ce(logits, target, tp_group): function print_rank0 (line 54) | def print_rank0(rank, msg): function run_test_forward_backward (line 63) | def run_test_forward_backward(ce_func, logits_cpu, target_cpu, tp_group,... function benchmark_performance (line 95) | def benchmark_performance(ce_func, logits_cpu, target_cpu, tp_group, dev... function compare_results (line 125) | def compare_results(name1, name2, loss1, grad1, loss2, grad2, rank): function _run_test (line 163) | def _run_test(args): function test_triton_cross_entropy (line 270) | def test_triton_cross_entropy(run_distributed, tp_size, seq_len, batch_s... FILE: tests/kernels/test_triton_cross_entropy_debug.py function non_fused_ce (line 24) | def non_fused_ce(logits, target, tp_group): function jit_fused_ce (line 28) | def jit_fused_ce(logits, target, tp_group): function triton_fused_ce (line 32) | def triton_fused_ce(logits, target, tp_group): function print_rank0 (line 36) | def print_rank0(rank, msg): function run_test_forward_backward (line 41) | def run_test_forward_backward(ce_func, logits_cpu, target_cpu, tp_group,... function benchmark_performance (line 73) | def benchmark_performance(ce_func, logits_cpu, target_cpu, tp_group, dev... function compare_results (line 103) | def compare_results(name1, name2, loss1, grad1, loss2, grad2, rank): function test_triton_cross_entropy (line 141) | def test_triton_cross_entropy(): FILE: tests/kernels/test_triton_cross_entropy_kernels.py function device (line 77) | def device(): function reset_seed (line 85) | def reset_seed(): function check_precision (line 90) | def check_precision(triton_val, torch_val, name, rtol=1e-2, atol=1e-3): function test_max_reduction (line 121) | def test_max_reduction(device, seq_len, batch_size, vocab_size, model_co... function test_forward (line 143) | def test_forward(device, seq_len, batch_size, vocab_size, model_config): function test_backward (line 176) | def test_backward(device, seq_len, batch_size, vocab_size, model_config): function test_edge_cases_max (line 222) | def test_edge_cases_max(device, case_name, seq_len, batch_size, vocab_si... function test_boundary_targets (line 246) | def test_boundary_targets(device): FILE: tests/kernels/test_triton_cross_entropy_kernels_debug.py function check_precision (line 23) | def check_precision(triton_val, torch_val, name, rtol=1e-2, atol=1e-3): function test_max_reduction (line 41) | def test_max_reduction(): function test_forward (line 69) | def test_forward(): function test_backward (line 105) | def test_backward(): function test_edge_cases (line 153) | def test_edge_cases(): function main (line 204) | def main(): FILE: tests/models/test_checkpoint_convert.py function test_convert_checkpoints_bert_mlm (line 8) | def test_convert_checkpoints_bert_mlm(checkpoint_dir): FILE: tests/models/test_dataloader.py function _run_test (line 17) | def _run_test(args: dict): function test_distributed_dataloader_with_groups (line 106) | def test_distributed_dataloader_with_groups(run_distributed, small_model... FILE: tests/models/test_model_correctness.py function _dp_parallel_config (line 28) | def _dp_parallel_config(num_layers: int, batch: int, chunks: int) -> Dic... function _run_test (line 49) | def _run_test(test_args: Dict[str, Any]): function test_dp_correctness (line 229) | def test_dp_correctness(run_distributed, hf_arch, dp_size, checkpoint_dir): FILE: tests/models/test_moe_correctness.py class _PytestMarkStub (line 10) | class _PytestMarkStub: method skipif (line 11) | def skipif(self, *args, **kwargs): method parametrize (line 14) | def parametrize(self, *args, **kwargs): method __getattr__ (line 19) | def __getattr__(self, _name): class _PytestStub (line 24) | class _PytestStub: function _dp_parallel_config (line 58) | def _dp_parallel_config(num_layers: int, batch: int, chunks: int) -> Dic... function _run_test (line 81) | def _run_test(test_args: Dict[str, Any]): function test_dp_correctness (line 226) | def test_dp_correctness(run_distributed, dp_size, checkpoint_dir): FILE: tests/profiler/test_hardware_profile.py function base_profiler (line 9) | def base_profiler(profiler_hardware_configs_dir): function _count_torchrun_blocks (line 15) | def _count_torchrun_blocks(scripts_dir: str, filename: str) -> int: function test_torch_hardware_profile (line 32) | def test_torch_hardware_profile( FILE: tests/profiler/test_model_profile.py function _reset_profiler_caches (line 19) | def _reset_profiler_caches(profiler): function base_profiler (line 27) | def base_profiler(profiler_model_configs_dir): function test_get_seq_list (line 42) | def test_get_seq_list(base_profiler, mode, expected_seq_list, config): function test_get_bsz_list (line 60) | def test_get_bsz_list(base_profiler, mode, expected_bsz_list, config): function test_launch_profiling_scripts (line 89) | def test_launch_profiling_scripts(base_profiler, profile_type, profile_m... function test_process_computation_profiled_data (line 132) | def test_process_computation_profiled_data(base_profiler, profiler_model... function test_process_memory_profiled_data (line 171) | def test_process_memory_profiled_data(base_profiler, profiler_model_conf... FILE: tests/profiler/test_runtime_profile.py function mock_distributed (line 8) | def mock_distributed(): function base_profiler (line 16) | def base_profiler(profiler_model_configs_dir): function test_profile_memory_stages (line 28) | def test_profile_memory_stages(base_profiler, stage, expected_keys): function test_post_profile_memory (line 56) | def test_post_profile_memory(base_profiler, pipeline_type, expected_keys): function test_post_profile_memory_with_save (line 83) | def test_post_profile_memory_with_save(base_profiler): class MockCUDAEvent (line 114) | class MockCUDAEvent: method __init__ (line 119) | def __init__(self): method record (line 122) | def record(self): method elapsed_time (line 126) | def elapsed_time(self, end): function test_profile_time_start_normal (line 130) | def test_profile_time_start_normal(base_profiler): function test_profile_time_start_with_save (line 149) | def test_profile_time_start_with_save(base_profiler): function test_profile_time_end_with_loss (line 169) | def test_profile_time_end_with_loss(base_profiler): function test_profile_time_python (line 205) | def test_profile_time_python(base_profiler): FILE: tests/search_engine/test_bsz_utils.py function base_engine (line 8) | def base_engine(): function test_settle_bsz (line 20) | def test_settle_bsz(base_engine): function test_normal_bsz_range (line 31) | def test_normal_bsz_range(base_engine): function test_bsz_range_with_different_scales (line 46) | def test_bsz_range_with_different_scales(base_engine, min_bsz, max_bsz, ... function test_max_bsz_adjustment (line 70) | def test_max_bsz_adjustment(base_engine): function test_min_bsz_smaller_than_scale (line 80) | def test_min_bsz_smaller_than_scale(base_engine): FILE: tests/search_engine/test_generate_strategies.py function test_generate_strategies (line 10) | def test_generate_strategies(model_type, tmp_path, disables, capsys): FILE: tests/search_engine/test_get_configs.py function _build_hf_test_args (line 15) | def _build_hf_test_args(config_json, time_mode): function _promote_profile_filenames_to_all (line 30) | def _promote_profile_filenames_to_all(configs_dir: Path, precision: str,... function test_config_loading (line 52) | def test_config_loading(base_config_dirs, model_type, time_mode, memory_... function test_hardware_config_loading (line 120) | def test_hardware_config_loading(base_config_dirs, num_nodes, gpus_per_n... FILE: tests/search_engine/test_initialize.py function test_set_cost_models (line 15) | def test_set_cost_models(base_config_dirs, base_log_dirs, model_type, ti... FILE: tests/search_engine/test_parallelsim_optimization.py function test_basic_search_flow (line 15) | def test_basic_search_flow(base_config_dirs, base_log_dirs, idx, model_t... FILE: tests/search_engine/test_strategy_utils.py class TestDPType (line 38) | class TestDPType: method test_enum_values (line 39) | def test_enum_values(self): method test_values_returns_all_members (line 44) | def test_values_returns_all_members(self): method test_contains_true (line 48) | def test_contains_true(self): method test_contains_false (line 52) | def test_contains_false(self): method test_lt_ordering (line 55) | def test_lt_ordering(self): method test_lt_type_error (line 61) | def test_lt_type_error(self): class TestColorSet (line 69) | class TestColorSet: method test_ansi_codes_exist (line 70) | def test_ansi_codes_exist(self): class TestEmbeddingLMHeadStrategy (line 81) | class TestEmbeddingLMHeadStrategy: method test_default_values (line 82) | def test_default_values(self): method test_auto_reset_dp_type_when_sdp_is_1 (line 92) | def test_auto_reset_dp_type_when_sdp_is_1(self): method test_dp_type_preserved_when_sdp_gt_1 (line 97) | def test_dp_type_preserved_when_sdp_gt_1(self): method test_tp_and_sp_mutual_exclusion (line 101) | def test_tp_and_sp_mutual_exclusion(self): method test_world_size (line 105) | def test_world_size(self): method test_sdp_size (line 109) | def test_sdp_size(self): method test_tp_sp_size_with_tp (line 113) | def test_tp_sp_size_with_tp(self): method test_tp_sp_size_with_sp (line 117) | def test_tp_sp_size_with_sp(self): method test_equality_same (line 121) | def test_equality_same(self): method test_equality_different (line 126) | def test_equality_different(self): method test_equality_different_type (line 131) | def test_equality_different_type(self): method test_hash_consistency (line 135) | def test_hash_consistency(self): method test_hash_usable_in_set (line 140) | def test_hash_usable_in_set(self): method test_lt (line 145) | def test_lt(self): method test_lt_not_implemented_for_different_types (line 151) | def test_lt_not_implemented_for_different_types(self): method test_to_string (line 155) | def test_to_string(self): method test_str (line 161) | def test_str(self): method test_to_simple_string_basic (line 166) | def test_to_simple_string_basic(self): method test_to_simple_string_with_tp (line 171) | def test_to_simple_string_with_tp(self): method test_to_simple_string_zero3 (line 176) | def test_to_simple_string_zero3(self): method test_to_simple_string_with_sp (line 181) | def test_to_simple_string_with_sp(self): class TestAttentionStrategy (line 191) | class TestAttentionStrategy: method test_default_checkpoint_false (line 192) | def test_default_checkpoint_false(self): method test_inherits_embedding_fields (line 196) | def test_inherits_embedding_fields(self): method test_to_embedding_lmhead_strategy (line 201) | def test_to_embedding_lmhead_strategy(self): method test_to_ffn_strategy (line 209) | def test_to_ffn_strategy(self): method test_to_layer_strategy (line 216) | def test_to_layer_strategy(self): method test_hash (line 222) | def test_hash(self): method test_to_simple_string_with_checkpoint (line 227) | def test_to_simple_string_with_checkpoint(self): class TestFFNStrategy (line 236) | class TestFFNStrategy: method test_default_checkpoint (line 237) | def test_default_checkpoint(self): method test_to_embedding_lmhead_strategy (line 241) | def test_to_embedding_lmhead_strategy(self): method test_hash (line 247) | def test_hash(self): class TestLayerStrategy (line 256) | class TestLayerStrategy: method test_default_checkpoint (line 257) | def test_default_checkpoint(self): method test_to_embedding_lmhead_strategy (line 261) | def test_to_embedding_lmhead_strategy(self): method test_hash (line 267) | def test_hash(self): class TestMoEFFNStrategy (line 277) | class TestMoEFFNStrategy: method test_default_values (line 278) | def test_default_values(self): method test_auto_reset_dp_type_when_dp_is_1 (line 288) | def test_auto_reset_dp_type_when_dp_is_1(self): method test_dp_type_preserved_when_dp_gt_1 (line 292) | def test_dp_type_preserved_when_dp_gt_1(self): method test_world_size (line 296) | def test_world_size(self): method test_sdp_size (line 300) | def test_sdp_size(self): method test_equality (line 304) | def test_equality(self): method test_inequality (line 309) | def test_inequality(self): method test_equality_different_type (line 314) | def test_equality_different_type(self): method test_lt (line 318) | def test_lt(self): method test_lt_not_implemented (line 323) | def test_lt_not_implemented(self): method test_hash (line 327) | def test_hash(self): method test_str (line 332) | def test_str(self): class TestIsPowerOfTwo (line 341) | class TestIsPowerOfTwo: method test_powers_of_two (line 343) | def test_powers_of_two(self, n): method test_not_powers_of_two (line 347) | def test_not_powers_of_two(self, n): class TestConstants (line 351) | class TestConstants: method test_byte_to_MB (line 352) | def test_byte_to_MB(self): method test_model_states_ratio (line 355) | def test_model_states_ratio(self): class TestOldToNewVersionStrategy (line 362) | class TestOldToNewVersionStrategy: method test_basic_ddp (line 363) | def test_basic_ddp(self): method test_with_fsdp (line 376) | def test_with_fsdp(self): method test_with_checkpoint (line 382) | def test_with_checkpoint(self): method test_with_sp (line 387) | def test_with_sp(self): method test_default_zero2 (line 393) | def test_default_zero2(self): method test_dp_size_1_forces_ddp (line 398) | def test_dp_size_1_forces_ddp(self): class TestNewToOldVersionStrategy (line 404) | class TestNewToOldVersionStrategy: method test_basic_roundtrip_ddp (line 405) | def test_basic_roundtrip_ddp(self): method test_fsdp_flag (line 413) | def test_fsdp_flag(self): method test_tp_flag (line 418) | def test_tp_flag(self): method test_sp_flag (line 425) | def test_sp_flag(self): method test_checkpoint_flag (line 431) | def test_checkpoint_flag(self): class TestPrintStrategyList (line 440) | class TestPrintStrategyList: method test_none_input (line 441) | def test_none_input(self, capsys): method test_prints_strategies (line 447) | def test_prints_strategies(self, capsys): method test_with_logger (line 457) | def test_with_logger(self): class TestStrategyList2Config (line 476) | class TestStrategyList2Config: method test_empty_list (line 477) | def test_empty_list(self): method test_single_layer (line 480) | def test_single_layer(self): method test_multiple_layers (line 492) | def test_multiple_layers(self): method test_all_zero3 (line 506) | def test_all_zero3(self): FILE: tests/test_arguments.py function test_load_with_hydra_train_dist_runtime_matches_yaml (line 26) | def test_load_with_hydra_train_dist_runtime_matches_yaml(): function test_load_with_hydra_train_dist_overrides (line 62) | def test_load_with_hydra_train_dist_overrides(): function test_profiler_args_defaults (line 74) | def test_profiler_args_defaults(): function test_profiler_hardware_args_defaults (line 89) | def test_profiler_hardware_args_defaults(): function test_search_engine_args_defaults (line 105) | def test_search_engine_args_defaults(): FILE: tests/utils.py function init_dist_env (line 3) | def init_dist_env(): FILE: tests/utils/init_dist.py function init_dist_env (line 5) | def init_dist_env(): FILE: tests/utils/model_utils.py class ModelFactory (line 7) | class ModelFactory: method _get_yaml_dir (line 32) | def _get_yaml_dir() -> str: method _resolve_yaml_path (line 36) | def _resolve_yaml_path(model_type: str) -> str: method resolve_model_config (line 45) | def resolve_model_config(args: Union[GalvatronRuntimeArgs, GalvatronSe... method get_test_config (line 60) | def get_test_config(model_type: str) -> Dict[str, Any]: method get_model_layer_configs (line 84) | def get_model_layer_configs(args: Union[GalvatronRuntimeArgs, Galvatro... method get_model_name (line 90) | def get_model_name(args: Union[GalvatronRuntimeArgs, GalvatronSearchAr... method get_model_layer_configs_func (line 96) | def get_model_layer_configs_func() -> Callable: method get_model_name_func (line 102) | def get_model_name_func() -> Callable: FILE: tests/utils/parallel_config.py class ParallelConfig (line 6) | class ParallelConfig: method to_dict (line 21) | def to_dict(self): FILE: tests/utils/profiler_configs.py function create_computation_static_config (line 5) | def create_computation_static_config() -> Dict[str, float]: function create_computation_batch_config (line 12) | def create_computation_batch_config() -> Dict[str, float]: function create_computation_sequence_config (line 37) | def create_computation_sequence_config() -> Dict[str, float]: function create_memory_static_config (line 58) | def create_memory_static_config() -> Dict: function create_memory_static_config_sp (line 239) | def create_memory_static_config_sp() -> Dict: function create_memory_sequence_config_sp (line 420) | def create_memory_sequence_config_sp() -> Dict: function save_profiler_configs (line 613) | def save_profiler_configs( FILE: tests/utils/profiler_utils.py function initialize_model_profile_profiler (line 7) | def initialize_model_profile_profiler(profiler_model_configs_dir, model_... function initialize_hardware_profile_profiler (line 33) | def initialize_hardware_profile_profiler(profiler_hardware_configs_dir): function initialize_runtime_profile_profiler (line 41) | def initialize_runtime_profile_profiler(profiler_model_configs_dir, mode... FILE: tests/utils/runtime_args.py class TestRuntimeArgs (line 15) | class TestRuntimeArgs(GalvatronRuntimeArgs): method padded_vocab_size (line 22) | def padded_vocab_size(self): method hidden_size (line 26) | def hidden_size(self): method num_attention_heads (line 30) | def num_attention_heads(self): method seq_length (line 34) | def seq_length(self): method kv_channels (line 38) | def kv_channels(self): method group_query_attention (line 42) | def group_query_attention(self): method num_query_groups (line 47) | def num_query_groups(self): function _ensure_config_path (line 56) | def _ensure_config_path(config): function make_test_args (line 68) | def make_test_args( FILE: tests/utils/search_args.py class SearchArgs (line 4) | class SearchArgs: method __init__ (line 6) | def __init__(self): FILE: tests/utils/search_configs.py function create_static_time_config (line 10) | def create_static_time_config() -> Dict[str, float]: function create_batch_time_config (line 17) | def create_batch_time_config() -> Dict[str, float]: function create_sequence_time_config (line 42) | def create_sequence_time_config() -> Dict[str, float]: function create_static_memory_config (line 63) | def create_static_memory_config(): function create_static_memory_config_sp (line 124) | def create_static_memory_config_sp(): function create_sequence_memory_config_sp (line 189) | def create_sequence_memory_config_sp(): function create_hardware_configs (line 462) | def create_hardware_configs(): function write_time_config (line 550) | def write_time_config( function write_memory_config (line 569) | def write_memory_config( function write_hardware_config (line 587) | def write_hardware_config( function _auto_update_nested_args (line 612) | def _auto_update_nested_args(model: BaseModel, flat_updates: Dict) -> Ba... function initialize_search_engine (line 647) | def initialize_search_engine(base_config_dirs, base_log_dirs, model_type...