SYMBOL INDEX (5871 symbols across 514 files) FILE: .github/scripts/oncall_manager.py function get_headers (line 37) | def get_headers(): function get_repo_info (line 52) | def get_repo_info(): function get_team_members (line 61) | def get_team_members(org, team_slug): function get_user_email (line 85) | def get_user_email(username): function get_slack_client (line 150) | def get_slack_client(): function get_slack_user_id (line 158) | def get_slack_user_id(slack_client, email): function get_slack_usergroup_id (line 176) | def get_slack_usergroup_id(slack_client, handle): function update_slack_usergroup (line 192) | def update_slack_usergroup(new_oncall_username, old_members_usernames): function load_schedule (line 238) | def load_schedule(): function save_schedule (line 255) | def save_schedule(schedule): function update_active_oncall_team (line 260) | def update_active_oncall_team(org, new_oncall): function rotate_schedule (line 289) | def rotate_schedule(repo_owner, dry_run=False): function get_last_wednesday (line 339) | def get_last_wednesday(): function ensure_schedule_filled (line 345) | def ensure_schedule_filled(schedule, repo_owner): function assign_reviewer (line 393) | def assign_reviewer(pr_number): function main (line 408) | def main(): FILE: .github/scripts/sync_team_usergroups.py function get_headers (line 45) | def get_headers(): function get_org (line 61) | def get_org(): function github_team_to_slack_usergroup (line 67) | def github_team_to_slack_usergroup(team_slug): function get_child_teams (line 100) | def get_child_teams(org, parent_team_slug): function get_team_members (line 139) | def get_team_members(org, team_slug): function get_user_email (line 167) | def get_user_email(username): function get_slack_client (line 233) | def get_slack_client(): function get_slack_user_id (line 242) | def get_slack_user_id(slack_client, email): function fetch_all_usergroups (line 261) | def fetch_all_usergroups(slack_client): function get_slack_usergroup_id (line 291) | def get_slack_usergroup_id(slack_client, handle): function github_team_to_usergroup_name (line 301) | def github_team_to_usergroup_name(team_slug): function create_slack_usergroup (line 312) | def create_slack_usergroup(slack_client, handle, team_slug): function sync_team_to_usergroup (line 356) | def sync_team_to_usergroup(team_slug, usergroup_handle, dry_run=False): function get_team_to_usergroup_mapping (line 447) | def get_team_to_usergroup_mapping(parent_team_slug): function sync_all_teams (line 464) | def sync_all_teams(dry_run=False, parent_teams=None, direct_teams=None): function main (line 520) | def main(): FILE: .gitlab/scripts/check_imports.py class ImportChecker (line 31) | class ImportChecker: method __init__ (line 34) | def __init__(self, package_name: str = "megatron.core", verbose: bool ... method should_skip_module (line 59) | def should_skip_module(self, module_name: str) -> bool: method discover_modules (line 66) | def discover_modules(self, package_path: str) -> List[str]: method import_module (line 102) | def import_module(self, module_name: str) -> Tuple[str, str]: method check_all_imports (line 123) | def check_all_imports(self): function main (line 200) | def main(package_name: str): FILE: docs/add_copyright_header.py function main (line 15) | def main(): FILE: docs/autodoc2_docstrings_parser.py class NapoleonParser (line 20) | class NapoleonParser(MystParser): method parse (line 23) | def parse(self, input_string: str, document: nodes.document) -> None: FILE: examples/academic_paper_scripts/detxoify_lm/annotations/filter-selfgeneration.py function get_corpus_scores (line 24) | def get_corpus_scores(lines): function main (line 37) | def main(): FILE: examples/academic_paper_scripts/detxoify_lm/annotations/perspective_api_annotate.py class PerspectiveApiScorer (line 24) | class PerspectiveApiScorer: method __init__ (line 31) | def __init__(self): method get_scores (line 44) | def get_scores(self, input_text: str, requested_attributes: Optional[L... function test (line 73) | def test(): function split_lines (line 79) | def split_lines(lines, split): function get_score (line 88) | def get_score(line): function get_scores (line 118) | def get_scores(lines): function get_annotated_datasets (line 150) | def get_annotated_datasets(lines, threads=10): function main (line 160) | def main(): FILE: examples/academic_paper_scripts/detxoify_lm/finetune_gpt.py function model_provider (line 28) | def model_provider(pre_process=True, post_process=True): function get_batch (line 41) | def get_batch(data_iterator): function loss_func (line 72) | def loss_func(loss_mask, output_tensor): function forward_step (line 83) | def forward_step(data_iterator, model): function train_valid_test_datasets_provider (line 100) | def train_valid_test_datasets_provider(train_val_test_num_samples): function add_validation_args (line 141) | def add_validation_args(parser): FILE: examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py function model_provider (line 29) | def model_provider(pre_process=True, post_process=True) -> Union[GPTMode... function add_text_generate_args (line 93) | def add_text_generate_args(parser): function generate_samples_unconditional (line 119) | def generate_samples_unconditional(model): function generate_samples_conditional (line 156) | def generate_samples_conditional(model): function generate_and_write_samples_unconditional (line 209) | def generate_and_write_samples_unconditional(model): function generate_and_write_samples_conditional (line 218) | def generate_and_write_samples_conditional(model): function main (line 232) | def main(): FILE: examples/academic_paper_scripts/detxoify_lm/perspective_api.py class PerspectiveApiScorer (line 24) | class PerspectiveApiScorer: method __init__ (line 31) | def __init__(self): method get_scores (line 44) | def get_scores(self, input_text: str, requested_attributes: Optional[L... function test (line 73) | def test(): function get_score (line 80) | def get_score(x): function main (line 92) | def main(): FILE: examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py function initialize_distributed (line 17) | def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_... function model_provider (line 29) | def model_provider(): function load_distributed_checkpoint (line 49) | def load_distributed_checkpoint(checkpoint_path, gpt_model): FILE: examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py function initialize_distributed (line 18) | def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_... function model_provider (line 30) | def model_provider(): function load_distributed_checkpoint (line 50) | def load_distributed_checkpoint(checkpoint_path, gpt_model): FILE: examples/gptoss/01_convert_from_hf.py function _parse_args (line 10) | def _parse_args(): FILE: examples/gptoss/03_convert_to_hf.py function _parse_args (line 10) | def _parse_args(): FILE: examples/inference/gpt/gpt_dynamic_inference.py function run_inference (line 58) | def run_inference( function main (line 279) | def main(): FILE: examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py function suspend_resume_cycle (line 33) | async def suspend_resume_cycle(client, engine, args, futures): function main (line 49) | async def main( FILE: examples/inference/gpt/gpt_static_inference.py function add_static_inference_args (line 37) | def add_static_inference_args(parser): function get_inference_engine (line 55) | def get_inference_engine(args: Namespace, model: MegatronModule) -> Stat... function generate (line 84) | async def generate( function main (line 119) | def main(): FILE: examples/inference/gpt/utils.py function get_default_sampling_params (line 23) | def get_default_sampling_params(termination_id: int = None): function get_curr_time (line 34) | def get_curr_time() -> float: class Request (line 42) | class Request: method __init__ (line 57) | def __init__( method __str__ (line 81) | def __str__(self) -> str: function get_time_offsets (line 91) | def get_time_offsets( function get_cli_requests (line 136) | def get_cli_requests( function get_synthetic_requests (line 153) | def get_synthetic_requests( function get_requests_from_file (line 188) | def get_requests_from_file( function build_requests (line 230) | def build_requests( function get_model_size_str (line 244) | def get_model_size_str(model): function build_dynamic_engine_setup_prefix (line 253) | def build_dynamic_engine_setup_prefix( function get_global_peak_memory_stats_bytes (line 315) | def get_global_peak_memory_stats_bytes() -> dict: FILE: examples/inference/t5/simple_t5_batch_inference.py function add_text_generate_args (line 38) | def add_text_generate_args(parser): function get_inference_engine (line 70) | def get_inference_engine(args: Namespace, model: MegatronModule) -> Abst... function main (line 102) | def main(): FILE: examples/mimo/avlm_inference.py function init_distributed (line 23) | def init_distributed(tp_size: int = 1, pp_size: int = 1): function get_input_data (line 32) | def get_input_data( function main (line 129) | def main(): function load_distributed_checkpoint (line 213) | def load_distributed_checkpoint(model: torch.nn.Module, ckpt_dir: str): FILE: examples/mimo/configs/llava_avlm.py function get_llava_projection_config (line 25) | def get_llava_projection_config( function get_vicuna_language_layer_spec (line 46) | def get_vicuna_language_layer_spec() -> ModuleSpec: function get_llava_projection_layer_spec (line 50) | def get_llava_projection_layer_spec() -> ModuleSpec: FILE: examples/mimo/configs/llava_vlm.py function get_vicuna_language_model_config (line 24) | def get_vicuna_language_model_config( function get_llava_projection_config (line 82) | def get_llava_projection_config( function get_vicuna_language_layer_spec (line 103) | def get_vicuna_language_layer_spec() -> ModuleSpec: function get_llava_projection_layer_spec (line 107) | def get_llava_projection_layer_spec() -> ModuleSpec: FILE: examples/mimo/configs/mock.py function get_mock_language_model_config (line 28) | def get_mock_language_model_config(config: Optional[TransformerConfig] =... function get_mock_vision_model_config (line 47) | def get_mock_vision_model_config(config: Optional[TransformerConfig] = N... function get_mock_projection_config (line 76) | def get_mock_projection_config(hidden_size: int = 128) -> TransformerCon... function get_mock_language_layer_spec (line 97) | def get_mock_language_layer_spec(): function get_mock_vision_layer_spec (line 107) | def get_mock_vision_layer_spec(): function get_mock_projection_layer_spec (line 120) | def get_mock_projection_layer_spec(): FILE: examples/mimo/data/avlm_sample_loader.py function sample_loader (line 3) | def sample_loader(raw: dict) -> dict: function part_filter (line 85) | def part_filter(part: str) -> bool: FILE: examples/mimo/data/energon_avlm_task_encoder.py class ConversationTemplateConfig (line 50) | class ConversationTemplateConfig: class LlavaConversationTemplateConfig (line 56) | class LlavaConversationTemplateConfig(ConversationTemplateConfig): class VisionAudioQASample (line 64) | class VisionAudioQASample(VQASample): class AVLMModelType (line 74) | class AVLMModelType(Enum): class AVLMTaskEncoder (line 78) | class AVLMTaskEncoder( method __init__ (line 86) | def __init__( method apply_prompt_template (line 100) | def apply_prompt_template(self, input_text: VisionAudioQASample): method _find_pattern_indices (line 157) | def _find_pattern_indices( method encode_sample (line 169) | def encode_sample(self, sample: VisionAudioQASample): method batch (line 289) | def batch(self, samples: List[Dict]) -> Dict: method encode_batch_avlm_clip_whisper_llava (line 314) | def encode_batch_avlm_clip_whisper_llava(self, batch_data: Dict) -> Dict: method encode_batch (line 351) | def encode_batch(self, batch_data: Dict) -> dict: function llava_avlm_dataloader_provider (line 358) | def llava_avlm_dataloader_provider(train_val_test_num_samples): class KeyProcessor (line 445) | class KeyProcessor(Protocol): method __call__ (line 448) | def __call__(self, values: List[torch.Tensor]) -> torch.Tensor: # pra... class StackProcessor (line 452) | class StackProcessor: method __init__ (line 455) | def __init__(self, dim: int = 0): method __call__ (line 458) | def __call__(self, values: List[torch.Tensor]) -> torch.Tensor: class PaddingProcessor (line 462) | class PaddingProcessor: method __init__ (line 465) | def __init__(self, pad_value: int, batch_first: bool = True): method __call__ (line 469) | def __call__(self, values: List[torch.Tensor]) -> torch.Tensor: FILE: examples/mimo/data/energon_vlm_task_encoder.py class ConversationTemplateConfig (line 44) | class ConversationTemplateConfig: class LlavaConversationTemplateConfig (line 51) | class LlavaConversationTemplateConfig(ConversationTemplateConfig): class ModelType (line 57) | class ModelType(Enum): function predict_seq_len_with_padding (line 61) | def predict_seq_len_with_padding(instance_tokens: torch.Tensor, pad_to_m... function group_samples (line 73) | def group_samples(samples: List[Dict[str, torch.Tensor]], class VLMTaskEncoder (line 110) | class VLMTaskEncoder( method __init__ (line 118) | def __init__( method apply_prompt_template (line 145) | def apply_prompt_template(self, input_text: VQASample): method _find_pattern_indices (line 202) | def _find_pattern_indices( method select_samples_to_pack (line 213) | def select_samples_to_pack(self, samples: List[Dict[str, torch.Tensor]... method pack_selected_samples (line 237) | def pack_selected_samples(self, samples: List[Dict[str, torch.Tensor]]... method encode_sample (line 335) | def encode_sample(self, sample: VQASample): method batch (line 385) | def batch(self, samples: List[Dict]) -> Dict: method encode_batch_vlm_clip_llava (line 437) | def encode_batch_vlm_clip_llava(self, batch_data: Dict) -> Dict: method encode_batch_vlm_clip_llava_video (line 469) | def encode_batch_vlm_clip_llava_video(self, batch_data: Dict) -> Dict: method encode_batch (line 494) | def encode_batch(self, batch_data: Dict) -> dict: function llava_vlm_dataloader_provider (line 502) | def llava_vlm_dataloader_provider(train_val_test_num_samples, max_seq_le... class KeyProcessor (line 573) | class KeyProcessor(Protocol): method __call__ (line 576) | def __call__(self, values: List[torch.Tensor], max_len: Optional[int] ... class StackProcessor (line 580) | class StackProcessor: method __init__ (line 583) | def __init__(self, dim: int = 0): method __call__ (line 586) | def __call__(self, values: List[torch.Tensor], max_len: Optional[int] ... class PaddingProcessor (line 594) | class PaddingProcessor: method __init__ (line 597) | def __init__(self, pad_value: int, batch_first: bool = True): method _pad_and_stack (line 601) | def _pad_and_stack(self, tensors: List[torch.Tensor], max_len: int, pa... method __call__ (line 616) | def __call__(self, values: List[torch.Tensor], max_len: Optional[int] ... class PackingKwargsProcessor (line 624) | class PackingKwargsProcessor: method __call__ (line 627) | def __call__(self, values: List[torch.Tensor], max_len: Optional[int] ... class GenericStackProcessor (line 633) | class GenericStackProcessor: method __init__ (line 635) | def __init__(self, dim: int = 0): method __call__ (line 638) | def __call__(self, values: List[torch.Tensor], max_len: Optional[int] ... FILE: examples/mimo/data/mock.py function create_mock_image (line 15) | def create_mock_image(image_size: int = 336) -> torch.Tensor: function create_mock_caption (line 28) | def create_mock_caption() -> str: class MockVLMDataset (line 38) | class MockVLMDataset(Dataset): method __init__ (line 41) | def __init__( method __len__ (line 81) | def __len__(self) -> int: method __getitem__ (line 85) | def __getitem__(self, idx: int) -> Dict: method _mock_tokenize (line 134) | def _mock_tokenize(self) -> torch.Tensor: function get_mock_vlm_dataloader (line 165) | def get_mock_vlm_dataloader( function _collate_fn (line 211) | def _collate_fn(batch: List[Dict]) -> Dict[str, torch.Tensor]: function train_valid_test_datasets_provider (line 240) | def train_valid_test_datasets_provider(train_val_test_num_samples): FILE: examples/mimo/data/prepare_video_llava_data.py function _extract_archives (line 11) | def _extract_archives(root: str): function convert_llava_video_to_wds (line 23) | def convert_llava_video_to_wds(dataset_root: str, shard_size: int = 8000): FILE: examples/mimo/data/utils/calculate_audio_tokens.py function calculate_num_mel_frames (line 18) | def calculate_num_mel_frames(audio_length, sample_rate, window_stride, w... function calculate_num_audio_tokens (line 44) | def calculate_num_audio_tokens(audio_tensor, model_name): FILE: examples/mimo/model_providers/hf_clip_encoder.py class HFCLIPEncoderWrapper (line 10) | class HFCLIPEncoderWrapper(torch.nn.Module): method __init__ (line 13) | def __init__(self, feature_layer_index=-2, is_video_input: bool = False): method forward (line 30) | def forward(self, pixel_values: torch.Tensor): FILE: examples/mimo/model_providers/hf_whisper_encoder.py class HFWhisperEncoderWrapper (line 6) | class HFWhisperEncoderWrapper(torch.nn.Module): method __init__ (line 9) | def __init__(self, model_name: str): method forward (line 13) | def forward(self, input_features, seq_lengths=None): FILE: examples/mimo/model_providers/llava_avlm.py function model_provider_llava_avlm (line 31) | def model_provider_llava_avlm( FILE: examples/mimo/model_providers/llava_vlm.py function model_provider_llava_vlm (line 29) | def model_provider_llava_vlm( FILE: examples/mimo/model_providers/mock.py function model_provider_mock_vlm_single_encoder (line 28) | def model_provider_mock_vlm_single_encoder( FILE: examples/mimo/train.py function add_mimo_args (line 52) | def add_mimo_args(parser): function get_batch (line 86) | def get_batch(data_iterator: Iterator[Dict[str, Any]]): function loss_func (line 139) | def loss_func(loss_mask, output_tensor): function forward_step (line 176) | def forward_step(data_iterator, model): function train_valid_test_datasets_provider (line 193) | def train_valid_test_datasets_provider(*provider_args, **provider_kwargs): function model_provider (line 219) | def model_provider( FILE: examples/mimo/utils/data_helpers.py function flatten (line 15) | def flatten( function regroup (line 32) | def regroup(flat: List[Tuple[Tuple[str, ...], torch.Tensor]]) -> Dict[st... function broadcast_nested_data_batch (line 43) | def broadcast_nested_data_batch(nested_dict: Dict[str, Any]) -> Dict[str... FILE: examples/mimo/utils/logging.py function print_mimo_structure (line 9) | def print_mimo_structure(model): FILE: examples/mimo/utils/model_helpers.py function load_submodule_ckpt (line 10) | def load_submodule_ckpt(module: torch.nn.Module, ckpt_dir: str): FILE: examples/multimodal/combine_state_dicts.py function combine (line 15) | def combine(input_files, module_prefixes, output_files): FILE: examples/multimodal/config.py function get_language_model_config (line 9) | def get_language_model_config(config): function get_vision_model_config (line 179) | def get_vision_model_config(config, apply_query_key_layer_scaling): function get_vision_projection_config (line 334) | def get_vision_projection_config(config, hidden_size): class EvaluationConfig (line 393) | class EvaluationConfig: FILE: examples/multimodal/dataloader_provider.py function datasets_provider (line 27) | def datasets_provider(task_encoder,worker_config=None): function is_first_or_last_stage (line 71) | def is_first_or_last_stage(pp_size): function is_dataloader_rank (line 84) | def is_dataloader_rank(): function train_valid_test_dataloaders_provider (line 95) | def train_valid_test_dataloaders_provider(train_val_test_num_samples, ta... class EnergonDataloader (line 152) | class EnergonDataloader: method __init__ (line 154) | def __init__(self, dataloader): method __next__ (line 158) | def __next__(self): method __iter__ (line 161) | def __iter__(self): method save_state (line 164) | def save_state(self): function cyclic_iter (line 168) | def cyclic_iter(iter): FILE: examples/multimodal/dataset_helpers.py class ImageTaskSample (line 36) | class ImageTaskSample(Sample): class ImageTaskSamplePacked (line 50) | class ImageTaskSamplePacked(Sample): class ImageTaskBatchPacked (line 72) | class ImageTaskBatchPacked(Batch): function search_for_fit (line 95) | def search_for_fit(numbers: List[int], capacity: int) -> int: function greedy_knapsack (line 103) | def greedy_knapsack(item_sizes: List[int], samples: List, max_capacity: ... class TaskEncoder (line 145) | class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatc... method __init__ (line 148) | def __init__( method _get_total_seq_length (line 195) | def _get_total_seq_length(self, input_ids, num_tiles): method _truncate_for_packing (line 202) | def _truncate_for_packing(self, input_ids, target, num_tiles): method encode_sample (line 219) | def encode_sample(self, sample: Union[CaptioningSample, OCRSample, VQA... method encode_captioning (line 247) | def encode_captioning(self, sample: CaptioningSample): method encode_llava_pretrain (line 293) | def encode_llava_pretrain(self, sample: VQASample): method encode_sample_list (line 327) | def encode_sample_list(self, samples: SampleListSample): method encode_llava_sft (line 347) | def encode_llava_sft(self, sample: Union[SimilarityInterleavedSample, ... method target_has_trainable_tokens (line 533) | def target_has_trainable_tokens(self, input_ids, num_tiles, target): method replace_value_with_repetition (line 552) | def replace_value_with_repetition(self, arr, token_to_replace, num_rep... method encode_any_single_turn_vqa (line 581) | def encode_any_single_turn_vqa(self, sample): method combined_ocr_encoder (line 663) | def combined_ocr_encoder(self, sample, task_type): method encode_pdf_prompt (line 703) | def encode_pdf_prompt(self, sample: OCRSample) -> ImageTaskSample: method encode_ocr_ref_prompt (line 724) | def encode_ocr_ref_prompt(self, sample: OCRSample) -> ImageTaskSample: method bbox_coord_to_label (line 758) | def bbox_coord_to_label(self, text, bbox): method encode_ocr_prompt (line 772) | def encode_ocr_prompt(self, sample: OCRSample) -> ImageTaskSample: method batch (line 791) | def batch(self, samples: List[Union[ImageTaskSample, ImageTaskSamplePa... method encode_batch (line 864) | def encode_batch(self, batch: ImageTaskBatchPacked) -> dict: method select_samples_to_pack (line 869) | def select_samples_to_pack(self, samples: List[ImageTaskSample]) -> Li... method pack_selected_samples (line 882) | def pack_selected_samples(self, samples: List[ImageTaskSample]) -> Lis... function print_error_handler (line 945) | def print_error_handler(exc: Exception, key: Optional[str]): function format_multichoice_question (line 953) | def format_multichoice_question(question, multichoice_options): function format_multichoice_answer (line 964) | def format_multichoice_answer(idx): FILE: examples/multimodal/energon_util.py class SampleListSample (line 10) | class SampleListSample(Sample): class OfflineTargetAspectRatioSample (line 21) | class OfflineTargetAspectRatioSample(Sample): FILE: examples/multimodal/evaluation/evaluate_ai2d.py function merge_input_files (line 8) | def merge_input_files(input_path): function ai2d_eval (line 38) | def ai2d_eval(input_path): FILE: examples/multimodal/evaluation/evaluate_chartqa.py function merge_input_files (line 8) | def merge_input_files(input_path): function chartqa_eval (line 35) | def chartqa_eval(input_path): FILE: examples/multimodal/evaluation/evaluate_coco.py function convert_to_coco_format (line 10) | def convert_to_coco_format(input_path): function coco_captioning_eval (line 40) | def coco_captioning_eval(input_path, groundtruth_file): FILE: examples/multimodal/evaluation/evaluate_infovqa.py function merge_input_files (line 8) | def merge_input_files(input_path): function infovqa_eval (line 35) | def infovqa_eval(input_path): FILE: examples/multimodal/evaluation/evaluate_mathvista.py function merge_input_files (line 10) | def merge_input_files(input_path): function extra_processing (line 36) | def extra_processing(text): function extract_answer (line 60) | def extract_answer(text): function compute_mathvista_accuracy (line 74) | def compute_mathvista_accuracy(result_file): function mathvista_eval (line 108) | def mathvista_eval(input_path): FILE: examples/multimodal/evaluation/evaluate_mmmu.py function get_input_output_paths (line 21) | def get_input_output_paths(input_path, task): function extract_answer (line 38) | def extract_answer(text): function convert_to_mmmu_format (line 55) | def convert_to_mmmu_format(input_path): function mmmu_eval (line 87) | def mmmu_eval(input_path, groundtruth_path): function main (line 113) | def main(): FILE: examples/multimodal/evaluation/evaluate_ocrbench.py function merge_input_files (line 7) | def merge_input_files(input_path): function compute_ocrbench_score (line 33) | def compute_ocrbench_score(result_file): function ocrbench_eval (line 123) | def ocrbench_eval(input_path): FILE: examples/multimodal/evaluation/evaluate_ocrbench_v2.py function convert_to_ocrbench_v2_format (line 10) | def convert_to_ocrbench_v2_format(input_path, groundtruth_path): function ocrbench_v2_eval (line 37) | def ocrbench_v2_eval(input_path, groundtruth_path, output_path): function main (line 71) | def main(): FILE: examples/multimodal/evaluation/evaluate_rd_tablebench.py function convert_to_rdtablebench_format (line 22) | def convert_to_rdtablebench_format(input_path): function rdtablebench_eval (line 42) | def rdtablebench_eval(input_path): function main (line 67) | def main(): FILE: examples/multimodal/evaluation/evaluate_realworldqa.py function merge_input_files (line 8) | def merge_input_files(input_path): function realworldqa_eval (line 32) | def realworldqa_eval(input_path): FILE: examples/multimodal/evaluation/evaluate_spdocvqa.py function merge_input_files (line 8) | def merge_input_files(input_path): function spdocvqa_eval (line 35) | def spdocvqa_eval(input_path): FILE: examples/multimodal/evaluation/evaluate_textvqa.py function merge_input_files (line 8) | def merge_input_files(input_path): function textvqa_eval (line 38) | def textvqa_eval(input_path): FILE: examples/multimodal/evaluation/evaluate_video_motionbench.py function merge_input_files (line 9) | def merge_input_files(input_path): function motionbench_eval (line 33) | def motionbench_eval(input_path): FILE: examples/multimodal/evaluation/evaluate_video_mvbench.py function merge_input_files (line 7) | def merge_input_files(input_path): function check_ans (line 36) | def check_ans(pred, gt): function create_result_dict (line 53) | def create_result_dict(result_list): function combine_all_res (line 83) | def combine_all_res(acc_dict): function mvbench_eval (line 98) | def mvbench_eval(input_path): FILE: examples/multimodal/evaluation/evaluate_video_phys_game_bench.py function merge_input_files (line 7) | def merge_input_files(input_path): function check_ans (line 35) | def check_ans(pred, gt): function compute_all_acc (line 52) | def compute_all_acc(result_list): function phys_game_bench_eval (line 83) | def phys_game_bench_eval(input_path): FILE: examples/multimodal/evaluation/evaluate_vqav2.py function levenshtein_distance (line 11) | def levenshtein_distance(s1: str, s2: str) -> int: function normalized_levenshtein_distance (line 29) | def normalized_levenshtein_distance(s1: str, s2: str) -> float: function similarity_function (line 34) | def similarity_function(prediction: str, gold_label: str, threshold: flo... function anls_score (line 38) | def anls_score( function merge_input_files (line 58) | def merge_input_files(input_path): function is_number (line 85) | def is_number(n: str): function compute_vqa_accuracy (line 94) | def compute_vqa_accuracy(result_file, task): function vqav2_eval (line 148) | def vqav2_eval(input_path): FILE: examples/multimodal/evaluation/evaluation_datasets.py function _get_partition_bounds (line 17) | def _get_partition_bounds( class VQADataset (line 28) | class VQADataset(torch.utils.data.Dataset): method __init__ (line 31) | def __init__( method __len__ (line 69) | def __len__(self): method __getitem__ (line 72) | def __getitem__(self, idx): class CaptioningDataset (line 110) | class CaptioningDataset(torch.utils.data.Dataset): method __init__ (line 113) | def __init__( method __len__ (line 150) | def __len__(self): method __getitem__ (line 153) | def __getitem__(self, idx): class MMMUDataset (line 179) | class MMMUDataset(torch.utils.data.Dataset): method __init__ (line 182) | def __init__( method __len__ (line 255) | def __len__(self): method process_image_tag (line 258) | def process_image_tag(self, q): method __getitem__ (line 307) | def __getitem__(self, idx): class VideoMMEDataset (line 452) | class VideoMMEDataset(torch.utils.data.Dataset): method __init__ (line 455) | def __init__( method __len__ (line 500) | def __len__(self): method __getitem__ (line 503) | def __getitem__(self, idx): class OCRBenchDataset (line 553) | class OCRBenchDataset(torch.utils.data.Dataset): method __init__ (line 556) | def __init__( method __len__ (line 587) | def __len__(self): method __getitem__ (line 590) | def __getitem__(self, idx): class MathVistaDataset (line 621) | class MathVistaDataset(torch.utils.data.Dataset): method __init__ (line 624) | def __init__( method __len__ (line 665) | def __len__(self): method __getitem__ (line 668) | def __getitem__(self, idx): class AI2DDataset (line 719) | class AI2DDataset(torch.utils.data.Dataset): method __init__ (line 722) | def __init__( method __len__ (line 756) | def __len__(self): method __getitem__ (line 759) | def __getitem__(self, idx): class RDTableBenchDataset (line 787) | class RDTableBenchDataset(torch.utils.data.Dataset): method __init__ (line 788) | def __init__( method __len__ (line 828) | def __len__(self): method __getitem__ (line 831) | def __getitem__(self, idx): class RealworldQADataset (line 865) | class RealworldQADataset(torch.utils.data.Dataset): method __init__ (line 866) | def __init__( method __len__ (line 899) | def __len__(self): method __getitem__ (line 902) | def __getitem__(self, idx): class MotionBenchDataset (line 952) | class MotionBenchDataset(torch.utils.data.Dataset): method __init__ (line 953) | def __init__( method __len__ (line 1007) | def __len__(self): method __getitem__ (line 1010) | def __getitem__(self, idx): class PhysGameBenchDataset (line 1057) | class PhysGameBenchDataset(torch.utils.data.Dataset): method __init__ (line 1058) | def __init__( method __len__ (line 1104) | def __len__(self): method _qa_template (line 1107) | def _qa_template(self, data): method __getitem__ (line 1116) | def __getitem__(self, idx): class MVBenchDataset (line 1167) | class MVBenchDataset(torch.utils.data.Dataset): method __init__ (line 1168) | def __init__( method __len__ (line 1245) | def __len__(self): method get_index (line 1248) | def get_index(self, bound, fps, max_frame, first_idx=0): method qa_template (line 1262) | def qa_template(self, data): method read_frame (line 1276) | def read_frame(self, video_path, bound=None, fps=2): method read_video_ours (line 1285) | def read_video_ours(self, video_path, bound=None): method __getitem__ (line 1299) | def __getitem__(self, idx): class ExampleInferenceDataset (line 1342) | class ExampleInferenceDataset(torch.utils.data.Dataset): method __init__ (line 1343) | def __init__( method __len__ (line 1372) | def __len__(self): method __getitem__ (line 1375) | def __getitem__(self, idx): function get_evaluation_dataset (line 1408) | def get_evaluation_dataset( FILE: examples/multimodal/evaluation/mmmu_utils.py function load_yaml (line 58) | def load_yaml(file_path): function parse_img_path (line 68) | def parse_img_path(text): function process_single_sample (line 73) | def process_single_sample(data): function construct_prompt (line 98) | def construct_prompt(sample, config): function parse_multi_choice_response (line 151) | def parse_multi_choice_response(response, all_choices, index2ans): function check_is_number (line 206) | def check_is_number(string): function normalize_str (line 218) | def normalize_str(string): function extract_numbers (line 243) | def extract_numbers(string): function parse_open_response (line 266) | def parse_open_response(response): function eval_multi_choice (line 321) | def eval_multi_choice(gold_i, pred_i): function eval_open (line 338) | def eval_open(gold_i, pred_i): function evaluate (line 367) | def evaluate(samples): function calculate_ins_level_acc (line 393) | def calculate_ins_level_acc(results: Dict): function mmmu_main_eval (line 405) | def mmmu_main_eval(output_dict, task_cfg): FILE: examples/multimodal/image_processing.py function find_closest_aspect_ratio (line 31) | def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height... function find_closest_area_weighted_aspect_ratio (line 47) | def find_closest_area_weighted_aspect_ratio(aspect_ratio, target_ratios,... class ImageTransform (line 65) | class ImageTransform: method __init__ (line 68) | def __init__(self, input_size, vision_model_type): method __call__ (line 72) | def __call__(self, img, img_h, img_w, use_tiling=False, max_num_tiles=... function dynamic_preprocess (line 88) | def dynamic_preprocess( function _build_transform (line 131) | def _build_transform(input_size, vision_model_type): FILE: examples/multimodal/layer_scaling.py function _bias_dropout_add_func_layer_scaling (line 10) | def _bias_dropout_add_func_layer_scaling(ls, x_with_bias, residual, prob... function bias_dropout_add_unfused_layer_scaling (line 24) | def bias_dropout_add_unfused_layer_scaling(ls, training): function get_bias_dropout_add_layer_scaling (line 33) | def get_bias_dropout_add_layer_scaling(ls, training, fused): class LayerScalingTransformerLayer (line 40) | class LayerScalingTransformerLayer(TransformerLayer): method __init__ (line 42) | def __init__(self, *args, **kwargs): FILE: examples/multimodal/layer_specs.py function get_layer_spec (line 54) | def get_layer_spec(is_vit, normalization) -> ModuleSpec: function get_layer_spec_te (line 98) | def get_layer_spec_te(is_vit=False, padding=False) -> ModuleSpec: function get_mamba_layer_spec_te (line 128) | def get_mamba_layer_spec_te(padding=False) -> ModuleSpec: function get_mlp_module_spec (line 187) | def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec: function get_norm_mlp_module_spec_te (line 198) | def get_norm_mlp_module_spec_te() -> ModuleSpec: FILE: examples/multimodal/model.py function model_provider (line 18) | def model_provider( function _get_tile_tags (line 238) | def _get_tile_tags(args, tokenizer): FILE: examples/multimodal/model_converter/clip_converter.py function convert (line 10) | def convert(download_root, output_path, tensor_parallel_size, use_te): FILE: examples/multimodal/model_converter/internvit_converter.py function convert (line 8) | def convert(model_name, output_path, tensor_parallel_size, use_te): FILE: examples/multimodal/model_converter/radio_converter.py function convert_radio_h (line 7) | def convert_radio_h(output_path, tensor_parallel_size, use_te, version): function convert_radio_g (line 127) | def convert_radio_g(output_path, tensor_parallel_size, use_te, version): function convert (line 279) | def convert(output_path, tensor_parallel_size, use_te, model_type, versi... FILE: examples/multimodal/model_converter/siglip_converter.py function convert (line 8) | def convert(output_path, tensor_parallel_size, use_te): FILE: examples/multimodal/model_converter/vision_model_tester.py function run_mcore_vision (line 24) | def run_mcore_vision(model_path): function run_hf_vision (line 74) | def run_hf_vision(model_name): function main (line 89) | def main(mcore_model, hf_model): FILE: examples/multimodal/multimodal_args.py function add_multimodal_extra_args (line 5) | def add_multimodal_extra_args(parser): FILE: examples/multimodal/nvlm/internvit.py class InternViTRMSNorm (line 61) | class InternViTRMSNorm(MegatronModule): method __init__ (line 63) | def __init__( method _norm (line 91) | def _norm(self, x, var): method forward (line 97) | def forward(self, x: torch.Tensor) -> torch.Tensor: method _gather_var (line 115) | def _gather_var(self, input_, max_dim): method sharded_state_dict (line 150) | def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata={}): function get_mlp_module_spec (line 163) | def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec: class InternViTSelfAttention (line 175) | class InternViTSelfAttention(SelfAttention): method __init__ (line 177) | def __init__( class InternViTTEDotProductAttention (line 214) | class InternViTTEDotProductAttention(TEDotProductAttention): method forward (line 218) | def forward(self, *args, **kwargs): function get_internvit_layer_spec (line 237) | def get_internvit_layer_spec(use_te) -> ModuleSpec: function get_internvit300M_layer_spec (line 263) | def get_internvit300M_layer_spec(use_te) -> ModuleSpec: FILE: examples/multimodal/nvlm/pp_checkpoint_converter.py function split (line 14) | def split(input_dir, base_output_dir, input_pp, output_pp, num_tp, num_l... function combine (line 82) | def combine(input_dir, base_output_dir, input_pp, output_pp, num_tp, num... FILE: examples/multimodal/radio/radio_g.py function get_mlp_module_spec (line 54) | def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec: function get_norm_mlp_module_spec_te (line 65) | def get_norm_mlp_module_spec_te() -> ModuleSpec: function get_radio_g_layer_spec (line 75) | def get_radio_g_layer_spec(normalization) -> ModuleSpec: function get_radio_g_layer_spec_te (line 115) | def get_radio_g_layer_spec_te() -> ModuleSpec: FILE: examples/multimodal/run_text_generation.py function is_first_rank (line 46) | def is_first_rank(): function add_text_generation_args (line 54) | def add_text_generation_args(parser): function get_evaluation_dataloader (line 107) | def get_evaluation_dataloader( function generate_samples (line 156) | def generate_samples(model, config: EvaluationConfig, print_output): function get_evaluation_configs (line 365) | def get_evaluation_configs(config_path=None) -> Dict[str, EvaluationConf... function get_output_path (line 424) | def get_output_path(config, dp_rank): function generate_and_write_samples (line 439) | def generate_and_write_samples(model, config, print_output=True): class VLMForwardStep (line 457) | class VLMForwardStep(ForwardStep): method __init__ (line 460) | def __init__( method _forward (line 482) | def _forward(self, tokens, position_ids, attention_mask): method __call__ (line 493) | def __call__(self, tokens, position_ids, attention_mask): function get_conversation (line 536) | def get_conversation(task, question, metadata=None): function get_prompt_and_generated (line 648) | def get_prompt_and_generated(prompt_and_generation, prompt_format): function run_eval (line 690) | def run_eval(config, iteration=None): function run_evaluation_loop (line 804) | def run_evaluation_loop(model, configs, output_dir_override=None, iterat... function eval_tasks (line 843) | def eval_tasks(): FILE: examples/multimodal/train.py function get_batch (line 33) | def get_batch(data_iterator, image_token_index, img_seq_len): function get_ltor_masks_and_position_ids (line 152) | def get_ltor_masks_and_position_ids(input_ids, target, pad_token): function get_mask_start_and_end_idx (line 168) | def get_mask_start_and_end_idx(arr): function scaled_loss_func (line 193) | def scaled_loss_func(loss_mask, output_tensor): function loss_func (line 241) | def loss_func(loss_mask, output_tensor): function forward_step (line 254) | def forward_step(data_iterator, model: LLaVAModel): function llava_embedding_ranks (line 300) | def llava_embedding_ranks(pp_ranks): function llava_position_embedding_ranks (line 313) | def llava_position_embedding_ranks(pp_ranks): function run_online_eval (line 326) | def run_online_eval(model): function write_eval_to_tensorboard (line 353) | def write_eval_to_tensorboard(data, iteration, writer, walltime=None): function write_online_eval_to_tensorboard (line 363) | def write_online_eval_to_tensorboard(data, iteration, writer, walltime=N... FILE: examples/post_training/modelopt/convert_model.py function add_convert_args (line 39) | def add_convert_args(parser): function get_model (line 73) | def get_model(model_provider_func, model_type=ModelType.encoder_or_decod... function check_arguments (line 91) | def check_arguments(): FILE: examples/post_training/modelopt/export.py function add_modelopt_export_args (line 27) | def add_modelopt_export_args(parser): FILE: examples/post_training/modelopt/finetune.py function add_finetune_args (line 37) | def add_finetune_args(parser): function get_eos_id (line 46) | def get_eos_id(): class OfflineDataset (line 66) | class OfflineDataset(torch.utils.data.Dataset): method __init__ (line 67) | def __init__(self, data_dir: str, num_samples): method __len__ (line 77) | def __len__(self): method __getitem__ (line 80) | def __getitem__(self, idx): class SFTDataset (line 86) | class SFTDataset(torch.utils.data.Dataset): method _wildcard_get (line 112) | def _wildcard_get(cls, directory: Dict[str, Any], name: str, default_v... method __init__ (line 120) | def __init__( method __len__ (line 195) | def __len__(self): method __getitem__ (line 198) | def __getitem__(self, idx): method _process_and_pack_example (line 232) | def _process_and_pack_example(self): method _process_example (line 260) | def _process_example(self, example: Dict[str, Any]): method _to_conversation (line 305) | def _to_conversation(cls, question, response): method _sharegpt_to_openai_conversations (line 311) | def _sharegpt_to_openai_conversations(cls, data): method _special_to_openai_conversations (line 330) | def _special_to_openai_conversations(cls, data): function train_valid_test_sft_datasets_provider (line 335) | def train_valid_test_sft_datasets_provider(train_val_test_num_samples): function get_batch (line 377) | def get_batch(data_iterator): function non_loss_data_func (line 444) | def non_loss_data_func(model: GPTModel): function forward_step (line 455) | def forward_step(data_iterator, model: GPTModel): FILE: examples/post_training/modelopt/generate.py function add_generate_args (line 28) | def add_generate_args(parser): function check_arguments (line 41) | def check_arguments(): function mtbench_to_oai_chat (line 53) | def mtbench_to_oai_chat(example): function get_conversations (line 62) | def get_conversations(example): FILE: examples/post_training/modelopt/mmlu.py function add_mmlu_args (line 32) | def add_mmlu_args(parser): function get_all_subjects (line 45) | def get_all_subjects(): function format_example (line 108) | def format_example(example, include_answer: bool = True): function generate_prompt (line 120) | def generate_prompt(test_example, dev_examples, few_shots=0, no_subject_... FILE: examples/post_training/modelopt/offline_feature_extract.py function add_extract_args (line 23) | def add_extract_args(parser): function extract_feature (line 32) | def extract_feature(dataset, model, output_dir, idx_start, idx_end): FILE: examples/post_training/modelopt/prune.py function add_prune_args (line 43) | def add_prune_args(parser): function check_arguments (line 125) | def check_arguments(args): function get_calib_dataloader (line 132) | def get_calib_dataloader(calib_size=1024, max_sequence_length=512): function get_params (line 142) | def get_params(model): function _custom_prompt_forward_loop_func (line 187) | def _custom_prompt_forward_loop_func(model): function _hf_dataset_forword_loop_func (line 202) | def _hf_dataset_forword_loop_func(model): FILE: examples/post_training/modelopt/quantize.py function add_text_generate_ptq_args (line 76) | def add_text_generate_ptq_args(parser): function check_arguments (line 145) | def check_arguments(): function _is_first_layers (line 157) | def _is_first_layers(name: str, num_layers: int = 1, num_layers_to_disab... function _is_last_layers (line 167) | def _is_last_layers(name: str, num_layers: int = 1, num_layers_to_disabl... function get_first_layers_disabled_config (line 177) | def get_first_layers_disabled_config(config, num_layers: int = 1, num_la... function get_last_layers_disabled_config (line 195) | def get_last_layers_disabled_config(config, num_layers: int = 1, num_lay... function get_modelopt_torch_quantization_config (line 213) | def get_modelopt_torch_quantization_config(): function get_calib_dataloader (line 270) | def get_calib_dataloader( function _custom_prompt_forward_loop_func (line 382) | def _custom_prompt_forward_loop_func(model): function _dataset_forward_loop_func (line 397) | def _dataset_forward_loop_func(model): FILE: examples/post_training/modelopt/validate.py function add_ar_validation_args (line 27) | def add_ar_validation_args(parser): function check_arguments (line 59) | def check_arguments(): function get_current_memory_info (line 71) | def get_current_memory_info(): function report_current_memory_info (line 82) | def report_current_memory_info(): FILE: examples/rl/benchmark_refit.py function add_benchmark_args (line 24) | def add_benchmark_args(parser): function model_provider (line 51) | def model_provider(pre_process=True, post_process=True, parallel_output=... function create_refit_service (line 67) | def create_refit_service(method): function print_config_summary (line 79) | def print_config_summary(args, src_config, dst_config, world_size, mode): function run_benchmark (line 94) | def run_benchmark(src_model, dst_model, refit_service, num_warmup, num_i... function print_results (line 129) | def print_results(timings): function benchmark_collocated (line 145) | def benchmark_collocated(): function benchmark_non_collocated (line 220) | def benchmark_non_collocated(): function main (line 316) | def main(): FILE: examples/rl/environments/countdown/countdown.py function extract_solution (line 6) | def extract_solution(solution_str: str, remove_prompt: bool = False): function validate_equation (line 28) | def validate_equation(equation_str, available_numbers): function evaluate_equation (line 44) | def evaluate_equation(equation_str): function compute_score (line 59) | def compute_score(solution_str, ground_truth, method='strict', format_sc... FILE: examples/rl/environments/countdown/countdown_agent.py class CountdownAgent (line 12) | class CountdownAgent(RewardOnlyAgent, HFDatasetAgent): method make_prefix (line 15) | def make_prefix(self, target, nums) -> str: method get_dataset (line 20) | def get_dataset(self, validation: bool = False): method evaluation_prompts (line 29) | async def evaluation_prompts( method get_prompt (line 38) | async def get_prompt(self, validation=False) -> tuple[str, dict]: method get_reward (line 43) | async def get_reward(self, response, golden: dict) -> float: FILE: examples/rl/environments/math/aime_agent.py class AIMEAgent (line 15) | class AIMEAgent(MathAgent): method get_dataset (line 18) | def get_dataset(self, validation: bool = False): method evaluation_prompts (line 22) | async def evaluation_prompts( method get_prompt (line 34) | async def get_prompt(self, validation=False) -> tuple[str, dict]: method get_reward (line 44) | async def get_reward(self, response, golden: dict) -> float: FILE: examples/rl/environments/math/bigmath_agent.py class BigMathAgent (line 16) | class BigMathAgent(MathAgent): method get_dataset (line 19) | def get_dataset(self, validation: bool = False): method evaluation_prompts (line 22) | async def evaluation_prompts( method get_prompt (line 31) | async def get_prompt(self, validation=False) -> tuple[str, dict]: method get_reward (line 37) | async def get_reward(self, response, golden: dict) -> float: FILE: examples/rl/environments/math/dapo_agent.py class DAPOAgent (line 15) | class DAPOAgent(MathAgent): method reformat_datum (line 18) | def reformat_datum(self, datum: dict) -> dict: method get_dataset (line 30) | def get_dataset(self, validation: bool = False): method evaluation_prompts (line 33) | async def evaluation_prompts( method get_prompt (line 43) | async def get_prompt(self, validation=False) -> tuple[str, dict]: method get_reward (line 50) | async def get_reward(self, response, golden: dict) -> float: FILE: examples/rl/environments/math/gsm8k_agent.py class GSM8KAgent (line 25) | class GSM8KAgent(MathAgent): method __init__ (line 26) | def __init__(self, method reformat_datum (line 41) | def reformat_datum(self, datum: dict) -> dict: method get_dataset (line 48) | def get_dataset(self, validation: bool = False): method evaluation_prompts (line 51) | async def evaluation_prompts( method get_prompt (line 60) | async def get_prompt(self, validation=False) -> tuple[str, dict]: method get_reward (line 67) | async def get_reward(self, response, golden: dict) -> float: FILE: examples/rl/environments/math/math_agent.py class MathAgent (line 23) | class MathAgent(RewardOnlyAgent): method __init__ (line 24) | def __init__(self, method compute_score (line 49) | def compute_score(self, response: str, golden: dict, golden_key: str =... method make_prefix (line 120) | def make_prefix(self, problem_key: str = "problem", **kwargs) -> str: FILE: examples/rl/environments/math/openmath_agent.py class OpenMathInstructAgent (line 16) | class OpenMathInstructAgent(MathAgent): method get_dataset (line 19) | def get_dataset(self, validation: bool = False): method evaluation_prompts (line 22) | async def evaluation_prompts( method get_prompt (line 31) | async def get_prompt(self, validation=False) -> tuple[str, dict]: method get_reward (line 37) | async def get_reward(self, response, golden: dict) -> float: FILE: examples/run_simple_mcore_train_loop.py function initialize_distributed (line 32) | def initialize_distributed( function model_provider (line 60) | def model_provider() -> GPTModel: function get_train_data_iterator (line 85) | def get_train_data_iterator() -> Iterator: function forward_step_func (line 123) | def forward_step_func( function save_distributed_checkpoint (line 163) | def save_distributed_checkpoint( function load_distributed_checkpoint (line 183) | def load_distributed_checkpoint( FILE: gpt_builders.py function gpt_builder (line 28) | def gpt_builder(args, pre_process, post_process, vp_stage=None, config=N... function _get_transformer_layer_spec (line 116) | def _get_transformer_layer_spec(use_te, config): FILE: mamba_builders.py function mamba_builder (line 12) | def mamba_builder(args, pre_process, post_process, vp_stage=None, config... FILE: megatron/core/_rank_utils.py function safe_get_rank (line 12) | def safe_get_rank() -> int: function log_single_rank (line 31) | def log_single_rank(logger: logging.Logger, *args: Any, rank: int = 0, *... FILE: megatron/core/activations.py function squared_relu (line 9) | def squared_relu(x: torch.Tensor) -> torch.Tensor: function quick_gelu (line 15) | def quick_gelu(x: torch.Tensor) -> torch.Tensor: function fast_gelu (line 21) | def fast_gelu(x: torch.Tensor) -> torch.Tensor: FILE: megatron/core/config.py function set_experimental_flag (line 6) | def set_experimental_flag(flag: bool): function is_experimental_enabled (line 12) | def is_experimental_enabled(): FILE: megatron/core/config_logger.py function get_config_logger_path (line 25) | def get_config_logger_path(config): function has_config_logger_enabled (line 30) | def has_config_logger_enabled(config): function get_path_count (line 40) | def get_path_count(path): function get_path_with_count (line 52) | def get_path_with_count(path): class JSONEncoderWithMcoreTypes (line 59) | class JSONEncoderWithMcoreTypes(json.JSONEncoder): method default (line 64) | def default(self, o): function log_config_to_disk (line 97) | def log_config_to_disk(config, dict_data, prefix='', rank_str=''): FILE: megatron/core/datasets/bert_dataset.py class BERTMaskedWordPieceDatasetConfig (line 17) | class BERTMaskedWordPieceDatasetConfig(MaskedWordPieceDatasetConfig): method __post_init__ (line 23) | def __post_init__(self) -> None: class BERTMaskedWordPieceDataset (line 30) | class BERTMaskedWordPieceDataset(MaskedWordPieceDataset): method __init__ (line 44) | def __init__( method _key_config_attributes (line 64) | def _key_config_attributes() -> List[str]: method __getitem__ (line 74) | def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]: method _get_token_mask (line 173) | def _get_token_mask(self, numpy_random_state: numpy.random.RandomState... FILE: megatron/core/datasets/blended_dataset.py class BlendedDataset (line 24) | class BlendedDataset(torch.utils.data.Dataset): method __init__ (line 41) | def __init__( method __len__ (line 88) | def __len__(self) -> int: method __getitem__ (line 97) | def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]: method _build_indices (line 110) | def _build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]: FILE: megatron/core/datasets/blended_megatron_dataset_builder.py class BlendedMegatronDatasetBuilder (line 29) | class BlendedMegatronDatasetBuilder(object): method __init__ (line 45) | def __init__( method build (line 77) | def build(self) -> List[Optional[TopLevelDataset]]: method _build_blended_dataset_splits (line 136) | def _build_blended_dataset_splits(self) -> List[Optional[TopLevelDatas... method _build_megatron_datasets_parallel (line 331) | def _build_megatron_datasets_parallel( method _build_megatron_dataset_splits (line 416) | def _build_megatron_dataset_splits( method build_generic_dataset (line 491) | def build_generic_dataset( function _get_size_per_split_per_dataset (line 553) | def _get_size_per_split_per_dataset( FILE: megatron/core/datasets/blended_megatron_dataset_config.py class BlendedMegatronDatasetConfig (line 16) | class BlendedMegatronDatasetConfig: method __post_init__ (line 99) | def __post_init__(self) -> None: function parse_and_normalize_split (line 155) | def parse_and_normalize_split(split: str) -> List[float]: function convert_split_vector_to_split_matrix (line 175) | def convert_split_vector_to_split_matrix( FILE: megatron/core/datasets/data_schedule.py class HybridCPDataLoaderWrapper (line 12) | class HybridCPDataLoaderWrapper: method __init__ (line 28) | def __init__( method __iter__ (line 51) | def __iter__(self): method get_global_seqlens (line 55) | def get_global_seqlens(self, subsample_seqlens: torch.Tensor) -> List[... method get_global_id_seqlens (line 105) | def get_global_id_seqlens(self, num_local_subsamples, offsets, seqlens... method _gid_to_src_rank (line 126) | def _gid_to_src_rank(self, gid: int, offsets: List[int]) -> int: method reroute_samples_to_hdp_ranks (line 136) | def reroute_samples_to_hdp_ranks( method unpack_batch (line 245) | def unpack_batch(self, batch): method __next__ (line 267) | def __next__(self) -> Any: FILE: megatron/core/datasets/gpt_dataset.py class GPTDatasetConfig (line 25) | class GPTDatasetConfig(BlendedMegatronDatasetConfig): method __post_init__ (line 79) | def __post_init__(self) -> None: class GPTDataset (line 101) | class GPTDataset(MegatronDataset): method __init__ (line 119) | def __init__( method numel_low_level_dataset (line 148) | def numel_low_level_dataset(low_level_dataset: IndexedDataset) -> int: method build_low_level_dataset (line 163) | def build_low_level_dataset(dataset_path: str, config: GPTDatasetConfi... method __len__ (line 196) | def __len__(self) -> int: method __getitem__ (line 225) | def __getitem__(self, idx: Optional[int]) -> Dict[str, torch.Tensor]: method _query_document_sample_shuffle_indices (line 298) | def _query_document_sample_shuffle_indices( method _build_document_sample_shuffle_indices (line 381) | def _build_document_sample_shuffle_indices( method _get_num_tokens_per_epoch (line 609) | def _get_num_tokens_per_epoch(self) -> int: method _get_num_epochs (line 617) | def _get_num_epochs(self, num_tokens_per_epoch: int) -> int: function _build_document_index (line 640) | def _build_document_index( function _build_shuffle_index (line 674) | def _build_shuffle_index( function _get_ltor_masks_and_position_ids (line 706) | def _get_ltor_masks_and_position_ids( class MockGPTLowLevelDataset (line 783) | class MockGPTLowLevelDataset: method __init__ (line 803) | def __init__(self, tokenizer: MegatronTokenizerBase) -> None: method __len__ (line 811) | def __len__(self) -> int: method __getitem__ (line 814) | def __getitem__(self, idx: int) -> numpy.number: method get (line 821) | def get(self, idx: int, offset: int = 0, length: Optional[int] = None)... class MockGPTDataset (line 839) | class MockGPTDataset(GPTDataset): method __init__ (line 857) | def __init__( method numel_low_level_dataset (line 878) | def numel_low_level_dataset(low_level_dataset: MockGPTLowLevelDataset)... method build_low_level_dataset (line 890) | def build_low_level_dataset( # type: ignore[override] FILE: megatron/core/datasets/helpers.cpp function build_exhaustive_blending_indices (line 22) | void build_exhaustive_blending_indices(py::array_t &dataset_ind... function build_blending_indices (line 77) | void build_blending_indices(py::array_t &dataset_index, function build_sample_idx (line 145) | py::array_t build_sample_idx( function get_target_sample_len (line 251) | inline int32_t get_target_sample_len(const int32_t short_seq_ratio, function build_mapping_impl (line 269) | py::array build_mapping_impl(const py::array_t &docs_, function build_mapping (line 529) | py::array build_mapping(const py::array_t &docs_, function build_blocks_mapping_impl (line 567) | py::array build_blocks_mapping_impl(const py::array_t &docs_, function build_blocks_mapping (line 808) | py::array build_blocks_mapping(const py::array_t &docs_, function PYBIND11_MODULE (line 841) | PYBIND11_MODULE(helpers_cpp, m) FILE: megatron/core/datasets/helpers.py function build_sample_idx (line 12) | def build_sample_idx( FILE: megatron/core/datasets/indexed_dataset.py class DType (line 50) | class DType(Enum): method code_from_dtype (line 63) | def code_from_dtype(cls, value: Type[numpy.number]) -> int: method dtype_from_code (line 75) | def dtype_from_code(cls, value: int) -> Type[numpy.number]: method size (line 87) | def size(key: Union[int, Type[numpy.number]]) -> int: method optimal_dtype (line 107) | def optimal_dtype(cardinality: Optional[int]) -> Type[numpy.number]: class _IndexWriter (line 122) | class _IndexWriter(object): method __init__ (line 131) | def __init__(self, idx_path: str, dtype: Type[numpy.number]) -> None: method __enter__ (line 135) | def __enter__(self) -> "_IndexWriter": method __exit__ (line 154) | def __exit__( method write (line 175) | def write( method _sequence_pointers (line 213) | def _sequence_pointers( class _IndexReader (line 233) | class _IndexReader(object): method __init__ (line 246) | def __init__( method __del__ (line 336) | def __del__(self) -> None: method __len__ (line 342) | def __len__(self) -> int: method __getitem__ (line 351) | def __getitem__(self, idx: int) -> Tuple[numpy.int32, numpy.int64, Opt... class _BinReader (line 368) | class _BinReader(ABC): method read (line 372) | def read(self, dtype: Type[numpy.number], count: int, offset: int) -> ... class _MMapBinReader (line 389) | class _MMapBinReader(_BinReader): method __init__ (line 396) | def __init__(self, bin_path: str) -> None: method read (line 405) | def read(self, dtype: Type[numpy.number], count: int, offset: int) -> ... method __del__ (line 421) | def __del__(self) -> None: class _FileBinReader (line 431) | class _FileBinReader(_BinReader): method __init__ (line 438) | def __init__( method read (line 447) | def read(self, dtype: Type[numpy.number], count: int, offset: int) -> ... class _S3BinReader (line 500) | class _S3BinReader(_BinReader): method __init__ (line 513) | def __init__(self, bin_path: str, object_storage_config: ObjectStorage... method _extract_from_cache (line 523) | def _extract_from_cache(self, offset: int, size: int) -> bytes: method read (line 532) | def read(self, dtype: Type[numpy.number], count: int, offset: int) -> ... method __del__ (line 581) | def __del__(self) -> None: class _MultiStorageClientBinReader (line 586) | class _MultiStorageClientBinReader(_BinReader): method __init__ (line 595) | def __init__(self, bin_path: str, object_storage_config: ObjectStorage... method read (line 599) | def read(self, dtype: Type[numpy.number], count: int, offset: int) -> ... class IndexedDataset (line 611) | class IndexedDataset(torch.utils.data.Dataset): method __init__ (line 634) | def __init__( method initialize (line 678) | def initialize( method __getstate__ (line 736) | def __getstate__(self) -> Tuple[str, bool, bool, Optional[ObjectStorag... method __setstate__ (line 752) | def __setstate__(self, state: Tuple[str, bool, bool, Optional[ObjectSt... method __del__ (line 777) | def __del__(self) -> None: method __len__ (line 782) | def __len__(self) -> int: method __getitem__ (line 790) | def __getitem__( method get (line 843) | def get( method sequence_lengths (line 872) | def sequence_lengths(self) -> numpy.ndarray: method document_indices (line 881) | def document_indices(self) -> numpy.ndarray: method get_document_indices (line 889) | def get_document_indices(self) -> numpy.ndarray: method set_document_indices (line 899) | def set_document_indices(self, document_indices: numpy.ndarray) -> None: method sequence_modes (line 910) | def sequence_modes(self) -> numpy.ndarray: method exists (line 920) | def exists(path_prefix: str) -> bool: class IndexedDatasetBuilder (line 937) | class IndexedDatasetBuilder(object): method __init__ (line 948) | def __init__( method add_item (line 965) | def add_item(self, tensor: torch.Tensor, mode: int = 0) -> None: method add_document (line 979) | def add_document( method end_document (line 999) | def end_document(self) -> None: method add_index (line 1003) | def add_index(self, path_prefix: str) -> None: method finalize (line 1029) | def finalize(self, idx_path: str) -> None: function get_idx_path (line 1040) | def get_idx_path(path_prefix: str) -> str: function get_bin_path (line 1052) | def get_bin_path(path_prefix: str) -> str: FILE: megatron/core/datasets/masked_dataset.py class MaskedWordPieceDatasetConfig (line 23) | class MaskedWordPieceDatasetConfig(BlendedMegatronDatasetConfig): method __post_init__ (line 49) | def __post_init__(self) -> None: class MaskedWordPieceDataset (line 76) | class MaskedWordPieceDataset(MegatronDataset): method __init__ (line 102) | def __init__( method numel_low_level_dataset (line 116) | def numel_low_level_dataset(low_level_dataset: IndexedDataset) -> int: method build_low_level_dataset (line 128) | def build_low_level_dataset( method _key_config_attributes (line 144) | def _key_config_attributes() -> List[str]: method __len__ (line 160) | def __len__(self) -> int: method _build_sample_index (line 163) | def _build_sample_index( method _create_masked_lm_predictions (line 247) | def _create_masked_lm_predictions( method _get_token_mask (line 440) | def _get_token_mask(self, numpy_random_state: numpy.random.RandomState... FILE: megatron/core/datasets/megatron_dataset.py class MegatronDataset (line 23) | class MegatronDataset(ABC, torch.utils.data.Dataset): method __init__ (line 41) | def __init__( method numel_low_level_dataset (line 117) | def numel_low_level_dataset(low_level_dataset: LowLevelDataset) -> int: method build_low_level_dataset (line 134) | def build_low_level_dataset( method _key_config_attributes (line 155) | def _key_config_attributes() -> List[str]: method __len__ (line 167) | def __len__(self) -> int: method __getitem__ (line 176) | def __getitem__(self, idx: int) -> Dict[str, Union[torch.Tensor, numpy... FILE: megatron/core/datasets/multimodal_dataset.py class MultimodalDatasetConfig (line 12) | class MultimodalDatasetConfig(GPTDatasetConfig): method __post_init__ (line 28) | def __post_init__(self) -> None: class MockMultimodalDataset (line 35) | class MockMultimodalDataset(MockGPTDataset): method __getitem__ (line 42) | def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: FILE: megatron/core/datasets/object_storage_utils.py class ObjectStorageConfig (line 21) | class ObjectStorageConfig: class S3Client (line 46) | class S3Client(Protocol): method download_file (line 49) | def download_file(self, Bucket: str, Key: str, Filename: str) -> None: method upload_file (line 53) | def upload_file(self, Filename: str, Bucket: str, Key: str) -> None: method head_object (line 57) | def head_object(self, Bucket: str, Key: str) -> Dict[str, Any]: method get_object (line 61) | def get_object(self, Bucket: str, Key: str, Range: str) -> Dict[str, A... method close (line 65) | def close(self) -> None: function _remove_s3_prefix (line 70) | def _remove_s3_prefix(path: str) -> str: function _is_s3_path (line 82) | def _is_s3_path(path: str) -> bool: function _remove_msc_prefix (line 94) | def _remove_msc_prefix(path: str) -> str: function _is_msc_path (line 107) | def _is_msc_path(path: str) -> bool: function _s3_download_file (line 119) | def _s3_download_file(client: S3Client, s3_path: str, local_path: str) -... function _s3_object_exists (line 135) | def _s3_object_exists(client: S3Client, path: str) -> bool: function is_object_storage_path (line 158) | def is_object_storage_path(path: str) -> bool: function get_index_cache_path (line 170) | def get_index_cache_path(idx_path: str, object_storage_config: ObjectSto... function parse_s3_path (line 195) | def parse_s3_path(path: str) -> Tuple[str, str]: function get_object_storage_access (line 215) | def get_object_storage_access(path: str) -> str: function dataset_exists (line 220) | def dataset_exists(path_prefix: str, idx_path: str, bin_path: str) -> bool: function cache_index_file (line 243) | def cache_index_file(remote_path: str, local_path: str) -> None: FILE: megatron/core/datasets/t5_dataset.py class T5MaskedWordPieceDatasetConfig (line 22) | class T5MaskedWordPieceDatasetConfig(MaskedWordPieceDatasetConfig): method __post_init__ (line 36) | def __post_init__(self) -> None: class T5MaskedWordPieceDataset (line 48) | class T5MaskedWordPieceDataset(MaskedWordPieceDataset): method __init__ (line 67) | def __init__( method _key_config_attributes (line 85) | def _key_config_attributes() -> List[str]: method _build_b1ss_attention_mask (line 96) | def _build_b1ss_attention_mask( method config_attention_mask (line 128) | def config_attention_mask( method __getitem__ (line 225) | def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]: method _get_token_mask (line 329) | def _get_token_mask(self, numpy_random_state: numpy.random.RandomState... FILE: megatron/core/datasets/utils.py class Split (line 14) | class Split(Enum): function compile_helpers (line 20) | def compile_helpers(): function normalize (line 33) | def normalize(weights: List[float]) -> List[float]: function get_blend_from_list (line 49) | def get_blend_from_list( FILE: megatron/core/dist_checkpointing/core.py class CheckpointingException (line 15) | class CheckpointingException(Exception): class CheckpointingConfig (line 22) | class CheckpointingConfig: function check_is_distributed_checkpoint (line 38) | def check_is_distributed_checkpoint(checkpoint_dir): function maybe_load_config (line 50) | def maybe_load_config(checkpoint_dir: str) -> Optional[CheckpointingConf... function save_config (line 76) | def save_config(config: CheckpointingConfig, checkpoint_dir: str): FILE: megatron/core/dist_checkpointing/dict_utils.py function extract_matching_values (line 18) | def extract_matching_values( function diff (line 69) | def diff(x1: Any, x2: Any, prefix: Tuple = ()) -> Tuple[list, list, list]: function inspect_types (line 138) | def inspect_types(x: Any, prefix: Tuple = (), indent: int = 4): function nested_values (line 166) | def nested_values(x: Union[dict, list]): function nested_items_iter (line 176) | def nested_items_iter(x: Union[dict, list]): function dict_map (line 186) | def dict_map(f: Callable, d: dict): function dict_map_with_key (line 192) | def dict_map_with_key(f: Callable, d: dict): function dict_list_map_inplace (line 198) | def dict_list_map_inplace(f: Callable[[U], V], x: Union[Dict, List, U]): function dict_list_map_outplace (line 210) | def dict_list_map_outplace(f: Callable[[U], V], x: Union[Dict, List, U])... function merge (line 220) | def merge(x1: Union[dict, list], x2: Union[dict, list], key: Tuple[Union... function map_reduce (line 244) | def map_reduce( FILE: megatron/core/dist_checkpointing/exchange_utils.py function is_float8tensor (line 32) | def is_float8tensor(tensor: torch.Tensor) -> bool: class ShardDistribution (line 40) | class ShardDistribution(NamedTuple): function _shard_size (line 63) | def _shard_size(sh_ten: ShardedTensor): function _get_empty_tensor_for_exchange (line 69) | def _get_empty_tensor_for_exchange( function distribute_shards_to_ranks (line 118) | def distribute_shards_to_ranks( function determine_main_replica_uniform_distribution (line 174) | def determine_main_replica_uniform_distribution( function exchange_loaded_tensors_gather_rounds (line 257) | def exchange_loaded_tensors_gather_rounds( function exchange_loaded_tensors_gather_object (line 375) | def exchange_loaded_tensors_gather_object( function exchange_loaded_objects_gather_object (line 421) | def exchange_loaded_objects_gather_object( function exchange_loaded_tensors_broadcast (line 454) | def exchange_loaded_tensors_broadcast( function exchange_by_distribution (line 538) | def exchange_by_distribution( FILE: megatron/core/dist_checkpointing/mapping.py class ShardedBase (line 34) | class ShardedBase(ABC): method validate_metadata_integrity (line 42) | def validate_metadata_integrity(self): method without_data (line 46) | def without_data(self) -> "ShardedBase": class ShardedTensor (line 52) | class ShardedTensor(ShardedBase): method __post_init__ (line 93) | def __post_init__(self): method validate_metadata_integrity (line 96) | def validate_metadata_integrity(self) -> None: method has_regular_grid (line 137) | def has_regular_grid(self): method global_slice (line 141) | def global_slice(self) -> Tuple[Union[int, slice], ...]: method local_chunk_offset_in_global (line 159) | def local_chunk_offset_in_global(self) -> Tuple[int, ...]: method max_allowed_chunks (line 172) | def max_allowed_chunks(self) -> Tuple[int, ...]: method without_data (line 186) | def without_data(self): method from_rank_offsets (line 190) | def from_rank_offsets( method init_data (line 247) | def init_data(self, device: Union[str, torch.device], init_fn=torch.em... method narrow (line 262) | def narrow(self, dim: int, start: int, length: int) -> List["ShardedTe... function is_main_replica (line 322) | def is_main_replica(replica_id: ReplicaId): class LocalNonpersistentObject (line 342) | class LocalNonpersistentObject: method __init__ (line 351) | def __init__(self, obj): method unwrap (line 354) | def unwrap(self): class ShardedObject (line 360) | class ShardedObject(ShardedBase): method __post_init__ (line 384) | def __post_init__(self): method validate_metadata_integrity (line 387) | def validate_metadata_integrity(self): method without_data (line 393) | def without_data(self): method unique_key (line 397) | def unique_key(self): method __str__ (line 405) | def __str__(self): method empty_from_unique_key (line 409) | def empty_from_unique_key(cls, unique_key, replica_id: ReplicaId = 0) ... class ShardedTensorFactory (line 438) | class ShardedTensorFactory(ShardedBase): method build (line 471) | def build(self): method validate_metadata_integrity (line 475) | def validate_metadata_integrity(self): method without_data (line 479) | def without_data(self): function apply_factories (line 483) | def apply_factories(sharded_state_dict: ShardedStateDict): function apply_factory_merges (line 502) | def apply_factory_merges( FILE: megatron/core/dist_checkpointing/optimizer.py function get_optim_param_to_id_map (line 35) | def get_optim_param_to_id_map(optim_params_iter: Iterable[torch.nn.Param... function get_param_id_to_sharded_param_map (line 45) | def get_param_id_to_sharded_param_map( function make_sharded_optimizer_tensor (line 83) | def make_sharded_optimizer_tensor( function optim_state_to_sharding_state (line 111) | def optim_state_to_sharding_state( FILE: megatron/core/dist_checkpointing/serialization.py function load (line 61) | def load( function load_common_state_dict (line 174) | def load_common_state_dict(checkpoint_dir: Union[str, Path]) -> StateDict: function load_tensors_metadata (line 196) | def load_tensors_metadata( function load_sharded_metadata (line 227) | def load_sharded_metadata( function load_plain_tensors (line 270) | def load_plain_tensors(checkpoint_dir: str) -> StateDict: function load_content_metadata (line 287) | def load_content_metadata( function remove_sharded_tensors (line 308) | def remove_sharded_tensors(checkpoint_dir: str, key_prefix: str): function save (line 314) | def save( function get_default_save_sharded_strategy (line 442) | def get_default_save_sharded_strategy( function get_default_save_common_strategy (line 449) | def get_default_save_common_strategy( function get_default_load_sharded_strategy (line 456) | def get_default_load_sharded_strategy( FILE: megatron/core/dist_checkpointing/state_dict_utils.py function save_preprocess (line 20) | def save_preprocess( function load_preprocess (line 62) | def load_preprocess(sharded_state_dict: ShardedStateDict): function filter_out_empty_flatten_tensor (line 96) | def filter_out_empty_flatten_tensor(sharded_state_dict: Union[dict, list]): FILE: megatron/core/dist_checkpointing/strategies/async_utils.py function _set_process_qos (line 28) | def _set_process_qos(cpu_priority: int, io_priority: Optional[int]) -> N... function _disable_gc (line 85) | def _disable_gc(): class AsyncRequest (line 97) | class AsyncRequest(NamedTuple): method add_finalize_fn (line 123) | def add_finalize_fn(self, fn: Callable) -> None: method execute_sync (line 137) | def execute_sync(self) -> None: method freeze (line 163) | def freeze(self) -> 'AsyncRequest': class AsyncCaller (line 173) | class AsyncCaller(ABC): method schedule_async_call (line 180) | def schedule_async_call(self, async_req: AsyncRequest) -> None: method is_current_async_call_done (line 193) | def is_current_async_call_done(self, blocking: bool, no_dist: bool) ->... method sync_all_async_calls (line 213) | def sync_all_async_calls(self, is_alive: int) -> bool: method close (line 228) | def close(self, abort=False): method __del__ (line 232) | def __del__(self): class TemporalAsyncCaller (line 236) | class TemporalAsyncCaller(AsyncCaller): method __init__ (line 242) | def __init__(self): method schedule_async_call (line 247) | def schedule_async_call(self, async_req: AsyncRequest) -> None: method is_current_async_call_done (line 283) | def is_current_async_call_done(self, blocking: bool = False, no_dist: ... method close (line 315) | def close(self, abort=False): method __del__ (line 343) | def __del__(self): class PersistentAsyncCaller (line 347) | class PersistentAsyncCaller(AsyncCaller): method __init__ (line 358) | def __init__(self): method _get_process (line 365) | def _get_process( method schedule_async_call (line 395) | def schedule_async_call(self, async_req: AsyncRequest) -> None: method is_current_async_call_done (line 435) | def is_current_async_call_done(self, blocking: bool = False, no_dist: ... method close (line 488) | def close(self, abort=False): method __del__ (line 517) | def __del__(self): method async_loop (line 522) | def async_loop( class _ActiveAsyncRequest (line 599) | class _ActiveAsyncRequest(NamedTuple): class AsyncCallsQueue (line 614) | class AsyncCallsQueue: method __init__ (line 623) | def __init__(self, persistent: bool = False): method _get_async_caller (line 628) | def _get_async_caller(self): method warmup_persistent_caller (line 636) | def warmup_persistent_caller( method schedule_async_request (line 646) | def schedule_async_request(self, async_request: AsyncRequest) -> int: method maybe_finalize_async_calls (line 670) | def maybe_finalize_async_calls(self, blocking=False, no_dist=False) ->... method get_num_unfinalized_calls (line 707) | def get_num_unfinalized_calls(self): method close (line 711) | def close(self, abort=False): FILE: megatron/core/dist_checkpointing/strategies/base.py class StrategyAction (line 15) | class StrategyAction(Enum): function get_default_strategy (line 29) | def get_default_strategy(action: StrategyAction, backend: str, version: ... function register_default_strategy (line 50) | def register_default_strategy( class LoadStrategyBase (line 67) | class LoadStrategyBase(ABC): method check_backend_compatibility (line 72) | def check_backend_compatibility(self, loaded_backend): method check_version_compatibility (line 77) | def check_version_compatibility(self, loaded_version): method can_handle_sharded_objects (line 82) | def can_handle_sharded_objects(self): class SaveStrategyBase (line 87) | class SaveStrategyBase(ABC): method __init__ (line 91) | def __init__(self, backend: str, version: int): method can_handle_sharded_objects (line 96) | def can_handle_sharded_objects(self): method __str__ (line 100) | def __str__(self): class LoadCommonStrategy (line 104) | class LoadCommonStrategy(LoadStrategyBase): method load_common (line 108) | def load_common(self, checkpoint_dir: Union[str, Path]): method load_sharded_objects (line 113) | def load_sharded_objects( method load_sharded_metadata (line 119) | def load_sharded_metadata(self, checkpoint_dir: Union[str, Path]) -> S... class LoadShardedStrategy (line 126) | class LoadShardedStrategy(LoadStrategyBase): method load (line 130) | def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: U... method load_tensors_metadata (line 135) | def load_tensors_metadata(self, checkpoint_dir: Union[str, Path]): method load_sharded_metadata (line 149) | def load_sharded_metadata(self, checkpoint_dir: Union[str, Path]): method remove_sharded_tensors (line 164) | def remove_sharded_tensors(self, checkpoint_dir: Union[str, Path], key... class SaveCommonStrategy (line 169) | class SaveCommonStrategy(SaveStrategyBase): method save_common (line 173) | def save_common(self, common_state_dict: StateDict, checkpoint_dir: Un... method save_sharded_objects (line 177) | def save_sharded_objects( class SaveShardedStrategy (line 184) | class SaveShardedStrategy(SaveStrategyBase): method save (line 188) | def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: U... class AsyncSaveShardedStrategy (line 193) | class AsyncSaveShardedStrategy(SaveShardedStrategy): method async_save (line 197) | def async_save( method save (line 212) | def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: U... FILE: megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py class CachedMetadataFileSystemReader (line 11) | class CachedMetadataFileSystemReader(FileSystemReader): method __init__ (line 24) | def __init__(self, path: Union[str, os.PathLike], cache_metadata: bool... method read_metadata (line 34) | def read_metadata(self) -> Metadata: method clear_metadata_cache (line 49) | def clear_metadata_cache(cls): FILE: megatron/core/dist_checkpointing/strategies/checkpointable.py class CheckpointableShardedTensor (line 15) | class CheckpointableShardedTensor(torch.Tensor): method __new__ (line 21) | def __new__(cls, data: torch.Tensor, sh_ten: ShardedTensor): method __init__ (line 24) | def __init__(self, data: torch.Tensor, sh_ten: ShardedTensor): method __create_write_items__ (line 28) | def __create_write_items__( method __create_chunk_list__ (line 59) | def __create_chunk_list__(self) -> list[ChunkStorageMetadata]: method __get_tensor_shard__ (line 71) | def __get_tensor_shard__(self, index: MetadataIndex) -> torch.Tensor: method from_sh_ten (line 83) | def from_sh_ten(cls, sh_ten: ShardedTensor) -> 'CheckpointableShardedT... method __torch_dispatch__ (line 96) | def __torch_dispatch__(cls, func, types, args, kwargs=None): method __repr__ (line 103) | def __repr__(self): class LocalShardsContainer (line 107) | class LocalShardsContainer(torch.Tensor): method __new__ (line 117) | def __new__(cls, local_shards: list[torch.Tensor]) -> "LocalShardsCont... method __init__ (line 122) | def __init__(self, local_shards: list[torch.Tensor]): method __torch_dispatch__ (line 129) | def __torch_dispatch__(cls, func, types, args=(), kwargs=None): method __create_write_items__ (line 136) | def __create_write_items__( method __create_chunk_list__ (line 155) | def __create_chunk_list__(self) -> list[ChunkStorageMetadata]: method __get_tensor_shard__ (line 165) | def __get_tensor_shard__(self, index: MetadataIndex) -> torch.Tensor: method __repr__ (line 195) | def __repr__(self): FILE: megatron/core/dist_checkpointing/strategies/common.py function register_default_common_strategies (line 29) | def register_default_common_strategies(): class TorchCommonSaveStrategy (line 37) | class TorchCommonSaveStrategy(SaveCommonStrategy): method save_common (line 40) | def save_common(self, common_state_dict: StateDict, checkpoint_dir: Un... method save_sharded_objects (line 50) | def save_sharded_objects( method can_handle_sharded_objects (line 66) | def can_handle_sharded_objects(self): class TorchCommonLoadStrategy (line 71) | class TorchCommonLoadStrategy(LoadCommonStrategy): method load_common (line 74) | def load_common(self, checkpoint_dir: Union[str, Path]): method load_sharded_objects (line 100) | def load_sharded_objects( method load_sharded_metadata (line 153) | def load_sharded_metadata(self, checkpoint_dir: Union[str, Path]) -> S... method can_handle_sharded_objects (line 185) | def can_handle_sharded_objects(self): method check_backend_compatibility (line 189) | def check_backend_compatibility(self, loaded_version): method check_version_compatibility (line 192) | def check_version_compatibility(self, loaded_version): FILE: megatron/core/dist_checkpointing/strategies/filesystem_async.py function get_write_results_queue (line 53) | def get_write_results_queue(mp_mode: str = 'spawn') -> mp.Queue: class FileSystemWriterAsync (line 69) | class FileSystemWriterAsync(FileSystemWriter): method __init__ (line 90) | def __init__( method prepare_write_data (line 114) | def prepare_write_data(self, plan: SavePlan, planner: SavePlanner) -> ... method get_save_function_and_args (line 201) | def get_save_function_and_args(self) -> Tuple[Optional[Callable], Opti... method preload_tensors (line 222) | def preload_tensors(write_buckets: List[WriteBucket], non_blocking=Tru... method write_preloaded_data_multithread (line 248) | def write_preloaded_data_multithread( method write_preloaded_data (line 359) | def write_preloaded_data( method write_data (line 438) | def write_data(self, plan: SavePlan, planner: SavePlanner) -> Future[L... method retrieve_write_results (line 442) | def retrieve_write_results(self) -> Union[List[WriteResult], WRAPPED_E... method prepare_decentralized_global_plan (line 478) | def prepare_decentralized_global_plan(self, local_plan: SavePlan) -> S... method finish (line 493) | def finish(self, metadata: Metadata, results: List[List[WriteResult]])... method prepare_local_plan (line 518) | def prepare_local_plan(self, plan: SavePlan) -> SavePlan: method checkpoint_id (line 532) | def checkpoint_id(self) -> Union[str, os.PathLike]: method validate_checkpoint_id (line 539) | def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]... function _split_by_size_and_type (line 554) | def _split_by_size_and_type(bins: int, items: List[WriteItem]) -> List[L... function _split_by_separation_hint (line 600) | def _split_by_separation_hint( function _item_size (line 631) | def _item_size(item: WriteItem) -> int: function _process_memory (line 653) | def _process_memory() -> int: FILE: megatron/core/dist_checkpointing/strategies/fully_parallel.py class FullyParallelSaveStrategyWrapper (line 48) | class FullyParallelSaveStrategyWrapper(AsyncSaveShardedStrategy): method __init__ (line 73) | def __init__( method async_save (line 88) | def async_save(self, sharded_state_dict: ShardedStateDict, checkpoint_... method save (line 96) | def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: P... method apply_saving_parallelization (line 100) | def apply_saving_parallelization(self, sharded_state_dict: ShardedStat... method can_handle_sharded_objects (line 137) | def can_handle_sharded_objects(self): class FullyParallelLoadStrategyWrapper (line 141) | class FullyParallelLoadStrategyWrapper(LoadShardedStrategy): method __init__ (line 167) | def __init__( method load (line 188) | def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: P... method _defer_loading_sharded_objects (line 293) | def _defer_loading_sharded_objects( method _defer_loading_sharded_tensors (line 304) | def _defer_loading_sharded_tensors( method fill_in_deferred_sharded_objects (line 317) | def fill_in_deferred_sharded_objects( method fill_in_deferred_sharded_tensors (line 336) | def fill_in_deferred_sharded_tensors( method apply_loading_parallelization (line 354) | def apply_loading_parallelization( method can_handle_sharded_objects (line 392) | def can_handle_sharded_objects(self): method load_tensors_metadata (line 395) | def load_tensors_metadata(self, checkpoint_dir: Path): method load_sharded_metadata (line 398) | def load_sharded_metadata(self, checkpoint_dir: Path): method check_backend_compatibility (line 401) | def check_backend_compatibility(self, loaded_version): method check_version_compatibility (line 404) | def check_version_compatibility(self, loaded_version): function distribute_main_replicas_with_precomputed_distribution (line 408) | def distribute_main_replicas_with_precomputed_distribution( function _defer_loading_sharded_items (line 465) | def _defer_loading_sharded_items( function _fill_in_deferred_sharded_items (line 502) | def _fill_in_deferred_sharded_items( FILE: megatron/core/dist_checkpointing/strategies/state_dict_saver.py function _compare_dataclasses (line 27) | def _compare_dataclasses(obj1, obj2): function save_state_dict_async_plan (line 41) | def save_state_dict_async_plan( function verify_global_md_reuse (line 171) | def verify_global_md_reuse( function save_state_dict_async_finalize (line 213) | def save_state_dict_async_finalize( FILE: megatron/core/dist_checkpointing/strategies/torch.py class MCoreMetadata (line 87) | class MCoreMetadata: class MCoreSavePlan (line 94) | class MCoreSavePlan: function register_default_torch_strategies (line 100) | def register_default_torch_strategies(): function flatten_state_dict (line 113) | def flatten_state_dict( function sharded_tensor_to_torch_sharded_tensor (line 141) | def sharded_tensor_to_torch_sharded_tensor( function mcore_to_pyt_state_dict (line 248) | def mcore_to_pyt_state_dict( function _unwrap_pyt_sharded_tensor (line 338) | def _unwrap_pyt_sharded_tensor( function _replace_state_dict_keys_with_sharded_keys (line 363) | def _replace_state_dict_keys_with_sharded_keys( function _replace_sharded_keys_with_state_dict_keys (line 380) | def _replace_sharded_keys_with_state_dict_keys( function _restore_dict_types (line 395) | def _restore_dict_types(x: Union[dict, list, Any], keys_template: Union[... class MCoreSavePlanner (line 410) | class MCoreSavePlanner(DefaultSavePlanner): method __init__ (line 421) | def __init__( method create_local_plan (line 442) | def create_local_plan(self) -> SavePlan: method create_decentralized_global_plan (line 462) | def create_decentralized_global_plan(self, local_plan: SavePlan) -> Sa... method transform_object (line 479) | def transform_object(self, write_item: WriteItem, object: Any): class MCoreLoadPlanner (line 484) | class MCoreLoadPlanner(DefaultLoadPlanner): method __init__ (line 491) | def __init__( method _validate_global_shapes (line 503) | def _validate_global_shapes(self, metadata, sharded_tensors): method _temporarily_bypass_shape_validation (line 521) | def _temporarily_bypass_shape_validation(self): method create_local_plan (line 545) | def create_local_plan(self) -> LoadPlan: method resolve_tensor (line 554) | def resolve_tensor(self, read_item: ReadItem): method commit_tensor (line 576) | def commit_tensor(self, read_item: ReadItem, tensor: torch.Tensor) -> ... class TorchDistSaveShardedStrategy (line 589) | class TorchDistSaveShardedStrategy(AsyncSaveShardedStrategy): method __init__ (line 597) | def __init__( method async_save (line 643) | def async_save( method _get_save_and_finalize_callbacks (line 745) | def _get_save_and_finalize_callbacks(self, writer, save_state_dict_ret... method can_handle_sharded_objects (line 754) | def can_handle_sharded_objects(self): function _get_filesystem_reader (line 758) | def _get_filesystem_reader( class TorchDistLoadShardedStrategy (line 771) | class TorchDistLoadShardedStrategy(LoadShardedStrategy): method __init__ (line 774) | def __init__(self, cache_metadata: bool = False): method load (line 779) | def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: P... method load_tensors_metadata (line 835) | def load_tensors_metadata(self, checkpoint_dir: Path, metadata: Metada... method load_sharded_metadata (line 853) | def load_sharded_metadata(self, checkpoint_dir: Path) -> ShardedStateD... method remove_sharded_tensors (line 868) | def remove_sharded_tensors(self, checkpoint_dir: str, key_prefix: str): method can_handle_sharded_objects (line 952) | def can_handle_sharded_objects(self): method check_backend_compatibility (line 955) | def check_backend_compatibility(self, loaded_version): method check_version_compatibility (line 958) | def check_version_compatibility(self, loaded_version): FILE: megatron/core/dist_checkpointing/tensor_aware_state_dict.py class MCoreTensorAwareStateDict (line 48) | class MCoreTensorAwareStateDict(TensorAwareStateDict): method _validate_params (line 61) | def _validate_params(algo): method _get_distribution (line 68) | def _get_distribution( method _remove_redundant_data (line 86) | def _remove_redundant_data( method from_state_dict (line 101) | def from_state_dict( method is_hollow (line 155) | def is_hollow(self): method _sharded_tensors (line 162) | def _sharded_tensors(self): method tensors (line 179) | def tensors(self) -> Iterator[torch.Tensor]: method common_state_dict (line 187) | def common_state_dict(self) -> Dict: method pop_tensors (line 193) | def pop_tensors(self) -> List[torch.Tensor]: method insert_tensors (line 213) | def insert_tensors(self, tensor_data: Iterable[torch.Tensor]): method init_tensors (line 230) | def init_tensors(self): method copy_tensors_to_cpu (line 245) | def copy_tensors_to_cpu(self, non_blocking=False): method restore_tensor_device (line 264) | def restore_tensor_device(self, non_blocking=True): method _insert_sharded_data (line 276) | def _insert_sharded_data( method to_state_dict (line 325) | def to_state_dict( FILE: megatron/core/dist_checkpointing/utils.py function zip_strict (line 25) | def zip_strict(*args): function _sharded_tensor_shard_id (line 37) | def _sharded_tensor_shard_id(sharded_tensor: ShardedTensor) -> _ShardId: function _sharded_object_id (line 55) | def _sharded_object_id(sharded_object: ShardedObject) -> _ShardId: function extract_sharded_tensors (line 68) | def extract_sharded_tensors( function extract_sharded_tensors_and_factories (line 86) | def extract_sharded_tensors_and_factories( function extract_sharded_tensors_or_nonpersistent (line 107) | def extract_sharded_tensors_or_nonpersistent( function extract_sharded_base (line 129) | def extract_sharded_base( function extract_nonpersistent (line 145) | def extract_nonpersistent( function add_prefix_for_sharding (line 165) | def add_prefix_for_sharding(sharded_state_dict: ShardedStateDict, prefix... function replace_prefix_for_sharding (line 184) | def replace_prefix_for_sharding( function apply_prefix_mapping (line 210) | def apply_prefix_mapping(sharded_state_dict: ShardedStateDict, prefix_ma... function force_all_tensors_to_non_fp8 (line 236) | def force_all_tensors_to_non_fp8(sharded_state_dict: ShardedStateDict): function logger_stack (line 255) | def logger_stack(name: Optional[str] = None, current_logger: Optional[lo... function debug_time (line 293) | def debug_time( function debug_msg (line 318) | def debug_msg(msg: str): function _clean_metadata_for_serialization (line 335) | def _clean_metadata_for_serialization(metadata: dict) -> dict: FILE: megatron/core/dist_checkpointing/validation.py class StrictHandling (line 44) | class StrictHandling(Enum): method requires_explicit_ckpt_mismatch_check (line 86) | def requires_explicit_ckpt_mismatch_check(val: "StrictHandling") -> bool: method requires_global_app_metadata (line 91) | def requires_global_app_metadata(val: "StrictHandling") -> bool: method requires_returning_mismatch_keys (line 101) | def requires_returning_mismatch_keys(val: "StrictHandling") -> bool: function parse_strict_flag (line 106) | def parse_strict_flag(strict: Union[str, StrictHandling]) -> StrictHandl... function validate_integrity_and_strict_load (line 124) | def validate_integrity_and_strict_load( function verify_checkpoint_and_load_strategy (line 202) | def verify_checkpoint_and_load_strategy( function adjust_non_strict_load (line 268) | def adjust_non_strict_load( function _determine_missing_and_unexpected_keys (line 289) | def _determine_missing_and_unexpected_keys( function maybe_report_missing_and_unexpected_keys (line 337) | def maybe_report_missing_and_unexpected_keys( function _validate_common_state_dict (line 381) | def _validate_common_state_dict(common_state_dict: CommonStateDict) -> N... function validate_sharding_integrity (line 415) | def validate_sharding_integrity( function _validate_sharding_for_key (line 458) | def _validate_sharding_for_key( function _compute_shards_access (line 500) | def _compute_shards_access(rank_sharding): function _validate_objects_for_key (line 510) | def _validate_objects_for_key(sharded_objects: List[ShardedObject]) -> L... function determine_global_metadata (line 530) | def determine_global_metadata( function validate_sharded_objects_handling (line 547) | def validate_sharded_objects_handling( FILE: megatron/core/distributed/data_parallel_base.py class _BaseDataParallel (line 11) | class _BaseDataParallel(MegatronModule): method __init__ (line 14) | def __init__(self, config: TransformerConfig, module: torch.nn.Module): method forward (line 18) | def forward(self, *inputs, **kwargs): method no_sync (line 25) | def no_sync(self): method start_grad_sync (line 34) | def start_grad_sync(self, *unused): method scale_gradients (line 45) | def scale_gradients(self, scaling_factor: float) -> None: method finish_grad_sync (line 49) | def finish_grad_sync(self): method zero_grad_buffer (line 60) | def zero_grad_buffer(self): method broadcast_params (line 67) | def broadcast_params(self): method state_dict (line 73) | def state_dict(self, prefix='', keep_vars=False, destination=None): method state_dict_for_save_checkpoint (line 84) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): method load_state_dict (line 90) | def load_state_dict(self, state_dict, strict=True): FILE: megatron/core/distributed/distributed_data_parallel.py class DistributedDataParallel (line 22) | class DistributedDataParallel(_BaseDataParallel): method __init__ (line 41) | def __init__( method enable_forward_pre_hook (line 359) | def enable_forward_pre_hook(self): method disable_forward_pre_hook (line 371) | def disable_forward_pre_hook(self, param_sync: bool = True): method _make_forward_pre_hook (line 388) | def _make_forward_pre_hook(self): method _make_backward_post_hook (line 424) | def _make_backward_post_hook(self, param: torch.nn.Parameter): method no_sync (line 455) | def no_sync(self): method start_param_sync (line 467) | def start_param_sync(self, *unused, force_sync: bool = False, force_di... method start_grad_sync (line 525) | def start_grad_sync(self, *unused): method finish_grad_sync (line 537) | def finish_grad_sync(self, force_all_reduce: Optional[bool] = False): method free_overlap_buffers (line 549) | def free_overlap_buffers(self): method scale_gradients (line 554) | def scale_gradients(self, scaling_factor: float): method zero_grad_buffer (line 559) | def zero_grad_buffer(self): method broadcast_params (line 575) | def broadcast_params(self): method offload_grad_buffers (line 592) | def offload_grad_buffers(self, synchronize: bool = True, empty_cache: ... method restore_grad_buffers (line 613) | def restore_grad_buffers(self, synchronize: bool = True) -> None: FILE: megatron/core/distributed/distributed_data_parallel_config.py class DistributedDataParallelConfig (line 10) | class DistributedDataParallelConfig: method __post_init__ (line 194) | def __post_init__(self): FILE: megatron/core/distributed/finalize_model_grads.py function _get_main_grad_attr (line 34) | def _get_main_grad_attr(param: torch.nn.Parameter): function _unshard_if_dtensor (line 40) | def _unshard_if_dtensor(tensor: Union[torch.Tensor, "DTensor"]) -> torch... function _reshard_if_dtensor (line 60) | def _reshard_if_dtensor( function _allreduce_conditional_embedding_grads (line 89) | def _allreduce_conditional_embedding_grads( function _get_shared_word_embedding_weight (line 132) | def _get_shared_word_embedding_weight( function _get_position_embedding_weight (line 151) | def _get_position_embedding_weight(model_module: torch.nn.Module) -> tor... function _allreduce_word_embedding_grads (line 164) | def _allreduce_word_embedding_grads( function _allreduce_embedding_grad (line 204) | def _allreduce_embedding_grad( function _allreduce_position_embedding_grads (line 262) | def _allreduce_position_embedding_grads( function reset_model_temporary_tensors (line 278) | def reset_model_temporary_tensors(config: TransformerConfig, model: List... function _update_router_expert_bias (line 293) | def _update_router_expert_bias(model: List[torch.nn.Module], config: Tra... function _allreduce_non_tensor_model_parallel_grads (line 322) | def _allreduce_non_tensor_model_parallel_grads( function finalize_model_grads (line 400) | def finalize_model_grads( FILE: megatron/core/distributed/fsdp/mcore_fsdp_adapter.py class FullyShardedDataParallel (line 62) | class FullyShardedDataParallel(_BaseDataParallel): method __init__ (line 67) | def __init__( method load_state_dict (line 163) | def load_state_dict(self, state_dict, strict=True): method _fix_tensor_parallel_attributes (line 185) | def _fix_tensor_parallel_attributes(self, module): method _init_dist_index (line 223) | def _init_dist_index(self, pg_collection): method stop_communication (line 344) | def stop_communication(self): method sync_rng_states_across_tp_group (line 351) | def sync_rng_states_across_tp_group(self): function _get_hsdp_tp_mesh (line 366) | def _get_hsdp_tp_mesh(outer_fsdp_dp_group, dp_cp_group, tp_group, ep_siz... function _get_dp_tp_mesh (line 435) | def _get_dp_tp_mesh(dp_cp_group, tp_group, ep_size=1): function _check_mesh_ranks_and_group_ranks_are_consistent (line 477) | def _check_mesh_ranks_and_group_ranks_are_consistent(mesh_ranks, group_r... function _get_rng_state_dict (line 491) | def _get_rng_state_dict(): function _load_rng_state_dict (line 502) | def _load_rng_state_dict(rng_state_dict): FILE: megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py class DistributedDataParallelConfig (line 10) | class DistributedDataParallelConfig: method __post_init__ (line 148) | def __post_init__(self): FILE: megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py class ShardingStrategy (line 43) | class ShardingStrategy(IntEnum): function experimental_api (line 63) | def experimental_api(func: Callable) -> Callable: function fully_shard_model (line 75) | def fully_shard_model( function fully_shard_optimizer (line 408) | def fully_shard_optimizer( function fully_shard (line 614) | def fully_shard( FILE: megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py class TrainingState (line 61) | class TrainingState(Enum): class MegatronFSDP (line 76) | class MegatronFSDP(torch.nn.Module): method __init__ (line 171) | def __init__( method _check_module_parameter_types (line 319) | def _check_module_parameter_types(self): method _init_fsdp_param_and_grad_buffer (line 333) | def _init_fsdp_param_and_grad_buffer(self): method _import_class_from_path (line 415) | def _import_class_from_path(self, class_path: str): method all_gather_and_wait_parameters_ready (line 422) | def all_gather_and_wait_parameters_ready( method _register_fsdp_hooks (line 477) | def _register_fsdp_hooks(self, root_module): method no_sync (line 1059) | def no_sync(self): method sync (line 1073) | def sync(self): method set_model_auto_sync (line 1084) | def set_model_auto_sync(self, sync_model: bool = True): method get_distributed_index (line 1127) | def get_distributed_index(self) -> FSDPDistributedIndex: method mixed_precision_context (line 1135) | def mixed_precision_context(self, mixed_precision_policy: MixedPrecisi... method reset_mixed_precision_policy (line 1147) | def reset_mixed_precision_policy(self, mixed_precision_policy: MixedPr... method start_param_sync (line 1167) | def start_param_sync(self, *unused, force_sync: bool = False, force_di... method start_grad_sync (line 1205) | def start_grad_sync(self, *unused): method synchronize_param_gather (line 1222) | def synchronize_param_gather(self): method synchronize_gradient_reduce (line 1229) | def synchronize_gradient_reduce(self): method attach_grad_to_optimizer_state (line 1242) | def attach_grad_to_optimizer_state(self): method finish_grad_sync (line 1249) | def finish_grad_sync(self, force_all_reduce: Optional[bool] = False): method _replace_param_with_distributed_if_needed (line 1281) | def _replace_param_with_distributed_if_needed(self): method _replace_param_with_raw_if_needed (line 1300) | def _replace_param_with_raw_if_needed(self): method _reestablish_shared_weights (line 1314) | def _reestablish_shared_weights(self, old_params, new_params): method scale_gradients (line 1348) | def scale_gradients(self, scaling_factor: float): method zero_grad_buffer (line 1352) | def zero_grad_buffer(self): method install_optimized_model_weights (line 1362) | def install_optimized_model_weights(self): method broadcast_params (line 1369) | def broadcast_params(self): method forward (line 1385) | def forward(self, *inputs, **kwargs): class RegisterFSDPBackwardFunction (line 1396) | class RegisterFSDPBackwardFunction(torch.autograd.Function): method forward (line 1404) | def forward(ctx, post_backward, *inputs: torch.Tensor): method backward (line 1412) | def backward(ctx, *grads: torch.Tensor): function _replace_module_parameter (line 1420) | def _replace_module_parameter(module, name, new_param): FILE: megatron/core/distributed/fsdp/src/megatron_fsdp/mixed_precision.py function local_multi_tensor_applier (line 121) | def local_multi_tensor_applier(op, noop_flag_buffer, tensor_lists, *args): function local_multi_tensor_scale (line 125) | def local_multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale): function _multi_tensor_copy_this_to_that (line 133) | def _multi_tensor_copy_this_to_that( function is_te_min_version (line 162) | def is_te_min_version(vers, check_equality=True): function is_float8tensor (line 173) | def is_float8tensor(tensor: torch.Tensor) -> bool: function is_blockwise_float8tensor (line 178) | def is_blockwise_float8tensor(tensor: torch.Tensor) -> bool: function fp8_need_transpose_data (line 183) | def fp8_need_transpose_data(tensor: torch.Tensor) -> bool: function fp8_need_transpose_data_for_meta_device_init (line 188) | def fp8_need_transpose_data_for_meta_device_init(module: TransformerEngi... function fp8_discard_transpose_cache (line 193) | def fp8_discard_transpose_cache(tensor: torch.Tensor) -> None: function fp8_create_transpose_cache (line 204) | def fp8_create_transpose_cache(tensors: List[torch.Tensor]) -> None: function _fp8_create_transpose_cache_fallback (line 212) | def _fp8_create_transpose_cache_fallback(tensors: List[torch.Tensor]) ->... function fp8_set_raw_data (line 223) | def fp8_set_raw_data(tensor: torch.Tensor, data: torch.Tensor, set_trans... function fp8_get_raw_data (line 244) | def fp8_get_raw_data(tensor: torch.Tensor, get_transpose: bool = False) ... function fp8_dequantize (line 257) | def fp8_dequantize(tensor: torch.Tensor) -> torch.Tensor: function fp8_quantize (line 266) | def fp8_quantize( function _fp8_quantize_fallback (line 288) | def _fp8_quantize_fallback( function get_quantized_model_init_context_cls (line 353) | def get_quantized_model_init_context_cls(): class MixedPrecisionPolicy (line 366) | class MixedPrecisionPolicy: FILE: megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py function _p_assert (line 107) | def _p_assert(cond: Any, s: str, raise_assertion_error: bool = True) -> ... function _alloc_storage (line 118) | def _alloc_storage(tensor: torch.Tensor, size: torch.Size) -> None: function _free_storage (line 138) | def _free_storage(tensor: torch.Tensor): class MultiGroupUBRAllocator (line 170) | class MultiGroupUBRAllocator: method __init__ (line 200) | def __init__(self, pool, groups): # torch.cuda.MemPool # torch.distr... method __enter__ (line 206) | def __enter__(self): method __exit__ (line 218) | def __exit__(self, *args): class BucketingPolicy (line 232) | class BucketingPolicy: function _pad (line 252) | def _pad(number_to_be_padded: int, divisor: int) -> int: function build_data_parallel_buffer_index (line 256) | def build_data_parallel_buffer_index( function _get_dp_buffer_shard_bucket_index (line 384) | def _get_dp_buffer_shard_bucket_index( class Bucket (line 444) | class Bucket: class TemporaryBucketAllocator (line 461) | class TemporaryBucketAllocator: method __init__ (line 497) | def __init__(self): method allocate (line 500) | def allocate( method free (line 515) | def free(self, bucket_id: int): class StorageResizeBasedBucketAllocator (line 524) | class StorageResizeBasedBucketAllocator(TemporaryBucketAllocator): method __init__ (line 530) | def __init__(self): method allocate (line 533) | def allocate( method free (line 550) | def free(self, bucket_id: int): class RotaryBucketAllocator (line 558) | class RotaryBucketAllocator(TemporaryBucketAllocator): method __init__ (line 589) | def __init__(self, name: str): method allocate (line 596) | def allocate( method _get_gbuf_name (line 630) | def _get_gbuf_name(self, buffer_id: int): method free (line 633) | def free(self, bucket_id: int): class FixedPoolAllocator (line 642) | class FixedPoolAllocator(TemporaryBucketAllocator): method __init__ (line 652) | def __init__( method _is_two_bucket_group_equal (line 729) | def _is_two_bucket_group_equal(self, group_a, group_b): method allocate (line 743) | def allocate( method _get_gbuf_name (line 806) | def _get_gbuf_name(self, buf_group_id: int, bucket_index: int): method free (line 809) | def free(self, bucket_id: int): class DataParallelBuffer (line 828) | class DataParallelBuffer: method __init__ (line 846) | def __init__( method init_data (line 932) | def init_data(self, data: torch.Tensor): method fetch_bucket (line 942) | def fetch_bucket( method allocate_bucket_storage (line 987) | def allocate_bucket_storage( method free_bucket_storage (line 1043) | def free_bucket_storage(self): method reset_param_main_grad (line 1049) | def reset_param_main_grad(self): method _get_item_slice_in_shard (line 1058) | def _get_item_slice_in_shard(self, item_id: int) -> Tuple[int, int]: method locate_item_in_global_item (line 1106) | def locate_item_in_global_item(self, item_id: int) -> Tuple[int, int]: method _get_item_local_shard_index (line 1127) | def _get_item_local_shard_index(self, item_id: int) -> Tuple[int, int]: method _get_item_local_index (line 1171) | def _get_item_local_index(self, item_id: int) -> Tuple[int, int]: method set_item (line 1188) | def set_item(self, item_id: int, item_data: torch.Tensor) -> None: method get_item (line 1223) | def get_item(self, item_id: int, only_shard: bool = False) -> torch.Te... method get_item_from_bucket (line 1257) | def get_item_from_bucket(self, bucket: Bucket, item_id: int): method get_shard_from_bucket (line 1268) | def get_shard_from_bucket(self, bucket: Bucket): method get_shard_from_local_buffer (line 1278) | def get_shard_from_local_buffer(self) -> torch.Tensor: class ParameterGroup (line 1290) | class ParameterGroup: function _get_parameter_groups (line 1349) | def _get_parameter_groups( class ParamAndGradBuffer (line 1583) | class ParamAndGradBuffer: method __init__ (line 1625) | def __init__( method get_mem_alloc_context (line 1770) | def get_mem_alloc_context(self, groups=None, symmetric=True): method manual_buffer_registration (line 1834) | def manual_buffer_registration(self): method _log_parameter_groups (line 1869) | def _log_parameter_groups(self): method _init_each_parameter_group_buffers (line 1918) | def _init_each_parameter_group_buffers(self, meta_device_init_fp8_para... method _reset_parameters (line 2615) | def _reset_parameters(self, old_params, new_params): method scale_gradients (line 2659) | def scale_gradients(self, scaling_factor: float) -> None: method zero_grad (line 2667) | def zero_grad(self): method _init_distributed_params (line 2685) | def _init_distributed_params(self): method _init_optimizer_named_parameters (line 2756) | def _init_optimizer_named_parameters(self) -> List[Tuple[str, torch.nn... method update_main_grads (line 2813) | def update_main_grads(self): method num_buckets (line 2879) | def num_buckets(self): method copy_main_weights_to_model_weights (line 2884) | def copy_main_weights_to_model_weights(self): method copy_model_weights_to_main_weights (line 3055) | def copy_model_weights_to_main_weights(self): method all_gather_parameters (line 3073) | def all_gather_parameters(self, async_op: bool = True): method reduce_scatter_gradients (line 3104) | def reduce_scatter_gradients(self, async_op: bool = True): method all_reduce_gradients (line 3140) | def all_reduce_gradients(self, async_op: bool = False): class BucketStatus (line 3176) | class BucketStatus(Enum): class GradReducePipeline (line 3191) | class GradReducePipeline: method __init__ (line 3196) | def __init__( method num_buckets (line 3227) | def num_buckets(self): method reset (line 3231) | def reset(self): method reduce_gradients (line 3251) | def reduce_gradients( method wait_for_previous_grad_reduce (line 3295) | def wait_for_previous_grad_reduce( method _enforce_double_buffer_limit (line 3327) | def _enforce_double_buffer_limit(self, add_buckets): method get_ready_bucket_group_for_reduction (line 3349) | def get_ready_bucket_group_for_reduction(self, bucket_id: int) -> Opti... method get_fsdp_buffer (line 3375) | def get_fsdp_buffer(self, bucket_id: int) -> DataParallelBuffer: method _bucket_group_gradient_reduce (line 3382) | def _bucket_group_gradient_reduce( class PrefetchOrder (line 3634) | class PrefetchOrder(Enum): class AllGatherPipeline (line 3647) | class AllGatherPipeline: method __init__ (line 3652) | def __init__( method get_bucket_key (line 3696) | def get_bucket_key(self, bucket_id, bwd): method num_buckets (line 3704) | def num_buckets(self): method reset (line 3708) | def reset(self): method all_gather_params (line 3737) | def all_gather_params( method wait_bucket_ready (line 3922) | def wait_bucket_ready(self, bucket_id, bwd, empty_ok=False): method release_bucket (line 3942) | def release_bucket(self, bucket_id, bwd, lazy: bool = False): method recycle_unused_buckets (line 3995) | def recycle_unused_buckets(self): method get_fsdp_buffer (line 4003) | def get_fsdp_buffer(self, bucket_id: int, bwd=False) -> DataParallelBu... method async_bucket_gather (line 4020) | def async_bucket_gather(self, bucket_id, bwd) -> None: function gradient_reduce_preprocessing (line 4067) | def gradient_reduce_preprocessing(grad_data, scaling_factor, ddp_config): function _check_nan_in_grad (line 4092) | def _check_nan_in_grad(grad: torch.Tensor): function check_gpu_memory (line 4104) | def check_gpu_memory(threshold=0.9): class ResetParametersContext (line 4134) | class ResetParametersContext: method __init__ (line 4139) | def __init__(self, init_param_with_fp8=False, with_cuda_rng_tracker=Fa... method __enter__ (line 4143) | def __enter__(self): method __exit__ (line 4177) | def __exit__(self, *exc_details): function override_sharded_param_methods_with_safety_checks (line 4181) | def override_sharded_param_methods_with_safety_checks(params, all_gather... function _dtype_size (line 4221) | def _dtype_size(dtype: torch.dtype) -> int: function to_local_if_dtensor (line 4252) | def to_local_if_dtensor(tensor): function _get_fsdp_tensor_spec (line 4265) | def _get_fsdp_tensor_spec( function make_fsdp_dtensor (line 4341) | def make_fsdp_dtensor( FILE: megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py function gather_and_compute_chunk_metadata (line 31) | def gather_and_compute_chunk_metadata(dtensor: DTensor) -> ChunkStorageM... function update_uneven_dtensor_chunk_metadata (line 98) | def update_uneven_dtensor_chunk_metadata(dtensor: DTensor) -> dict: function validate_uneven_dtensor (line 141) | def validate_uneven_dtensor(dtensor: DTensor) -> None: function filter_unflattened_state_dict (line 208) | def filter_unflattened_state_dict(state_dict, key_chain=[], visit_condit... function get_unflattened_state_dict (line 227) | def get_unflattened_state_dict(state_dict, key_chain=[]): function preprocess_state_dict_for_uneven_dtensor (line 240) | def preprocess_state_dict_for_uneven_dtensor(state_dict: dict) -> dict: function gather_uneven_dtensor_to_full_tensor (line 258) | def gather_uneven_dtensor_to_full_tensor( function _assemble_full_tensor_from_uneven_chunks (line 333) | def _assemble_full_tensor_from_uneven_chunks( function _intersection (line 402) | def _intersection(s1, s2): function _offset_slice (line 411) | def _offset_slice(s, offset): function split_dtensor (line 415) | def split_dtensor( FILE: megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py function get_te_version (line 59) | def get_te_version(): function is_te_min_version (line 76) | def is_te_min_version(vers, check_equality=True): function is_submodule (line 88) | def is_submodule(module, parent_module, strict=True): function get_mesh_names (line 101) | def get_mesh_names( function contains_submesh (line 147) | def contains_submesh( function _get_cuda_rng_state (line 162) | def _get_cuda_rng_state( function _set_cuda_rng_state (line 193) | def _set_cuda_rng_state(new_state: torch.Tensor, device: int = -1, graph... function initialize_rng_tracker (line 235) | def initialize_rng_tracker( function get_cuda_rng_tracker (line 427) | def get_cuda_rng_tracker( function safe_get_rank (line 437) | def safe_get_rank() -> int: function log_single_rank (line 457) | def log_single_rank(logger_: logging.Logger, level: int, msg: str, *args... class FSDPDistributedIndex (line 465) | class FSDPDistributedIndex: method __init__ (line 474) | def __init__( method get_submesh (line 627) | def get_submesh( method get_dp_group (line 671) | def get_dp_group(self, is_expert_parallel: bool = False) -> ProcessGroup: method get_fsdp_group (line 681) | def get_fsdp_group( method get_outer_fsdp_group (line 691) | def get_outer_fsdp_group(self, is_expert_parallel: bool = False) -> Pr... method get_root_mesh (line 699) | def get_root_mesh(self, is_expert_parallel: bool = False) -> DeviceMesh: method get_logical_hybrid_fsdp_rank (line 708) | def get_logical_hybrid_fsdp_rank(self, is_expert_parallel: bool = False): class GlobalMemoryBuffer (line 752) | class GlobalMemoryBuffer: method __init__ (line 757) | def __init__(self): method get_tensor (line 760) | def get_tensor(self, tensor_shape, dtype, name, mem_alloc_context: Opt... function get_global_memory_buffer (line 781) | def get_global_memory_buffer(): function create_updated_function_signature (line 789) | def create_updated_function_signature(original_function, **extended_kwar... function is_mcore_tensor_model_parallel (line 813) | def is_mcore_tensor_model_parallel(param: torch.Tensor) -> bool: function is_mcore_tensor_parallel_duplicated (line 820) | def is_mcore_tensor_parallel_duplicated(param: torch.Tensor) -> bool: function get_mcore_tensor_parallel_partition_dim (line 827) | def get_mcore_tensor_parallel_partition_dim(param: torch.Tensor) -> Opti... FILE: megatron/core/distributed/param_and_grad_buffer.py class BufferType (line 48) | class BufferType(Enum): function shard_buffer (line 57) | def shard_buffer(buffer: torch.Tensor, data_parallel_world_size: int): class _ParamAndGradBucket (line 69) | class _ParamAndGradBucket: method __init__ (line 87) | def __init__( method set_layerwise_params_list (line 122) | def set_layerwise_params_list(self, layerwise_params_list: List[List[t... class _LayerwiseAllGatherHandle (line 136) | class _LayerwiseAllGatherHandle: method __init__ (line 143) | def __init__(self, handles): method wait (line 146) | def wait(self): class _ParamAndGradBucketGroup (line 153) | class _ParamAndGradBucketGroup: method __init__ (line 168) | def __init__( method reset (line 242) | def reset(self): method check_grads (line 254) | def check_grads(self, check_for_nan_or_inf, check_for_large): method start_param_sync (line 292) | def start_param_sync(self, force_sync: bool = False): method finish_param_sync (line 427) | def finish_param_sync(self, skip_next_bucket_dispatch: bool = False): method start_grad_sync (line 515) | def start_grad_sync(self, force_all_reduce: Optional[bool] = False): method finish_grad_sync (line 658) | def finish_grad_sync(self, force_all_reduce: Optional[bool] = False): method free_overlap_buffers (line 690) | def free_overlap_buffers(self): method register_grad_ready (line 705) | def register_grad_ready( class _ParamAndGradBuffer (line 730) | class _ParamAndGradBuffer: method __init__ (line 752) | def __init__( method scale_gradients (line 1060) | def scale_gradients(self, scaling_factor: float) -> None: method _get (line 1064) | def _get(self, shape: torch.Size, start_index: int, buffer_type: Buffe... method _new_bucket (line 1081) | def _new_bucket( method reset (line 1125) | def reset(self): method offload_to_cpu (line 1131) | def offload_to_cpu(self, move_params: bool = True, move_grads: bool = ... method reload_from_cpu (line 1146) | def reload_from_cpu(self, move_params: bool = True, move_grads: bool =... function partition_buckets (line 1164) | def partition_buckets( FILE: megatron/core/distributed/reduce_scatter_with_fp32_accumulation.py class _ReduceScatterWithFP32AccumulationWorkHandle (line 9) | class _ReduceScatterWithFP32AccumulationWorkHandle: method __init__ (line 13) | def __init__( method wait (line 26) | def wait(self): function reduce_scatter_with_fp32_accumulation (line 42) | def reduce_scatter_with_fp32_accumulation( FILE: megatron/core/distributed/torch_fully_sharded_data_parallel.py class TorchFullyShardedDataParallel (line 28) | class TorchFullyShardedDataParallel(_BaseDataParallel): method __init__ (line 55) | def __init__( method load_state_dict (line 150) | def load_state_dict(self, state_dict, strict=True): FILE: megatron/core/distributed/torch_fully_sharded_data_parallel_config.py class TorchFullyShardedDataParallelConfig (line 10) | class TorchFullyShardedDataParallelConfig(DistributedDataParallelConfig): FILE: megatron/core/energy_monitor.py class EnergyMonitor (line 22) | class EnergyMonitor: method __init__ (line 30) | def __init__(self) -> None: method setup (line 37) | def setup(self) -> None: method shutdown (line 43) | def shutdown(self) -> None: method pause (line 48) | def pause(self) -> None: method resume (line 54) | def resume(self) -> None: method _get_energy (line 59) | def _get_energy(self) -> int: method lap (line 66) | def lap(self) -> float: method get_total (line 83) | def get_total(self) -> float: FILE: megatron/core/enums.py class ModelType (line 6) | class ModelType(enum.Enum): class Fp8Recipe (line 12) | class Fp8Recipe(str, enum.Enum): class Fp4Recipe (line 22) | class Fp4Recipe(str, enum.Enum): FILE: megatron/core/export/export_config.py class ExportConfig (line 9) | class ExportConfig: method __post_init__ (line 23) | def __post_init__(self): FILE: megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py class TRTLLMEngineBuilder (line 19) | class TRTLLMEngineBuilder: method build_and_save_engine (line 23) | def build_and_save_engine( FILE: megatron/core/export/trtllm/trtllm_helper.py class TRTLLMHelper (line 39) | class TRTLLMHelper: method __init__ (line 42) | def __init__( method _get_trtllm_config (line 110) | def _get_trtllm_config( method _load_scaling_factors (line 210) | def _load_scaling_factors(self, model_state_dict: dict) -> dict: method get_trtllm_pretrained_config_and_model_weights (line 264) | def get_trtllm_pretrained_config_and_model_weights( method _add_scales_to_converter (line 352) | def _add_scales_to_converter( method _get_trtllm_pretrained_config_and_model_weights_in_distributed_setting (line 377) | def _get_trtllm_pretrained_config_and_model_weights_in_distributed_set... method _get_trtllm_pretrained_config_and_model_weights_list_on_single_device (line 451) | def _get_trtllm_pretrained_config_and_model_weights_list_on_single_dev... method build_and_save_engine (line 532) | def build_and_save_engine( FILE: megatron/core/export/trtllm/trtllm_layers.py class TRTLLMLayers (line 8) | class TRTLLMLayers(Enum): method return_layer_name_and_number (line 56) | def return_layer_name_and_number(layer_name: str) -> Tuple[str, int]: method rename_input_layer_names_to_trtllm_layer_names (line 80) | def rename_input_layer_names_to_trtllm_layer_names( function get_layer_name_without_prefix (line 157) | def get_layer_name_without_prefix(layer: TRTLLMLayers) -> str: FILE: megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py function str_dtype_to_torch (line 23) | def str_dtype_to_torch(dtype: DataType): class DistributedTRTLLMModelWeightsConverter (line 31) | class DistributedTRTLLMModelWeightsConverter: method __init__ (line 37) | def __init__( method _add_to_trtllm_model_weights (line 82) | def _add_to_trtllm_model_weights(self, val: torch.Tensor, layer_name: ... method _convert_transformer_layer (line 100) | def _convert_transformer_layer(self, layer_name: str, val: torch.Tensor): method _convert_non_transformer_layer (line 195) | def _convert_non_transformer_layer(self, model_state_dict: dict, layer... method _get_remove_vocab_padding (line 209) | def _get_remove_vocab_padding(self, layer_name, model_state_dict, toke... method convert (line 236) | def convert( FILE: megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py function pad_vocab_size (line 26) | def pad_vocab_size(vocab_size: int, tp_size: int): function str_dtype_to_torch (line 33) | def str_dtype_to_torch(dtype: DataType): class SingleDeviceTRTLLMModelWeightsConverter (line 40) | class SingleDeviceTRTLLMModelWeightsConverter: method __init__ (line 43) | def __init__( method _convert_non_transformer_layer (line 81) | def _convert_non_transformer_layer(self, model_state_dict: dict, layer... method _cast_value (line 95) | def _cast_value(self, val: torch.Tensor, layer_name: str) -> torch.Ten... method _convert_transformer_layer (line 114) | def _convert_transformer_layer(self, layer_name: str, val: torch.Tensor): method convert (line 332) | def convert( method get_padded_vocab_size (line 405) | def get_padded_vocab_size(self) -> int: method get_local_model_weights_per_gpu (line 422) | def get_local_model_weights_per_gpu(self, mapping, trtllm_model_config... FILE: megatron/core/export/trtllm/trtllm_weights_converter/utils.py function is_gated_activation (line 6) | def is_gated_activation(helper): FILE: megatron/core/extensions/transformer_engine.py class TransformerEngineConfigType (line 85) | class TransformerEngineConfigType(enum.Enum): class TEQuantizationRecipe (line 92) | class TEQuantizationRecipe: method parse_from_config (line 128) | def parse_from_config(cls, quant_config: Dict[Any, Any]) -> "TEQuantiz... method get_config_keys (line 157) | def get_config_keys(cls) -> Set[str]: class TEQuantizationParams (line 163) | class TEQuantizationParams: method parse_from_config (line 175) | def parse_from_config(quant_config: QuantizationConfig) -> "TEQuantiza... function _get_fp8_autocast_for_quant_recipe (line 208) | def _get_fp8_autocast_for_quant_recipe(qrecipe: TEQuantizationRecipe): function _get_fp8_autocast_for_quant_params (line 259) | def _get_fp8_autocast_for_quant_params(qparams: TEQuantizationParams | N... function _get_should_context_be_quantized_recipe (line 268) | def _get_should_context_be_quantized_recipe( function _get_should_context_be_quantized_params (line 284) | def _get_should_context_be_quantized_params( function _get_extra_te_kwargs (line 299) | def _get_extra_te_kwargs(config: TransformerConfig): function condition_init_method (line 312) | def condition_init_method(config, init_method): function split_te_layernorm_column_parallel_linear (line 317) | def split_te_layernorm_column_parallel_linear( class TEActivationOp (line 400) | class TEActivationOp: method __new__ (line 406) | def __new__(cls, config: TransformerConfig): class TEFusedResidualRMSNorm (line 438) | class TEFusedResidualRMSNorm(te.pytorch.RMSNorm): method __init__ (line 453) | def __init__(self, *args, **kwargs): method _make_fused_impl (line 458) | def _make_fused_impl(self) -> te.pytorch.ops.Sequential: method _register_hooks_on_fused_impl (line 493) | def _register_hooks_on_fused_impl(self, fused_impl: torch.nn.Module) -... method forward (line 574) | def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, ... class TENorm (line 602) | class TENorm: method __new__ (line 617) | def __new__( class TELinear (line 661) | class TELinear(te.pytorch.Linear): method __init__ (line 676) | def __init__( method finish_init (line 849) | def finish_init(self, quantization_config: QuantizationConfig): method will_execute_quantized (line 856) | def will_execute_quantized(self, is_context_quantized: bool) -> bool: method forward (line 862) | def forward(self, x): method sharded_state_dict (line 880) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N... method backward_dw (line 898) | def backward_dw(self): class TELayerNormColumnParallelLinear (line 904) | class TELayerNormColumnParallelLinear(te.pytorch.LayerNormLinear): method __init__ (line 908) | def __init__( method finish_init (line 1070) | def finish_init(self, quantization_config: QuantizationConfig): method will_execute_quantized (line 1077) | def will_execute_quantized(self, is_context_quantized: bool) -> bool: method forward (line 1083) | def forward(self, x): method sharded_state_dict (line 1102) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N... method extra_repr (line 1116) | def extra_repr(self) -> str: method backward_dw (line 1125) | def backward_dw(self): class TEColumnParallelLinear (line 1131) | class TEColumnParallelLinear(TELinear): method __init__ (line 1135) | def __init__( method sharded_state_dict (line 1213) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N... method extra_repr (line 1226) | def extra_repr(self) -> str: method backward_dw (line 1235) | def backward_dw(self): class TERowParallelLinear (line 1241) | class TERowParallelLinear(TELinear): method __init__ (line 1245) | def __init__( method sharded_state_dict (line 1317) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N... method extra_repr (line 1330) | def extra_repr(self) -> str: method backward_dw (line 1339) | def backward_dw(self): class TEDotProductAttention (line 1345) | class TEDotProductAttention(te.pytorch.DotProductAttention): method __init__ (line 1356) | def __init__( method forward (line 1542) | def forward( method sharded_state_dict (line 1645) | def sharded_state_dict( class TEGroupedLinear (line 1668) | class TEGroupedLinear(te.pytorch.GroupedLinear): method __init__ (line 1677) | def __init__( method finish_init (line 1874) | def finish_init(self, quantization_config: QuantizationConfig): method will_execute_quantized (line 1881) | def will_execute_quantized(self, is_context_quantized: bool) -> bool: method forward (line 1887) | def forward(self, x, m_splits): method _encode_extra_state (line 1905) | def _encode_extra_state(self, state): method _decode_extra_state (line 1916) | def _decode_extra_state(self, state): method _split_extra_state (line 1928) | def _split_extra_state(self, state): method _sharded_state_dict_grouped (line 1969) | def _sharded_state_dict_grouped( method backward_dw (line 2032) | def backward_dw(self): class TEColumnParallelGroupedLinear (line 2040) | class TEColumnParallelGroupedLinear(TEGroupedLinear): method __init__ (line 2046) | def __init__( method sharded_state_dict (line 2074) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N... class TERowParallelGroupedLinear (line 2086) | class TERowParallelGroupedLinear(TEGroupedLinear): method __init__ (line 2092) | def __init__( method sharded_state_dict (line 2120) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N... class TEFusedMLP (line 2138) | class TEFusedMLP(MLP): method __init__ (line 2142) | def __init__(self, *args, **kwargs): method _make_fused_impl (line 2148) | def _make_fused_impl(self) -> te.pytorch.ops.Sequential: method _make_activation_op (line 2274) | def _make_activation_op( method _register_hooks_on_fused_impl (line 2310) | def _register_hooks_on_fused_impl(self, fused_impl: torch.nn.Module) -... method forward (line 2396) | def forward(self, hidden_states: torch.Tensor, **kwargs) -> Tuple[Tens... class TEDelayedScaling (line 2423) | class TEDelayedScaling(te.common.recipe.DelayedScaling): method __init__ (line 2428) | def __init__( class TECudaRNGStatesTracker (line 2459) | class TECudaRNGStatesTracker(te.pytorch.distributed.CudaRNGStatesTracker): method __init__ (line 2463) | def __init__(self, is_inference_rng_tracker=False): method is_initialized (line 2474) | def is_initialized(self): method reset (line 2478) | def reset(self): method set_states (line 2483) | def set_states(self, states): method add (line 2488) | def add(self, name, seed): function te_checkpoint (line 2494) | def te_checkpoint( function get_cpu_offload_context (line 2534) | def get_cpu_offload_context( function fused_apply_rotary_pos_emb (line 2575) | def fused_apply_rotary_pos_emb( function fused_apply_rotary_pos_emb_thd (line 2596) | def fused_apply_rotary_pos_emb_thd( function te_parallel_cross_entropy (line 2683) | def te_parallel_cross_entropy( function te_general_gemm (line 2711) | def te_general_gemm( function set_save_original_input (line 2764) | def set_save_original_input(module): FILE: megatron/core/extensions/transformer_engine_spec_provider.py class _TENormWithResidual (line 31) | class _TENormWithResidual: method __new__ (line 34) | def __new__(cls, *args, **kwargs): class TESpecProvider (line 38) | class TESpecProvider(BackendSpecProvider): method linear (line 41) | def linear(self) -> type: method column_parallel_linear (line 45) | def column_parallel_linear(self) -> type: method row_parallel_linear (line 49) | def row_parallel_linear(self) -> type: method fuse_layernorm_and_linear (line 53) | def fuse_layernorm_and_linear(self) -> bool: method column_parallel_layer_norm_linear (line 57) | def column_parallel_layer_norm_linear(self) -> Optional[type]: method layer_norm (line 61) | def layer_norm( method core_attention (line 73) | def core_attention(self) -> type: method grouped_mlp_modules (line 77) | def grouped_mlp_modules( method activation_func (line 102) | def activation_func(self) -> TEActivationFunctionBuilder | None: FILE: megatron/core/fp4_utils.py function is_nvfp4tensor (line 46) | def is_nvfp4tensor(tensor: torch.Tensor) -> bool: function get_fp4_align_size (line 51) | def get_fp4_align_size(fp4_recipe: Fp4Recipe) -> int: function dequantize_fp4_tensor (line 83) | def dequantize_fp4_tensor(fp4_tensor: torch.Tensor) -> torch.Tensor: function get_fp4_recipe (line 94) | def get_fp4_recipe(config: TransformerConfig): function get_fp4_context (line 122) | def get_fp4_context(config: TransformerConfig, layer_no: int = -1, is_in... function get_fp4_recipe (line 167) | def get_fp4_recipe(config: TransformerConfig): function get_fp4_context (line 171) | def get_fp4_context(config: TransformerConfig, layer_no: int = -1, is_in... FILE: megatron/core/fp8_utils.py function is_float8tensor (line 96) | def is_float8tensor(tensor: torch.Tensor) -> bool: function is_mxfp8tensor (line 108) | def is_mxfp8tensor(tensor: torch.Tensor) -> bool: function dequantize_fp8_tensor (line 113) | def dequantize_fp8_tensor(fp8_tensor: torch.Tensor) -> torch.Tensor: function _resolve_callable_from_python_import_path (line 121) | def _resolve_callable_from_python_import_path(dotted_path: str): function _get_custom_recipe (line 155) | def _get_custom_recipe(quantizer_factory_python_path: str) -> Union[Fp8R... function get_fp8_align_size (line 168) | def get_fp8_align_size(fp8_recipe: Fp8Recipe) -> int: function is_column_parallel_linear (line 176) | def is_column_parallel_linear(module): function is_row_parallel_linear (line 188) | def is_row_parallel_linear(module): function _modify_underlying_storage_impl (line 226) | def _modify_underlying_storage_impl( function _quantize_param_shard_impl (line 233) | def _quantize_param_shard_impl( function _correct_amax_history_if_needed_impl (line 267) | def _correct_amax_history_if_needed_impl(model: List[torch.nn.Module]) -... function _modify_underlying_storage_impl (line 275) | def _modify_underlying_storage_impl( function _quantize_param_shard_impl (line 284) | def _quantize_param_shard_impl( function _correct_amax_history_if_needed_impl (line 359) | def _correct_amax_history_if_needed_impl(model: List[torch.nn.Module]) -... function _modify_underlying_storage_impl (line 367) | def _modify_underlying_storage_impl(tensor: Float8Tensor, new_raw_data: ... function _quantize_param_shard_impl (line 374) | def _quantize_param_shard_impl( function _correct_amax_history_if_needed_impl (line 446) | def _correct_amax_history_if_needed_impl(model: List[torch.nn.Module]) -... function _modify_underlying_storage_impl (line 461) | def _modify_underlying_storage_impl(*args, **kwargs): function _quantize_param_shard_impl (line 464) | def _quantize_param_shard_impl(model_params, *args, **kwargs): function _correct_amax_history_if_needed_impl (line 471) | def _correct_amax_history_if_needed_impl(*args, **kwargs): function modify_underlying_storage (line 478) | def modify_underlying_storage(tensor: torch.Tensor, new_raw_data: torch.... function quantize_param_shard (line 484) | def quantize_param_shard( function correct_amax_history_if_needed (line 494) | def correct_amax_history_if_needed(model: List[torch.nn.Module]): function post_all_gather_processing (line 499) | def post_all_gather_processing(model_params): function is_first_last_bf16_layer (line 513) | def is_first_last_bf16_layer(config: TransformerConfig, layer_no: int): function get_fp8_recipe (line 536) | def get_fp8_recipe(config: TransformerConfig): function get_fp8_context (line 596) | def get_fp8_context(config: TransformerConfig, layer_no: int = -1, is_in... function get_fp8_recipe (line 658) | def get_fp8_recipe(config: TransformerConfig): function get_fp8_context (line 662) | def get_fp8_context(config: TransformerConfig, layer_no: int = -1, is_in... function _wrap_te_linear_for_padding (line 673) | def _wrap_te_linear_for_padding(module: torch.nn.Module): function prepare_model_for_fp8_inference (line 757) | def prepare_model_for_fp8_inference(model): function prepare_model_for_fp8_inference (line 780) | def prepare_model_for_fp8_inference(model): FILE: megatron/core/full_cuda_graph.py function copy_tensors_in_struct (line 19) | def copy_tensors_in_struct(src): function clone_tensors_in_struct (line 33) | def clone_tensors_in_struct(tgt, src): class StaticBufferLoader (line 57) | class StaticBufferLoader: method __init__ (line 62) | def __init__(self): method __call__ (line 65) | def __call__(self, inputs, stage, microbatch): class FullCudaGraphWrapper (line 94) | class FullCudaGraphWrapper: method __init__ (line 101) | def __init__(self, forward_backward_func, cuda_graph_warmup_steps=1): method data_read (line 106) | def data_read(self, data_iterator, model, training, num_microbatches): method __call__ (line 139) | def __call__(self, *args, **kwargs): method curr_iter (line 192) | def curr_iter(self, stage): method next_iter (line 196) | def next_iter(self, stage): FILE: megatron/core/fusions/fused_bias_dropout.py function _bias_dropout_add_func (line 11) | def _bias_dropout_add_func(x_with_bias, residual, prob, training): function bias_dropout_add_unfused (line 62) | def bias_dropout_add_unfused(training): function bias_dropout_add_fused_train (line 70) | def bias_dropout_add_fused_train( function bias_dropout_add_fused_inference (line 77) | def bias_dropout_add_fused_inference( function get_bias_dropout_add (line 83) | def get_bias_dropout_add(training, fused): FILE: megatron/core/fusions/fused_bias_geglu.py function geglu (line 17) | def geglu(y): function bias_geglu (line 31) | def bias_geglu(bias, y): function geglu_back (line 49) | def geglu_back(g, y): function bias_geglu_back (line 69) | def bias_geglu_back(g, y, bias): class BiasGeGLUFunction (line 84) | class BiasGeGLUFunction(torch.autograd.Function): method forward (line 89) | def forward(ctx, input, bias): method backward (line 104) | def backward(ctx, grad_output): class GeGLUFunction (line 119) | class GeGLUFunction(torch.autograd.Function): method forward (line 124) | def forward(ctx, input): method backward (line 138) | def backward(ctx, grad_output): function bias_geglu_impl (line 153) | def bias_geglu_impl(input, bias): function quick_gelu (line 185) | def quick_gelu(y: torch.Tensor) -> torch.Tensor: function quick_geglu (line 191) | def quick_geglu(y: torch.Tensor, linear_offset: float = 0.0) -> torch.Te... function weighted_quick_geglu (line 206) | def weighted_quick_geglu( function quick_geglu_back (line 221) | def quick_geglu_back(g, y, linear_offset: float = 0.0) -> torch.Tensor: function weighted_quick_geglu_back (line 240) | def weighted_quick_geglu_back(g, y, weights, linear_offset: float = 0.0): function weighted_bias_quick_geglu (line 259) | def weighted_bias_quick_geglu( function weighted_bias_quick_geglu_back (line 279) | def weighted_bias_quick_geglu_back(g, y, bias, weights, linear_offset: f... class WeightedQuickGeGLUFunction (line 303) | class WeightedQuickGeGLUFunction(torch.autograd.Function): method forward (line 307) | def forward( method backward (line 333) | def backward(ctx, grad_output): class WeightedBiasQuickGeGLUFunction (line 350) | class WeightedBiasQuickGeGLUFunction(torch.autograd.Function): method forward (line 354) | def forward( method backward (line 387) | def backward(ctx, grad_output): function weighted_bias_quick_geglu_impl (line 410) | def weighted_bias_quick_geglu_impl( FILE: megatron/core/fusions/fused_bias_gelu.py function bias_gelu (line 17) | def bias_gelu(bias, y): function bias_gelu_back (line 26) | def bias_gelu_back(g, bias, y): class GeLUFunction (line 36) | class GeLUFunction(torch.autograd.Function): method forward (line 39) | def forward(ctx, input, bias): method backward (line 44) | def backward(ctx, grad_output): method apply (line 51) | def apply(cls, *args, **kwargs): FILE: megatron/core/fusions/fused_bias_swiglu.py function swiglu (line 16) | def swiglu(y): function bias_swiglu (line 30) | def bias_swiglu(y, bias): function weighted_swiglu (line 45) | def weighted_swiglu(y, weights): function swiglu_back (line 55) | def swiglu_back(g, y): function bias_swiglu_back (line 73) | def bias_swiglu_back(g, y, bias): function weighted_swiglu_back (line 90) | def weighted_swiglu_back(g, y, weights): class BiasSwiGLUFunction (line 100) | class BiasSwiGLUFunction(torch.autograd.Function): method forward (line 105) | def forward(ctx, input, bias, fp8_input_store, cpu_offload_input): method backward (line 128) | def backward(ctx, grad_output): class SwiGLUFunction (line 147) | class SwiGLUFunction(torch.autograd.Function): method forward (line 152) | def forward(ctx, input, fp8_input_store, cpu_offload_input): method backward (line 173) | def backward(ctx, grad_output): class WeightedSwiGLUFunction (line 191) | class WeightedSwiGLUFunction(torch.autograd.Function): method forward (line 194) | def forward(ctx, input, weights, fp8_input_store): method backward (line 202) | def backward(ctx, grad_output): function bias_swiglu_impl (line 209) | def bias_swiglu_impl(input, bias, fp8_input_store=False, cpu_offload_inp... function weighted_bias_swiglu_impl (line 239) | def weighted_bias_swiglu_impl(input, bias, weights, fp8_input_store=False): FILE: megatron/core/fusions/fused_cross_entropy.py function calculate_logits_max (line 13) | def calculate_logits_max(vocab_parallel_logits: torch.Tensor) -> Tuple[t... function calculate_predicted_logits (line 26) | def calculate_predicted_logits( function calculate_cross_entropy_loss (line 48) | def calculate_cross_entropy_loss( function calculate_gradients (line 65) | def calculate_gradients( class _VocabParallelCrossEntropy (line 87) | class _VocabParallelCrossEntropy(torch.autograd.Function): method forward (line 89) | def forward(ctx, vocab_parallel_logits, target, tp_group): method backward (line 124) | def backward(ctx, grad_output): function fused_vocab_parallel_cross_entropy (line 136) | def fused_vocab_parallel_cross_entropy(vocab_parallel_logits, target, tp... FILE: megatron/core/fusions/fused_indices_converter.py function _indices_to_multihot_kernel (line 32) | def _indices_to_multihot_kernel( function _multihot_to_indices_kernel (line 112) | def _multihot_to_indices_kernel( class IndicesToMultihot (line 176) | class IndicesToMultihot(torch.autograd.Function): method forward (line 186) | def forward(ctx, indices, probs_indices, num_of_local_experts): method backward (line 239) | def backward(ctx, grad_multihot_indices, grad_probs_in_multihot): function fused_indices_to_multihot (line 282) | def fused_indices_to_multihot(indices, probs_indices, num_of_local_exper... FILE: megatron/core/fusions/fused_layer_norm.py class FusedLayerNorm (line 30) | class FusedLayerNorm(torch.nn.Module): method __init__ (line 52) | def __init__( method reset_parameters (line 122) | def reset_parameters(self): method forward (line 131) | def forward(self, input: Tensor) -> Tensor: FILE: megatron/core/fusions/fused_mla_yarn_rope_apply.py function _get_thd_token_idx (line 31) | def _get_thd_token_idx(cu_seqlens, pid_m, seq_num, cp_rank, cp_size): function rotary_fwd_q_kernel (line 68) | def rotary_fwd_q_kernel( function rotary_bwd_q_kernel (line 148) | def rotary_bwd_q_kernel( class ApplyMLARotaryEmbQ (line 210) | class ApplyMLARotaryEmbQ(torch.autograd.Function): method forward (line 216) | def forward( method backward (line 285) | def backward(ctx, grad): function fused_apply_mla_rope_for_q (line 327) | def fused_apply_mla_rope_for_q( function rotary_fwd_kv_kernel (line 379) | def rotary_fwd_kv_kernel( function rotary_bwd_kv_kernel (line 487) | def rotary_bwd_kv_kernel( class ApplyMLARotaryEmbKV (line 581) | class ApplyMLARotaryEmbKV(torch.autograd.Function): method forward (line 587) | def forward( method backward (line 675) | def backward(ctx, dk, dv): function fused_apply_mla_rope_for_kv (line 735) | def fused_apply_mla_rope_for_kv( FILE: megatron/core/fusions/fused_pad_routing_map.py function _pad_routing_map_kernel (line 31) | def _pad_routing_map_kernel( function fused_pad_routing_map (line 74) | def fused_pad_routing_map(routing_map: torch.Tensor, pad_multiple: int) ... FILE: megatron/core/fusions/fused_softmax.py class ScaledUpperTriangMaskedSoftmax (line 11) | class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function): method forward (line 20) | def forward(ctx, inputs, scale): method backward (line 40) | def backward(ctx, output_grads): class ScaledMaskedSoftmax (line 60) | class ScaledMaskedSoftmax(torch.autograd.Function): method forward (line 69) | def forward(ctx, inputs, mask, scale): method backward (line 90) | def backward(ctx, output_grads): class ScaledSoftmax (line 108) | class ScaledSoftmax(torch.autograd.Function): method forward (line 116) | def forward(ctx, inputs, scale): method backward (line 136) | def backward(ctx, output_grads): class SoftmaxOne (line 154) | class SoftmaxOne(nn.Module): method __init__ (line 161) | def __init__( method forward (line 168) | def forward(self, x: torch.Tensor) -> torch.Tensor: class FusedScaleMaskSoftmax (line 179) | class FusedScaleMaskSoftmax(nn.Module): method __init__ (line 193) | def __init__( method forward (line 219) | def forward( method is_kernel_available (line 238) | def is_kernel_available(self, mask, b, np, sq, sk): method forward_fused_softmax (line 272) | def forward_fused_softmax(self, input, mask): method forward_torch_softmax (line 299) | def forward_torch_softmax(self, input, mask, softmax_offset=None): method get_batch_per_block (line 345) | def get_batch_per_block(sq, sk, b, np): FILE: megatron/core/fusions/fused_weighted_squared_relu.py function weighted_squared_relu (line 14) | def weighted_squared_relu(x: torch.Tensor, weights: torch.Tensor) -> tor... function _squared_relu_back (line 32) | def _squared_relu_back(g: torch.Tensor, x: torch.Tensor) -> torch.Tensor: function weighted_squared_relu_back (line 41) | def weighted_squared_relu_back(g: torch.Tensor, x: torch.Tensor, weights... class WeightedSquaredReLUFunction (line 60) | class WeightedSquaredReLUFunction(torch.autograd.Function): method forward (line 65) | def forward(ctx, input: torch.Tensor, weights: torch.Tensor): method backward (line 79) | def backward(ctx, grad_output: torch.Tensor): function weighted_squared_relu_impl (line 91) | def weighted_squared_relu_impl(input: torch.Tensor, weights: torch.Tenso... FILE: megatron/core/hyper_comm_grid.py class HyperCommGrid (line 33) | class HyperCommGrid: method __init__ (line 82) | def __init__( method create_pg (line 120) | def create_pg(self, dims: Union[str, list[str]], **kwargs: Any) -> dis... method destroy (line 168) | def destroy(self) -> None: method get_pg (line 175) | def get_pg(self, dims: Union[str, list[str]]) -> dist.ProcessGroup: method get_rank_enum (line 190) | def get_rank_enum(self, dims: Union[str, list[str]]) -> list[list[int]]: method _gen_rank_enum (line 206) | def _gen_rank_enum(self, dims: list[str]) -> list[list[int]]: method _order_dims (line 251) | def _order_dims(self, dims: Union[str, list[str]]) -> Tuple[list[str],... FILE: megatron/core/inference/async_stream.py class AsyncStream (line 17) | class AsyncStream: method __init__ (line 24) | def __init__( method put (line 36) | def put(self, item: Union[InferenceRequest, Exception]) -> None: method finish (line 41) | def finish(self, exception: Optional[Union[BaseException, Type[BaseExc... method finished (line 51) | def finished(self) -> bool: method generator (line 55) | async def generator(self) -> AsyncGenerator[InferenceRequest, None]: method _is_raisable (line 70) | def _is_raisable(value: Any): FILE: megatron/core/inference/batch_dimensions_utils.py class InferenceBatchDimensions (line 21) | class InferenceBatchDimensions: method __str__ (line 38) | def __str__(self): method is_applicable_for_batch_dim (line 44) | def is_applicable_for_batch_dim( method is_valid (line 76) | def is_valid( method __hash__ (line 112) | def __hash__(self): method __eq__ (line 119) | def __eq__(self, other: "InferenceBatchDimensions") -> bool: method req_count (line 132) | def req_count(self) -> int: method adjust_batch_dims_for_expert_parallelism (line 139) | def adjust_batch_dims_for_expert_parallelism( class CUDAGraphBatchDimensionBuilder (line 233) | class CUDAGraphBatchDimensionBuilder: method _calculate_cuda_graph_token_counts (line 244) | def _calculate_cuda_graph_token_counts( method generate_cuda_graph_batch_dimensions_list (line 318) | def generate_cuda_graph_batch_dimensions_list( method match_graph_config (line 508) | def match_graph_config( FILE: megatron/core/inference/communication/torch_symm_triton/barrier.py function _send_signal (line 21) | def _send_signal(addrs, sem: tl.constexpr): function _wait_signal (line 43) | def _wait_signal(addrs, sem: tl.constexpr): function symm_mem_sync (line 65) | def symm_mem_sync( FILE: megatron/core/inference/communication/torch_symm_triton/collectives.py function _ag_phase (line 30) | def _ag_phase( function _multimem_all_gather_kernel (line 68) | def _multimem_all_gather_kernel( function _multimem_all_gather_3_kernel (line 95) | def _multimem_all_gather_3_kernel( function _multimem_reduce_scatter_kernel (line 159) | def _multimem_reduce_scatter_kernel( function _kernel_launch_config (line 212) | def _kernel_launch_config(element_size: int, max_numel: int, world_size:... function multimem_all_gather (line 230) | def multimem_all_gather( function multimem_all_gather_fused (line 270) | def multimem_all_gather_fused( function multimem_reduce_scatter (line 323) | def multimem_reduce_scatter( FILE: megatron/core/inference/communication/torch_symm_triton/fused_collectives.py function unpack_bf16x2 (line 23) | def unpack_bf16x2(x, mask): function sum_sq (line 42) | def sum_sq(x, y, z, w, mask): function apply_norm (line 74) | def apply_norm(x, y, z, w, wx, wy, wz, ww, rrms, mask): function _multimem_reduce_scatter_residual_add_kernel (line 115) | def _multimem_reduce_scatter_residual_add_kernel( function fused_multimem_rs_add_norm_ag (line 210) | def fused_multimem_rs_add_norm_ag( FILE: megatron/core/inference/communication/torch_symm_triton/multimem_asm.py function ld_128 (line 21) | def ld_128(ptr, mask, multicast_op: tl.constexpr, reduce_f32: tl.constex... function st_128 (line 111) | def st_128(ptr, x, y, z, w, mask, multicast_op): function add_v8_bf16_from_u32 (line 181) | def add_v8_bf16_from_u32( function asm_rsqrt (line 215) | def asm_rsqrt(x, eps): FILE: megatron/core/inference/communication/torch_symm_triton/utils.py function is_device_nvls_capable (line 20) | def is_device_nvls_capable(device: torch.device) -> bool: function are_tensors_nvls_eligible (line 26) | def are_tensors_nvls_eligible(*tensors: torch.Tensor) -> bool: function get_tid (line 42) | def get_tid(): function get_ntid (line 61) | def get_ntid(): function get_flat_tid (line 80) | def get_flat_tid(): function get_flat_bid (line 90) | def get_flat_bid(): function sync_threads (line 101) | def sync_threads(): FILE: megatron/core/inference/communication_utils.py function is_pipeline_first_stage (line 10) | def is_pipeline_first_stage(pp_group: ProcessGroup): function is_pipeline_last_stage (line 19) | def is_pipeline_last_stage(pp_group: ProcessGroup): function _is_cuda (line 28) | def _is_cuda(tensor): function _is_cuda_contiguous (line 34) | def _is_cuda_contiguous(tensor): function broadcast_from_last_pipeline_stage (line 40) | def broadcast_from_last_pipeline_stage( function recv_from_prev_pipeline_rank_ (line 83) | def recv_from_prev_pipeline_rank_( function send_to_next_pipeline_rank (line 114) | def send_to_next_pipeline_rank( function broadcast_tensor (line 145) | def broadcast_tensor(size, dtype, tensor=None, rank=0, data_parallel=Fal... function broadcast_list (line 169) | def broadcast_list(size, dtype, list_values=None, rank=0, data_parallel=... function broadcast_int_list (line 190) | def broadcast_int_list(size, int_list=None, rank=0, data_parallel=False): function broadcast_float_list (line 202) | def broadcast_float_list(size, float_list=None, rank=0, data_parallel=Fa... FILE: megatron/core/inference/config.py class MambaInferenceStateConfig (line 15) | class MambaInferenceStateConfig: method from_model (line 46) | def from_model( class PrefixCachingEvictionPolicy (line 81) | class PrefixCachingEvictionPolicy(str, Enum): class PrefixCachingCoordinatorPolicy (line 94) | class PrefixCachingCoordinatorPolicy(str, Enum): class KVCacheManagementMode (line 107) | class KVCacheManagementMode(str, Enum): class InferenceConfig (line 121) | class InferenceConfig: FILE: megatron/core/inference/contexts/attention_context/mamba_metadata.py class MambaMetadata (line 10) | class MambaMetadata: method __init__ (line 13) | def __init__(self, max_requests: int, max_tokens: int, mamba_chunk_siz... method reset (line 87) | def reset(self) -> None: method reset_varlen_metadata (line 101) | def reset_varlen_metadata(self) -> None: method update (line 120) | def update( method allocate_slot (line 294) | def allocate_slot(self) -> Optional[int]: method batch_allocate_slots (line 311) | def batch_allocate_slots(self, num_slots: int) -> Optional[torch.Tensor]: method free_slots (line 330) | def free_slots(self, request_indices: torch.Tensor) -> None: FILE: megatron/core/inference/contexts/attention_context/metadata_base.py class MetadataBase (line 4) | class MetadataBase: method __init__ (line 14) | def __init__(self): method update (line 20) | def update(self, *args, **kwargs): method reset (line 26) | def reset(self): method tensor_copy_and_pad (line 32) | def tensor_copy_and_pad( method __str__ (line 68) | def __str__(self): FILE: megatron/core/inference/contexts/attention_context/mha_metadata.py class MHAMetadata (line 9) | class MHAMetadata(MetadataBase): method __init__ (line 14) | def __init__( method update (line 37) | def update( method reset (line 123) | def reset(self): class GraphedMHAMetadata (line 136) | class GraphedMHAMetadata(MHAMetadata): method __init__ (line 141) | def __init__( method update (line 148) | def update( method reset (line 175) | def reset(self): class NonGraphedMHAMetadata (line 179) | class NonGraphedMHAMetadata(MHAMetadata): method update (line 184) | def update( FILE: megatron/core/inference/contexts/attention_context/triton/tensor_ops.py function _tensor_get_slice_after_kernel (line 24) | def _tensor_get_slice_after_kernel( function _tensor_merge_kernel (line 54) | def _tensor_merge_kernel( function _tensor_masked_update_kernel_2d (line 101) | def _tensor_masked_update_kernel_2d( function _tensor_masked_update_kernel_3d (line 141) | def _tensor_masked_update_kernel_3d( function _tensor_masked_update_kernel_4d (line 197) | def _tensor_masked_update_kernel_4d( function _compute_row_size (line 262) | def _compute_row_size(tensor): function tensor_get_slice_after (line 272) | def tensor_get_slice_after(input_tensor, output_tensor, pos_on_device, c... function tensor_merge (line 327) | def tensor_merge( function tensor_masked_update (line 395) | def tensor_masked_update(states: torch.Tensor, idx: torch.Tensor, new_st... FILE: megatron/core/inference/contexts/base_context.py class BaseInferenceContext (line 8) | class BaseInferenceContext(abc.ABC): method __init__ (line 15) | def __init__(self, inference_config: InferenceConfig): method is_static_batching (line 22) | def is_static_batching(self) -> bool: method is_dynamic_batching (line 26) | def is_dynamic_batching(self) -> bool: method increment_sequence_len_offset (line 30) | def increment_sequence_len_offset(self, increment: int) -> None: method increment_batch_size_offset (line 35) | def increment_batch_size_offset(self, increment: int) -> None: method reset_batch_size_offset (line 40) | def reset_batch_size_offset(self) -> None: FILE: megatron/core/inference/contexts/dynamic_context.py class ContextOverflowError (line 97) | class ContextOverflowError(Exception): method __init__ (line 106) | def __init__( class RequestOverflowError (line 117) | class RequestOverflowError(ContextOverflowError): class TokenOverflowError (line 123) | class TokenOverflowError(ContextOverflowError): class MaxSequenceLengthOverflowError (line 129) | class MaxSequenceLengthOverflowError(ContextOverflowError): method __init__ (line 132) | def __init__(self, request_id, message: Optional[str] = None): class BlockOverflowError (line 136) | class BlockOverflowError(ContextOverflowError): class ActiveRequestCountOverflowError (line 142) | class ActiveRequestCountOverflowError(ContextOverflowError): method __init__ (line 146) | def __init__(self, max_request_count, active_request_count): class TensorStateDeallocatedError (line 155) | class TensorStateDeallocatedError(ContextOverflowError): class ContextErrorFactory (line 162) | class ContextErrorFactory: method serialize (line 166) | def serialize(cls, error: ContextOverflowError) -> dict: method deserialize (line 184) | def deserialize(cls, obj: dict) -> ContextOverflowError: function get_mem_size_str (line 206) | def get_mem_size_str(n_bytes: int) -> str: class DynamicInferenceContext (line 217) | class DynamicInferenceContext(BaseInferenceContext): method __init__ (line 248) | def __init__(self, model_config: TransformerConfig, inference_config: ... method _allocate_memory_buffer (line 600) | def _allocate_memory_buffer(self): method _allocate_mamba_states (line 636) | def _allocate_mamba_states(self): method initialize_all_tensors (line 702) | def initialize_all_tensors(self) -> None: method reinitialize_inference_state_buffers (line 801) | def reinitialize_inference_state_buffers(self): method deallocate_inference_state_buffers (line 838) | def deallocate_inference_state_buffers(self): method round_up_tokens (line 879) | def round_up_tokens(cls, value, tp_size=None): method round_up_requests (line 893) | def round_up_requests(cls, value, tp_size=None): method is_static_batching (line 906) | def is_static_batching(self) -> bool: method is_decode_only (line 910) | def is_decode_only(self) -> bool: method using_cuda_graph_this_step (line 916) | def using_cuda_graph_this_step(self) -> bool: method has_unfinished_requests (line 920) | def has_unfinished_requests(self) -> bool: method cu_query_lengths (line 924) | def cu_query_lengths(self) -> Tuple[Tensor, int]: method cu_kv_lengths (line 932) | def cu_kv_lengths(self) -> Tuple[Tensor, Tensor, int]: method get_active_sequence_lengths (line 941) | def get_active_sequence_lengths(self) -> Tensor: method get_max_sequence_lengths (line 947) | def get_max_sequence_lengths(self) -> Tensor: method get_active_request_count (line 951) | def get_active_request_count(self): method append_key_value_cache (line 955) | def append_key_value_cache(self, layer_number: int, key: Tensor, value... method key_value_cache (line 1004) | def key_value_cache(self, layer_number: int) -> Tuple[Tensor, Optional... method mamba_states_cache (line 1031) | def mamba_states_cache( method _allocate_mamba_cache (line 1051) | def _allocate_mamba_cache(self, mamba_gb: float) -> None: method apply_fused_qk_rotary_emb (line 1093) | def apply_fused_qk_rotary_emb( method apply_rotary_emb_query (line 1125) | def apply_rotary_emb_query( method apply_rotary_emb_key (line 1159) | def apply_rotary_emb_key( method reset_attention_state (line 1198) | def reset_attention_state(self) -> None: method reset_mamba_state (line 1210) | def reset_mamba_state(self) -> None: method add_dummy_requests_parallel (line 1215) | def add_dummy_requests_parallel( method add_dummy_requests_for_cudagraph_capture (line 1354) | def add_dummy_requests_for_cudagraph_capture( method num_decode_requests (line 1414) | def num_decode_requests(self) -> int: method add_dummy_requests_for_expert_parallel_step (line 1420) | def add_dummy_requests_for_expert_parallel_step(self) -> None: method initialize_attention_state (line 1476) | def initialize_attention_state( method reset_tensors (line 1648) | def reset_tensors(self) -> None: method reset_metadata (line 1674) | def reset_metadata(self) -> None: method reset (line 1717) | def reset(self) -> None: method current_input_and_position_ids (line 1736) | def current_input_and_position_ids( method last_token_logits (line 1758) | def last_token_logits(self, logits: Tensor) -> Tensor: method _compute_prefix_match (line 1781) | def _compute_prefix_match( method check_availability (line 1860) | def check_availability(self, req: DynamicInferenceRequest) -> Tuple[bo... method _find_kv_match_count (line 1880) | def _find_kv_match_count( method add_request (line 1927) | def add_request( method _move_book_keeping_tensors (line 2135) | def _move_book_keeping_tensors( method _swap_book_keeping_tensors (line 2164) | def _swap_book_keeping_tensors( method get_index_of_chunked_prefill_request (line 2194) | def get_index_of_chunked_prefill_request(self, safe: bool = True) -> int: method is_chunked_prefill_enabled (line 2216) | def is_chunked_prefill_enabled(self) -> bool: method release_memory_blocks_from_request_indexes (line 2222) | def release_memory_blocks_from_request_indexes(self, request_indexes) ... method resume_paused_requests (line 2254) | def resume_paused_requests( method evict_overflow_paused_requests (line 2338) | def evict_overflow_paused_requests( method update_requests (line 2453) | def update_requests( method calculate_log_probs (line 2902) | def calculate_log_probs( method get_kvcache_utilization_stats (line 2974) | def get_kvcache_utilization_stats(self) -> dict: FILE: megatron/core/inference/contexts/fused_kv_append_kernel.py function _append_kv_cache_kernel (line 22) | def _append_kv_cache_kernel( function triton_append_key_value_cache (line 93) | def triton_append_key_value_cache( FILE: megatron/core/inference/contexts/kv_block_allocator.py class KVBlockAllocator (line 12) | class KVBlockAllocator: method __init__ (line 27) | def __init__( method __str__ (line 76) | def __str__(self): method get_total_used (line 83) | def get_total_used(self): method get_active_used (line 87) | def get_active_used(self): method get_paused_used (line 107) | def get_paused_used(self): method get_active_avail (line 123) | def get_active_avail(self): method get_paused_avail (line 127) | def get_paused_avail(self): method is_memory_available (line 131) | def is_memory_available(self, num_blocks: int) -> bool: method allocate_memory_blocks (line 153) | def allocate_memory_blocks(self, num_blocks: int) -> Optional[Tensor]: method release_memory_blocks (line 188) | def release_memory_blocks(self, blocks: Tensor) -> None: method reset (line 227) | def reset(self) -> None: method register_kv_block_hashes (line 262) | def register_kv_block_hashes(self, block_ids: list[int], block_hashes:... method _deregister_blocks (line 276) | def _deregister_blocks(self, block_ids: Tensor) -> None: method update_timestamps (line 313) | def update_timestamps(self, block_ids: Tensor) -> None: method get_evictable_block_count (line 326) | def get_evictable_block_count(self) -> Tensor: method evict_lru_blocks (line 335) | def evict_lru_blocks(self, num_blocks_needed: int) -> bool: FILE: megatron/core/inference/contexts/mamba_slot_allocator.py class MambaSlotAllocator (line 14) | class MambaSlotAllocator: method __init__ (line 31) | def __init__( method allocate_slot (line 79) | def allocate_slot(self, block_id: int) -> int: method _evict_lru_slot (line 104) | def _evict_lru_slot(self) -> int: method get_slot (line 138) | def get_slot(self, block_id: int) -> int: method has_state (line 149) | def has_state(self, block_id: int) -> bool: method invalidate_block (line 153) | def invalidate_block(self, block_id: int) -> None: method store_from_tensors (line 174) | def store_from_tensors( method store_from_live (line 190) | def store_from_live(self, block_id: int, request_idx: int) -> None: method restore_to_live (line 206) | def restore_to_live(self, request_idx: int, block_id: int) -> bool: method register_block_hash (line 228) | def register_block_hash(self, block_id: int, block_hash: int) -> None: method on_kv_blocks_deregistered (line 241) | def on_kv_blocks_deregistered(self, block_ids_list: list, hashes_to_de... method compute_and_store_offsets (line 263) | def compute_and_store_offsets( method get_intermediate_offsets (line 332) | def get_intermediate_offsets(self) -> Optional[List[List[int]]]: method buffer_intermediate_states (line 361) | def buffer_intermediate_states( method commit_intermediate_states (line 373) | def commit_intermediate_states(self) -> None: method _clear_intermediate_state (line 426) | def _clear_intermediate_state(self) -> None: method reset (line 444) | def reset(self) -> None: FILE: megatron/core/inference/contexts/routing_metadata.py class RoutingMetadata (line 13) | class RoutingMetadata: method __init__ (line 25) | def __init__(self, context: 'DynamicInferenceContext', moe_router_topk... method _ensure_buffer_allocated (line 36) | def _ensure_buffer_allocated(self) -> None: method get_routing_indices (line 57) | def get_routing_indices(self) -> Optional[torch.Tensor]: method enable_static_buffer_recording (line 83) | def enable_static_buffer_recording(self) -> None: method disable_static_buffer_recording (line 94) | def disable_static_buffer_recording(self) -> None: FILE: megatron/core/inference/contexts/static_context.py class StaticInferenceContext (line 8) | class StaticInferenceContext(BaseInferenceContext): method __init__ (line 17) | def __init__( method swap_key_value_dict (line 29) | def swap_key_value_dict(self, batch_idx): method enable_prefill_mode (line 46) | def enable_prefill_mode(self): method enable_decode_mode (line 54) | def enable_decode_mode(self): method is_decode_only (line 62) | def is_decode_only(self): method reset (line 66) | def reset(self): method __str__ (line 72) | def __str__(self): method __eq__ (line 83) | def __eq__(self, other): method is_static_batching (line 121) | def is_static_batching(self): FILE: megatron/core/inference/data_parallel_inference_coordinator.py class DataParallelInferenceCoordinator (line 43) | class DataParallelInferenceCoordinator: class CoordinatorState (line 77) | class CoordinatorState(Enum): method __init__ (line 85) | def __init__( method get_next_data_parallel_rank (line 203) | def get_next_data_parallel_rank(self): method _remove_engine (line 217) | def _remove_engine(self, identity): method _send_to_engine (line 226) | def _send_to_engine(self, identity, payload): method compute_request_hashes (line 241) | def compute_request_hashes(self, prompt): method get_best_data_parallel_rank (line 259) | def get_best_data_parallel_rank(self, request_hashes): method _update_rank_hashes (line 292) | def _update_rank_hashes(self, rank_identity, request_hashes): method start (line 304) | def start(self): method detokenize (line 497) | def detokenize(self, finished_request): method entrypoint (line 522) | def entrypoint( method stop (line 574) | def stop(self): FILE: megatron/core/inference/engines/abstract_engine.py class AbstractEngine (line 6) | class AbstractEngine(ABC): method generate (line 9) | def generate(self) -> dict: FILE: megatron/core/inference/engines/async_zmq_communicator.py class AsyncZMQCommunicator (line 20) | class AsyncZMQCommunicator: method __init__ (line 29) | def __init__(self, zmq_context: zmq.Context, process_group: dist.Proce... method all_reduce_max (line 68) | async def all_reduce_max(self, *local_vals: int, async_op=True) -> int... method close (line 127) | def close(self): FILE: megatron/core/inference/engines/dynamic_engine.py class EngineState (line 114) | class EngineState(Enum): class EngineSuspendedError (line 129) | class EngineSuspendedError(Exception): function format_mem_bytes (line 135) | def format_mem_bytes(mem_bytes): class RequestEntry (line 145) | class RequestEntry: class DynamicInferenceEngine (line 154) | class DynamicInferenceEngine(AbstractEngine): method __init__ (line 185) | def __init__(self, controller: TextGenerationController, context: Dyna... method reset (line 266) | def reset(self) -> None: method wait_until (line 314) | async def wait_until(self, state: EngineState): method create_cuda_graphs (line 326) | def create_cuda_graphs(self, reset_context: bool = True): method start_listening_to_data_parallel_coordinator (line 424) | async def start_listening_to_data_parallel_coordinator( method suspend_resume_ctx (line 627) | def suspend_resume_ctx(key: str, *, unified_memory_level: int) -> None: method suspend (line 688) | def suspend(self): method resume (line 737) | def resume(self): method _notify_cond_for_new_request (line 800) | async def _notify_cond_for_new_request(self): method _handle_failed_request (line 805) | def _handle_failed_request(self, request_id: int): method has_unfinished_requests (line 846) | def has_unfinished_requests(self) -> bool: method get_request (line 850) | def get_request(self, request_id: int) -> DynamicInferenceRequest: method _add_request (line 861) | def _add_request( method add_request (line 950) | def add_request( method post_process_requests (line 1009) | def post_process_requests( method _get_and_clear_stop_word_finished_ids (line 1299) | def _get_and_clear_stop_word_finished_ids(self, active_request_ids: li... method _check_stop_words_for_request_post_append (line 1322) | def _check_stop_words_for_request_post_append( method get_prefix_coordination_metrics (line 1370) | def get_prefix_coordination_metrics(self) -> dict: method _find_mamba_match_count (line 1378) | def _find_mamba_match_count(self, req: DynamicInferenceRequest) -> int: method schedule_waiting_requests (line 1394) | def schedule_waiting_requests(self): method schedule_non_chunked_prefill (line 1411) | def schedule_non_chunked_prefill(self): method schedule_chunked_prefill (line 1466) | def schedule_chunked_prefill(self): method async_forward (line 1594) | async def async_forward(self) -> Tuple[Dict, Dict, float]: method async_bookkeep (line 1666) | async def async_bookkeep( method async_step (line 1898) | async def async_step( method _run_coroutine_sync (line 1917) | def _run_coroutine_sync(self, coro): method step_modern (line 1935) | def step_modern( method step_legacy (line 1941) | def step_legacy( method generate (line 1959) | def generate( method schedule_requests (line 1978) | def schedule_requests(self) -> int: method shutdown (line 2123) | async def shutdown(self): method run_engine (line 2157) | async def run_engine(self, *, loop: Optional[asyncio.AbstractEventLoop... method _ep_establish_consensus (line 2178) | async def _ep_establish_consensus( method _world_barrier (line 2227) | async def _world_barrier(self): method run_engine_with_coordinator (line 2244) | async def run_engine_with_coordinator( FILE: megatron/core/inference/engines/static_engine.py class StaticInferenceEngine (line 35) | class StaticInferenceEngine(AbstractEngine): method __init__ (line 50) | def __init__( method get_new_request_id (line 132) | def get_new_request_id(self) -> str: method add_request (line 136) | def add_request( method get_stream_generator (line 192) | def get_stream_generator( method generate_using_dynamic_engine (line 202) | def generate_using_dynamic_engine( method generate_using_legacy_static_engine (line 250) | def generate_using_legacy_static_engine( method generate (line 305) | def generate( method run_engine (line 351) | def run_engine(self): method _wrapped_run_engine (line 389) | def _wrapped_run_engine(self, cuda_device): method run_engine_async (line 399) | async def run_engine_async(self, loop: Optional[asyncio.AbstractEventL... FILE: megatron/core/inference/headers.py class Headers (line 6) | class Headers(Enum): class UnknownHeaderError (line 25) | class UnknownHeaderError(Exception): method __init__ (line 28) | def __init__(self, header): FILE: megatron/core/inference/inference_client.py class InferenceClient (line 29) | class InferenceClient: method __init__ (line 54) | def __init__(self, inference_coordinator_address: str, deserialize: bo... method add_request (line 87) | def add_request( method _recv_task (line 119) | async def _recv_task(self): method _connect_with_inference_coordinator (line 154) | def _connect_with_inference_coordinator(self): method start (line 166) | def start(self, loop: Optional[asyncio.AbstractEventLoop] = None): method _send_signal_to_engines (line 179) | def _send_signal_to_engines(self, signal, *args): method pause_engines (line 191) | def pause_engines(self): method unpause_engines (line 200) | def unpause_engines(self) -> None: method set_generation_epoch (line 204) | def set_generation_epoch(self, generation_epoch: int): method suspend_engines (line 212) | def suspend_engines(self): method resume_engines (line 219) | def resume_engines(self): method stop_engines (line 226) | def stop_engines(self): method shutdown_coordinator (line 234) | def shutdown_coordinator(self): method stop (line 241) | def stop(self): FILE: megatron/core/inference/inference_request.py function serialize_tensor (line 18) | def serialize_tensor(tensor: torch.Tensor) -> List: function deserialize_tensor (line 36) | def deserialize_tensor(tensor_as_list: List) -> torch.Tensor: function unwrap_serialized_tensors (line 49) | def unwrap_serialized_tensors(serialized_request: dict) -> dict: class Status (line 65) | class Status(Enum): function compute_block_hashes_batched (line 88) | def compute_block_hashes_batched(prompt_tokens: torch.Tensor, block_size... class InferenceRequest (line 131) | class InferenceRequest: method __post_init__ (line 158) | def __post_init__(self): method serialize (line 166) | def serialize(self) -> dict: method deserialize (line 191) | def deserialize(cls, obj: dict) -> "InferenceRequest": method _post_deserialize (line 206) | def _post_deserialize(self, obj: dict): class DynamicInferenceEventType (line 230) | class DynamicInferenceEventType(Enum): class DynamicInferenceEvent (line 245) | class DynamicInferenceEvent: method __post_init__ (line 263) | def __post_init__(self): method __str__ (line 287) | def __str__(self): method serialize (line 296) | def serialize(self) -> dict: method deserialize (line 322) | def deserialize(cls, obj: dict) -> "DynamicInferenceEvent": class DynamicInferenceRequest (line 349) | class DynamicInferenceRequest(InferenceRequest): method __post_init__ (line 377) | def __post_init__(self): method _compute_block_hashes (line 391) | def _compute_block_hashes(self) -> None: method remaining_prompt_length (line 403) | def remaining_prompt_length(self): method __str__ (line 414) | def __str__(self): method serialize (line 425) | def serialize(self): method _post_deserialize (line 450) | def _post_deserialize(self, obj): method tracked_metadata (line 455) | def tracked_metadata(self) -> List[Any]: method get_metadata_types (line 475) | def get_metadata_types() -> List[Tuple[str, torch.dtype, bool]]: method add_event (line 494) | def add_event( method add_event_add_engine (line 502) | def add_event_add_engine(self): method add_event_add_context (line 507) | def add_event_add_context(self): method add_event_generated_token (line 511) | def add_event_generated_token( method add_event_pause (line 547) | def add_event_pause(self): method add_event_evict (line 551) | def add_event_evict(self): method add_event_finish (line 555) | def add_event_finish(self): method add_event_fail (line 559) | def add_event_fail(self): method add_event_error_transient (line 563) | def add_event_error_transient(self, error: Exception): method add_event_error_nontransient (line 567) | def add_event_error_nontransient(self, error: Exception): method succeeded (line 571) | def succeeded(self) -> bool: method failed (line 575) | def failed(self) -> bool: class DynamicInferenceRequestRecord (line 581) | class DynamicInferenceRequestRecord: method from_request (line 589) | def from_request(cls, request: DynamicInferenceRequest) -> "DynamicInf... method __getitem__ (line 602) | def __getitem__(self, idx: int) -> DynamicInferenceRequest: method request_id (line 614) | def request_id(self) -> int: method checkpoint (line 622) | def checkpoint(self, tokenizer: MegatronTokenizer | None = None): method merge (line 679) | def merge(self, tokenizer: MegatronTokenizer | None = None) -> Dynamic... method serialize (line 737) | def serialize(self) -> dict: method deserialize (line 751) | def deserialize(cls, obj: dict) -> "DynamicInferenceRequestRecord": class VLMInferenceRequest (line 766) | class VLMInferenceRequest(InferenceRequest): FILE: megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py class AbstractModelInferenceWrapper (line 23) | class AbstractModelInferenceWrapper(abc.ABC): method __init__ (line 39) | def __init__( method prep_model_for_inference (line 70) | def prep_model_for_inference(self): method prep_inference_input (line 87) | def prep_inference_input(self, prompt_tokens) -> Dict[str, Any]: method get_batch_for_context_window (line 99) | def get_batch_for_context_window(self, *args, **kwargs) -> Dict[str, A... method _forward (line 109) | def _forward(self, inference_input): method dummy_forward (line 130) | def dummy_forward(self): method _get_batch_size_and_seq_len (line 157) | def _get_batch_size_and_seq_len( method _allocate_recv_buffer (line 176) | def _allocate_recv_buffer(self, batch_size, seq_len): method forward_pass_without_pipeline_parallel (line 188) | def forward_pass_without_pipeline_parallel( method forward_pass_with_pipeline_parallel (line 209) | def forward_pass_with_pipeline_parallel( method run_one_forward_step (line 256) | def run_one_forward_step( FILE: megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py class GPTInferenceWrapper (line 19) | class GPTInferenceWrapper(AbstractModelInferenceWrapper): method __init__ (line 31) | def __init__(self, model: GPTModel, inference_context: Optional[BaseIn... method prep_inference_input (line 34) | def prep_inference_input(self, prompts_tokens: torch.Tensor) -> Dict[s... method _build_attention_mask_and_position_ids (line 54) | def _build_attention_mask_and_position_ids( method get_batch_for_context_window (line 91) | def get_batch_for_context_window( FILE: megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py class VLMInferenceWrapper (line 18) | class VLMInferenceWrapper(GPTInferenceWrapper): method prep_model_for_inference (line 21) | def prep_model_for_inference(self, prompts_tokens: Optional[torch.Tens... method prep_inference_input (line 55) | def prep_inference_input( method get_batch_for_context_window (line 89) | def get_batch_for_context_window( method _forward (line 126) | def _forward(self, inference_input: Dict[str, Any]): method run_one_forward_step (line 155) | def run_one_forward_step(self, inference_input: Dict[str, Any]) -> tor... FILE: megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py class T5InferenceWrapper (line 19) | class T5InferenceWrapper(AbstractModelInferenceWrapper): method __init__ (line 33) | def __init__( method prep_inference_input (line 42) | def prep_inference_input( method tokenize_encoder_prompt (line 93) | def tokenize_encoder_prompt(self, encoder_prompt: str, tokenizer) -> t... method pad_encoder_prompts_tokens (line 121) | def pad_encoder_prompts_tokens( method get_batch_for_context_window (line 144) | def get_batch_for_context_window( method forward_pass_without_pipeline_parallel (line 192) | def forward_pass_without_pipeline_parallel( FILE: megatron/core/inference/moe/__init__.py class InferenceGroupedGemmBackend (line 10) | class InferenceGroupedGemmBackend(enum.Enum): function resolve_inference_grouped_gemm_backend (line 18) | def resolve_inference_grouped_gemm_backend( FILE: megatron/core/inference/moe/activations.py function _ceil_div (line 28) | def _ceil_div(a, b): function _squared_relu_kernel (line 33) | def _squared_relu_kernel(input_ptr, output_ptr, src_idx_ptr, M, N, BLOCK... function padded_squared_relu (line 46) | def padded_squared_relu(x: torch.Tensor, permutation_map: torch.Tensor) ... function _squared_relu_quantize_kernel (line 56) | def _squared_relu_quantize_kernel( function squared_relu_and_quantize_mxfp8 (line 121) | def squared_relu_and_quantize_mxfp8( FILE: megatron/core/inference/moe/fused_moe.py class ActivationType (line 40) | class ActivationType(Enum): function _bf16_grouped_mm (line 46) | def _bf16_grouped_mm( function _mxfp8_grouped_mm (line 54) | def _mxfp8_grouped_mm(act: MXFP8Tensor, weight: MXFP8Tensor, offs: torch... function _get_activation_func (line 70) | def _get_activation_func(activation_type: ActivationType, fused_quant: b... function mcore_fused_moe (line 81) | def mcore_fused_moe( FILE: megatron/core/inference/moe/pad.py function _pad_tokens_kernel (line 37) | def _pad_tokens_kernel( function pad_to_alignment (line 92) | def pad_to_alignment( function _unpad_tokens_kernel (line 140) | def _unpad_tokens_kernel( function unpad_from_alignment (line 168) | def unpad_from_alignment( FILE: megatron/core/inference/moe/permute.py function _ceil_div (line 31) | def _ceil_div(a, b): function _count_local_tokens_kernel (line 36) | def _count_local_tokens_kernel( function compute_local_tokens_per_expert (line 60) | def compute_local_tokens_per_expert( function _prefix_sum_kernel (line 79) | def _prefix_sum_kernel( function compute_expert_offsets (line 104) | def compute_expert_offsets(tokens_per_expert: torch.Tensor, alignment: i... function _permute_tokens_kernel (line 121) | def _permute_tokens_kernel( function permute_tokens (line 170) | def permute_tokens( function _unpermute_tokens_kernel (line 243) | def _unpermute_tokens_kernel( function unpermute_tokens (line 271) | def unpermute_tokens( function _permute_quantize_mxfp8_kernel (line 295) | def _permute_quantize_mxfp8_kernel( function permute_and_quantize_mxfp8 (line 375) | def permute_and_quantize_mxfp8( FILE: megatron/core/inference/quantization/mxfp8_quantize.py function _ceil_div (line 33) | def _ceil_div(a, b): function _mxfp8_quant_swizzle_kernel (line 38) | def _mxfp8_quant_swizzle_kernel( function mxfp8_quantize (line 160) | def mxfp8_quantize(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: FILE: megatron/core/inference/quantization/mxfp8_tensor.py function _ceil_div (line 20) | def _ceil_div(a, b): class MXFP8Tensor (line 25) | class MXFP8Tensor: method size (line 32) | def size(self, idx: Optional[int] = None): method scale_2d (line 36) | def scale_2d(self, K: Optional[int] = None) -> torch.Tensor: method from_bf16 (line 51) | def from_bf16(cls, x: torch.Tensor, group_size: int = 32, backend: str... FILE: megatron/core/inference/quantization/utils.py function _verify_te_to_mcore_mxfp8_conversion (line 32) | def _verify_te_to_mcore_mxfp8_conversion(te_dequantized, fi_quantized: M... function quantize_model_to_mxfp8 (line 53) | def quantize_model_to_mxfp8(model: torch.nn.Module, backend: str = "flas... function _should_quantize_param (line 97) | def _should_quantize_param(val: torch.Tensor) -> bool: function _to_bf16 (line 114) | def _to_bf16(val: torch.Tensor) -> torch.Tensor: function collect_mxfp8_param_metadata (line 123) | def collect_mxfp8_param_metadata( function quantize_params_to_mxfp8 (line 142) | def quantize_params_to_mxfp8( function _mm_mxfp8_flashinfer (line 215) | def _mm_mxfp8_flashinfer(x_mxfp8: MXFP8Tensor, weight: MXFP8Tensor, out=... function _mm_mxfp8_torch (line 222) | def _mm_mxfp8_torch(x_mxfp8: MXFP8Tensor, weight: MXFP8Tensor, out=None): function mm_mxfp8 (line 241) | def mm_mxfp8(x: torch.Tensor, weight: MXFP8Tensor, out: torch.Tensor = N... FILE: megatron/core/inference/sampling_params.py class SamplingParams (line 9) | class SamplingParams: method __post_init__ (line 38) | def __post_init__(self): method _sync_prompt_logprobs_fields (line 46) | def _sync_prompt_logprobs_fields(self): method add_attributes (line 62) | def add_attributes(self, attribute_value_pair: dict): method serialize (line 79) | def serialize(self) -> dict: method deserialize (line 84) | def deserialize(cls, data: dict) -> "SamplingParams": FILE: megatron/core/inference/scheduler.py class Scheduler (line 17) | class Scheduler: method __init__ (line 28) | def __init__(self, max_batch_size): method get_new_request_id (line 37) | def get_new_request_id(self) -> int: method add_request (line 42) | def add_request( method num_requests_pending (line 124) | def num_requests_pending(self) -> int: method have_requests_pending (line 131) | def have_requests_pending(self) -> bool: method add_earliest_waiting_request_to_active_pool (line 138) | def add_earliest_waiting_request_to_active_pool(self): method update_requests_pools (line 154) | def update_requests_pools( method abort_request (line 184) | def abort_request( FILE: megatron/core/inference/symmetric_memory.py class SymmetricMemoryBuffer (line 34) | class SymmetricMemoryBuffer: method __init__ (line 41) | def __init__(self, size_in_mb, process_group): method _can_allocate (line 60) | def _can_allocate(self, numel, dtype) -> bool: method _allocate (line 71) | def _allocate(self, numel, dtype) -> torch.Tensor: method maybe_get_tensors (line 77) | def maybe_get_tensors(self, tensor_specs, alignment=16): method maybe_get_tensor (line 115) | def maybe_get_tensor(self, tensor_shape, dtype): class SymmetricMemoryManager (line 131) | class SymmetricMemoryManager: method get_buffer (line 144) | def get_buffer( method destroy (line 168) | def destroy(cls, key: Optional[str] = None) -> None: method is_initialized (line 180) | def is_initialized(cls, key: str) -> bool: FILE: megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py class EncoderDecoderTextGenerationController (line 13) | class EncoderDecoderTextGenerationController(TextGenerationController): method prep_inference_input (line 21) | def prep_inference_input( FILE: megatron/core/inference/text_generation_controllers/text_generation_controller.py class TextGenerationController (line 49) | class TextGenerationController: method __init__ (line 60) | def __init__(self, inference_wrapped_model: AbstractModelInferenceWrap... method _get_mtp_num_heads (line 90) | def _get_mtp_num_heads(self) -> int: method set_stop_word_finished_ids_callback (line 97) | def set_stop_word_finished_ids_callback(self, callback): method _init_dynamic_sampling_tensors (line 108) | def _init_dynamic_sampling_tensors(self): method _init_mtp_sampling_tensor (line 144) | def _init_mtp_sampling_tensor(self): method tokenize_prompt (line 161) | def tokenize_prompt(tokenizer, prompt: str, add_BOS: bool = False) -> ... method detokenize (line 187) | def detokenize( method detokenize_generations (line 219) | def detokenize_generations( method _torch_sampling_func (line 272) | def _torch_sampling_func( method sample_from_logits (line 361) | def sample_from_logits( method update_generation_status (line 449) | def update_generation_status( method pad_input_prompt_tokens (line 495) | def pad_input_prompt_tokens( method unpad_input_prompt_tokens (line 531) | def unpad_input_prompt_tokens( method _dynamic_step_context_init (line 542) | def _dynamic_step_context_init( method _dynamic_step_forward_logits (line 618) | def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids... method _dynamic_step_sample_bookkeeping (line 660) | def _dynamic_step_sample_bookkeeping(self): method _rewind_kv_cache (line 684) | def _rewind_kv_cache(self): method _sample_from_logits_2d (line 793) | def _sample_from_logits_2d(self, logits_2d: Tensor) -> Tensor: method _compute_serial_mtp_and_sample (line 818) | def _compute_serial_mtp_and_sample(self): method _get_required_logit_indices (line 893) | def _get_required_logit_indices( method _sample_speculative_logits (line 934) | def _sample_speculative_logits( method _verify_speculative_tokens (line 991) | def _verify_speculative_tokens( method _dynamic_step_sample_logits_and_verify_tokens (line 1075) | def _dynamic_step_sample_logits_and_verify_tokens(self, logits: Tensor... method _dynamic_step_sample_logits (line 1156) | def _dynamic_step_sample_logits(self, logits: Tensor): method _dynamic_step_log_probs_bookkeeping (line 1197) | def _dynamic_step_log_probs_bookkeeping(self) -> Tuple[bool, bool]: method _router_record_bookkeeping (line 1211) | def _router_record_bookkeeping(self) -> Optional[Dict[int, Tensor]]: method _dynamic_step_calculate_log_probs (line 1271) | def _dynamic_step_calculate_log_probs(self, logits: Tensor) -> Optiona... method _dynamic_step_calculate_log_probs_speculative (line 1282) | def _dynamic_step_calculate_log_probs_speculative( method _dynamic_step_calculate_top_n_logprobs_speculative (line 1380) | def _dynamic_step_calculate_top_n_logprobs_speculative( method _dynamic_step_calculate_top_n_logprobs (line 1477) | def _dynamic_step_calculate_top_n_logprobs( method dummy_forward (line 1561) | def dummy_forward(self): method _dummy_serial_mtp_forward (line 1609) | def _dummy_serial_mtp_forward(self): method _dynamic_step_context_bookkeeping (line 1666) | def _dynamic_step_context_bookkeeping(self) -> Dict[str, Tensor]: method async_generate_output_tokens_dynamic_batch (line 1739) | async def async_generate_output_tokens_dynamic_batch( method generate_output_tokens_dynamic_batch (line 1860) | def generate_output_tokens_dynamic_batch( method _update_top_n_logprobs_dict (line 1867) | def _update_top_n_logprobs_dict( method generate_all_output_tokens_static_batch (line 1899) | def generate_all_output_tokens_static_batch( method prep_inference_input (line 2367) | def prep_inference_input( method stream_tokens (line 2393) | def stream_tokens( FILE: megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py class VLMTextGenerationController (line 13) | class VLMTextGenerationController(TextGenerationController): method prep_inference_input (line 16) | def prep_inference_input( FILE: megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/chat_completions.py function _get_field (line 20) | def _get_field(obj, key, default=None): function _normalize_tool_calls (line 27) | def _normalize_tool_calls(tool_calls): function _coerce_arguments_mapping (line 51) | def _coerce_arguments_mapping(arguments): function _sanitize_messages_for_template (line 72) | def _sanitize_messages_for_template(messages): function _sanitize_tools_for_template (line 116) | def _sanitize_tools_for_template(tools): function _reconstruct_reasoning_content (line 143) | def _reconstruct_reasoning_content(messages: list[dict]) -> list[dict]: function _replace_prefix_tokens (line 159) | def _replace_prefix_tokens( function apply_parsers (line 199) | def apply_parsers(message_text, tools, parsers_list, tools_requested): function chat_completions (line 225) | async def chat_completions(): FILE: megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/common.py function send_do_generate (line 11) | def send_do_generate(): FILE: megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/completions.py function completions (line 20) | async def completions(): FILE: megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/health.py function health (line 14) | async def health(): FILE: megatron/core/inference/text_generation_server/dynamic_text_gen_server/text_generation_server.py function temp_log_level (line 31) | def temp_log_level(level, logger=None): function _run_text_gen_server (line 43) | async def _run_text_gen_server( function _server_process_worker (line 112) | def _server_process_worker( function start_text_gen_server (line 141) | def start_text_gen_server( function stop_text_gen_server (line 184) | def stop_text_gen_server(): FILE: megatron/core/inference/text_generation_server/dynamic_text_gen_server/tokenization.py function tokenize_prompts (line 12) | def tokenize_prompts( function _tokenize_prompts_and_batch (line 70) | def _tokenize_prompts_and_batch(tokenizer, prompts, tokens_to_generate, ... FILE: megatron/core/inference/text_generation_server/endpoints/common.py function send_do_generate (line 11) | def send_do_generate(): FILE: megatron/core/inference/text_generation_server/endpoints/completions.py function detokenize (line 24) | def detokenize(prompt, tok) -> list[str]: class MegatronCompletions (line 46) | class MegatronCompletions(Resource): method __init__ (line 49) | def __init__(self, engine, args): method post (line 53) | def post(self): FILE: megatron/core/inference/text_generation_server/run_mcore_engine.py function run_mcore_engine (line 12) | def run_mcore_engine( FILE: megatron/core/inference/text_generation_server/text_generation_server.py class MegatronGenerate (line 27) | class MegatronGenerate(Resource): method __init__ (line 30) | def __init__(self, engine, args): method put (line 35) | def put(self): class MegatronServer (line 192) | class MegatronServer(object): method __init__ (line 195) | def __init__(self, model, args=None): method run (line 204) | def run(self, url, port): FILE: megatron/core/inference/text_generation_server/tokenization.py function tokenize_prompts (line 12) | def tokenize_prompts( function _tokenize_prompts_and_batch (line 70) | def _tokenize_prompts_and_batch(tokenizer, prompts, tokens_to_generate, ... FILE: megatron/core/inference/unified_memory.py class CompilationState (line 28) | class CompilationState(Enum): class UnifiedMemoryUnsupportedError (line 36) | class UnifiedMemoryUnsupportedError(Exception): class UnifiedMemoryCompileTimeoutError (line 40) | class UnifiedMemoryCompileTimeoutError(UnifiedMemoryUnsupportedError): function _compile_timeout (line 55) | def _compile_timeout(timeout_s: int): function compile_allocator (line 87) | def compile_allocator(): function create_unified_mempool (line 260) | def create_unified_mempool() -> "MemPool": function _get_ctypes_lib (line 283) | def _get_ctypes_lib() -> "ctypes.CDLL": function prefetch_managed_tensor (line 317) | def prefetch_managed_tensor(tensor, *, device: int, stream=None) -> None: function advise_managed_tensor_preferred_location (line 352) | def advise_managed_tensor_preferred_location(tensor, *, device: int) -> ... function advise_managed_tensor_accessed_by (line 381) | def advise_managed_tensor_accessed_by(tensor, *, device: int) -> None: function prefetch_managed_module_parameters (line 410) | def prefetch_managed_module_parameters( function advise_managed_module_parameters_preferred_location (line 481) | def advise_managed_module_parameters_preferred_location( FILE: megatron/core/inference/utils.py function device_memory_summary (line 20) | def device_memory_summary() -> str: class Counter (line 43) | class Counter: method __init__ (line 49) | def __init__(self, start: int = 0) -> None: method __next__ (line 52) | def __next__(self) -> int: method reset (line 57) | def reset(self) -> None: function get_attention_mask (line 62) | def get_attention_mask(seq_length: int) -> torch.Tensor: function _init_moe_expert_cache (line 78) | def _init_moe_expert_cache(model): function set_decode_expert_padding (line 103) | def set_decode_expert_padding(model, set_to: bool = False, capacity_fact... function check_flashinfer_jit_cache_installed (line 165) | def check_flashinfer_jit_cache_installed(log_version: bool = False): function set_inference_cuda_graphed_iteration_for_ep_inference (line 204) | def set_inference_cuda_graphed_iteration_for_ep_inference(model): function unset_inference_cuda_graphed_iteration_for_ep_inference (line 219) | def unset_inference_cuda_graphed_iteration_for_ep_inference(model): function tensor_swap (line 232) | def tensor_swap(x, src_idxs, dst_idxs): function await_process_call (line 239) | async def await_process_call(call, process: multiprocessing.Process, tim... class asyncio_QueueShutDown (line 267) | class asyncio_QueueShutDown(Exception): class asyncio_Queue (line 272) | class asyncio_Queue(asyncio.Queue): method __init__ (line 275) | def __init__(self, maxsize: int = 0): method get (line 279) | async def get(self): method put_nowait (line 290) | def put_nowait(self, item): method shutdown (line 298) | def shutdown(self): FILE: megatron/core/jit.py function noop_decorator (line 11) | def noop_decorator(func): function enable_jit_fuser (line 16) | def enable_jit_fuser(): function disable_jit_fuser (line 27) | def disable_jit_fuser(): FILE: megatron/core/model_parallel_config.py class ModelParallelConfig (line 11) | class ModelParallelConfig: method __post_init__ (line 401) | def __post_init__(self): FILE: megatron/core/models/T5/t5_model.py class T5LMHead (line 27) | class T5LMHead(MegatronModule): method __init__ (line 39) | def __init__( method forward (line 71) | def forward(self, hidden_states: Tensor, word_embeddings_weight: Tenso... class T5Model (line 86) | class T5Model(LanguageModule): method __init__ (line 136) | def __init__( method forward (line 279) | def forward( method set_input_tensor (line 441) | def set_input_tensor(self, input_tensor): method shared_embedding_or_output_weight (line 471) | def shared_embedding_or_output_weight(self) -> Tensor: method sharded_state_dict (line 480) | def sharded_state_dict( function t5_extended_attention_mask (line 504) | def t5_extended_attention_mask(attention_mask_list: List[Tensor]) -> Lis... function t5_position_ids (line 528) | def t5_position_ids(token_ids: Tensor) -> Tensor: FILE: megatron/core/models/T5/t5_spec.py function encoder_model_with_transformer_engine_default_spec (line 54) | def encoder_model_with_transformer_engine_default_spec() -> ModuleSpec: function decoder_model_with_transformer_engine_default_spec (line 84) | def decoder_model_with_transformer_engine_default_spec() -> ModuleSpec: function encoder_model_with_local_spec (line 126) | def encoder_model_with_local_spec() -> ModuleSpec: function decoder_model_with_local_spec (line 161) | def decoder_model_with_local_spec() -> ModuleSpec: function get_t5_encoder_with_transformer_engine_block_spec (line 208) | def get_t5_encoder_with_transformer_engine_block_spec( function get_t5_decoder_with_transformer_engine_block_spec (line 222) | def get_t5_decoder_with_transformer_engine_block_spec( function get_t5_encoder_with_local_block_spec (line 236) | def get_t5_encoder_with_local_block_spec(num_layers: int) -> Transformer... function get_t5_decoder_with_local_block_spec (line 248) | def get_t5_decoder_with_local_block_spec(num_layers: int) -> Transformer... FILE: megatron/core/models/backends.py class BackendSpecProvider (line 51) | class BackendSpecProvider(Protocol): method column_parallel_linear (line 55) | def column_parallel_linear(self) -> type: method row_parallel_linear (line 60) | def row_parallel_linear(self) -> type: method fuse_layernorm_and_linear (line 65) | def fuse_layernorm_and_linear(self) -> bool: method column_parallel_layer_norm_linear (line 70) | def column_parallel_layer_norm_linear(self) -> Optional[type]: method layer_norm (line 75) | def layer_norm( method core_attention (line 82) | def core_attention(self) -> type: method grouped_mlp_modules (line 87) | def grouped_mlp_modules( method activation_func (line 94) | def activation_func(self) -> TEActivationFunctionBuilder | None: class LocalSpecProvider (line 99) | class LocalSpecProvider(BackendSpecProvider): method column_parallel_linear (line 102) | def column_parallel_linear(self) -> type: method row_parallel_linear (line 106) | def row_parallel_linear(self) -> type: method fuse_layernorm_and_linear (line 110) | def fuse_layernorm_and_linear(self) -> bool: method column_parallel_layer_norm_linear (line 114) | def column_parallel_layer_norm_linear(self) -> Optional[type]: method layer_norm (line 118) | def layer_norm( method core_attention (line 129) | def core_attention(self) -> type: method grouped_mlp_modules (line 133) | def grouped_mlp_modules( method activation_func (line 141) | def activation_func(self) -> TEActivationFunctionBuilder | None: class InferenceSpecProvider (line 146) | class InferenceSpecProvider(BackendSpecProvider): method linear (line 149) | def linear(self) -> type: method column_parallel_linear (line 153) | def column_parallel_linear(self) -> type: method row_parallel_linear (line 157) | def row_parallel_linear(self) -> type: method fuse_layernorm_and_linear (line 161) | def fuse_layernorm_and_linear(self) -> bool: method column_parallel_layer_norm_linear (line 165) | def column_parallel_layer_norm_linear(self) -> type[InferenceLayerNorm... method layer_norm (line 169) | def layer_norm( method core_attention (line 180) | def core_attention(self) -> type[TEDotProductAttention]: method activation_func (line 184) | def activation_func(self) -> TEActivationFunctionBuilder | None: method grouped_mlp_modules (line 190) | def grouped_mlp_modules( FILE: megatron/core/models/bert/bert_layer_specs.py function get_bert_layer_with_transformer_engine_submodules (line 44) | def get_bert_layer_with_transformer_engine_submodules() -> TransformerLa... function get_bert_layer_with_transformer_engine_spec (line 80) | def get_bert_layer_with_transformer_engine_spec(): function __getattr__ (line 91) | def __getattr__(name): FILE: megatron/core/models/bert/bert_lm_head.py class BertLMHead (line 19) | class BertLMHead(MegatronModule): method __init__ (line 27) | def __init__(self, hidden_size: int, config: TransformerConfig): method forward (line 44) | def forward(self, hidden_states: Tensor) -> Tensor: FILE: megatron/core/models/bert/bert_model.py class BertModel (line 31) | class BertModel(LanguageModule): method __init__ (line 54) | def __init__( method _sanity_check_attention_and_get_attn_mask_dimension (line 164) | def _sanity_check_attention_and_get_attn_mask_dimension(self) -> str: method bert_extended_attention_mask (line 240) | def bert_extended_attention_mask(self, attention_mask: Tensor) -> Tensor: method bert_position_ids (line 272) | def bert_position_ids(self, token_ids): method set_input_tensor (line 281) | def set_input_tensor(self, input_tensor: Tensor) -> None: method forward (line 297) | def forward( FILE: megatron/core/models/bert/pooler.py class Pooler (line 11) | class Pooler(MegatronModule): method __init__ (line 24) | def __init__( method forward (line 38) | def forward(self, hidden_states: Tensor, sequence_index=0): FILE: megatron/core/models/common/embeddings/language_model_embedding.py class LanguageModelEmbedding (line 14) | class LanguageModelEmbedding(MegatronModule): method __init__ (line 29) | def __init__( method zero_parameters (line 88) | def zero_parameters(self): method forward (line 99) | def forward(self, input_ids: Tensor, position_ids: Tensor, tokentype_i... FILE: megatron/core/models/common/embeddings/relative_pos_embedding.py class RelativePositionEmbedding (line 21) | class RelativePositionEmbedding(nn.Module): method __init__ (line 28) | def __init__( method _relative_position_bucket (line 46) | def _relative_position_bucket( method _compute_bias (line 100) | def _compute_bias(self, query_length, key_length): method get_relative_seq_len (line 138) | def get_relative_seq_len( method forward (line 175) | def forward(self, query_seq_length, key_seq_length): FILE: megatron/core/models/common/embeddings/rope_utils.py function get_pos_emb_on_this_cp_rank (line 48) | def get_pos_emb_on_this_cp_rank( function _rotate_half (line 73) | def _rotate_half(x: Tensor, rotary_interleaved: bool) -> Tensor: function _apply_rotary_pos_emb_bshd (line 92) | def _apply_rotary_pos_emb_bshd( function _get_thd_freqs_on_this_cp_rank (line 129) | def _get_thd_freqs_on_this_cp_rank( function _apply_rotary_pos_emb_thd (line 178) | def _apply_rotary_pos_emb_thd( function apply_rotary_pos_emb (line 250) | def apply_rotary_pos_emb( function apply_rotary_pos_emb_with_cos_sin (line 319) | def apply_rotary_pos_emb_with_cos_sin( FILE: megatron/core/models/common/embeddings/rotary_pos_embedding.py class RotaryEmbedding (line 36) | class RotaryEmbedding(nn.Module): method __init__ (line 58) | def __init__( method _apply_scaling (line 92) | def _apply_scaling( method get_freqs_non_repeated (line 127) | def get_freqs_non_repeated(self, max_seq_len: int, offset: int = 0) ->... method get_cos_sin (line 142) | def get_cos_sin(self, max_seq_len: int, offset: int = 0) -> (Tensor, T... method get_emb (line 150) | def get_emb(self, max_seq_len: int, offset: int = 0) -> Tensor: method forward (line 179) | def forward( method _load_from_state_dict (line 208) | def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs): method get_rotary_seq_len (line 212) | def get_rotary_seq_len( class MultimodalRotaryEmbedding (line 266) | class MultimodalRotaryEmbedding(nn.Module): method __init__ (line 285) | def __init__( method forward (line 315) | def forward( FILE: megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py class YarnRotaryEmbedding (line 21) | class YarnRotaryEmbedding(RotaryEmbedding): method __init__ (line 49) | def __init__( method get_emb (line 106) | def get_emb(self, max_seq_len: int, offset: int = 0) -> Tensor: method forward (line 161) | def forward( method _set_cos_sin_cache (line 189) | def _set_cos_sin_cache(self, seq_len, offset, dtype, packed_seq=False): method get_cached_cos_sin (line 203) | def get_cached_cos_sin( function _yarn_find_correction_dim (line 218) | def _yarn_find_correction_dim( function _yarn_find_correction_range (line 227) | def _yarn_find_correction_range( function _yarn_linear_ramp_mask (line 243) | def _yarn_linear_ramp_mask(min: float, max: float, dim: int, device: tor... function _yarn_get_mscale (line 252) | def _yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float: function _yarn_get_concentration_factor (line 259) | def _yarn_get_concentration_factor( function _yarn_get_concentration_factor_from_config (line 274) | def _yarn_get_concentration_factor_from_config(config: TransformerConfig... FILE: megatron/core/models/common/language_module/language_module.py class LanguageModule (line 36) | class LanguageModule(MegatronModule): method __init__ (line 44) | def __init__( method _is_in_embd_group (line 66) | def _is_in_embd_group(self): method _set_attention_backend (line 93) | def _set_attention_backend(self): method compute_language_model_loss (line 129) | def compute_language_model_loss(self, labels: Tensor, logits: Tensor) ... method setup_embeddings_and_output_layer (line 174) | def setup_embeddings_and_output_layer(self) -> None: method _scale_logits (line 287) | def _scale_logits(self, logits: Tensor) -> Tensor: method shared_embedding_or_output_weight (line 306) | def shared_embedding_or_output_weight(self) -> Tensor: method sharded_state_dict (line 326) | def sharded_state_dict( method tie_embeddings_and_output_weights_state_dict (line 382) | def tie_embeddings_and_output_weights_state_dict( FILE: megatron/core/models/common/model_chunk_schedule_plan.py class ModelChunkState (line 19) | class ModelChunkState: class TransformerLayerSchedulePlan (line 29) | class TransformerLayerSchedulePlan: method __init__ (line 56) | def __init__(self, layer, event, chunk_state, comp_stream, comm_stream... method release_state (line 85) | def release_state(self): method _build_callable_nodes (line 108) | def _build_callable_nodes(self, event, comp_stream, comm_stream, extra... method get_fp8_context (line 175) | def get_fp8_context(self): method run (line 189) | def run(f_layer, b_layer, f_input=None, b_grad=None, is_last_layer_in_... class TransformerModelChunkSchedulePlan (line 256) | class TransformerModelChunkSchedulePlan(AbstractSchedulePlan): method __init__ (line 271) | def __init__( method _build_layer_schedule_plan (line 349) | def _build_layer_schedule_plan(self, module, comp_stream, comm_stream): method event (line 369) | def event(self): method record_current_stream (line 373) | def record_current_stream(self): method wait_current_stream (line 378) | def wait_current_stream(self): method get_layer (line 383) | def get_layer(self, i): method pop_layer (line 388) | def pop_layer(self): method num_layers (line 392) | def num_layers(self): method state (line 397) | def state(self): method release_state (line 401) | def release_state(self): method run (line 412) | def run( FILE: megatron/core/models/common/vision_module/vision_module.py class VisionModule (line 9) | class VisionModule(MegatronModule): method __init__ (line 16) | def __init__(self, config: TransformerConfig) -> None: FILE: megatron/core/models/gpt/experimental_attention_variant_module_specs.py function get_gated_delta_net_module_spec (line 56) | def get_gated_delta_net_module_spec( function get_dsa_module_spec_for_backend (line 77) | def get_dsa_module_spec_for_backend( function get_experimental_attention_variant_module_spec (line 131) | def get_experimental_attention_variant_module_spec( function get_transformer_block_with_experimental_attention_variant_spec (line 152) | def get_transformer_block_with_experimental_attention_variant_spec( function is_linear_attention_variant (line 285) | def is_linear_attention_variant(experimental_attention_variant: Optional... function get_moe_layer_pattern (line 291) | def get_moe_layer_pattern(config: TransformerConfig) -> List[int]: function get_linear_attention_pattern (line 316) | def get_linear_attention_pattern(config: TransformerConfig) -> List[int]: function _get_backend_spec_provider (line 353) | def _get_backend_spec_provider(config: TransformerConfig) -> BackendSpec... function _get_self_attention_module_spec (line 377) | def _get_self_attention_module_spec( function _get_dense_mlp_module_spec (line 411) | def _get_dense_mlp_module_spec( function _get_moe_module_spec (line 430) | def _get_moe_module_spec( FILE: megatron/core/models/gpt/fine_grained_callables.py function weak_method (line 29) | def weak_method(method): function should_free_input (line 47) | def should_free_input(name, is_moe, config, num_local_experts): class TransformerLayerState (line 102) | class TransformerLayerState: class PreProcessNode (line 112) | class PreProcessNode(ScheduleNode): method __init__ (line 119) | def __init__(self, gpt_model, chunk_state, event, stream): method forward_impl (line 132) | def forward_impl(self): class PostProcessNode (line 172) | class PostProcessNode(ScheduleNode): method __init__ (line 179) | def __init__(self, gpt_model, chunk_state, event, stream): method forward_impl (line 192) | def forward_impl(self, hidden_states): class TransformerLayerNode (line 237) | class TransformerLayerNode(ScheduleNode): method __init__ (line 244) | def __init__( method detach (line 302) | def detach(self, t): method forward_impl (line 310) | def forward_impl(self, *args): method backward_impl (line 314) | def backward_impl(self, outputs, output_grad): method backward_dw (line 329) | def backward_dw(self): method __del__ (line 348) | def __del__(self): class _BackwardDWWrapper (line 357) | class _BackwardDWWrapper: method __init__ (line 370) | def __init__(self, layer): method backward_dw (line 388) | def backward_dw(self): method set_graphed_backward_dw_callable (line 401) | def set_graphed_backward_dw_callable(self, graphed_backward_dw_callable): function build_transformer_layer_callables (line 406) | def build_transformer_layer_callables(layer: TransformerLayer): function build_mtp_layer_callables (line 638) | def build_mtp_layer_callables(layer): function build_layer_callables (line 721) | def build_layer_callables(layer): FILE: megatron/core/models/gpt/gpt_layer_specs.py function get_gpt_layer_with_inference_submodules (line 73) | def get_gpt_layer_with_inference_submodules( function get_gpt_layer_with_inference_spec (line 166) | def get_gpt_layer_with_inference_spec(*args, **kwargs) -> ModuleSpec: function get_gpt_layer_with_transformer_engine_submodules (line 173) | def get_gpt_layer_with_transformer_engine_submodules( function get_gpt_layer_with_transformer_engine_spec (line 343) | def get_gpt_layer_with_transformer_engine_spec(*args, **kwargs) -> Modul... function get_gpt_layer_local_submodules (line 351) | def get_gpt_layer_local_submodules( function get_gpt_layer_local_spec (line 459) | def get_gpt_layer_local_spec(*args, **kwargs) -> ModuleSpec: function _get_mlp_module_spec (line 466) | def _get_mlp_module_spec( function get_mlp_module_spec (line 482) | def get_mlp_module_spec( function get_mlp_module_spec_for_backend (line 513) | def get_mlp_module_spec_for_backend( function get_gpt_decoder_layer_specs (line 549) | def get_gpt_decoder_layer_specs( function get_gpt_decoder_block_spec (line 657) | def get_gpt_decoder_block_spec( function get_gpt_mtp_block_spec (line 700) | def get_gpt_mtp_block_spec( function get_gpt_mtp_block_spec_for_backend (line 733) | def get_gpt_mtp_block_spec_for_backend( FILE: megatron/core/models/gpt/gpt_model.py class GPTModel (line 43) | class GPTModel(LanguageModule): method __init__ (line 86) | def __init__( method set_input_tensor (line 275) | def set_input_tensor(self, input_tensor: Tensor) -> None: method _preprocess (line 291) | def _preprocess( method preprocess_for_fine_grained_offloading (line 459) | def preprocess_for_fine_grained_offloading(self): method forward (line 477) | def forward( method _postprocess (line 571) | def _postprocess( method compute_mtp_single_step (line 714) | def compute_mtp_single_step( method build_schedule_plan (line 756) | def build_schedule_plan( method sharded_state_dict (line 820) | def sharded_state_dict( FILE: megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py function _get_layer_norm (line 72) | def _get_layer_norm(config: AttentionConfig | MLPConfig, use_te: bool, n... function _get_qk_layernorm (line 81) | def _get_qk_layernorm(use_te: bool, normalization: str): function _get_heterogenous_attention_spec (line 99) | def _get_heterogenous_attention_spec( function _get_heterogenous_mlp_spec (line 129) | def _get_heterogenous_mlp_spec(mlp_config: MLPConfig, use_te: bool): function _get_sharded_state_dict_keys_map (line 152) | def _get_sharded_state_dict_keys_map(block_config: TransformerBlockConfi... function get_gpt_heterogeneous_layer_spec (line 176) | def get_gpt_heterogeneous_layer_spec( FILE: megatron/core/models/gpt/moe_module_specs.py function get_moe_module_spec (line 18) | def get_moe_module_spec( function get_moe_module_spec_for_backend (line 43) | def get_moe_module_spec_for_backend( function get_inference_optimized_moe_spec (line 80) | def get_inference_optimized_moe_spec() -> ModuleSpec: FILE: megatron/core/models/huggingface/clip_model.py class SiglipHuggingFaceModel (line 19) | class SiglipHuggingFaceModel(HuggingFaceModule): method __init__ (line 27) | def __init__(self, config): method forward (line 37) | def forward(self, *args, **kwargs): FILE: megatron/core/models/huggingface/module.py class HuggingFaceModule (line 15) | class HuggingFaceModule(MegatronModule): method __init__ (line 20) | def __init__(self, config): method set_input_tensor (line 23) | def set_input_tensor(self, input_tensor): method __setattr__ (line 27) | def __setattr__(self, name: str, value): class AutoHuggingFaceModel (line 42) | class AutoHuggingFaceModel(HuggingFaceModule): method __init__ (line 47) | def __init__(self, config): method forward (line 57) | def forward(self, *args, **kwargs): function get_hf_model_type (line 62) | def get_hf_model_type(model_path): function build_hf_model (line 82) | def build_hf_model(config, model_path): FILE: megatron/core/models/huggingface/qwen_model.py class QwenHuggingFaceModel (line 21) | class QwenHuggingFaceModel(HuggingFaceModule): method __init__ (line 29) | def __init__(self, config): method forward (line 39) | def forward(self, *args, **kwargs): method embedding (line 57) | def embedding(self, input_ids, position_ids=None): FILE: megatron/core/models/mamba/mamba_model.py class MambaModel (line 37) | class MambaModel(LanguageModule): method __init__ (line 85) | def __init__( method set_input_tensor (line 284) | def set_input_tensor(self, input_tensor: Tensor) -> None: method forward (line 300) | def forward( method compute_mtp_single_step (line 479) | def compute_mtp_single_step( FILE: megatron/core/models/mimo/config/base_configs.py class MimoModelConfig (line 11) | class MimoModelConfig: FILE: megatron/core/models/mimo/model/base.py class MimoModel (line 18) | class MimoModel(MegatronModule): method __init__ (line 37) | def __init__(self, mimo_config: MimoModelConfig, cp_group=None, tp_gro... method align_embeddings_by_token_positions (line 87) | def align_embeddings_by_token_positions( method _initialize_submodules (line 155) | def _initialize_submodules(self) -> None: method _initialize_language_model (line 171) | def _initialize_language_model(self) -> None: method set_input_tensor (line 178) | def set_input_tensor(self, input_tensor): method get_text_embeddings (line 199) | def get_text_embeddings( method forward (line 233) | def forward( FILE: megatron/core/models/mimo/partition/utils.py class PartitionConfig (line 37) | class PartitionConfig: method is_partitioning_enabled (line 54) | def is_partitioning_enabled(self) -> bool: method from_mp_config (line 59) | def from_mp_config( class PartitionAdapter (line 91) | class PartitionAdapter: method __init__ (line 94) | def __init__(self, cfg: PartitionConfig): method shard (line 101) | def shard( method _apply_context_parallel (line 185) | def _apply_context_parallel( FILE: megatron/core/models/mimo/submodules/audio.py class AudioModalitySubmodules (line 15) | class AudioModalitySubmodules(ModalitySubmodules): method __init__ (line 18) | def __init__( method encode (line 47) | def encode(self, encoders_data_batch: Dict) -> List[torch.Tensor]: method decode (line 90) | def decode(self, embeddings: torch.Tensor, data_batch: Dict) -> torch.... method combine_embeddings (line 94) | def combine_embeddings(self, embeddings: List[torch.Tensor]) -> torch.... method project_embeddings (line 108) | def project_embeddings( method forward (line 129) | def forward(self, encoder_inputs: Dict[str, Any]) -> Optional[torch.Te... FILE: megatron/core/models/mimo/submodules/base.py class ModalitySubmodules (line 17) | class ModalitySubmodules(ABC, nn.Module): method __init__ (line 39) | def __init__( method from_spec (line 62) | def from_spec(cls, module_spec: ModuleSpec) -> 'ModalitySubmodules': method combine_embeddings (line 124) | def combine_embeddings(self, embeddings: List[torch.Tensor]) -> torch.... method encode (line 137) | def encode(self, data_batch: Dict) -> List[torch.Tensor]: method decode (line 150) | def decode(self, embeddings: torch.Tensor, data_batch: Dict) -> torch.... method project_embeddings (line 165) | def project_embeddings( method forward (line 182) | def forward(self, encoder_inputs: Dict[str, Any]) -> Optional[torch.Te... FILE: megatron/core/models/mimo/submodules/vision.py class VisionModalitySubmodules (line 15) | class VisionModalitySubmodules(ModalitySubmodules): method __init__ (line 21) | def __init__( method encode (line 55) | def encode(self, encoders_data_batch: Dict) -> List[torch.Tensor]: method decode (line 99) | def decode(self, embeddings: torch.Tensor, data_batch: Dict) -> torch.... method combine_embeddings (line 112) | def combine_embeddings(self, embeddings: List[torch.Tensor]) -> torch.... method project_embeddings (line 135) | def project_embeddings( method forward (line 163) | def forward(self, encoder_inputs: Dict[str, Any]) -> Optional[torch.Te... FILE: megatron/core/models/multimodal/context_parallel.py function get_padding (line 9) | def get_padding( function get_packed_seq_params (line 62) | def get_packed_seq_params(tokens, img_seq_len, padding_needed, cp_size, ... FILE: megatron/core/models/multimodal/llava_model.py class LLaVAModel (line 51) | class LLaVAModel(MegatronModule): method __init__ (line 91) | def __init__( method shared_embedding_or_output_weight (line 389) | def shared_embedding_or_output_weight(self): method set_input_tensor (line 396) | def set_input_tensor(self, input_tensor) -> None: method freeze (line 413) | def freeze( method _preprocess_data (line 437) | def _preprocess_data( method _process_embedding_token_parallel (line 669) | def _process_embedding_token_parallel( method _apply_tile_tagging (line 762) | def _apply_tile_tagging(self, image_embeddings, num_image_tiles): method forward (line 798) | def forward( function _load_state_dict_hook_ignore_param_names (line 949) | def _load_state_dict_hook_ignore_param_names( function _load_state_dict_hook_ignore_extra_state (line 973) | def _load_state_dict_hook_ignore_extra_state( function pixel_shuffle (line 998) | def pixel_shuffle(x, scale_factor=0.5, version=2): FILE: megatron/core/models/multimodal/llava_spec.py function decoder_model_with_transformer_engine_default_spec (line 37) | def decoder_model_with_transformer_engine_default_spec( function decoder_model_with_local_default_spec (line 65) | def decoder_model_with_local_default_spec( FILE: megatron/core/models/vision/clip_vit_model.py class CLIPViTModel (line 26) | class CLIPViTModel(VisionModule): method __init__ (line 42) | def __init__( method set_input_tensor (line 156) | def set_input_tensor(self, input_tensor: torch.Tensor) -> None: method forward (line 164) | def forward( function get_num_image_embeddings (line 205) | def get_num_image_embeddings( FILE: megatron/core/models/vision/multimodal_projector.py class MultimodalProjector (line 14) | class MultimodalProjector(MegatronModule): method __init__ (line 28) | def __init__( method forward (line 63) | def forward(self, hidden_states): FILE: megatron/core/models/vision/radio.py class RADIOViTModel (line 29) | class RADIOViTModel(VisionModule): method __init__ (line 50) | def __init__( method set_input_tensor (line 168) | def set_input_tensor(self, input_tensor: torch.Tensor) -> None: method forward (line 176) | def forward( method apply_pos_enc (line 237) | def apply_pos_enc( method get_pos_enc (line 257) | def get_pos_enc( method _get_pos_embeddings (line 281) | def _get_pos_embeddings(self, batch_size: int, input_dims: Tuple[int, ... function fp8_pad_hook (line 359) | def fp8_pad_hook( FILE: megatron/core/models/vision/vit_layer_specs.py function get_vit_layer_with_transformer_engine_spec (line 36) | def get_vit_layer_with_transformer_engine_spec() -> ModuleSpec: function get_vit_layer_with_local_spec (line 61) | def get_vit_layer_with_local_spec() -> ModuleSpec: function _get_mlp_module_spec (line 88) | def _get_mlp_module_spec(use_te: bool = True) -> ModuleSpec: FILE: megatron/core/msc_utils.py class _FeatureFlag (line 18) | class _FeatureFlag: method __init__ (line 20) | def __init__(self, default: bool = False): method enable (line 23) | def enable(self) -> None: method disable (line 27) | def disable(self) -> None: method is_enabled (line 31) | def is_enabled(self) -> bool: method import_package (line 35) | def import_package(self) -> Any: method __getstate__ (line 48) | def __getstate__(self): method __setstate__ (line 52) | def __setstate__(self, state): function open_file (line 60) | def open_file(*args, **kwargs): FILE: megatron/core/nccl_allocator.py function _build_nccl_allocator (line 25) | def _build_nccl_allocator(): function get_func_args (line 101) | def get_func_args(func): function create_nccl_mem_pool (line 111) | def create_nccl_mem_pool(symmetric=None): # symmetric: bool | None = No... function init (line 143) | def init() -> None: function register_mem_pool (line 163) | def register_mem_pool(pool, group, symmetric=True): function deregister_mem_pool (line 185) | def deregister_mem_pool(pool, group): class nccl_mem (line 195) | class nccl_mem: method __init__ (line 200) | def __init__(self, pool, enabled=True, device=None, group=None, symmet... method __enter__ (line 225) | def __enter__(self): method __exit__ (line 244) | def __exit__(self, *args): class MultiGroupMemPoolAllocator (line 276) | class MultiGroupMemPoolAllocator: method __init__ (line 301) | def __init__( method __enter__ (line 315) | def __enter__(self): method __exit__ (line 335) | def __exit__(self, *args): class MemPoolAllocatorWithoutRegistration (line 367) | class MemPoolAllocatorWithoutRegistration: method __init__ (line 373) | def __init__(self, pool): method __enter__ (line 377) | def __enter__(self): method __exit__ (line 380) | def __exit__(self, *args): FILE: megatron/core/num_microbatches_calculator.py function get_num_microbatches (line 17) | def get_num_microbatches() -> int: function get_current_global_batch_size (line 22) | def get_current_global_batch_size() -> int: function get_micro_batch_size (line 27) | def get_micro_batch_size() -> int: function get_current_running_global_batch_size (line 32) | def get_current_running_global_batch_size() -> int: function update_num_microbatches (line 38) | def update_num_microbatches( function unset_num_microbatches_calculator (line 54) | def unset_num_microbatches_calculator(): function init_num_microbatches_calculator (line 64) | def init_num_microbatches_calculator( function destroy_num_microbatches_calculator (line 101) | def destroy_num_microbatches_calculator(): function reconfigure_num_microbatches_calculator (line 107) | def reconfigure_num_microbatches_calculator( function _configure_global_num_microbatches_calculator (line 144) | def _configure_global_num_microbatches_calculator( function _build_num_microbatches_calculator (line 191) | def _build_num_microbatches_calculator( function _round (line 261) | def _round(batch_size: int, divisor: int) -> int: class NumMicroBatchesCalculator (line 266) | class NumMicroBatchesCalculator(ABC): method __init__ (line 269) | def __init__(self) -> None: method get (line 275) | def get(self) -> int: method get_current_global_batch_size (line 279) | def get_current_global_batch_size(self) -> int: method get_micro_batch_size (line 283) | def get_micro_batch_size(self) -> int: method get_current_running_global_batch_size (line 287) | def get_current_running_global_batch_size(self) -> int: method update (line 293) | def update(self, consumed_samples, consistency_check, verbose=False) -... class ConstantNumMicroBatchesCalculator (line 298) | class ConstantNumMicroBatchesCalculator(NumMicroBatchesCalculator): method __init__ (line 315) | def __init__( method update (line 356) | def update(self, consumed_samples, consistency_check, verbose=False) -... class RampupBatchsizeNumMicroBatchesCalculator (line 360) | class RampupBatchsizeNumMicroBatchesCalculator(NumMicroBatchesCalculator): method __init__ (line 387) | def __init__( method update (line 441) | def update(self, consumed_samples: int, consistency_check: bool, verbo... FILE: megatron/core/optimizer/__init__.py function get_standard_config_overrides (line 77) | def get_standard_config_overrides(config: OptimizerConfig) -> Dict[Param... function get_mup_config_overrides (line 115) | def get_mup_config_overrides( function _get_param_groups (line 260) | def _get_param_groups( function _get_param_groups_and_buffers (line 384) | def _get_param_groups_and_buffers( function _get_megatron_optimizer_based_on_param_groups (line 419) | def _get_megatron_optimizer_based_on_param_groups( function check_config_overrides_consistency (line 649) | def check_config_overrides_consistency( function get_megatron_optimizer (line 676) | def get_megatron_optimizer( FILE: megatron/core/optimizer/clip_grads.py function get_grad_norm_fp32 (line 51) | def get_grad_norm_fp32( function clip_grad_by_total_norm_fp32 (line 138) | def clip_grad_by_total_norm_fp32( function count_zeros_fp32 (line 180) | def count_zeros_fp32( FILE: megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py function _param_generator (line 8) | def _param_generator(cpu_optimizer): class HybridDeviceOptimizer (line 14) | class HybridDeviceOptimizer(torch.optim.Optimizer): method __init__ (line 45) | def __init__( method _set_sub_optimizer_grads (line 83) | def _set_sub_optimizer_grads(self): method _register_param_copy_back_gpu_hook (line 117) | def _register_param_copy_back_gpu_hook(self): method step (line 150) | def step(self, closure=None): method _init_sub_optimizers (line 181) | def _init_sub_optimizers(self): method build_cpu_optimizer_list (line 227) | def build_cpu_optimizer_list(cpu_optimizer_cls, cpu_param_groups): method _get_sub_optimizer_param_groups (line 251) | def _get_sub_optimizer_param_groups(self, offload_fraction: float): method _sync_sub_optimizers_state_to_hdo (line 302) | def _sync_sub_optimizers_state_to_hdo(self): method _sync_hdo_state_to_sub_optimizers (line 323) | def _sync_hdo_state_to_sub_optimizers(self): method _sync_hdo_param_groups_to_sub_optimizers (line 334) | def _sync_hdo_param_groups_to_sub_optimizers(self): method _move_new_state_to_right_device (line 357) | def _move_new_state_to_right_device(self): method _update_fp32_params_by_new_state (line 369) | def _update_fp32_params_by_new_state(self): method update_fp32_param_by_new_param (line 376) | def update_fp32_param_by_new_param(self): method _register_load_state_dict_hooks (line 383) | def _register_load_state_dict_hooks(self): method zero_grad (line 440) | def zero_grad(self, set_to_none: bool = True): method dummy_step (line 453) | def dummy_step(self): method sub_optimizers (line 466) | def sub_optimizers(self): FILE: megatron/core/optimizer/distrib_optimizer.py class Range (line 62) | class Range: method __init__ (line 72) | def __init__(self, start: int, end: int): method normalize (line 77) | def normalize(self, start: int = 0): method __str__ (line 87) | def __str__(self): method __repr__ (line 90) | def __repr__(self): method __len__ (line 93) | def __len__(self): class DistributedOptimizer (line 97) | class DistributedOptimizer(MixedPrecisionOptimizer): method _build_model_gbuf_param_range_map (line 112) | def _build_model_gbuf_param_range_map( method _build_model_gbuf_range (line 174) | def _build_model_gbuf_range(cls, param_and_grad_buffer: _ParamAndGradB... method _build_gbuf_range_map (line 221) | def _build_gbuf_range_map(cls, param_and_grad_buffer: _ParamAndGradBuf... method _build_model_param_gbuf_map (line 241) | def _build_model_param_gbuf_map( method _build_optimizer_group_ranges (line 260) | def _build_optimizer_group_ranges(cls, param_groups: List[Dict], gbuf_... method _build_model_and_main_param_groups (line 307) | def _build_model_and_main_param_groups( method __init__ (line 459) | def __init__( method _get_model_param_range_map (line 610) | def _get_model_param_range_map(self, param: torch.nn.Parameter): method get_grad_stats_parallel_group (line 620) | def get_grad_stats_parallel_group(self) -> torch.distributed.ProcessGr... method state_dict (line 628) | def state_dict(self): method load_state_dict (line 690) | def load_state_dict(self, state_dict): method _get_main_param_and_optimizer_states (line 883) | def _get_main_param_and_optimizer_states(self, model_param): method _set_main_param_and_optimizer_states (line 916) | def _set_main_param_and_optimizer_states(self, model_param, tensors): method get_parameter_state_dp_reshardable (line 954) | def get_parameter_state_dp_reshardable(self): method get_parameter_state_dp_zero (line 989) | def get_parameter_state_dp_zero( method save_parameter_state (line 1138) | def save_parameter_state(self, filename: str): method _init_optimizer_states_with_dummy_values (line 1149) | def _init_optimizer_states_with_dummy_values(self): method _param_name (line 1166) | def _param_name(self, param: torch.nn.Parameter) -> str: method sharded_state_dict (line 1187) | def sharded_state_dict( method _param_groups_to_param2group_meta (line 1308) | def _param_groups_to_param2group_meta( method _param2group_meta_to_param_groups (line 1320) | def _param2group_meta_to_param_groups( method sharded_param_state_fsdp_dtensor (line 1359) | def sharded_param_state_fsdp_dtensor(self, is_loading: bool = False): method sharded_param_state_dp_zero (line 1383) | def sharded_param_state_dp_zero( method sharded_param_state_fully_reshardable (line 1416) | def sharded_param_state_fully_reshardable( method sharded_param_state_dp_reshardable (line 1551) | def sharded_param_state_dp_reshardable( method sharded_param_state_fs_model_space (line 1701) | def sharded_param_state_fs_model_space( method load_parameter_state_from_dp_reshardable (line 1784) | def load_parameter_state_from_dp_reshardable(self, state_dict): method load_parameter_state_from_fs_model_space (line 1819) | def load_parameter_state_from_fs_model_space(self, state_dict): method _update_legacy_world_tensors (line 1844) | def _update_legacy_world_tensors(cls, old_tensors, new_numels): method load_parameter_state_from_dp_zero_legacy (line 1863) | def load_parameter_state_from_dp_zero_legacy(self, state_dict): method load_parameter_state_from_dp_zero (line 1977) | def load_parameter_state_from_dp_zero(self, state_dict, *, update_lega... method load_parameter_state_from_fully_reshardable (line 2082) | def load_parameter_state_from_fully_reshardable(self, state_dict: dict): method split_state_dict_if_needed (line 2132) | def split_state_dict_if_needed(self, state_dict): method load_parameter_state (line 2265) | def load_parameter_state(self, filename: str, *, update_legacy_format=... method zero_grad (line 2281) | def zero_grad(self, set_to_none: bool = True): method _collect_main_grad_data_for_unscaling (line 2312) | def _collect_main_grad_data_for_unscaling(self): method _get_model_and_main_params_data_float16 (line 2330) | def _get_model_and_main_params_data_float16(self): method _get_fp8_params_and_shard_fp32_from_fp8 (line 2347) | def _get_fp8_params_and_shard_fp32_from_fp8(self): method _copy_model_grads_to_main_grads (line 2398) | def _copy_model_grads_to_main_grads(self): method _copy_main_params_to_model_params (line 2441) | def _copy_main_params_to_model_params(self): method _copy_main_params_to_param_buffer (line 2493) | def _copy_main_params_to_param_buffer(self): method _build_model_param_to_state_dict_param_map (line 2517) | def _build_model_param_to_state_dict_param_map(self, state_dict): method _copy_model_params_to_main_params (line 2553) | def _copy_model_params_to_main_params(self, state_dict=None): method step_with_ready_grads (line 2610) | def step_with_ready_grads(self) -> bool: FILE: megatron/core/optimizer/grad_scaler.py class MegatronGradScaler (line 11) | class MegatronGradScaler(ABC): method __init__ (line 12) | def __init__(self, initial_scale: float): method scale (line 18) | def scale(self): method inv_scale (line 22) | def inv_scale(self): method update (line 26) | def update(self, found_inf: bool): method state_dict (line 30) | def state_dict(self): method load_state_dict (line 34) | def load_state_dict(self, state_dict: Dict): class ConstantGradScaler (line 38) | class ConstantGradScaler(MegatronGradScaler): method update (line 43) | def update(self, found_inf: bool): method state_dict (line 46) | def state_dict(self): method load_state_dict (line 49) | def load_state_dict(self, state_dict): class DynamicGradScaler (line 53) | class DynamicGradScaler(MegatronGradScaler): method __init__ (line 61) | def __init__( method update (line 108) | def update(self, found_inf: bool): method state_dict (line 132) | def state_dict(self): method load_state_dict (line 139) | def load_state_dict(self, state_dict: Dict): FILE: megatron/core/optimizer/layer_wise_optimizer.py class LayerWiseDistributedOptimizer (line 26) | class LayerWiseDistributedOptimizer(ChainedOptimizer): method __init__ (line 42) | def __init__( method shard_params (line 105) | def shard_params(self, optimizers): method set_bucket_layerwise_params_list (line 159) | def set_bucket_layerwise_params_list(self, model_chunks): method allgather_params (line 195) | def allgather_params(self) -> None: method broadcast_params (line 243) | def broadcast_params(self): method get_grad_norm (line 260) | def get_grad_norm(self): method count_zeros (line 269) | def count_zeros(self): method step (line 280) | def step(self): # type: ignore[no-untyped-def] method load_state_dict (line 295) | def load_state_dict(self, state_dict): method sharded_state_dict (line 308) | def sharded_state_dict( method save_state_dict_to_file (line 365) | def save_state_dict_to_file(self, filename: str) -> None: method load_state_dict_from_file (line 372) | def load_state_dict_from_file(self, filename: str) -> None: FILE: megatron/core/optimizer/muon.py class TensorParallelMuon (line 51) | class TensorParallelMuon(OrthogonalizedOptimizer): method __init__ (line 54) | def __init__( method orthogonalize (line 119) | def orthogonalize(self, p: torch.Tensor, grad: torch.Tensor, **kwargs:... function get_megatron_muon_optimizer (line 174) | def get_megatron_muon_optimizer( FILE: megatron/core/optimizer/optimizer.py function _zero_grad_group_helper (line 58) | def _zero_grad_group_helper( function _multi_tensor_copy_this_to_that (line 79) | def _multi_tensor_copy_this_to_that( class MegatronOptimizer (line 100) | class MegatronOptimizer(ABC): method __init__ (line 110) | def __init__( method get_parameters (line 126) | def get_parameters(self) -> List[torch.nn.Parameter]: method get_main_grads_for_grad_norm (line 137) | def get_main_grads_for_grad_norm(self) -> List[torch.Tensor]: method get_grad_stats_parallel_group (line 165) | def get_grad_stats_parallel_group(self) -> torch.distributed.ProcessGr... method prepare_grads (line 186) | def prepare_grads(self) -> bool: method step_with_ready_grads (line 191) | def step_with_ready_grads(self) -> bool: method get_grad_norm (line 196) | def get_grad_norm(self): method clip_grad_norm (line 204) | def clip_grad_norm(self, clip_grad: float) -> float: method count_zeros (line 224) | def count_zeros(self) -> float: method zero_grad (line 235) | def zero_grad(self, set_to_none: bool = True): method get_loss_scale (line 240) | def get_loss_scale(self) -> torch.Tensor: method scale_loss (line 247) | def scale_loss(self, loss: torch.Tensor) -> torch.Tensor: method reload_model_params (line 252) | def reload_model_params(self, state_dict=None): method state_dict (line 269) | def state_dict(self): method load_state_dict (line 274) | def load_state_dict(self, state_dict): method _get_state (line 280) | def _get_state(self): method _set_state (line 283) | def _set_state(self, value): method _get_param_groups (line 291) | def _get_param_groups(self): method _set_param_groups (line 297) | def _set_param_groups(self, value): method step (line 303) | def step(self): method sharded_state_dict (line 308) | def sharded_state_dict( method _extract_common_per_param_step (line 326) | def _extract_common_per_param_step(state_dict) -> Union[int, torch.Ten... method _restore_common_per_param_step (line 341) | def _restore_common_per_param_step(state_dict: Dict, step: Union[int, ... method offload_to_cpu (line 345) | def offload_to_cpu(self): method restore_from_cpu (line 365) | def restore_from_cpu(self): method _filter_and_reorder_param_groups (line 384) | def _filter_and_reorder_param_groups( class MixedPrecisionOptimizer (line 438) | class MixedPrecisionOptimizer(MegatronOptimizer): method __init__ (line 452) | def __init__( method get_loss_scale (line 488) | def get_loss_scale(self): method reload_model_params (line 493) | def reload_model_params(self, state_dict=None): method _unscale_main_grads_and_check_for_nan (line 497) | def _unscale_main_grads_and_check_for_nan(self): method prepare_grads (line 525) | def prepare_grads(self) -> bool: method step_with_ready_grads (line 561) | def step_with_ready_grads(self) -> bool: method step (line 594) | def step(self): class Float16OptimizerWithFloat16Params (line 627) | class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer): method __init__ (line 641) | def __init__( method zero_grad (line 711) | def zero_grad(self, set_to_none=True): method _collect_main_grad_data_for_unscaling (line 726) | def _collect_main_grad_data_for_unscaling(self): method _get_model_and_main_params_data_float16 (line 746) | def _get_model_and_main_params_data_float16(self): method _copy_model_grads_to_main_grads (line 755) | def _copy_model_grads_to_main_grads(self): method _copy_main_params_to_model_params (line 775) | def _copy_main_params_to_model_params(self): method _copy_model_params_to_main_params (line 782) | def _copy_model_params_to_main_params(self, state_dict=None): method state_dict (line 790) | def state_dict(self, is_loading: bool = False): method sharded_state_dict (line 801) | def sharded_state_dict( method load_state_dict (line 850) | def load_state_dict(self, state_dict): class FP32Optimizer (line 891) | class FP32Optimizer(MegatronOptimizer): method __init__ (line 900) | def __init__( method zero_grad (line 911) | def zero_grad(self, set_to_none=True): method get_loss_scale (line 918) | def get_loss_scale(self): method prepare_grads (line 923) | def prepare_grads(self) -> bool: method step_with_ready_grads (line 944) | def step_with_ready_grads(self) -> bool: method step (line 962) | def step(self): method reload_model_params (line 996) | def reload_model_params(self, state_dict=None): method state_dict (line 999) | def state_dict(self): method load_state_dict (line 1002) | def load_state_dict(self, state_dict): method sharded_state_dict (line 1013) | def sharded_state_dict( class ProxyDict (line 1039) | class ProxyDict: method __init__ (line 1051) | def __init__(self, inner_dicts: List[dict]): method __getitem__ (line 1054) | def __getitem__(self, key: Tuple[int, str]): method __setitem__ (line 1058) | def __setitem__(self, key: Tuple[int, str], value: Any): method __len__ (line 1062) | def __len__(self) -> int: method __iter__ (line 1065) | def __iter__(self): method items (line 1070) | def items(self): class ChainedOptimizer (line 1077) | class ChainedOptimizer(MegatronOptimizer): method __init__ (line 1087) | def __init__(self, chained_optimizers: List[MegatronOptimizer]): method optimizer (line 1109) | def optimizer(self): method param_groups (line 1119) | def param_groups(self) -> List[dict]: method get_parameters (line 1127) | def get_parameters(self) -> List[torch.nn.Parameter]: method state (line 1135) | def state(self) -> ProxyDict: method zero_grad (line 1142) | def zero_grad(self, set_to_none=True): method get_loss_scale (line 1146) | def get_loss_scale(self): method _split_state_dict (line 1152) | def _split_state_dict(self, state_dict): method reload_model_params (line 1182) | def reload_model_params(self, state_dict=None): method state_dict (line 1187) | def state_dict(self): method sharded_state_dict (line 1193) | def sharded_state_dict( method load_state_dict (line 1225) | def load_state_dict(self, state_dict): method prepare_grads (line 1242) | def prepare_grads(self) -> bool: method step_with_ready_grads (line 1251) | def step_with_ready_grads(self) -> bool: method grads_states_parallel_group_is_shared (line 1263) | def grads_states_parallel_group_is_shared(self): method get_grad_stats_parallel_group (line 1271) | def get_grad_stats_parallel_group(self) -> torch.distributed.ProcessGr... method get_grad_norm (line 1279) | def get_grad_norm(self): method count_zeros (line 1298) | def count_zeros(self): method step (line 1317) | def step(self): method save_parameter_state (line 1349) | def save_parameter_state(self, filename: str): method load_parameter_state (line 1376) | def load_parameter_state(self, filename: str, *, update_legacy_format:... method _synchronize_steps (line 1401) | def _synchronize_steps(self): method offload_to_cpu (line 1423) | def offload_to_cpu(self): method restore_from_cpu (line 1428) | def restore_from_cpu(self): FILE: megatron/core/optimizer/optimizer_config.py class ParamPredicate (line 13) | class ParamPredicate: method __call__ (line 32) | def __call__(self, param: torch.nn.Parameter) -> bool: class ParamWithNamePredicate (line 37) | class ParamWithNamePredicate: method __call__ (line 60) | def __call__(self, param: torch.nn.Parameter, name: str) -> bool: class ParamKey (line 65) | class ParamKey: method matches (line 89) | def matches(self, param: torch.nn.Parameter, param_name: str) -> bool: class OptimizerConfig (line 139) | class OptimizerConfig: method __post_init__ (line 358) | def __post_init__(self): class AdamOptimizerConfig (line 442) | class AdamOptimizerConfig(OptimizerConfig): class SGDOptimizerConfig (line 463) | class SGDOptimizerConfig(OptimizerConfig): FILE: megatron/core/optimizer/qk_clip.py function clip_qk (line 8) | def clip_qk(model, log_max_only=False) -> float: FILE: megatron/core/optimizer_param_scheduler.py class ParamGroupOverride (line 17) | class ParamGroupOverride(TypedDict): function get_canonical_lr_for_logging (line 37) | def get_canonical_lr_for_logging(param_groups: list[dict]) -> float | None: function param_group_override_to_tuple (line 57) | def param_group_override_to_tuple( function combine_param_group_overrides (line 70) | def combine_param_group_overrides( class OptimizerParamScheduler (line 97) | class OptimizerParamScheduler: method __init__ (line 122) | def __init__( method get_wd (line 181) | def get_wd(self, param_group: Optional[dict] = None) -> float: method get_lr (line 215) | def get_lr(self, param_group: dict) -> float: method step (line 281) | def step(self, increment: int) -> None: method state_dict (line 296) | def state_dict(self) -> dict: method _check_and_set (line 312) | def _check_and_set(self, cls_value: float, sd_value: float, name: str)... method load_state_dict (line 335) | def load_state_dict(self, state_dict: dict) -> None: FILE: megatron/core/packed_seq_params.py class PackedSeqParams (line 10) | class PackedSeqParams: method __post_init__ (line 28) | def __post_init__(self): FILE: megatron/core/parallel_state.py function get_nccl_options (line 150) | def get_nccl_options(pg_name, nccl_comm_cfgs): function update_pg_timeout (line 185) | def update_pg_timeout( function create_group (line 214) | def create_group( function generate_masked_orthogonal_rank_groups (line 251) | def generate_masked_orthogonal_rank_groups( function create_hierarchical_groups (line 360) | def create_hierarchical_groups( function create_hybrid_dp_cp_groups (line 422) | def create_hybrid_dp_cp_groups(rank, ranks, pg_options): class RankGenerator (line 447) | class RankGenerator(object): method __init__ (line 450) | def __init__( method get_mask (line 491) | def get_mask(self, order: str, token: str): method get_ranks (line 506) | def get_ranks(self, token): function default_embedding_ranks (line 525) | def default_embedding_ranks(pp_ranks): function default_position_embedding_ranks (line 534) | def default_position_embedding_ranks(pp_ranks): function overwrite_nccl_comm_cfgs (line 540) | def overwrite_nccl_comm_cfgs(nccl_comm_cfgs, pg_name, key_value_pair): function initialize_model_parallel (line 548) | def initialize_model_parallel( function is_initialized (line 1361) | def is_initialized(): function model_parallel_is_initialized (line 1366) | def model_parallel_is_initialized(): function get_model_parallel_group (line 1377) | def get_model_parallel_group(check_initialized=True): function get_tensor_model_parallel_group (line 1384) | def get_tensor_model_parallel_group(check_initialized=True): function get_pipeline_model_parallel_group (line 1393) | def get_pipeline_model_parallel_group(check_initialized=True): function get_data_parallel_group (line 1402) | def get_data_parallel_group( function has_separate_all_gather_group (line 1427) | def has_separate_all_gather_group() -> bool: function get_data_parallel_group_gloo (line 1436) | def get_data_parallel_group_gloo(with_context_parallel=False, partial_da... function get_context_parallel_group (line 1454) | def get_context_parallel_group(check_initialized=True): function get_context_parallel_global_ranks (line 1461) | def get_context_parallel_global_ranks(check_initialized=True): function get_hierarchical_context_parallel_groups (line 1470) | def get_hierarchical_context_parallel_groups(check_initialized=True): function get_hybrid_data_context_parallel_groups (line 1477) | def get_hybrid_data_context_parallel_groups(check_initialized=True, grou... function get_embedding_group (line 1489) | def get_embedding_group(check_initialized=True): function get_position_embedding_group (line 1496) | def get_position_embedding_group(check_initialized=True): function get_amax_reduction_group (line 1503) | def get_amax_reduction_group(with_context_parallel=False, tp_only_amax_r... function get_tensor_and_data_parallel_group (line 1529) | def get_tensor_and_data_parallel_group(check_initialized=True, with_cont... function get_tensor_and_context_parallel_group (line 1545) | def get_tensor_and_context_parallel_group(check_initialized=True): function set_tensor_model_parallel_world_size (line 1554) | def set_tensor_model_parallel_world_size(world_size): function set_pipeline_model_parallel_world_size (line 1560) | def set_pipeline_model_parallel_world_size(world_size): function set_virtual_pipeline_model_parallel_world_size (line 1566) | def set_virtual_pipeline_model_parallel_world_size(world_size): function get_tensor_model_parallel_world_size (line 1572) | def get_tensor_model_parallel_world_size(): function get_pipeline_model_parallel_world_size (line 1580) | def get_pipeline_model_parallel_world_size(): function set_tensor_model_parallel_rank (line 1588) | def set_tensor_model_parallel_rank(rank): function set_pipeline_model_parallel_rank (line 1594) | def set_pipeline_model_parallel_rank(rank): function get_tensor_model_parallel_rank (line 1600) | def get_tensor_model_parallel_rank(): function get_pipeline_model_parallel_rank (line 1608) | def get_pipeline_model_parallel_rank(): function is_pipeline_first_stage (line 1616) | def is_pipeline_first_stage(ignore_virtual=True, vp_stage=None): function is_pipeline_last_stage (line 1626) | def is_pipeline_last_stage(ignore_virtual=True, vp_stage=None): function is_rank_in_embedding_group (line 1636) | def is_rank_in_embedding_group(ignore_virtual=True, vp_stage=None): function is_rank_in_position_embedding_group (line 1654) | def is_rank_in_position_embedding_group(): function get_virtual_pipeline_model_parallel_rank (line 1661) | def get_virtual_pipeline_model_parallel_rank(): function set_virtual_pipeline_model_parallel_rank (line 1667) | def set_virtual_pipeline_model_parallel_rank(rank): function get_virtual_pipeline_model_parallel_world_size (line 1678) | def get_virtual_pipeline_model_parallel_world_size(): function get_tensor_model_parallel_src_rank (line 1684) | def get_tensor_model_parallel_src_rank(): function get_model_parallel_src_rank (line 1693) | def get_model_parallel_src_rank(): function get_data_parallel_src_rank (line 1700) | def get_data_parallel_src_rank(with_context_parallel=False): function get_pipeline_model_parallel_first_rank (line 1713) | def get_pipeline_model_parallel_first_rank(): function get_pipeline_model_parallel_last_rank (line 1719) | def get_pipeline_model_parallel_last_rank(): function get_pipeline_model_parallel_next_rank (line 1726) | def get_pipeline_model_parallel_next_rank(): function get_pipeline_model_parallel_prev_rank (line 1734) | def get_pipeline_model_parallel_prev_rank(): function get_data_parallel_world_size (line 1742) | def get_data_parallel_world_size(with_context_parallel=False, partial_da... function set_data_parallel_rank (line 1755) | def set_data_parallel_rank(rank): function get_data_parallel_rank (line 1761) | def get_data_parallel_rank(with_context_parallel=False, partial_data_par... function get_context_parallel_world_size (line 1774) | def get_context_parallel_world_size(): function get_context_parallel_rank (line 1782) | def get_context_parallel_rank(): function get_tensor_and_context_parallel_world_size (line 1790) | def get_tensor_and_context_parallel_world_size(): function get_tensor_and_context_parallel_rank (line 1798) | def get_tensor_and_context_parallel_rank(): function get_expert_model_parallel_group (line 1807) | def get_expert_model_parallel_group(check_initialized=True): function get_expert_model_parallel_src_rank (line 1816) | def get_expert_model_parallel_src_rank(): function get_expert_model_parallel_world_size (line 1825) | def get_expert_model_parallel_world_size(): function set_expert_model_parallel_world_size (line 1835) | def set_expert_model_parallel_world_size(world_size): function get_expert_model_parallel_rank (line 1841) | def get_expert_model_parallel_rank(): function set_expert_model_parallel_rank (line 1851) | def set_expert_model_parallel_rank(rank): function get_expert_tensor_parallel_group (line 1857) | def get_expert_tensor_parallel_group(check_initialized=True): function get_expert_tensor_parallel_world_size (line 1866) | def get_expert_tensor_parallel_world_size(): function set_expert_tensor_parallel_world_size (line 1878) | def set_expert_tensor_parallel_world_size(world_size): function get_expert_tensor_parallel_rank (line 1884) | def get_expert_tensor_parallel_rank(): function set_expert_tensor_parallel_rank (line 1896) | def set_expert_tensor_parallel_rank(rank): function get_expert_tensor_and_model_parallel_group (line 1902) | def get_expert_tensor_and_model_parallel_group(check_initialized=True): function get_expert_tensor_and_model_parallel_world_size (line 1911) | def get_expert_tensor_and_model_parallel_world_size(): function get_expert_tensor_and_model_parallel_rank (line 1920) | def get_expert_tensor_and_model_parallel_rank(): function get_expert_tensor_model_pipeline_parallel_group (line 1928) | def get_expert_tensor_model_pipeline_parallel_group(check_initialized=Tr... function get_expert_data_parallel_group (line 1937) | def get_expert_data_parallel_group(check_initialized=True, partial_exper... function get_expert_data_parallel_group_gloo (line 1953) | def get_expert_data_parallel_group_gloo(partial_expert_data_parallel=Fal... function get_expert_data_parallel_rank (line 1967) | def get_expert_data_parallel_rank(partial_expert_data_parallel=False): function get_expert_data_parallel_world_size (line 1977) | def get_expert_data_parallel_world_size(partial_expert_data_parallel=Fal... function get_intra_distributed_optimizer_instance_group (line 1987) | def get_intra_distributed_optimizer_instance_group(check_initialized=True): function get_inter_distributed_optimizer_instance_group (line 1996) | def get_inter_distributed_optimizer_instance_group(check_initialized=True): function _set_global_memory_buffer (line 2012) | def _set_global_memory_buffer(): function get_global_memory_buffer (line 2019) | def get_global_memory_buffer(): function destroy_global_memory_buffer (line 2025) | def destroy_global_memory_buffer(): function get_all_ranks (line 2031) | def get_all_ranks(): function destroy_model_parallel (line 2044) | def destroy_model_parallel(): FILE: megatron/core/pipeline_parallel/bridge_communicator.py class CommRole (line 14) | class CommRole(Enum): class RankCommInfo (line 31) | class RankCommInfo: class BridgeCommunicator (line 39) | class BridgeCommunicator: method destroy_broadcast_pgs (line 53) | def destroy_broadcast_pgs(cls): method __init__ (line 60) | def __init__( method _get_or_create_broadcast_pg (line 161) | def _get_or_create_broadcast_pg(cls, ranks_list: List[List[int]]): method get_leader_rank (line 169) | def get_leader_rank(self, grid: HyperCommGrid, is_src: bool) -> List[i... method get_boundary_pp_stage_ranks (line 205) | def get_boundary_pp_stage_ranks(self, grid: HyperCommGrid, is_src: bool): method is_current_rank_in_grid (line 243) | def is_current_rank_in_grid(self, grid: HyperCommGrid) -> bool: method build_comm_map (line 247) | def build_comm_map(self, src_tp_leaders: List[int], dest_tp_leaders: L... method send_forward (line 312) | def send_forward(self, tensor_to_send: torch.Tensor): method recv_forward (line 340) | def recv_forward(self) -> torch.Tensor: method send_backward (line 436) | def send_backward(self, grad_tensor: torch.Tensor): method recv_backward (line 471) | def recv_backward(self) -> torch.Tensor: method send_forward_recv_backward (line 562) | def send_forward_recv_backward( method send_backward_recv_forward (line 683) | def send_backward_recv_forward( method _communicate_shapes (line 811) | def _communicate_shapes( method _split_tensor_at_batch_dim (line 923) | def _split_tensor_at_batch_dim( FILE: megatron/core/pipeline_parallel/combined_1f1b.py function combined_1f1b_schedule_for_no_pipelining (line 18) | def combined_1f1b_schedule_for_no_pipelining( function combined_1f1b_schedule_for_interleaved_pipelining (line 111) | def combined_1f1b_schedule_for_interleaved_pipelining( function combined_forward_backward_step (line 237) | def combined_forward_backward_step( FILE: megatron/core/pipeline_parallel/fine_grained_activation_offload.py function debug_rank (line 16) | def debug_rank(message): function print_offload_summary_table (line 26) | def print_offload_summary_table(total_offload_bytes: Dict[str, int]): class GPUTensorPool (line 97) | class GPUTensorPool: method __init__ (line 114) | def __init__(self, device: str = 'cuda', pin_memory: bool = False): method _get_pool_key (line 141) | def _get_pool_key(self, shape: Tuple, dtype: torch.dtype) -> Tuple: method _calculate_memory_size (line 146) | def _calculate_memory_size(shape: Tuple, dtype: torch.dtype) -> int: method allocate (line 154) | def allocate(self, shape: Tuple, dtype: torch.dtype = torch.float32) -... method free (line 208) | def free(self, tensor: torch.Tensor): method get_pool_status (line 251) | def get_pool_status(self, shape: Tuple = None, dtype: torch.dtype = No... method reset (line 294) | def reset(self): method clear (line 308) | def clear(self): method __del__ (line 326) | def __del__(self): class OffloadTensorGroup (line 331) | class OffloadTensorGroup: method __init__ (line 336) | def __init__(self, name): method push_tensor (line 352) | def push_tensor(self, tag, tensor): method pop_tensor (line 356) | def pop_tensor(self, tag): method record_offload_event (line 360) | def record_offload_event(self, stream): method wait_offload_event (line 364) | def wait_offload_event(self, stream): method record_reload_event (line 368) | def record_reload_event(self, stream): method wait_reload_event (line 372) | def wait_reload_event(self, stream): method update_offload_info (line 376) | def update_offload_info(self, tensor): class PipelineOffloadManager (line 382) | class PipelineOffloadManager: method get_instance (line 392) | def get_instance(cls): method reset_instance (line 399) | def reset_instance(cls): method __init__ (line 404) | def __init__(self): method d2h_stream (line 437) | def d2h_stream(self): method h2d_stream (line 442) | def h2d_stream(self): method cpu_tensor_pool (line 447) | def cpu_tensor_pool(self): method push_offload_groups (line 451) | def push_offload_groups(self, group_hook, forced_released_tensors): method flush_delayed_groups (line 456) | def flush_delayed_groups(self): method reset (line 464) | def reset(self): method offload_summary_bytes (line 484) | def offload_summary_bytes(self) -> Dict[str, int]: method offload_summary_total_bytes (line 489) | def offload_summary_total_bytes(self) -> int: method flush (line 493) | def flush(self): method disable_offload (line 508) | def disable_offload(self): method enable_offload (line 515) | def enable_offload(self): method post_warmup_callback (line 522) | def post_warmup_callback(self): method push (line 575) | def push(self, handler): method pop_backward_chunk (line 582) | def pop_backward_chunk(self, name=None): method front_backward_chunk (line 599) | def front_backward_chunk(self, name=None): method init_model_chunk_offload_handler (line 609) | def init_model_chunk_offload_handler( method pop_forward_chunk (line 649) | def pop_forward_chunk(self, name=None): method cur_forward_chunk (line 665) | def cur_forward_chunk(self): method cur_backward_chunk (line 669) | def cur_backward_chunk(self): method mark_not_offloadable (line 673) | def mark_not_offloadable(self, tensor: torch.Tensor): method __enter__ (line 678) | def __enter__(self): method __exit__ (line 695) | def __exit__(self, *args: Any): method on_save_for_backward (line 709) | def on_save_for_backward(self, tensor: torch.Tensor) -> Any: method on_get_saved_tensor (line 718) | def on_get_saved_tensor(self, saved_state: Any) -> torch.Tensor: class ChunkOffloadHandler (line 727) | class ChunkOffloadHandler: method offload (line 733) | def offload(self, src_tensor, pin_memory=True, use_cpu_pool=True): method reload (line 751) | def reload(self, state, non_blocking=None): method __init__ (line 765) | def __init__(self, min_offloaded_tensor_size, cpu_tensor_pool): method reset (line 789) | def reset(self): method find_group_with_name (line 797) | def find_group_with_name(self, name: str, start_index: int = 0): method is_empty_chunk (line 803) | def is_empty_chunk(self, name=None): method finish_all_groups (line 810) | def finish_all_groups(self, name=None) -> bool: method find_next_group (line 826) | def find_next_group(self, name=None): method tensor_push (line 831) | def tensor_push(self, tensor): method tensor_pop (line 849) | def tensor_pop(self, tensor_tag): method tensor_need_offloading_checker (line 860) | def tensor_need_offloading_checker(self, tensor): method bulk_offload_group (line 872) | def bulk_offload_group(self): method get_max_deduplicated_groups (line 891) | def get_max_deduplicated_groups(self): method bulk_reload_group (line 899) | def bulk_reload_group(self): method pre_reload_last_layer (line 920) | def pre_reload_last_layer(self): method should_bulk_offload (line 928) | def should_bulk_offload(self): method bulk_offload (line 952) | def bulk_offload(self, forced_released_tensors): method on_group_commit_forward (line 967) | def on_group_commit_forward(self, forced_released_tensors): method bulk_reload (line 976) | def bulk_reload(self): method on_group_commit_backward (line 993) | def on_group_commit_backward(self, name): method on_group_start_forward (line 1015) | def on_group_start_forward(self, name): method on_group_start_backward (line 1037) | def on_group_start_backward(self): function fine_grained_offloading_disable_offload (line 1050) | def fine_grained_offloading_disable_offload(): function fine_grained_offloading_enable_offload (line 1056) | def fine_grained_offloading_enable_offload(): class FineGrainedOffloadingGroupCommitFunction (line 1062) | class FineGrainedOffloadingGroupCommitFunction(torch.autograd.Function): method forward (line 1069) | def forward(ctx, tensor, cur_forward_chunk, name, forced_released_tens... method backward (line 1084) | def backward(ctx, *grad_output): function fine_grained_offloading_group_commit (line 1093) | def fine_grained_offloading_group_commit( function fine_grained_offloading_group_flush_delayed_groups (line 1136) | def fine_grained_offloading_group_flush_delayed_groups(): class FineGrainedOffloadingGroupStartFunction (line 1142) | class FineGrainedOffloadingGroupStartFunction(torch.autograd.Function): method forward (line 1149) | def forward(ctx, tensor, cpu_offload_handler, name): method backward (line 1159) | def backward(ctx, grad_output): function fine_grained_offloading_group_start (line 1167) | def fine_grained_offloading_group_start(tensor, name=None): function fine_grained_offloading_forward_record (line 1175) | def fine_grained_offloading_forward_record(event: torch.cuda.Event) -> N... class FineGrainedOffloadingBackwardRecordFunction (line 1182) | class FineGrainedOffloadingBackwardRecordFunction(torch.autograd.Function): method forward (line 1189) | def forward(ctx, tensor, event: torch.cuda.Event) -> torch.Tensor: method backward (line 1195) | def backward(ctx, grad_output): function fine_grained_offloading_backward_record (line 1203) | def fine_grained_offloading_backward_record(tensor, event: torch.cuda.Ev... class FineGrainedActivationOffloadingInterface (line 1208) | class FineGrainedActivationOffloadingInterface: method __init__ (line 1211) | def __init__(self, offload: bool, tensor: torch.Tensor, name: str): method __enter__ (line 1216) | def __enter__(self): method __exit__ (line 1223) | def __exit__(self, *args: Any): method init_chunk_handler (line 1229) | def init_chunk_handler(vp_size, vp_stage, min_offloaded_tensor_size): method get_context (line 1236) | def get_context(flag): method group_commit (line 1241) | def group_commit(tensor, name, forced_released_tensors=None, delay_off... method mark_not_offloadable (line 1248) | def mark_not_offloadable(tensor: torch.Tensor): method forward_record (line 1253) | def forward_record(event: torch.cuda.Event) -> None: method reset (line 1260) | def reset(): method reset_instance (line 1265) | def reset_instance(): FILE: megatron/core/pipeline_parallel/hybrid_cp_schedule.py class BalancedCPScheduler (line 14) | class BalancedCPScheduler: method __init__ (line 20) | def __init__(self, max_seq_len_per_rank: int, dp_cp_group: torch.distr... method get_total_workload (line 28) | def get_total_workload(self, seq_length: int, cp_size: Optional[int] =... method gpus_needed (line 44) | def gpus_needed(self, seq_len: int) -> int: method make_buckets_equal (line 55) | def make_buckets_equal( method next_hdp_group (line 104) | def next_hdp_group( method get_groups_and_subsamples (line 456) | def get_groups_and_subsamples(self, sample_id_seqlens, config): function hybrid_context_parallel_forward_backward (line 477) | def hybrid_context_parallel_forward_backward( FILE: megatron/core/pipeline_parallel/multimodule_communicator.py class RankModuleInfo (line 20) | class RankModuleInfo: function _prepare_tensor_for_comm (line 48) | def _prepare_tensor_for_comm( function _restore_tensor_from_comm (line 81) | def _restore_tensor_from_comm( class MultiModulePipelineCommunicator (line 104) | class MultiModulePipelineCommunicator: method __init__ (line 107) | def __init__( method _build_bridge_comms (line 153) | def _build_bridge_comms(self): method is_pp_first_stage (line 171) | def is_pp_first_stage(self): method is_pp_last_stage (line 187) | def is_pp_last_stage(self): method _is_source_module (line 202) | def _is_source_module(self, module_name: str) -> bool: method _is_sink_module (line 210) | def _is_sink_module(self, module_name: str) -> bool: method is_current_rank_in_grid (line 214) | def is_current_rank_in_grid(self, grid: HyperCommGrid) -> bool: method total_stages (line 219) | def total_stages(self) -> int: method current_stage (line 231) | def current_stage(self) -> int: method _build_rank_module_info_map (line 263) | def _build_rank_module_info_map(self): method recv_forward (line 307) | def recv_forward( method send_forward (line 341) | def send_forward(self, output_dict: Dict[str, torch.Tensor], is_last_s... method send_forward_recv_backward (line 359) | def send_forward_recv_backward( method send_backward_recv_forward (line 393) | def send_backward_recv_forward( method recv_backward (line 429) | def recv_backward( method send_backward (line 460) | def send_backward(self, grad_dict: Dict[str, torch.Tensor], is_first_s... method compute_total_pipeline_stages (line 479) | def compute_total_pipeline_stages( FILE: megatron/core/pipeline_parallel/p2p_communication.py function _batched_p2p_ops (line 17) | def _batched_p2p_ops( function _p2p_ops (line 55) | def _p2p_ops( function is_single_shape (line 131) | def is_single_shape(x) -> bool: class P2PCommunicator (line 140) | class P2PCommunicator: method __init__ (line 147) | def __init__(self, pp_group: dist.ProcessGroup, config: ModelParallelC... method is_pp_first_stage (line 167) | def is_pp_first_stage(self) -> bool: method is_pp_last_stage (line 172) | def is_pp_last_stage(self) -> bool: method total_stages (line 177) | def total_stages(self) -> int: method current_stage (line 182) | def current_stage(self) -> int: method _communicate_shapes (line 186) | def _communicate_shapes(self, tensor_send_next, tensor_send_prev, recv... method _communicate (line 275) | def _communicate( method recv_forward (line 424) | def recv_forward( method recv_backward (line 455) | def recv_backward( method send_forward (line 486) | def send_forward(self, output_tensors, is_last_stage: bool) -> None: method send_backward (line 507) | def send_backward(self, input_tensor_grads, is_first_stage: bool) -> N... method send_forward_recv_backward (line 527) | def send_forward_recv_backward( method send_backward_recv_forward (line 560) | def send_backward_recv_forward( method send_forward_recv_forward (line 593) | def send_forward_recv_forward( method send_backward_recv_backward (line 619) | def send_backward_recv_backward( method send_forward_backward_recv_forward_backward (line 645) | def send_forward_backward_recv_forward_backward( FILE: megatron/core/pipeline_parallel/schedules.py function get_forward_backward_func (line 48) | def get_forward_backward_func(pp_size: Optional[int] = None, vp_size: Op... function deallocate_output_tensor (line 157) | def deallocate_output_tensor(out, deallocate_pipeline_outputs=False): function custom_backward (line 190) | def custom_backward(output, grad_output): function forward_step_calc_loss (line 222) | def forward_step_calc_loss( function forward_step (line 316) | def forward_step( function backward_step (line 451) | def backward_step(input_tensor, output_tensor, output_tensor_grad, config): function backward_step_multimodule (line 514) | def backward_step_multimodule( function check_first_val_step (line 575) | def check_first_val_step(first_val_step, forward_only, cond): function forward_backward_no_pipelining (line 583) | def forward_backward_no_pipelining( function clear_embedding_activation_buffer (line 751) | def clear_embedding_activation_buffer(config, model, is_last_stage): function finish_embedding_wgrad_compute (line 770) | def finish_embedding_wgrad_compute(config, embedding_module, is_last_sta... function get_pp_rank_microbatches (line 786) | def get_pp_rank_microbatches( function get_schedule_table (line 846) | def get_schedule_table(num_microbatches, num_model_chunks, microbatch_gr... function forward_backward_pipelining_with_interleaving (line 876) | def forward_backward_pipelining_with_interleaving( function get_tensor_shapes (line 1975) | def get_tensor_shapes( function forward_backward_pipelining_without_interleaving (line 2007) | def forward_backward_pipelining_without_interleaving( FILE: megatron/core/pipeline_parallel/utils.py function is_pp_first_stage (line 16) | def is_pp_first_stage(pp_group: torch.distributed.ProcessGroup): function is_pp_last_stage (line 21) | def is_pp_last_stage(pp_group: torch.distributed.ProcessGroup): function is_vp_first_stage (line 26) | def is_vp_first_stage(vp_stage: int, vp_size: int | None): function is_vp_last_stage (line 37) | def is_vp_last_stage(vp_stage: int, vp_size: int | None): function get_pp_first_rank (line 48) | def get_pp_first_rank(pp_group: torch.distributed.ProcessGroup): function get_pp_last_rank (line 54) | def get_pp_last_rank(pp_group: torch.distributed.ProcessGroup): function get_pp_next_rank (line 60) | def get_pp_next_rank(pp_group: torch.distributed.ProcessGroup): function get_pp_prev_rank (line 70) | def get_pp_prev_rank(pp_group: torch.distributed.ProcessGroup): function make_viewless (line 80) | def make_viewless(e): function set_ideal_affinity_for_current_gpu (line 86) | def set_ideal_affinity_for_current_gpu(): class NoopScheduleNode (line 119) | class NoopScheduleNode: method forward (line 128) | def forward(self, inputs): method backward (line 132) | def backward(self, outgrads): class ScheduleNode (line 137) | class ScheduleNode: method __init__ (line 144) | def __init__( method default_backward_func (line 181) | def default_backward_func(self, outputs, output_grad): method forward (line 194) | def forward(self, inputs=()): method _forward (line 200) | def _forward(self, *inputs): method get_output (line 227) | def get_output(self): method backward (line 231) | def backward(self, output_grad): method _backward (line 237) | def _backward(self, *output_grad): method get_grad (line 265) | def get_grad(self): method stream_acquire_context (line 274) | def stream_acquire_context(self, name=None): method _release_state (line 297) | def _release_state(self): class AbstractSchedulePlan (line 305) | class AbstractSchedulePlan(ABC): method run (line 311) | def run( function set_streams (line 330) | def set_streams(comp_stream=None, comm_stream=None): function get_comp_stream (line 348) | def get_comp_stream(): function get_comm_stream (line 354) | def get_comm_stream(): FILE: megatron/core/post_training/modelopt/gpt/model_specs.py function get_gpt_modelopt_spec (line 35) | def get_gpt_modelopt_spec( FILE: megatron/core/post_training/modelopt/gpt/state_dict_hooks.py function mcore_gpt_load_te_state_dict_pre_hook (line 11) | def mcore_gpt_load_te_state_dict_pre_hook( FILE: megatron/core/post_training/modelopt/layers.py class Norm (line 50) | class Norm: method __new__ (line 58) | def __new__( class Linear (line 105) | class Linear(torch.nn.Linear): method __init__ (line 108) | def __init__( method sharded_state_dict (line 158) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N... method forward (line 175) | def forward(self, x): class RealQuantTransformerLayer (line 184) | class RealQuantTransformerLayer(TransformerLayer): method __init__ (line 200) | def __init__(self, *args, **kwargs): method _collect_original_tensor_info (line 235) | def _collect_original_tensor_info(self): method _report_quantize_tensor_info (line 241) | def _report_quantize_tensor_info(self): class FP8WeightTransformerLayer (line 256) | class FP8WeightTransformerLayer(RealQuantTransformerLayer): class BlockwiseFP8WeightTransformerLayer (line 262) | class BlockwiseFP8WeightTransformerLayer(RealQuantTransformerLayer): FILE: megatron/core/post_training/modelopt/mamba/model_specs.py function get_mamba_stack_modelopt_spec (line 20) | def get_mamba_stack_modelopt_spec( function _get_mamba_stack_local_spec (line 55) | def _get_mamba_stack_local_spec( FILE: megatron/core/process_groups_config.py class ProcessGroupHelperMeta (line 14) | class ProcessGroupHelperMeta(type): method __setattr__ (line 17) | def __setattr__(cls, name, value): class ProcessGroupCollection (line 27) | class ProcessGroupCollection: method __init__ (line 136) | def __init__(self, **kwargs): method __repr__ (line 143) | def __repr__(self): method use_mpu_process_groups (line 161) | def use_mpu_process_groups(cls, required_pgs: Optional[List[str]] = No... method setup_process_groups_for_optimizer (line 253) | def setup_process_groups_for_optimizer( method setup_process_groups_for_ddp (line 444) | def setup_process_groups_for_ddp( class MultiModuleProcessGroupCollection (line 575) | class MultiModuleProcessGroupCollection: method __post_init__ (line 615) | def __post_init__(self): method get_language_model_collection (line 625) | def get_language_model_collection(self) -> ProcessGroupCollection: method get_language_model_cp_size (line 638) | def get_language_model_cp_size(self) -> int: method has_language_model (line 649) | def has_language_model(self) -> bool: method get_module_collection (line 657) | def get_module_collection(self, module_name: str) -> ProcessGroupColle... method __len__ (line 676) | def __len__(self): method __getitem__ (line 680) | def __getitem__(self, module_name: str): method __iter__ (line 684) | def __iter__(self): method keys (line 688) | def keys(self): method values (line 692) | def values(self): method items (line 696) | def items(self): method __repr__ (line 700) | def __repr__(self): FILE: megatron/core/quantization/quant_config.py class MatchContext (line 66) | class MatchContext: class QuantizationConfig (line 73) | class QuantizationConfig: method __init__ (line 76) | def __init__(self, config: dict, match_input: MatchContext, config_key... method __repr__ (line 87) | def __repr__(self) -> str: class Matcher (line 94) | class Matcher(ABC): method match (line 98) | def match(self, context: MatchContext) -> Optional[str]: class GlobMatcher (line 108) | class GlobMatcher(Matcher): method __init__ (line 117) | def __init__(self, pattern: str, config_key: str): method match (line 121) | def match(self, context: MatchContext) -> Optional[str]: method __repr__ (line 127) | def __repr__(self) -> str: class RecipeConfig (line 131) | class RecipeConfig: method __init__ (line 134) | def __init__(self, matchers: List[Matcher], config_dict: Dict[str, Dic... method _build_matchers (line 139) | def _build_matchers(matchers_dict: Dict | None) -> List[Matcher]: method from_yaml_file (line 172) | def from_yaml_file(recipe_yaml_path: str) -> "RecipeConfig": method from_config_dict (line 190) | def from_config_dict(config: Dict) -> "RecipeConfig": method match_to_config_key (line 199) | def match_to_config_key(self, operator_context: MatchContext) -> str |... method match (line 218) | def match(self, operator_context: MatchContext) -> QuantizationConfig ... method __repr__ (line 230) | def __repr__(self) -> str: FILE: megatron/core/quantization/utils.py function get_quant_config_or_none (line 9) | def get_quant_config_or_none( function load_quantization_recipe (line 23) | def load_quantization_recipe(recipe_path: str) -> RecipeConfig: function kitchen_quantization_recipe_config (line 29) | def kitchen_quantization_recipe_config(recipe_idx: int) -> RecipeConfig: FILE: megatron/core/rerun_state_machine.py class Caller (line 46) | class Caller(NamedTuple): class Call (line 53) | class Call(NamedTuple): class RerunDiagnostic (line 60) | class RerunDiagnostic(str, Enum): class RerunMode (line 74) | class RerunMode(str, Enum): class RerunState (line 82) | class RerunState(Enum): class RerunValidationStatus (line 114) | class RerunValidationStatus(str, Enum): class RerunStateMachine (line 129) | class RerunStateMachine: method __init__ (line 185) | def __init__( method set_mode (line 241) | def set_mode(self, mode: RerunMode) -> None: method get_mode (line 247) | def get_mode(self) -> RerunMode: method _reduce_any (line 252) | def _reduce_any(self, value: Union[bool, List[bool]]) -> Union[bool, T... method should_run_forward_backward (line 270) | def should_run_forward_backward(self, data_iterator: DataIteratorArgTy... method should_checkpoint_and_exit (line 398) | def should_checkpoint_and_exit(self) -> Tuple[bool, bool, int]: method validate_result (line 463) | def validate_result( method is_unexpectedly_large (line 697) | def is_unexpectedly_large( method state_dict (line 767) | def state_dict( method validate_state_dict (line 846) | def validate_state_dict(self, state_dict: dict[str, Any]) -> bool: method load_state_dict (line 862) | def load_state_dict(self, state_dict: dict[str, Any]) -> None: method _sanitize_data_iterators (line 910) | def _sanitize_data_iterators( method _get_validation_call_info (line 927) | def _get_validation_call_info(self, message: str) -> Call: method _save_state (line 943) | def _save_state(self) -> None: method _restore_state (line 964) | def _restore_state(self) -> None: method _maybe_report_stats (line 975) | def _maybe_report_stats(self) -> None: method _log_validation_error_to_file (line 1002) | def _log_validation_error_to_file( method get_skipped_iterations_from_tracker_file (line 1022) | def get_skipped_iterations_from_tracker_file(cls, tracker_file_name: s... class RerunDataIterator (line 1090) | class RerunDataIterator: method __init__ (line 1109) | def __init__(self, iterable: Iterable[Any]) -> None: method __next__ (line 1115) | def __next__(self) -> Any: method rewind (line 1130) | def rewind(self) -> None: method advance (line 1136) | def advance(self) -> None: method state_dict (line 1142) | def state_dict(self) -> SerializableStateType: method load_state_dict (line 1151) | def load_state_dict(self, state_dict: SerializableStateType) -> None: class QuickStats (line 1159) | class QuickStats: method __init__ (line 1166) | def __init__(self, max_size: int = 100000) -> None: method record (line 1173) | def record(self, data: float) -> None: method combine (line 1187) | def combine(self, others: list["QuickStats"]) -> None: method reset (line 1200) | def reset(self) -> None: method print_stats (line 1208) | def print_stats(self) -> str: method __getstate_ (line 1230) | def __getstate_(self) -> Any: method __setstate (line 1235) | def __setstate(self, state: Any) -> Any: class RerunErrorInjector (line 1244) | class RerunErrorInjector: method __init__ (line 1253) | def __init__( method maybe_inject (line 1268) | def maybe_inject(self) -> bool: method maybe_miscompare (line 1286) | def maybe_miscompare( method state_dict (line 1323) | def state_dict(self) -> SerializableStateType: method load_state_dict (line 1333) | def load_state_dict(self, state_dict: SerializableStateType) -> None: function initialize_rerun_state_machine (line 1343) | def initialize_rerun_state_machine(*args, **kwargs) -> None: function destroy_rerun_state_machine (line 1353) | def destroy_rerun_state_machine() -> None: function get_rerun_state_machine (line 1360) | def get_rerun_state_machine() -> RerunStateMachine: function _set_rerun_state_machine (line 1370) | def _set_rerun_state_machine(rerun_state_machine) -> None: function _compare_floats (line 1378) | def _compare_floats(a: torch.Tensor, b: torch.Tensor) -> float: FILE: megatron/core/resharding/copy_services/base.py class CopyService (line 9) | class CopyService(ABC): method submit_send (line 13) | def submit_send(self, src_tensor: torch.Tensor, dest_rank: int): method submit_recv (line 18) | def submit_recv(self, dest_tensor: torch.Tensor, src_rank: int): method run (line 23) | def run(self): FILE: megatron/core/resharding/copy_services/gloo_copy_service.py class SendOp (line 17) | class SendOp: class RecvOp (line 26) | class RecvOp: class GlooCopyService (line 34) | class GlooCopyService(CopyService): method __init__ (line 40) | def __init__(self, group=None): method submit_send (line 57) | def submit_send(self, src_tensor: torch.Tensor, dest_rank: int): method submit_send_with_id (line 60) | def submit_send_with_id(self, task_id: int, src_tensor: torch.Tensor, ... method submit_recv (line 64) | def submit_recv(self, dest_tensor: torch.Tensor, src_rank: int): method submit_recv_with_id (line 73) | def submit_recv_with_id(self, task_id: int, dest_tensor: torch.Tensor,... method run (line 80) | def run(self): FILE: megatron/core/resharding/copy_services/nccl_copy_service.py class SendOp (line 17) | class SendOp: class RecvOp (line 26) | class RecvOp: class NCCLCopyService (line 34) | class NCCLCopyService(CopyService): method __init__ (line 40) | def __init__(self, group=None): method submit_send (line 53) | def submit_send(self, src_tensor: torch.Tensor, dest_rank: int): method submit_send_with_id (line 56) | def submit_send_with_id(self, task_id: int, src_tensor: torch.Tensor, ... method submit_recv (line 60) | def submit_recv(self, dest_tensor: torch.Tensor, src_rank: int): method submit_recv_with_id (line 64) | def submit_recv_with_id(self, task_id: int, dest_tensor: torch.Tensor,... method run (line 68) | def run(self): FILE: megatron/core/resharding/copy_services/nvshmem_copy_service.py class NVSHMEMCopyService (line 17) | class NVSHMEMCopyService(CopyService): method __init__ (line 20) | def __init__(self, group=None): method _ensure_initialized (line 38) | def _ensure_initialized(self): method submit_send (line 46) | def submit_send(self, src_tensor: torch.Tensor, dest_rank: int): method submit_recv (line 60) | def submit_recv(self, dest_tensor: torch.Tensor, src_rank: int): method submit_send_with_id (line 72) | def submit_send_with_id(self, task_id: int, src_tensor: torch.Tensor, ... method submit_recv_with_id (line 100) | def submit_recv_with_id(self, task_id: int, dest_tensor: torch.Tensor,... method run (line 127) | def run(self): FILE: megatron/core/resharding/execution.py function _is_mxfp8_tensor (line 17) | def _is_mxfp8_tensor(param): function execute_reshard_plan (line 26) | def execute_reshard_plan( FILE: megatron/core/resharding/nvshmem_copy_service/compat.py function _patch_cuda_core_experimental (line 19) | def _patch_cuda_core_experimental(): function get_cuda_core_device_class (line 31) | def get_cuda_core_device_class(): function ensure_nvshmem_compat (line 47) | def ensure_nvshmem_compat(): FILE: megatron/core/resharding/nvshmem_copy_service/core/gpu_resource_manager.py class GPUResourceManager (line 32) | class GPUResourceManager: method __init__ (line 35) | def __init__(self): method init (line 56) | def init(self, group=None) -> None: method get_stream (line 154) | def get_stream(self, name: str): method get_torch_stream (line 172) | def get_torch_stream(self, name: str) -> Optional[torch.cuda.ExternalS... method create_events (line 184) | def create_events(self, num_events: int = 2): method finalize (line 200) | def finalize(self) -> None: FILE: megatron/core/resharding/nvshmem_copy_service/core/kernel_launcher.py class KernelLauncher (line 23) | class KernelLauncher: method __init__ (line 26) | def __init__(self): method load_kernels (line 32) | def load_kernels(self) -> None: method set_streams (line 47) | def set_streams(self, pack_stream, unpack_stream) -> None: method launch_pack (line 63) | def launch_pack( method launch_unpack (line 106) | def launch_unpack( FILE: megatron/core/resharding/nvshmem_copy_service/core/pipeline_executor.py class PipelineExecutor (line 31) | class PipelineExecutor: method __init__ (line 34) | def __init__( method set_streams (line 65) | def set_streams( method set_events (line 87) | def set_events(self, pack_events: List, unpack_events: List, barrier_e... method execute_pipeline (line 93) | def execute_pipeline( method _launch_pack (line 244) | def _launch_pack(self, iteration: int, batch: ScheduledBatch) -> None: method _launch_unpack (line 256) | def _launch_unpack(self, iteration: int, batch: ScheduledBatch) -> None: method process_self_moves (line 268) | def process_self_moves( FILE: megatron/core/resharding/nvshmem_copy_service/logger.py class ColoredFormatter (line 25) | class ColoredFormatter(logging.Formatter): method __init__ (line 28) | def __init__(self, fmt: str, pe_id: int, use_color: bool = True): method formatTime (line 46) | def formatTime(self, record, datefmt=None): method format (line 58) | def format(self, record): class PELogger (line 74) | class PELogger: method init (line 82) | def init(cls, pe_id: int, level: str = "INFO", logs_dir: str = "logs"): method set_level (line 136) | def set_level(cls, level: str): method trace (line 155) | def trace(cls, msg: str): method debug (line 161) | def debug(cls, msg: str): method info (line 167) | def info(cls, msg: str): method summary (line 173) | def summary(cls, msg: str): method warn (line 179) | def warn(cls, msg: str): method warning (line 185) | def warning(cls, msg: str): method error (line 190) | def error(cls, msg: str): method critical (line 196) | def critical(cls, msg: str): method shutdown (line 202) | def shutdown(cls): FILE: megatron/core/resharding/nvshmem_copy_service/memory/double_buffer_manager.py class DoubleBufferManager (line 25) | class DoubleBufferManager: method __init__ (line 28) | def __init__(self, slot_size: int = MAX_SEGMENT_SIZE): method allocate (line 39) | def allocate(self) -> None: method get_send_slot (line 58) | def get_send_slot(self, iteration: int): method get_recv_slot (line 70) | def get_recv_slot(self, iteration: int): method free (line 82) | def free(self) -> None: FILE: megatron/core/resharding/nvshmem_copy_service/memory/tensor_pointer_utils.py class TensorPointerExtractor (line 14) | class TensorPointerExtractor: method get_pointer (line 18) | def get_pointer(tensor: Any) -> int: FILE: megatron/core/resharding/nvshmem_copy_service/nvshmem_types.py class SendRequest (line 12) | class SendRequest: class ReceiveRequest (line 23) | class ReceiveRequest: class WorkloadGroup (line 34) | class WorkloadGroup: class ScheduledBatch (line 43) | class ScheduledBatch: class WorkloadSummary (line 58) | class WorkloadSummary: class TransferMetadata (line 67) | class TransferMetadata: FILE: megatron/core/resharding/nvshmem_copy_service/planning/communication_scheduler.py class CommunicationScheduler (line 9) | class CommunicationScheduler: method __init__ (line 16) | def __init__(self): method build_schedule (line 19) | def build_schedule( method _collect_all_batches (line 65) | def _collect_all_batches( method _assign_iterations (line 110) | def _assign_iterations(self, batches: List[ScheduledBatch]): method _exchange_workload_summaries (line 195) | def _exchange_workload_summaries( FILE: megatron/core/resharding/nvshmem_copy_service/planning/gpu_execution_planner.py class GPUExecutionPlanner (line 26) | class GPUExecutionPlanner: method __init__ (line 29) | def __init__(self): method create_gpu_plans (line 33) | def create_gpu_plans( method _plan_kernel_args (line 158) | def _plan_kernel_args( FILE: megatron/core/resharding/nvshmem_copy_service/planning/task_segmenter.py class TaskSegmenter (line 17) | class TaskSegmenter: method _encode_segment_id (line 23) | def _encode_segment_id(self, task_id: int, segment_index: int) -> int: method _calculate_num_segments (line 26) | def _calculate_num_segments(self, size: int) -> int: method _validate_segmentation (line 29) | def _validate_segmentation(self, task_id: int, size: int) -> bool: method segment_send_request (line 42) | def segment_send_request(self, req: SendRequest) -> List[SendRequest]: method segment_receive_request (line 72) | def segment_receive_request(self, req: ReceiveRequest) -> List[Receive... FILE: megatron/core/resharding/nvshmem_copy_service/planning/workload_packer.py class WorkloadPacker (line 9) | class WorkloadPacker: method pack_workloads (line 15) | def pack_workloads( method _pack_single_destination (line 53) | def _pack_single_destination( FILE: megatron/core/resharding/nvshmem_copy_service/service.py class RemoteCopyService (line 34) | class RemoteCopyService: method __init__ (line 42) | def __init__(self, group=None): method my_pe (line 70) | def my_pe(self) -> int: method n_pes (line 75) | def n_pes(self) -> int: method device (line 80) | def device(self): method initialized (line 85) | def initialized(self) -> bool: method init (line 89) | def init(self, log_level: str = "INFO") -> None: method register_send (line 169) | def register_send( method register_receive (line 189) | def register_receive( method schedule (line 209) | def schedule(self) -> None: method run (line 281) | def run(self) -> None: method clear_requests (line 322) | def clear_requests(self) -> None: method finalize (line 336) | def finalize(self) -> None: method _segment_tasks (line 357) | def _segment_tasks(self) -> None: method _prepare_iter_schedules (line 381) | def _prepare_iter_schedules( FILE: megatron/core/resharding/nvshmem_copy_service/validation.py class ValidationResult (line 19) | class ValidationResult: class ValidationSummary (line 36) | class ValidationSummary: method all_passed (line 46) | def all_passed(self) -> bool: function generate_deterministic_data (line 51) | def generate_deterministic_data(task_id: int, size: int, device: str = "... function validate_received_data (line 71) | def validate_received_data( function log_validation_summary (line 113) | def log_validation_summary(summary: ValidationSummary) -> None: FILE: megatron/core/resharding/planner.py function _build_descriptors_for_param (line 24) | def _build_descriptors_for_param( function _plan_multi_dim_lcm (line 70) | def _plan_multi_dim_lcm( function _plan_block_interleaved (line 155) | def _plan_block_interleaved( function _finalize_dp_transfers (line 275) | def _finalize_dp_transfers( function _determine_source_ranks_for_dst_param (line 322) | def _determine_source_ranks_for_dst_param( function build_centralized_reshard_plan (line 354) | def build_centralized_reshard_plan( FILE: megatron/core/resharding/refit.py class _PlanCacheKey (line 36) | class _PlanCacheKey: function _get_config_tuple (line 48) | def _get_config_tuple(core) -> Optional[Tuple[int, int, int, int, int]]: function _build_plan_cache_key (line 75) | def _build_plan_cache_key( function get_or_create_service (line 103) | def get_or_create_service(backend: RefitBackendName, group=None) -> Copy... function clear_service_cache (line 129) | def clear_service_cache(): function clear_plan_cache (line 149) | def clear_plan_cache(): function clear_all_caches (line 157) | def clear_all_caches(): function _unwrap_model_cores (line 165) | def _unwrap_model_cores(src_model, target_model): function _build_or_get_plan (line 199) | def _build_or_get_plan(src_core, tgt_core, num_experts, group, src_rank_... function _needs_mxfp8_conversion (line 219) | def _needs_mxfp8_conversion(model) -> bool: function _setup_mxfp8_transform_on_plan (line 231) | def _setup_mxfp8_transform_on_plan(plan, target_model) -> None: function prepare_swap_model_weights (line 275) | def prepare_swap_model_weights( function swap_model_weights (line 322) | def swap_model_weights( function reshard_model_weights (line 379) | def reshard_model_weights( FILE: megatron/core/resharding/transforms.py class ReshardTransform (line 17) | class ReshardTransform: method should_transform (line 33) | def should_transform(self, param_name: str) -> bool: method prepare_send (line 37) | def prepare_send( method prepare_recv (line 48) | def prepare_recv(self, param_name: str, dst_slice: tuple[slice, ...]) ... method finalize_recv (line 52) | def finalize_recv( function _scale_slice_from_data_slice (line 69) | def _scale_slice_from_data_slice( function _ensure_sendable (line 100) | def _ensure_sendable(param: torch.Tensor) -> torch.Tensor: class MXFP8ReshardTransform (line 117) | class MXFP8ReshardTransform(ReshardTransform): method __init__ (line 161) | def __init__( method should_transform (line 178) | def should_transform(self, param_name: str) -> bool: method prepare_send (line 183) | def prepare_send(self, param_name, src_slice, src_param): method prepare_recv (line 196) | def prepare_recv(self, param_name, dst_slice): method finalize_recv (line 220) | def finalize_recv(self, param_name, dst_slice, recv_buffers): FILE: megatron/core/resharding/utils.py class TransferOp (line 17) | class TransferOp: class ParameterMetadata (line 35) | class ParameterMetadata: class ShardingDescriptor (line 86) | class ShardingDescriptor: class ReshardPlan (line 98) | class ReshardPlan: method __str__ (line 104) | def __str__(self): function _get_rank_in_group (line 113) | def _get_rank_in_group(global_rank: int, group_ranks: list[int]) -> int: function _detect_expert_index_from_param_name (line 123) | def _detect_expert_index_from_param_name(param_name: str) -> Optional[int]: function assign_ep_resolved_name_inplace (line 137) | def assign_ep_resolved_name_inplace( function assign_resolved_name_inplace (line 183) | def assign_resolved_name_inplace( function _build_layer_module_prefix_map (line 201) | def _build_layer_module_prefix_map(module: torch.nn.Module) -> dict[str,... function _resolve_global_layer_number_in_name (line 225) | def _resolve_global_layer_number_in_name( function extract_param_metadata (line 252) | def extract_param_metadata( function select_src_metadata_balanced (line 370) | def select_src_metadata_balanced( FILE: megatron/core/safe_globals.py function register_safe_globals (line 39) | def register_safe_globals(): FILE: megatron/core/ssm/gated_delta_net.py class GatedDeltaNetSubmodules (line 60) | class GatedDeltaNetSubmodules: class GatedDeltaNet (line 70) | class GatedDeltaNet(MegatronModule): method __init__ (line 77) | def __init__( method reset_parameters (line 231) | def reset_parameters(self): method forward (line 253) | def forward( method _apply_gated_norm (line 416) | def _apply_gated_norm(self, x, gate): method sharded_state_dict (line 427) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N... method backward_dw (line 515) | def backward_dw(self): method _backward_in_proj (line 520) | def _backward_in_proj(self): method _backward_out_proj (line 524) | def _backward_out_proj(self): function _split_tensor_factory (line 529) | def _split_tensor_factory( function torch_chunk_gated_delta_rule (line 589) | def torch_chunk_gated_delta_rule( FILE: megatron/core/ssm/mamba_block.py class MambaStackSubmodules (line 36) | class MambaStackSubmodules: class MambaStack (line 48) | class MambaStack(GraphableMegatronModule, MegatronModule): method __init__ (line 73) | def __init__( method set_input_tensor (line 168) | def set_input_tensor(self, input_tensor: Tensor): method mamba_state_shapes_per_request (line 178) | def mamba_state_shapes_per_request(self) -> Optional[Tuple[Tuple[int],... method _should_call_local_cudagraph (line 188) | def _should_call_local_cudagraph(self, *args, **kwargs): method __call__ (line 211) | def __call__(self, *args, **kwargs): method forward (line 221) | def forward( method sharded_state_dict (line 354) | def sharded_state_dict( FILE: megatron/core/ssm/mamba_context_parallel.py class MambaContextParallel (line 31) | class MambaContextParallel: method __init__ (line 61) | def __init__( method pre_conv_ssm (line 133) | def pre_conv_ssm( method post_conv_ssm (line 194) | def post_conv_ssm( method conv1d (line 206) | def conv1d(self, input_: torch.Tensor) -> torch.Tensor: method conv1d_channels (line 225) | def conv1d_channels(self): method get_conv1d_weight (line 231) | def get_conv1d_weight(self) -> torch.Tensor: method get_conv1d_bias (line 236) | def get_conv1d_bias(self) -> torch.Tensor: method get_dt_bias (line 241) | def get_dt_bias(self) -> torch.Tensor: method get_A_log (line 245) | def get_A_log(self) -> torch.Tensor: method get_D (line 249) | def get_D(self) -> torch.Tensor: method _slice_conv_param (line 253) | def _slice_conv_param(self, param: torch.Tensor) -> torch.Tensor: method _slice_vector_param (line 288) | def _slice_vector_param(self, param: torch.Tensor, has_hdim: bool = Fa... function _all_to_all_cp2hp (line 304) | def _all_to_all_cp2hp( function _all_to_all_hp2cp (line 343) | def _all_to_all_hp2cp( function _undo_attention_load_balancing (line 379) | def _undo_attention_load_balancing( function _redo_attention_load_balancing (line 417) | def _redo_attention_load_balancing( FILE: megatron/core/ssm/mamba_hybrid_layer_allocation.py class Symbols (line 14) | class Symbols: class ParsedHybridPattern (line 27) | class ParsedHybridPattern: function pattern_from_ratios (line 59) | def pattern_from_ratios( function get_hybrid_total_layer_count (line 113) | def get_hybrid_total_layer_count(pattern: str) -> int: function get_hybrid_total_pipeline_segment_count (line 130) | def get_hybrid_total_pipeline_segment_count(pattern: str) -> int: function get_hybrid_layer_counts (line 146) | def get_hybrid_layer_counts(pattern: str) -> Dict[str, int]: function parse_hybrid_pattern (line 185) | def parse_hybrid_pattern(pattern: Optional[str]) -> ParsedHybridPattern: function _validate_pattern (line 262) | def _validate_pattern(pattern: str, pattern_name: str, allow_pipe: bool ... function validate_segment_layers (line 282) | def validate_segment_layers(segment: str) -> List[str]: function select_pipeline_segment (line 307) | def select_pipeline_segment( function get_layer_maps_from_layer_type_list (line 464) | def get_layer_maps_from_layer_type_list( FILE: megatron/core/ssm/mamba_layer.py class LayerNormBuilder (line 29) | class LayerNormBuilder(Protocol): method __call__ (line 32) | def __call__(self, config: TransformerConfig, hidden_size: int, /) -> ... class MambaLayerSubmodules (line 36) | class MambaLayerSubmodules: class MambaLayer (line 59) | class MambaLayer(GraphableMegatronModule): method __init__ (line 67) | def __init__( method create_mcore_cudagraph_manager (line 95) | def create_mcore_cudagraph_manager(self, config): method mamba_state_shapes_per_request (line 102) | def mamba_state_shapes_per_request(self) -> Tuple[Tuple[int], Tuple[in... method forward (line 106) | def forward( method sharded_state_dict (line 154) | def sharded_state_dict( method _te_cuda_graph_replay (line 177) | def _te_cuda_graph_replay(self, *args, **kwargs): method _should_call_local_cudagraph (line 190) | def _should_call_local_cudagraph(self, *args, **kwargs): FILE: megatron/core/ssm/mamba_mixer.py class ExtendedRMSNorm (line 93) | class ExtendedRMSNorm(RMSNormGated): method sharded_state_dict (line 98) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N... class MambaMixerSubmodules (line 114) | class MambaMixerSubmodules: class MambaMixer (line 123) | class MambaMixer(MegatronModule): method __init__ (line 154) | def __init__( method forward (line 406) | def forward( method _dynamic_inference (line 457) | def _dynamic_inference(self, hidden_states: torch.Tensor, context: Dyn... method _dynamic_inference_prefill (line 576) | def _dynamic_inference_prefill( method _decode (line 637) | def _decode( method _ssm_training (line 671) | def _ssm_training( method _ssm_prefill (line 727) | def _ssm_prefill( method _ssm_decode (line 1122) | def _ssm_decode( method mamba_state_shapes_per_request (line 1299) | def mamba_state_shapes_per_request(self) -> Tuple[Tuple[int], Tuple[in... method _get_states_from_cache (line 1305) | def _get_states_from_cache(self, inference_context, batch_size, *, inf... method sharded_state_dict (line 1345) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N... function _split_tensor_factory (line 1434) | def _split_tensor_factory( function _check_mamba_sequence_packing_support (line 1494) | def _check_mamba_sequence_packing_support( FILE: megatron/core/ssm/mlp_layer.py class MLPLayer (line 13) | class MLPLayer(TransformerLayer): method __init__ (line 16) | def __init__( FILE: megatron/core/ssm/ops/causal_conv1d_triton.py function causal_conv1d_update_kernel (line 26) | def causal_conv1d_update_kernel( function causal_conv1d_update (line 194) | def causal_conv1d_update( FILE: megatron/core/ssm/ops/causal_conv1d_varlen.py function _causal_conv1d_varlen_kernel (line 28) | def _causal_conv1d_varlen_kernel( function causal_conv1d_varlen_fn (line 119) | def causal_conv1d_varlen_fn( function _causal_conv1d_varlen_simple (line 211) | def _causal_conv1d_varlen_simple( FILE: megatron/core/ssm/ops/determinism.py function use_deterministic_mode (line 27) | def use_deterministic_mode(): function set_deterministic_mode (line 37) | def set_deterministic_mode(value): function _estimate_config_cost (line 43) | def _estimate_config_cost(cfg): function _filter_configs_by_block_sizes (line 58) | def _filter_configs_by_block_sizes(configs): function autotune_configs (line 81) | def autotune_configs(configs): function alloc_tile_workspace (line 106) | def alloc_tile_workspace(base_shape, tile_dim, dtype, device, determinis... function finalize_tile_workspace (line 117) | def finalize_tile_workspace(tensor, deterministic): FILE: megatron/core/ssm/ops/mamba_ssm.py function softplus (line 32) | def softplus(dt): function softplus (line 39) | def softplus(dt): function _selective_scan_update_kernel (line 53) | def _selective_scan_update_kernel( function selective_state_update (line 274) | def selective_state_update( FILE: megatron/core/ssm/ops/ssd_bmm.py function _bmm_chunk_fwd_kernel (line 65) | def _bmm_chunk_fwd_kernel( function _bmm_chunk_fwd (line 143) | def _bmm_chunk_fwd(a, b, chunk_size, cu_chunk_seqlens, causal=False, out... FILE: megatron/core/ssm/ops/ssd_chunk_scan.py function _chunk_scan_fwd_kernel (line 78) | def _chunk_scan_fwd_kernel( function _chunk_scan_fwd (line 317) | def _chunk_scan_fwd( FILE: megatron/core/ssm/ops/ssd_chunk_state.py function softplus (line 21) | def softplus(dt): # pylint: disable=C0116 function softplus (line 28) | def softplus(dt): # pylint: disable=C0116 function _chunk_cumsum_fwd_kernel (line 47) | def _chunk_cumsum_fwd_kernel( function _chunk_state_fwd_kernel (line 175) | def _chunk_state_fwd_kernel( function _chunk_cumsum_fwd (line 272) | def _chunk_cumsum_fwd( function _chunk_state_fwd (line 319) | def _chunk_state_fwd(B, x, dt, dA_cumsum, cu_chunk_seqlens, states=None,... function _chunk_state_varlen_kernel (line 427) | def _chunk_state_varlen_kernel( function chunk_state_varlen (line 583) | def chunk_state_varlen( FILE: megatron/core/ssm/ops/ssd_combined.py function is_int_pow_2 (line 18) | def is_int_pow_2(n): function _mamba_chunk_scan_combined_fwd (line 23) | def _mamba_chunk_scan_combined_fwd( function mamba_chunk_scan_combined_varlen (line 163) | def mamba_chunk_scan_combined_varlen( FILE: megatron/core/ssm/ops/ssd_state_passing.py function _state_passing_fwd_kernel (line 27) | def _state_passing_fwd_kernel( function _state_passing_fwd (line 105) | def _state_passing_fwd( FILE: megatron/core/ssm/triton_cache_manager.py function _version_no_greater_than (line 20) | def _version_no_greater_than(version, version_limit): function default_cache_dir (line 26) | def default_cache_dir(): class ParallelFileCacheManager (line 31) | class ParallelFileCacheManager(FileCacheManager): method put (line 51) | def put(self, data, filename, binary=True) -> str: FILE: megatron/core/tensor_parallel/cross_entropy.py class VocabParallelCrossEntropy (line 16) | class VocabParallelCrossEntropy: method calculate_logits_max (line 23) | def calculate_logits_max( method calculate_predicted_logits (line 35) | def calculate_predicted_logits( method calculate_cross_entropy_loss (line 71) | def calculate_cross_entropy_loss( method prepare_gradient_calculation_operands (line 85) | def prepare_gradient_calculation_operands( method calculate_gradients (line 104) | def calculate_gradients( class _VocabParallelCrossEntropy (line 122) | class _VocabParallelCrossEntropy(torch.autograd.Function): method forward (line 124) | def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0): method backward (line 192) | def backward(ctx, grad_output): function vocab_parallel_cross_entropy (line 219) | def vocab_parallel_cross_entropy(vocab_parallel_logits, target, label_sm... FILE: megatron/core/tensor_parallel/data.py function _check_data_types (line 10) | def _check_data_types(keys, data, target_dtype): function _build_key_size_numel_dictionaries (line 20) | def _build_key_size_numel_dictionaries(keys, data, tp_group=None): function broadcast_data (line 64) | def broadcast_data(keys, data, datatype, tp_group=None): FILE: megatron/core/tensor_parallel/inference_layers.py function _te_rms_norm_kernel (line 38) | def _te_rms_norm_kernel(x: torch.Tensor, weight: torch.Tensor, eps: float): function _apply_linear (line 48) | def _apply_linear( class InferenceLayerNormColumnParallelLinear (line 63) | class InferenceLayerNormColumnParallelLinear(TELayerNormColumnParallelLi... method __init__ (line 68) | def __init__( method _maybe_allocate_symmetric_buffer (line 121) | def _maybe_allocate_symmetric_buffer(self, x: torch.Tensor): method _all_gather (line 131) | def _all_gather(self, x: torch.Tensor, symm_mem_buffer: dict) -> None: method forward (line 155) | def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, None]: class InferenceColumnParallelLinear (line 190) | class InferenceColumnParallelLinear(TEColumnParallelLinear): method __init__ (line 195) | def __init__( method _maybe_allocate_symmetric_buffer (line 240) | def _maybe_allocate_symmetric_buffer(self, x: torch.Tensor): method _all_gather (line 250) | def _all_gather(self, x: torch.Tensor, symm_mem_buffer: dict) -> None: method forward (line 270) | def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, None]: class InferenceRowParallelLinear (line 288) | class InferenceRowParallelLinear(TERowParallelLinear): method __init__ (line 293) | def __init__( method _matmul_reduce_scatter (line 340) | def _matmul_reduce_scatter(self, x, residual=None): method _set_next_layer_norm_weights (line 404) | def _set_next_layer_norm_weights(self, weights: torch.Tensor): method _set_residual (line 410) | def _set_residual(self, residual: torch.Tensor): method forward (line 416) | def forward( FILE: megatron/core/tensor_parallel/layers.py function param_is_not_tensor_parallel_duplicate (line 91) | def param_is_not_tensor_parallel_duplicate(param, tp_group=None): function set_tensor_model_parallel_attributes (line 103) | def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride): function set_defaults_if_not_set_tensor_model_parallel_attributes (line 114) | def set_defaults_if_not_set_tensor_model_parallel_attributes(tensor): function copy_tensor_model_parallel_attributes (line 125) | def copy_tensor_model_parallel_attributes(destination_tensor, source_ten... function _initialize_affine_weight_gpu (line 136) | def _initialize_affine_weight_gpu(weight, init_method, partition_dim, st... function _initialize_affine_weight_cpu (line 151) | def _initialize_affine_weight_cpu( class VocabParallelEmbedding (line 197) | class VocabParallelEmbedding(torch.nn.Module): method __init__ (line 212) | def __init__( method forward (line 270) | def forward(self, input_): method sharded_state_dict (line 305) | def sharded_state_dict( class LinearWithFrozenWeight (line 327) | class LinearWithFrozenWeight(torch.autograd.Function): method forward (line 338) | def forward(ctx, input, weight, bias, allreduce_dgrad, tp_group): method backward (line 350) | def backward(ctx, grad_output): function linear_with_frozen_weight (line 362) | def linear_with_frozen_weight( class LinearWithGradAccumulationAndAsyncCommunication (line 433) | class LinearWithGradAccumulationAndAsyncCommunication(torch.autograd.Fun... method forward (line 438) | def forward( method backward (line 485) | def backward(ctx, grad_output): function linear_with_grad_accumulation_and_async_allreduce (line 619) | def linear_with_grad_accumulation_and_async_allreduce( class ColumnParallelLinear (line 731) | class ColumnParallelLinear(torch.nn.Module): method __init__ (line 780) | def __init__( method _forward_impl (line 928) | def _forward_impl(self, input, weight, *args, **kwargs): method forward (line 934) | def forward( method backward_dw (line 1033) | def backward_dw(self) -> None: method sharded_state_dict (line 1040) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N... method set_extra_state (line 1052) | def set_extra_state(self, state: Any): method get_extra_state (line 1055) | def get_extra_state(self) -> None: method extra_repr (line 1060) | def extra_repr(self) -> str: class RowParallelLinear (line 1072) | class RowParallelLinear(torch.nn.Module): method __init__ (line 1108) | def __init__( method _forward_impl (line 1223) | def _forward_impl(self, input, weight, *args, **kwargs): method forward (line 1229) | def forward(self, input_: torch.Tensor) -> tuple[torch.Tensor, torch.T... method backward_dw (line 1287) | def backward_dw(self) -> None: method sharded_state_dict (line 1294) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N... method set_extra_state (line 1306) | def set_extra_state(self, state: Any): method get_extra_state (line 1309) | def get_extra_state(self) -> None: method extra_repr (line 1314) | def extra_repr(self) -> str: FILE: megatron/core/tensor_parallel/mappings.py function _reduce (line 22) | def _reduce(input_, group): function _split_along_last_dim (line 36) | def _split_along_last_dim(input_, group): function _split_along_first_dim (line 56) | def _split_along_first_dim(input_, group): function _gather_along_last_dim (line 80) | def _gather_along_last_dim(input_, group): function _reduce_scatter_along_last_dim (line 99) | def _reduce_scatter_along_last_dim(input_, group): function _gather_along_first_dim (line 114) | def _gather_along_first_dim(input_, group, output_split_sizes=None, use_... function _reduce_scatter_along_first_dim (line 155) | def _reduce_scatter_along_first_dim(input_, group, input_split_sizes=Non... class _CopyToModelParallelRegion (line 197) | class _CopyToModelParallelRegion(torch.autograd.Function): method symbolic (line 201) | def symbolic(graph, input_, group): method forward (line 206) | def forward(ctx, input_, group): method backward (line 212) | def backward(ctx, grad_output): class _ReduceFromModelParallelRegion (line 217) | class _ReduceFromModelParallelRegion(torch.autograd.Function): method symbolic (line 221) | def symbolic(graph, input_, group): method forward (line 226) | def forward(ctx, input_, group): method backward (line 231) | def backward(ctx, grad_output): class _ScatterToModelParallelRegion (line 236) | class _ScatterToModelParallelRegion(torch.autograd.Function): method symbolic (line 240) | def symbolic(graph, input_, group): method forward (line 245) | def forward(ctx, input_, group): method backward (line 251) | def backward(ctx, grad_output): class _GatherFromModelParallelRegion (line 256) | class _GatherFromModelParallelRegion(torch.autograd.Function): method symbolic (line 260) | def symbolic(graph, input_, group): method forward (line 265) | def forward(ctx, input_, group): method backward (line 271) | def backward(ctx, grad_output): class _ScatterToSequenceParallelRegion (line 276) | class _ScatterToSequenceParallelRegion(torch.autograd.Function): method symbolic (line 280) | def symbolic(graph, input_, group): method forward (line 285) | def forward(ctx, input_, group): method backward (line 291) | def backward(ctx, grad_output): class _GatherFromSequenceParallelRegion (line 296) | class _GatherFromSequenceParallelRegion(torch.autograd.Function): method symbolic (line 300) | def symbolic( method forward (line 312) | def forward( method backward (line 328) | def backward(ctx, grad_output): class _ReduceScatterToSequenceParallelRegion (line 351) | class _ReduceScatterToSequenceParallelRegion(torch.autograd.Function): method symbolic (line 355) | def symbolic(graph, input_, group, input_split_sizes=None, use_global_... method forward (line 360) | def forward(ctx, input_, group, input_split_sizes=None, use_global_buf... method backward (line 368) | def backward(ctx, grad_output): class _AllGatherFromTensorParallelRegion (line 380) | class _AllGatherFromTensorParallelRegion(torch.autograd.Function): method symbolic (line 384) | def symbolic(graph, input_, group): method forward (line 389) | def forward(ctx, input_, group): method backward (line 395) | def backward(ctx, grad_output): class _ReduceScatterToTensorParallelRegion (line 400) | class _ReduceScatterToTensorParallelRegion(torch.autograd.Function): method symbolic (line 404) | def symbolic(graph, input_, group): method forward (line 409) | def forward(ctx, input_, group): method backward (line 415) | def backward(ctx, grad_output): class _AllToAll (line 420) | class _AllToAll(torch.autograd.Function): method forward (line 422) | def forward(ctx, group, input, output_split_sizes, input_split_sizes): method backward (line 454) | def backward(ctx, *grad_output): function copy_to_tensor_model_parallel_region (line 469) | def copy_to_tensor_model_parallel_region(input_, group=None): function reduce_from_tensor_model_parallel_region (line 475) | def reduce_from_tensor_model_parallel_region(input_, group=None): function scatter_to_tensor_model_parallel_region (line 481) | def scatter_to_tensor_model_parallel_region(input_, group=None): function gather_from_tensor_model_parallel_region (line 487) | def gather_from_tensor_model_parallel_region(input_, group=None): function scatter_to_sequence_parallel_region (line 493) | def scatter_to_sequence_parallel_region(input_, group=None): function gather_from_sequence_parallel_region (line 499) | def gather_from_sequence_parallel_region( function reduce_scatter_to_sequence_parallel_region (line 513) | def reduce_scatter_to_sequence_parallel_region( function all_gather_last_dim_from_tensor_parallel_region (line 523) | def all_gather_last_dim_from_tensor_parallel_region(input_, group=None): function reduce_scatter_last_dim_to_tensor_parallel_region (line 529) | def reduce_scatter_last_dim_to_tensor_parallel_region(input_, group=None): function all_to_all (line 535) | def all_to_all(group, input_, output_split_sizes_=None, input_split_size... function all_to_all_sp2hp (line 541) | def all_to_all_sp2hp(input_, group=None): function all_to_all_hp2sp (line 570) | def all_to_all_hp2sp(input_, group=None): FILE: megatron/core/tensor_parallel/random.py function _get_share_storage (line 65) | def _get_share_storage(): function _get_cuda_rng_state (line 96) | def _get_cuda_rng_state( function _set_cuda_rng_state (line 127) | def _set_cuda_rng_state(new_state: torch.Tensor, device: int = -1, graph... function convert_cuda_rng_state (line 169) | def convert_cuda_rng_state( function get_expert_parallel_rng_tracker_name (line 204) | def get_expert_parallel_rng_tracker_name(): function get_data_parallel_rng_tracker_name (line 210) | def get_data_parallel_rng_tracker_name(): class CudaRNGStatesTracker (line 216) | class CudaRNGStatesTracker: method __init__ (line 225) | def __init__(self, use_cudagraphable_rng=False, is_inference_rng_track... method is_initialized (line 238) | def is_initialized(self): method reset (line 242) | def reset(self): method get_states (line 258) | def get_states(self): method set_states (line 266) | def set_states(self, states): method add (line 272) | def add(self, name, seed): method fork (line 298) | def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME): function initialize_rng_tracker (line 341) | def initialize_rng_tracker( function get_cuda_rng_tracker (line 401) | def get_cuda_rng_tracker( function get_all_rng_states (line 411) | def get_all_rng_states(): function model_parallel_cuda_manual_seed (line 433) | def model_parallel_cuda_manual_seed( function is_graph_safe_cuda_rng_tracker (line 487) | def is_graph_safe_cuda_rng_tracker(cuda_rng_tracker): function _get_all_rng_states (line 499) | def _get_all_rng_states(): function _set_all_rng_states (line 509) | def _set_all_rng_states(cpu_rng_state, cuda_rng_state, cuda_rng_state_tr... function _fork_rng (line 519) | def _fork_rng(): function _set_checkpointing (line 534) | def _set_checkpointing(): function _unset_checkpointing (line 540) | def _unset_checkpointing(): function is_checkpointing (line 546) | def is_checkpointing(): class CheckpointFunction (line 555) | class CheckpointFunction(torch.autograd.Function): method forward (line 565) | def forward( method backward (line 599) | def backward(ctx, *args): function checkpoint (line 637) | def checkpoint( class CheckpointWithoutOutputFunction (line 645) | class CheckpointWithoutOutputFunction(torch.autograd.Function): method forward (line 652) | def forward( method backward (line 678) | def backward(ctx, *args): class CheckpointWithoutOutput (line 692) | class CheckpointWithoutOutput(object): method __init__ (line 706) | def __init__(self, fp8=False): method checkpoint (line 715) | def checkpoint(self, run_function: Callable[[Unpack[_Ts]], _R], *args:... method _recompute (line 735) | def _recompute(self, _): method discard_output_and_register_recompute (line 797) | def discard_output_and_register_recompute(self, hook_tensor): FILE: megatron/core/tensor_parallel/utils.py function split_tensor_along_last_dim (line 22) | def split_tensor_along_last_dim( function split_tensor_into_1d_equal_chunks (line 48) | def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False, tp_group... function gather_split_1d_tensor (line 79) | def gather_split_1d_tensor(tensor, tp_group=None): class VocabUtility (line 97) | class VocabUtility: method vocab_range_from_per_partition_vocab_size (line 105) | def vocab_range_from_per_partition_vocab_size( method vocab_range_from_global_vocab_size (line 114) | def vocab_range_from_global_vocab_size( FILE: megatron/core/timers.py class TimerBase (line 35) | class TimerBase(ABC): method __init__ (line 38) | def __init__(self, name): method start (line 42) | def start(self, barrier=False): method stop (line 51) | def stop(self, barrier=False): method reset (line 60) | def reset(self): method elapsed (line 65) | def elapsed(self, reset=True, barrier=False): class DummyTimer (line 78) | class DummyTimer(TimerBase): method __init__ (line 81) | def __init__(self): method start (line 84) | def start(self, barrier=False): method stop (line 87) | def stop(self, barrier=False): method reset (line 90) | def reset(self): method elapsed (line 93) | def elapsed(self, reset=True, barrier=False): method active_time (line 99) | def active_time(self): class Timer (line 109) | class Timer(TimerBase): method __init__ (line 121) | def __init__(self, name): method set_barrier_group (line 135) | def set_barrier_group(self, barrier_group): method start (line 143) | def start(self, barrier=False): method stop (line 156) | def stop(self, barrier=False): method reset (line 171) | def reset(self): method set_elapsed (line 177) | def set_elapsed(self, value): method elapsed (line 188) | def elapsed(self, reset=True, barrier=False): method active_time (line 212) | def active_time(self): class Timers (line 217) | class Timers: method __init__ (line 220) | def __init__(self, log_level, log_option): method __call__ (line 241) | def __call__(self, name, log_level=None): method _get_elapsed_time_all_ranks (line 270) | def _get_elapsed_time_all_ranks(self, names, reset, barrier): method _get_global_min_max_time (line 318) | def _get_global_min_max_time(self, names, reset, barrier, normalizer): method _get_global_min_max_time_string (line 338) | def _get_global_min_max_time_string(self, names, reset, barrier, norma... method _get_all_ranks_time_string (line 357) | def _get_all_ranks_time_string(self, names, reset, barrier, normalizer): method get_all_timers_string (line 378) | def get_all_timers_string( method log (line 422) | def log( method write (line 452) | def write( FILE: megatron/core/tokenizers/base_tokenizer.py class MegatronTokenizerBase (line 6) | class MegatronTokenizerBase(ABC): method __init__ (line 9) | def __init__(self, path: str, config: dict, **kwargs) -> None: method tokenize (line 26) | def tokenize(self): method detokenize (line 31) | def detokenize(self): method vocab (line 36) | def vocab(self): method vocab_size (line 41) | def vocab_size(self): method apply_chat_template (line 46) | def apply_chat_template(self): FILE: megatron/core/tokenizers/megatron_tokenizer.py class MegatronTokenizer (line 37) | class MegatronTokenizer: method __init__ (line 40) | def __init__(self) -> None: method from_pretrained (line 46) | def from_pretrained( method write_metadata (line 104) | def write_metadata( function _get_metadata_path (line 170) | def _get_metadata_path(tokenizer_path: str) -> str: function _get_tokenizer_model_class (line 188) | def _get_tokenizer_model_class(library: str, metadata: dict) -> Megatron... FILE: megatron/core/tokenizers/text/libraries/abstract_tokenizer.py class MegatronTokenizerTextAbstract (line 7) | class MegatronTokenizerTextAbstract(ABC): method text_to_tokens (line 13) | def text_to_tokens(self, text: str) -> List[str]: method tokens_to_text (line 26) | def tokens_to_text(self, tokens: List[str]) -> str: method tokens_to_ids (line 39) | def tokens_to_ids(self, tokens: List[str]) -> List[int]: method ids_to_tokens (line 52) | def ids_to_tokens(self, ids: List[int]) -> List[str]: method text_to_ids (line 65) | def text_to_ids(self, text: str) -> List[int]: method ids_to_text (line 78) | def ids_to_text(self, ids: List[int]) -> str: method add_special_tokens (line 91) | def add_special_tokens(self): method cls_id (line 96) | def cls_id(self) -> int: method sep_id (line 103) | def sep_id(self) -> int: method pad_id (line 110) | def pad_id(self) -> int: method eod (line 117) | def eod(self) -> int: method bos_id (line 129) | def bos_id(self) -> int: method eos_id (line 136) | def eos_id(self) -> int: method mask_id (line 143) | def mask_id(self) -> int: FILE: megatron/core/tokenizers/text/libraries/bytelevel_tokenizer.py class ByteLevelTokenizer (line 8) | class ByteLevelTokenizer(MegatronTokenizerTextAbstract): method __init__ (line 15) | def __init__( method text_to_tokens (line 61) | def text_to_tokens(self, text): method tokens_to_text (line 67) | def tokens_to_text(self, tokens): method text_to_ids (line 73) | def text_to_ids(self, text): method ids_to_text (line 79) | def ids_to_text(self, ids): method tokens_to_ids (line 87) | def tokens_to_ids(self, tokens): method ids_to_tokens (line 98) | def ids_to_tokens(self, ids): method token_to_id (line 109) | def token_to_id(self, token): method id_to_token (line 118) | def id_to_token(self, id): method add_special_tokens (line 127) | def add_special_tokens(self, special_tokens: Union[list, dict]) -> None: method pad_id (line 132) | def pad_id(self): method bos_id (line 139) | def bos_id(self): method eos_id (line 146) | def eos_id(self): method eod (line 153) | def eod(self): method unk_id (line 160) | def unk_id(self): FILE: megatron/core/tokenizers/text/libraries/chat_template.py class MegatronTokenizerChatTemplate (line 13) | class MegatronTokenizerChatTemplate: method apply_chat_template (line 16) | def apply_chat_template( FILE: megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py class HuggingFaceTokenizer (line 20) | class HuggingFaceTokenizer(MegatronTokenizerTextAbstract): method __init__ (line 26) | def __init__( method add_special_tokens (line 183) | def add_special_tokens(self, special_tokens_dict: dict) -> int: method additional_special_tokens_ids (line 213) | def additional_special_tokens_ids(self): method text_to_tokens (line 220) | def text_to_tokens(self, text: str) -> List[str]: method tokens_to_text (line 225) | def tokens_to_text(self, tokens: List[str]) -> str: method token_to_id (line 230) | def token_to_id(self, token: str) -> int: method tokens_to_ids (line 234) | def tokens_to_ids(self, tokens: List[str]) -> List[int]: method ids_to_tokens (line 239) | def ids_to_tokens(self, ids: List[int]) -> List[str]: method text_to_ids (line 244) | def text_to_ids(self, text: str) -> List[int]: method ids_to_text (line 252) | def ids_to_text(self, ids: List[int], remove_special_tokens: Optional[... method apply_chat_template (line 271) | def apply_chat_template(self, conversation, chat_template, **kwargs): method vocab (line 278) | def vocab(self) -> list: method inv_vocab (line 284) | def inv_vocab(self) -> dict: method vocab_size (line 291) | def vocab_size(self) -> int: method pad_id (line 296) | def pad_id(self) -> int: method bos_id (line 303) | def bos_id(self) -> int: method eos_id (line 310) | def eos_id(self) -> int: method eod (line 315) | def eod(self) -> int: method sep_id (line 322) | def sep_id(self) -> int: method cls_id (line 329) | def cls_id(self) -> int: method unk_id (line 336) | def unk_id(self) -> int: method mask_id (line 343) | def mask_id(self) -> int: method save_vocabulary (line 349) | def save_vocabulary(self, save_directory: str, filename_prefix: str = ... method save_pretrained (line 355) | def save_pretrained(self, save_directory: str): FILE: megatron/core/tokenizers/text/libraries/megatron_hf_tokenizer.py class MegatronHFTokenizer (line 79) | class MegatronHFTokenizer(HuggingFaceTokenizer): method __init__ (line 82) | def __init__( method _get_vocab_file (line 102) | def _get_vocab_file(self, tokenizer_name: str, vocab_file: str = None)... method _get_merges_file (line 122) | def _get_merges_file(self, tokenizer_name: str, merges_file: str = Non... method _get_available_models_list (line 144) | def _get_available_models_list(self) -> list: method _download (line 149) | def _download(self, path: str, url: str): FILE: megatron/core/tokenizers/text/libraries/null_tokenizer.py class NullTokenizer (line 6) | class NullTokenizer: method __init__ (line 14) | def __init__(self, vocab_size): method text_to_ids (line 19) | def text_to_ids(self, text): method ids_to_text (line 23) | def ids_to_text(self, ids): method tokens_to_ids (line 28) | def tokens_to_ids(self, tokens): method ids_to_tokens (line 32) | def ids_to_tokens(self, ids): method offsets (line 36) | def offsets(self, ids: list[int], text: str) -> list[int]: method unique_identifiers (line 45) | def unique_identifiers(self) -> OrderedDict: method vocab_size (line 50) | def vocab_size(self): method vocab (line 55) | def vocab(self): method inv_vocab (line 60) | def inv_vocab(self): method cls (line 65) | def cls(self): method sep (line 70) | def sep(self): method mask (line 75) | def mask(self): method eod (line 80) | def eod(self): method additional_special_tokens_ids (line 85) | def additional_special_tokens_ids(self): FILE: megatron/core/tokenizers/text/libraries/sentencepiece_tokenizer.py class SentencePieceTokenizer (line 22) | class SentencePieceTokenizer(MegatronTokenizerTextAbstract, MegatronToke... method __init__ (line 25) | def __init__( method text_to_tokens (line 92) | def text_to_tokens(self, text: str) -> List[str]: method text_to_ids (line 141) | def text_to_ids(self, text, sample_alpha=None) -> List[int]: method _text_to_ids (line 148) | def _text_to_ids(self, text, sample_alpha=None) -> List[int]: method _text_to_ids_extra_space (line 204) | def _text_to_ids_extra_space(self, text, sample_alpha=None) -> List[int]: method tokens_to_text (line 219) | def tokens_to_text(self, tokens: List[str]) -> str: method ids_to_text (line 226) | def ids_to_text(self, ids: List[int]) -> str: method token_to_id (line 246) | def token_to_id(self, token: str) -> int: method ids_to_tokens (line 253) | def ids_to_tokens(self, ids: List[int]) -> List[str]: method tokens_to_ids (line 263) | def tokens_to_ids( method add_special_tokens (line 275) | def add_special_tokens(self, special_tokens: Union[list, dict]) -> None: method offsets (line 313) | def offsets(self, ids: list[int], text: str) -> list[int]: method pad_id (line 318) | def pad_id(self) -> int: method bos_id (line 327) | def bos_id(self) -> int: method eos_id (line 336) | def eos_id(self) -> int: method sep_id (line 345) | def sep_id(self) -> int: method cls_id (line 356) | def cls_id(self) -> int: method mask_id (line 367) | def mask_id(self) -> int: method unk_id (line 378) | def unk_id(self) -> int: method additional_special_tokens_ids (line 383) | def additional_special_tokens_ids(self) -> list: method vocab (line 401) | def vocab(self) -> list: method inv_vocab (line 413) | def inv_vocab(self) -> dict: FILE: megatron/core/tokenizers/text/libraries/sft_tokenizer.py class PromptConfig (line 27) | class PromptConfig: class SFTTokenizer (line 46) | class SFTTokenizer: method __init__ (line 49) | def __init__(self, tokenizer_path: str, prompt_format: str): method tokenize_conversation (line 111) | def tokenize_conversation( method text_to_ids (line 188) | def text_to_ids(self, text: Union[str, List[Dict]]): method tokens_to_ids (line 198) | def tokens_to_ids(self, tokens: List[str]): method ids_to_text (line 202) | def ids_to_text(self, tokens: List[int]): method ids_to_tokens (line 206) | def ids_to_tokens(self): method text_to_tokens (line 210) | def text_to_tokens(self): method tokens_to_text (line 214) | def tokens_to_text(self): method get_special_tokens (line 218) | def get_special_tokens(self): method add_special_tokens (line 222) | def add_special_tokens(self): method pad_id (line 227) | def pad_id(self): method bos_id (line 232) | def bos_id(self): method eod (line 237) | def eod(self): method vocab (line 242) | def vocab(self): method inv_vocab (line 247) | def inv_vocab(self): method vocab_size (line 252) | def vocab_size(self): FILE: megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py function reload_mergeable_ranks (line 31) | def reload_mergeable_ranks( class TikTokenTokenizer (line 76) | class TikTokenTokenizer(MegatronTokenizerTextAbstract, MegatronTokenizer... method __init__ (line 79) | def __init__( method text_to_tokens (line 172) | def text_to_tokens(self, text: str) -> List[str]: method tokens_to_text (line 177) | def tokens_to_text(self, tokens: List[int]) -> str: method token_to_id (line 182) | def token_to_id(self, token: str) -> int: method tokens_to_ids (line 189) | def tokens_to_ids(self, tokens: List[str]) -> List[int]: method id_to_token (line 193) | def id_to_token(self, token_id: int) -> str: method ids_to_tokens (line 201) | def ids_to_tokens(self, token_ids: List[int]) -> List[str]: method text_to_ids (line 209) | def text_to_ids(self, text: str) -> List[int]: method ids_to_text (line 214) | def ids_to_text(self, tokens: List[int], remove_special_tokens: bool =... method add_special_tokens (line 232) | def add_special_tokens(self, special_tokens_dict: dict): method offsets (line 236) | def offsets(self, ids: list[int], text: str) -> list[int]: method additional_special_tokens_ids (line 255) | def additional_special_tokens_ids(self) -> list: method bos_id (line 271) | def bos_id(self) -> int: method eos_id (line 276) | def eos_id(self) -> int: method eod (line 281) | def eod(self) -> int: method unk_id (line 286) | def unk_id(self) -> int: method mask_id (line 291) | def mask_id(self) -> int: method pad_id (line 296) | def pad_id(self) -> int: method cls_id (line 301) | def cls_id(self) -> int: method sep_id (line 306) | def sep_id(self) -> int: method vocab (line 311) | def vocab(self): method decoder (line 316) | def decoder(self): method encoder (line 321) | def encoder(self): method vocab_size (line 326) | def vocab_size(self) -> int: method inv_vocab (line 331) | def inv_vocab(self) -> dict: FILE: megatron/core/tokenizers/text/models/bert_tokenizer.py class BertTokenizer (line 6) | class BertTokenizer(MegatronTokenizerText): method __init__ (line 9) | def __init__(self, path: str = None, config: dict = None, **kwargs) ->... FILE: megatron/core/tokenizers/text/models/default_tokenizer.py class DefaultTokenizerText (line 6) | class DefaultTokenizerText(MegatronTokenizerText): method __init__ (line 9) | def __init__(self, path: str = None, config: dict = None, **kwargs) ->... FILE: megatron/core/tokenizers/text/models/gpt_tokenizer.py class GPTTokenizer (line 6) | class GPTTokenizer(MegatronTokenizerText): method __init__ (line 9) | def __init__(self, path: str = None, config: dict = None, **kwargs) ->... FILE: megatron/core/tokenizers/text/models/mamba_tokenizer.py class MambaTokenizer (line 6) | class MambaTokenizer(MegatronTokenizerText): method __init__ (line 9) | def __init__(self, path: str = None, config: dict = None, **kwargs) ->... FILE: megatron/core/tokenizers/text/models/t5_tokenizer.py class T5Tokenizer (line 6) | class T5Tokenizer(MegatronTokenizerText): method __init__ (line 9) | def __init__(self, path: str = None, config: dict = None, **kwargs) ->... FILE: megatron/core/tokenizers/text/parsers/base_parser.py class BaseParser (line 5) | class BaseParser: method parse (line 9) | def parse(text: str, **kwargs) -> tuple[str, dict[str, Any]]: FILE: megatron/core/tokenizers/text/parsers/deepseek_r1_reasoning_parser.py class DeepSeekR1ReasoningParser (line 5) | class DeepSeekR1ReasoningParser(BaseParser): method parse (line 9) | def parse(text: str, **kwargs) -> tuple[str, dict[str, str]]: FILE: megatron/core/tokenizers/text/parsers/qwen3_coder_tool_parser.py class _Qwen3CoderToolParser (line 25) | class _Qwen3CoderToolParser: method _generate_tool_call_id (line 40) | def _generate_tool_call_id(self) -> str: method _get_arguments_config (line 44) | def _get_arguments_config( method _convert_param_value (line 69) | def _convert_param_value( method _parse_xml_function_call (line 172) | def _parse_xml_function_call( method _get_function_calls (line 202) | def _get_function_calls(self, model_output: str) -> list[str]: method extract_tool_calls (line 218) | def extract_tool_calls( class Qwen3CoderToolParser (line 259) | class Qwen3CoderToolParser(BaseParser): method parse (line 263) | def parse(text: str, **kwargs) -> tuple[str, dict[str, list[dict]]]: FILE: megatron/core/tokenizers/text/text_tokenizer.py class MegatronTokenizerText (line 22) | class MegatronTokenizerText(MegatronTokenizerBase): method __init__ (line 25) | def __init__(self, path: str, config: dict, **kwargs) -> None: method _restore_model (line 53) | def _restore_model(self, **kwargs) -> MegatronTokenizerTextAbstract: method tokenize (line 65) | def tokenize(self, text: str) -> List[int]: method detokenize (line 78) | def detokenize(self, ids: List[int]) -> str: method apply_chat_template (line 91) | def apply_chat_template( method tokenize_conversation (line 116) | def tokenize_conversation( method save_pretrained (line 142) | def save_pretrained(self, path: str) -> None: method add_special_tokens (line 157) | def add_special_tokens(self, special_tokens: Union[list, dict]) -> None: method offsets (line 171) | def offsets(self, ids: list[int], text: str) -> list[int]: method space_sensitive (line 176) | def space_sensitive(self): method additional_special_tokens_ids (line 186) | def additional_special_tokens_ids(self) -> list: method vocab_size (line 191) | def vocab_size(self) -> int: method vocab (line 196) | def vocab(self): method unique_identifiers (line 201) | def unique_identifiers(self) -> OrderedDict: method pad (line 212) | def pad(self) -> int: method pad_id (line 217) | def pad_id(self) -> int: method eod (line 222) | def eod(self) -> int: method bos (line 227) | def bos(self) -> int: method bos_id (line 232) | def bos_id(self) -> int: method eos_id (line 237) | def eos_id(self) -> int: method eos (line 242) | def eos(self) -> int: method unk (line 247) | def unk(self) -> int: method unk_id (line 252) | def unk_id(self) -> int: method mask (line 257) | def mask(self) -> int: method mask_id (line 262) | def mask_id(self) -> int: method cls (line 267) | def cls(self) -> int: method cls_id (line 272) | def cls_id(self) -> int: method sep (line 277) | def sep(self) -> int: method sep_id (line 282) | def sep_id(self) -> int: method vocab_file (line 287) | def vocab_file(self) -> str: method merges_file (line 292) | def merges_file(self) -> str: method inv_vocab (line 297) | def inv_vocab(self) -> dict: FILE: megatron/core/tokenizers/utils/build_tokenizer.py function build_tokenizer (line 15) | def build_tokenizer(args, **kwargs): function vocab_size_with_padding (line 98) | def vocab_size_with_padding(orig_vocab_size, args, logging_enabled=True): function _set_padded_vocab_size (line 114) | def _set_padded_vocab_size(args, tokenizer): FILE: megatron/core/tokenizers/vision/libraries/multimodal_tokenizer.py class MegatronMultimodalTokenizer (line 55) | class MegatronMultimodalTokenizer: method __init__ (line 58) | def __init__( method _apply_image_tag (line 190) | def _apply_image_tag(self, text: Union[str, List[Dict]]): method tokenize (line 205) | def tokenize(self, text: Union[str, List[Dict]]): method _encode (line 213) | def _encode(self, text: str): method tokenize_conversation (line 218) | def tokenize_conversation( method convert_tokens_to_ids (line 305) | def convert_tokens_to_ids(self, tokens: List[str]): method detokenize (line 309) | def detokenize(self, tokens: List[int]): method add_special_tokens (line 313) | def add_special_tokens(self, special_tokens: List[str]): method get_special_tokens (line 317) | def get_special_tokens(self): method pad (line 322) | def pad(self): method eod (line 327) | def eod(self): method vocab_size (line 332) | def vocab_size(self): method vocab (line 337) | def vocab(self): FILE: megatron/core/tokenizers/vision/libraries/null_multimodal_tokenizer.py class MegatronNullMultimodalTokenizer (line 4) | class MegatronNullMultimodalTokenizer: method __init__ (line 7) | def __init__(self, vocab_size, image_token=None, image_token_id=None): method tokenize (line 22) | def tokenize(self, text): method detokenize (line 34) | def detokenize(self, ids): method offsets (line 47) | def offsets(self, ids: list[int], text: str) -> list[int]: method convert_tokens_to_ids (line 55) | def convert_tokens_to_ids(self, tokens): method vocab_size (line 63) | def vocab_size(self): method cls (line 68) | def cls(self): method sep (line 73) | def sep(self): method mask (line 78) | def mask(self): method eod (line 83) | def eod(self): method additional_special_tokens_ids (line 88) | def additional_special_tokens_ids(self): FILE: megatron/core/tokenizers/vision/models/default_tokenizer.py class DefaultTokenizerVision (line 6) | class DefaultTokenizerVision(MegatronTokenizerVision): method __init__ (line 9) | def __init__(self, path: str = None, config: dict = None, **kwargs) ->... FILE: megatron/core/tokenizers/vision/vision_tokenizer.py class MegatronTokenizerVision (line 16) | class MegatronTokenizerVision(MegatronTokenizerBase): method __init__ (line 19) | def __init__(self, path: str, config: dict, **kwargs) -> None: method _restore_model (line 34) | def _restore_model(self, **kwargs): method tokenize (line 46) | def tokenize(self, text: Union[str, List[Dict]]) -> List[int]: method detokenize (line 59) | def detokenize(self, ids: List[int]) -> str: method tokenize_conversation (line 72) | def tokenize_conversation( method add_special_tokens (line 94) | def add_special_tokens(self, special_tokens: Union[list, dict]) -> None: method convert_tokens_to_ids (line 108) | def convert_tokens_to_ids(self, tokens: List[str]): method apply_chat_template (line 112) | def apply_chat_template(self): method get_special_tokens (line 116) | def get_special_tokens(self) -> list: method offsets (line 120) | def offsets(self, ids: list[int], text: str) -> list[int]: method vocab (line 125) | def vocab(self): method vocab_size (line 130) | def vocab_size(self) -> int: method pad (line 135) | def pad(self): method eod (line 140) | def eod(self): FILE: megatron/core/transformer/attention.py class LinearQkv (line 118) | class LinearQkv(Protocol): method forward (line 121) | def forward(self, input: Tensor, /) -> tuple[Tensor, object]: method backward_dw (line 125) | def backward_dw(self) -> None: class LinearQkvBuilder (line 130) | class LinearQkvBuilder(Protocol): method __call__ (line 133) | def __call__( class LinearLayer (line 150) | class LinearLayer(Protocol): method forward (line 153) | def forward(self, input: Tensor, /) -> Tuple[Tensor, object]: class LinearLayerBuilder (line 158) | class LinearLayerBuilder(Protocol): method __call__ (line 161) | def __call__( class CoreAttention (line 176) | class CoreAttention(Protocol): method forward (line 179) | def forward( class CoreAttentionBuilder (line 195) | class CoreAttentionBuilder(Protocol): method __call__ (line 198) | def __call__( class SelfAttentionSubmodules (line 212) | class SelfAttentionSubmodules: class CrossAttentionSubmodules (line 225) | class CrossAttentionSubmodules: class Attention (line 236) | class Attention(MegatronModule, ABC): method __init__ (line 243) | def __init__( method _checkpointed_attention_forward (line 381) | def _checkpointed_attention_forward( method _allocate_memory (line 421) | def _allocate_memory(self, inference_max_sequence_length, batch_size, ... method _get_pp_layer_offset_for_inference (line 433) | def _get_pp_layer_offset_for_inference(self): method _adjust_key_value_for_inference (line 455) | def _adjust_key_value_for_inference( method get_query_key_value_tensors (line 635) | def get_query_key_value_tensors( method flash_decode (line 651) | def flash_decode( method _flash_attention_3_forward_wrapper (line 697) | def _flash_attention_3_forward_wrapper( method flash_decode_and_prefill (line 765) | def flash_decode_and_prefill( method forward (line 884) | def forward( method _apply_output_gate (line 1227) | def _apply_output_gate(self, x, gate): method set_for_recompute_input_layernorm (line 1235) | def set_for_recompute_input_layernorm(self): method clip_qk (line 1239) | def clip_qk(self): class SelfAttention (line 1247) | class SelfAttention(Attention): method __init__ (line 1254) | def __init__( method run_realtime_tests (line 1309) | def run_realtime_tests(self): method get_query_key_value_tensors (line 1380) | def get_query_key_value_tensors( method backward_dw (line 1512) | def backward_dw(self) -> None: method _backward_qkv_proj (line 1517) | def _backward_qkv_proj(self): method _backward_output_proj (line 1521) | def _backward_output_proj(self): method set_for_recompute_input_layernorm (line 1525) | def set_for_recompute_input_layernorm(self): method clip_qk (line 1531) | def clip_qk(self): method _clip_linear_qkv (line 1580) | def _clip_linear_qkv(self, weight): class CrossAttention (line 1629) | class CrossAttention(Attention): method __init__ (line 1636) | def __init__( method get_query_key_value_tensors (line 1681) | def get_query_key_value_tensors( FILE: megatron/core/transformer/cuda_graphs.py function is_graph_capturing (line 84) | def is_graph_capturing(): function _set_capture_start (line 89) | def _set_capture_start(): function _set_capture_end (line 95) | def _set_capture_end(): function is_graph_warmup (line 101) | def is_graph_warmup(): function _set_warmup_start (line 106) | def _set_warmup_start(): function _set_warmup_end (line 112) | def _set_warmup_end(): class CudagraphBufferMetadata (line 119) | class CudagraphBufferMetadata: class ArgMetadata (line 135) | class ArgMetadata: method __init__ (line 138) | def __init__(self, arg): method zeros_like (line 152) | def zeros_like(self): class TensorReusePool (line 161) | class TensorReusePool: method insert (line 179) | def insert(self, tensor: torch.Tensor): method owns (line 184) | def owns(self, tensor: torch.Tensor): method get (line 188) | def get(self, meta: ArgMetadata): function tree_map (line 204) | def tree_map(func, tree): function _check_supported_type (line 223) | def _check_supported_type(meta): function _determine_if_first_last_layer_of_this_vp_chunk (line 249) | def _determine_if_first_last_layer_of_this_vp_chunk(base_module): function _clone_nested_tensors (line 278) | def _clone_nested_tensors(value: Any) -> Any: function _ensure_generator_state_is_cudagraph_safe (line 293) | def _ensure_generator_state_is_cudagraph_safe(gen: torch.Generator) -> t... class _CudagraphGlobalRecord (line 320) | class _CudagraphGlobalRecord: method record_fwd_graph (line 338) | def record_fwd_graph(cls, runner, args, kwargs, out): method record_bwd_graph (line 343) | def record_bwd_graph(cls, runner): method create_cudagraphs (line 348) | def create_cudagraphs(cls): function create_cudagraphs (line 478) | def create_cudagraphs(): function delete_cuda_graphs (line 491) | def delete_cuda_graphs(): class _GraphStatus (line 521) | class _GraphStatus(Enum): class _CudagraphRecordNode (line 528) | class _CudagraphRecordNode(torch.autograd.Function): method forward (line 533) | def forward(ctx, runner, inputs): method backward (line 544) | def backward(ctx, grads): class _CudagraphReplayNode (line 560) | class _CudagraphReplayNode(torch.autograd.Function): method forward (line 565) | def forward(ctx, runner, is_first_microbatch, *inputs): method backward (line 618) | def backward(ctx, *grads): class _CudaGraphRunner (line 670) | class _CudaGraphRunner(torch.nn.Module): method __init__ (line 675) | def __init__( method __str__ (line 746) | def __str__(self): method get_quantization_context (line 752) | def get_quantization_context(self): method get_connected_params (line 765) | def get_connected_params(self, outputs): method create_fwd_graph (line 789) | def create_fwd_graph(self, args, kwargs, outputs=None, clone_inputs=Tr... method create_bwd_graph (line 1023) | def create_bwd_graph(self): method apply_cudagraph_record_metadata (line 1170) | def apply_cudagraph_record_metadata(self, args, kwargs, outputs): method record_graph_capture (line 1188) | def record_graph_capture(self, args, kwargs): method replay_graph_capture (line 1245) | def replay_graph_capture(self, is_first_microbatch, args, kwargs): method get_mismatch_errors (line 1268) | def get_mismatch_errors(self, args, kwargs): method get_arg_metas (line 1333) | def get_arg_metas(self, args, kwargs=None): method get_tensors (line 1348) | def get_tensors(self, args, kwargs=None, check_types=True): method to_list (line 1379) | def to_list(self, x): class CudaGraphManager (line 1384) | class CudaGraphManager(torch.nn.Module): method __init__ (line 1390) | def __init__( method call_ddp_preforward_hook (line 1452) | def call_ddp_preforward_hook(self, module): method get_cudagraph_runner (line 1467) | def get_cudagraph_runner(self, megatron_module, args, kwargs, reuse_cu... method __call__ (line 1546) | def __call__(self, megatron_module, args, kwargs): function _layer_is_graphable (line 1641) | def _layer_is_graphable(layer, config): class TECudaGraphHelper (line 1684) | class TECudaGraphHelper: method __init__ (line 1693) | def __init__(self, model, config, seq_length, micro_batch_size, optimi... method _discover_layers (line 1729) | def _discover_layers(self): method capture_finished (line 1804) | def capture_finished(self): method graphs_created (line 1814) | def graphs_created(self): method _get_sample_arguments (line 1823) | def _get_sample_arguments(self, order, chunk_id_list=None): method _get_cuda_graph_input_data (line 2068) | def _get_cuda_graph_input_data(self): method _start_capturing (line 2217) | def _start_capturing(self): method _reset_after_capture (line 2233) | def _reset_after_capture(self): method _finish_capturing (line 2247) | def _finish_capturing(self, start_time): method create_cudagraphs (line 2268) | def create_cudagraphs(self): method cuda_graph_set_manual_hooks (line 2315) | def cuda_graph_set_manual_hooks(self): method delete_cuda_graphs (line 2325) | def delete_cuda_graphs(self): function convert_schedule_table_to_order (line 2356) | def convert_schedule_table_to_order(num_warmup_microbatches, num_model_c... function get_overlap_moe_expert_parallel_comm_order (line 2382) | def get_overlap_moe_expert_parallel_comm_order(order, num_layers_per_chu... function set_current_microbatch (line 2491) | def set_current_microbatch(model, microbatch_id): function _wrap_graph_for_vision (line 2540) | def _wrap_graph_for_vision(graph_fn): function get_vision_cuda_graph_seq_length (line 2562) | def get_vision_cuda_graph_seq_length(vision_config, default_seq_length: ... class VisionTECudaGraphHelper (line 2593) | class VisionTECudaGraphHelper(TECudaGraphHelper): method __init__ (line 2624) | def __init__( method _discover_layers (line 2639) | def _discover_layers(self): method _reset_after_capture (line 2693) | def _reset_after_capture(self): method _finish_capturing (line 2702) | def _finish_capturing(self, start_time): method _get_sample_arguments (line 2715) | def _get_sample_arguments(self, order, chunk_id_list=None): method cuda_graph_set_manual_hooks (line 2757) | def cuda_graph_set_manual_hooks(self): FILE: megatron/core/transformer/custom_layers/batch_invariant_kernels.py function _matmul_launch_metadata (line 42) | def _matmul_launch_metadata( function _compute_pid (line 63) | def _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM... function matmul_kernel_persistent (line 73) | def matmul_kernel_persistent( function get_compute_units (line 154) | def get_compute_units(): function matmul_persistent (line 178) | def matmul_persistent(a: torch.Tensor, b: torch.Tensor, bias: torch.Tens... function _log_softmax_kernel (line 251) | def _log_softmax_kernel( function log_softmax (line 308) | def log_softmax(input: torch.Tensor, dim: int = -1) -> torch.Tensor: function mean_kernel (line 344) | def mean_kernel( function mean_dim (line 391) | def mean_dim( function mm_batch_invariant (line 481) | def mm_batch_invariant(a, b): function addmm_batch_invariant (line 486) | def addmm_batch_invariant(bias, a, b): function _log_softmax_batch_invariant (line 491) | def _log_softmax_batch_invariant(input, dim, _half_to_float): function mean_batch_invariant (line 496) | def mean_batch_invariant(input, dim, keepdim=False, dtype: torch.dtype |... function get_batch_invariant_attention_block_size (line 516) | def get_batch_invariant_attention_block_size() -> AttentionBlockSize: function _import_module_if_available (line 530) | def _import_module_if_available(name: str): function _te_patch_for_batch_invariant (line 537) | def _te_patch_for_batch_invariant(): function _te_unpatch_for_batch_invariant (line 621) | def _te_unpatch_for_batch_invariant(): function _extract_te_gemm_args (line 694) | def _extract_te_gemm_args(args: tuple, kwargs: Dict[str, Any]): function _is_supported_dtype_for_bik (line 709) | def _is_supported_dtype_for_bik(t: torch.dtype) -> bool: class BatchInvariantTEGemmFn (line 713) | class BatchInvariantTEGemmFn(torch.autograd.Function): method forward (line 717) | def forward( method backward (line 775) | def backward(ctx, grad_output: torch.Tensor): function _te_general_gemm_patched (line 830) | def _te_general_gemm_patched(*args, **kwargs) -> List[torch.Tensor]: class BatchInvariantRMSNormFn (line 877) | class BatchInvariantRMSNormFn(torch.autograd.Function): method forward (line 881) | def forward(ctx, x: torch.Tensor, weight: torch.Tensor, eps: float, ze... method backward (line 912) | def backward(ctx, grad_output: torch.Tensor): function rmsnorm_batch_invariant (line 938) | def rmsnorm_batch_invariant(x: torch.Tensor, weight: torch.Tensor, eps: ... function _te_rmsnorm_forward_patched (line 948) | def _te_rmsnorm_forward_patched(self, x: torch.Tensor) -> torch.Tensor: function is_batch_invariant_mode_enabled (line 960) | def is_batch_invariant_mode_enabled(): function enable_batch_invariant_mode (line 965) | def enable_batch_invariant_mode(): function disable_batch_invariant_mode (line 981) | def disable_batch_invariant_mode(): function set_batch_invariant_mode (line 993) | def set_batch_invariant_mode(enabled: bool = True): FILE: megatron/core/transformer/dot_product_attention.py class DotProductAttention (line 26) | class DotProductAttention(MegatronModule): method __init__ (line 42) | def __init__( method forward (line 142) | def forward( method sharded_state_dict (line 253) | def sharded_state_dict( FILE: megatron/core/transformer/enums.py class ModelType (line 8) | class ModelType(enum.Enum): class LayerType (line 17) | class LayerType(enum.Enum): class AttnType (line 33) | class AttnType(enum.Enum): class AttnMaskType (line 40) | class AttnMaskType(enum.Enum): class AttnBackend (line 51) | class AttnBackend(enum.Enum): class CudaGraphScope (line 61) | class CudaGraphScope(enum.Enum): FILE: megatron/core/transformer/experimental_attention_variant/absorbed_mla.py class AbsorbedMLASelfAttentionSubmodules (line 62) | class AbsorbedMLASelfAttentionSubmodules: class AbsorbedMLASelfAttention (line 79) | class AbsorbedMLASelfAttention(Attention): method __init__ (line 93) | def __init__( method get_query_key_value_tensors (line 339) | def get_query_key_value_tensors( method _checkpointed_attention_forward (line 633) | def _checkpointed_attention_forward( method forward (line 692) | def forward( method backward_dw (line 805) | def backward_dw(self) -> NoReturn: method _backward_kv_proj (line 811) | def _backward_kv_proj(self): method _backward_q_proj (line 817) | def _backward_q_proj(self): method _backward_output_proj (line 825) | def _backward_output_proj(self): method set_for_recompute_input_layernorm (line 829) | def set_for_recompute_input_layernorm(self): method clip_qk (line 837) | def clip_qk(self): method _combine_kv_weights (line 845) | def _combine_kv_weights(self, k_weight, v_weight): method _split_kv_weights (line 882) | def _split_kv_weights(self, combined_weight): method _load_from_state_dict (line 917) | def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs): FILE: megatron/core/transformer/experimental_attention_variant/dsa.py function rotate_activation (line 30) | def rotate_activation(x: torch.Tensor) -> torch.Tensor: class DSAIndexerLossLoggingHelper (line 49) | class DSAIndexerLossLoggingHelper: method save_loss_to_tracker (line 55) | def save_loss_to_tracker( method clean_loss_in_tracker (line 83) | def clean_loss_in_tracker(): method reduce_loss_in_tracker (line 92) | def reduce_loss_in_tracker(): method track_indexer_metrics (line 116) | def track_indexer_metrics( function compute_dsa_indexer_loss (line 161) | def compute_dsa_indexer_loss( function _compute_index_scores (line 255) | def _compute_index_scores(q: torch.Tensor, weights: torch.Tensor, k: tor... function fused_qk_topk_naive (line 298) | def fused_qk_topk_naive( function fwd_fused_indexer_loss_naive (line 326) | def fwd_fused_indexer_loss_naive( function bwd_fused_indexer_loss_naive (line 346) | def bwd_fused_indexer_loss_naive( class FusedDSAIndexerLoss (line 510) | class FusedDSAIndexerLoss(torch.autograd.Function): method forward (line 514) | def forward( method backward (line 555) | def backward(ctx, grad_topk_indices, grad_loss): class DSAIndexerLossAutoScaler (line 579) | class DSAIndexerLossAutoScaler(torch.autograd.Function): method forward (line 589) | def forward(ctx, output: torch.Tensor, indexer_loss: torch.Tensor): method backward (line 603) | def backward(ctx, grad_output: torch.Tensor): method set_loss_scale (line 623) | def set_loss_scale(scale: torch.Tensor): class DSAIndexerSubmodules (line 636) | class DSAIndexerSubmodules: class DSAttentionSubmodules (line 654) | class DSAttentionSubmodules: class DSAIndexer (line 665) | class DSAIndexer(MegatronModule): method __init__ (line 676) | def __init__( method _apply_rope (line 779) | def _apply_rope(self, x: torch.Tensor, rotary_pos_emb: torch.Tensor, m... method forward_before_topk (line 798) | def forward_before_topk( method forward_with_scores (line 863) | def forward_with_scores( method forward (line 897) | def forward( function unfused_dsa_fn (line 920) | def unfused_dsa_fn(query, key, value, topk_indices, softmax_scale): class DSAttention (line 973) | class DSAttention(MegatronModule): method __init__ (line 982) | def __init__( method forward (line 1010) | def forward( FILE: megatron/core/transformer/fsdp_dtensor_checkpoint.py function get_ep_layer_offset (line 51) | def get_ep_layer_offset(num_experts: int | None = None) -> int: function get_expert_index_from_key (line 69) | def get_expert_index_from_key(key): function handle_experts_in_state_dict (line 93) | def handle_experts_in_state_dict(state_dict, num_experts: int | None = N... function expert_param_local_key (line 151) | def expert_param_local_key(key: str, num_experts: int | None = None) -> ... function handle_swiglu_in_state_dict (line 180) | def handle_swiglu_in_state_dict(model, model_state_dict, optimizer_state... function handle_fp8_extra_state_case (line 332) | def handle_fp8_extra_state_case(model_state_dict): function flatten_state_dict (line 343) | def flatten_state_dict(obj, parent_key="", sep="."): function print_diff_in_state_dicts (line 361) | def print_diff_in_state_dicts(state_dict_metadata, load_state_dict, limi... function validate_loaded_state_dict (line 394) | def validate_loaded_state_dict(state_dict, checkpoint_path): function get_global_unique_param_name (line 443) | def get_global_unique_param_name(model_chunks, param): FILE: megatron/core/transformer/heterogeneous/heterogeneous_config.py class AttentionConfig (line 11) | class AttentionConfig: method build_config_from_dict (line 25) | def build_config_from_dict( class MLPConfig (line 60) | class MLPConfig: method build_config_from_dict (line 74) | def build_config_from_dict(cls, block_config_dict: dict, hidden_size: ... method ffn_mult_to_intermediate_size (line 101) | def ffn_mult_to_intermediate_size(ffn_mult: float, hidden_size: int) -... method find_multiple (line 117) | def find_multiple(n: int, k: int) -> int: class TransformerBlockConfig (line 134) | class TransformerBlockConfig: class HeterogeneousTransformerConfig (line 147) | class HeterogeneousTransformerConfig(TransformerConfig): method __post_init__ (line 201) | def __post_init__(self): method get_config_for_layer (line 229) | def get_config_for_layer(self, layer_number: int) -> TransformerConfig: FILE: megatron/core/transformer/heterogeneous/linear_replacements.py function _gather_from_tensor_parallel_region (line 25) | def _gather_from_tensor_parallel_region(x: Tensor, config: TransformerCo... class TELayerNormColumnParallelLinearGathered (line 49) | class TELayerNormColumnParallelLinearGathered(TELayerNormColumnParallelL... method __init__ (line 55) | def __init__(self, config: TransformerConfig, tp_comm_buffer_name: str... method forward (line 68) | def forward(self, x, **kwargs): class ColumnParallelLinearGathered (line 78) | class ColumnParallelLinearGathered(ColumnParallelLinear): method __init__ (line 84) | def __init__(self, config: TransformerConfig, *args, **kwargs): method forward (line 96) | def forward( FILE: megatron/core/transformer/identity_op.py class IdentityOp (line 9) | class IdentityOp(torch.nn.Module): method __init__ (line 14) | def __init__(self, *args: object, **kwargs: object): method forward (line 17) | def forward(self, x: T, *args: object, **kwargs: object) -> T: class IdentityFuncOp (line 25) | class IdentityFuncOp(IdentityOp): method __init__ (line 32) | def __init__(self, *args: object, **kwargs: object): method forward (line 35) | def forward(self, *args: object, **kwargs: object): FILE: megatron/core/transformer/mlp.py class LinearFc1Interface (line 48) | class LinearFc1Interface(Protocol): method forward (line 51) | def forward(self, hidden_states: torch.Tensor, /) -> tuple[torch.Tenso... method backward_dw (line 55) | def backward_dw(self) -> None: class LinearFc1Builder (line 60) | class LinearFc1Builder(Protocol): method __call__ (line 63) | def __call__( class TEActivationFunctionInterface (line 83) | class TEActivationFunctionInterface(Protocol): method forward (line 86) | def forward(self, input_: torch.Tensor, /) -> torch.Tensor: class TEActivationFunctionBuilder (line 91) | class TEActivationFunctionBuilder(Protocol): method __call__ (line 94) | def __call__(self, *, config: TransformerConfig) -> TEActivationFuncti... class LinearFc2Interface (line 99) | class LinearFc2Interface(Protocol): method forward (line 102) | def forward(self, hidden_states: torch.Tensor, /) -> tuple[torch.Tenso... method backward_dw (line 106) | def backward_dw(self) -> None: class LinearFc2Builder (line 111) | class LinearFc2Builder(Protocol): method __call__ (line 114) | def __call__( class MLPSubmodules (line 134) | class MLPSubmodules: class MLP (line 150) | class MLP(MegatronModule): method __init__ (line 167) | def __init__( method forward (line 246) | def forward( method sharded_state_dict (line 348) | def sharded_state_dict( method backward_dw (line 365) | def backward_dw(self): function apply_swiglu_sharded_factory (line 371) | def apply_swiglu_sharded_factory( FILE: megatron/core/transformer/module.py function param_is_not_shared (line 26) | def param_is_not_shared(param): # pylint: disable=missing-function-docs... class MegatronModule (line 30) | class MegatronModule(torch.nn.Module): method __init__ (line 41) | def __init__(self, config: TransformerConfig): method state_dict_for_save_checkpoint (line 45) | def state_dict_for_save_checkpoint(self, prefix: str = '', keep_vars: ... method sharded_state_dict (line 59) | def sharded_state_dict( method set_is_first_microbatch (line 106) | def set_is_first_microbatch(self): method set_symmetric_ar (line 124) | def set_symmetric_ar(self, set_to: Optional[str] = None) -> None: class GraphableMegatronModule (line 158) | class GraphableMegatronModule(MegatronModule): method __init__ (line 166) | def __init__(self, config: TransformerConfig, vp_stage: Optional[int] ... method init_backward_dw_wrapper (line 201) | def init_backward_dw_wrapper(self): method set_te_cuda_graph_backward_dw_wrapper (line 211) | def set_te_cuda_graph_backward_dw_wrapper(self): method _te_cuda_graph_backward_dw_graph (line 220) | def _te_cuda_graph_backward_dw_graph(self, microbatch_idx): method get_layer_static_inputs (line 229) | def get_layer_static_inputs(self, seq_length, micro_batch_size): method setup_manual_hooks (line 257) | def setup_manual_hooks(self, make_hook_func): method _get_submodules_under_cudagraphs (line 276) | def _get_submodules_under_cudagraphs(self): method _te_cuda_graph_capture (line 283) | def _te_cuda_graph_capture(self, *args, **kwargs): method _te_cuda_graph_replay (line 290) | def _te_cuda_graph_replay(self, *args, **kwargs): method _get_te_cuda_graph_replay_args (line 311) | def _get_te_cuda_graph_replay_args(self, *args, **kwargs): method _should_call_local_cudagraph (line 327) | def _should_call_local_cudagraph(self, *args, **kwargs): method _should_call_te_cudagraph (line 333) | def _should_call_te_cudagraph(self, *args, **kwargs): method __call__ (line 345) | def __call__(self, *args, **kwargs): function conversion_helper (line 359) | def conversion_helper(val, conversion): function fp32_to_float16 (line 379) | def fp32_to_float16(val, float16_convertor): function float16_to_fp32 (line 398) | def float16_to_fp32(val): class Float16Module (line 416) | class Float16Module(MegatronModule): method __init__ (line 428) | def __init__(self, config: TransformerConfig, module: torch.nn.Module): method set_input_tensor (line 454) | def set_input_tensor(self, input_tensor): # pylint: disable=missing-f... method forward (line 457) | def forward(self, *inputs, fp32_output=True, **kwargs): method state_dict (line 502) | def state_dict( method state_dict_for_save_checkpoint (line 507) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): method sharded_state_dict (line 511) | def sharded_state_dict(self, prefix='', *args, **kwargs): method load_state_dict (line 515) | def load_state_dict( FILE: megatron/core/transformer/moe/experts.py class GroupedLinearFc1Interface (line 67) | class GroupedLinearFc1Interface(Protocol): method forward (line 70) | def forward( method backward_dw (line 76) | def backward_dw(self) -> None: class GroupedLinearFc1Builder (line 81) | class GroupedLinearFc1Builder(Protocol): method __call__ (line 84) | def __call__( class GroupedLinearFc2Interface (line 103) | class GroupedLinearFc2Interface(Protocol): method forward (line 106) | def forward( method backward_dw (line 112) | def backward_dw(self) -> None: class GroupedLinearFc2Builder (line 117) | class GroupedLinearFc2Builder(Protocol): method __call__ (line 120) | def __call__( class TEGroupedMLPSubmodules (line 140) | class TEGroupedMLPSubmodules: class TEGroupedMLP (line 156) | class TEGroupedMLP(MegatronModule): method __init__ (line 163) | def __init__( method _apply_bias (line 251) | def _apply_bias(intermediate_parallel, bias_parallel, tokens_per_exper... method bias_act_func (line 270) | def bias_act_func(self, intermediate_parallel, bias_parallel, permuted... method forward (line 327) | def forward( method sharded_state_dict (line 408) | def sharded_state_dict( method backward_dw (line 448) | def backward_dw(self): class InferenceGroupedMLP (line 460) | class InferenceGroupedMLP(TEGroupedMLP): method __init__ (line 470) | def __init__( method _resolve_flashinfer_activation_type (line 497) | def _resolve_flashinfer_activation_type(self): method _resolve_mcore_activation_type (line 513) | def _resolve_mcore_activation_type(self): method set_inference_cuda_graphed_iteration (line 520) | def set_inference_cuda_graphed_iteration(self): method unset_inference_cuda_graphed_iteration (line 524) | def unset_inference_cuda_graphed_iteration(self): method _build_concatenated_mxfp8_weights (line 528) | def _build_concatenated_mxfp8_weights(self): method _build_concatenated_weights (line 571) | def _build_concatenated_weights(self): method _flashinfer_forward (line 614) | def _flashinfer_forward(self, hidden_states, routing_map, probs): method _mcore_fused_moe_forward (line 632) | def _mcore_fused_moe_forward( method forward (line 652) | def forward( class SequentialMLP (line 716) | class SequentialMLP(MegatronModule): method __init__ (line 723) | def __init__( method _pad_tensor_for_quantization (line 758) | def _pad_tensor_for_quantization(self, hidden, probs): method forward (line 772) | def forward( method backward_dw (line 826) | def backward_dw(self): method sharded_state_dict (line 831) | def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=N... FILE: megatron/core/transformer/moe/fused_a2a.py function get_hidden_bytes (line 21) | def get_hidden_bytes(x: torch.Tensor) -> int: function get_buffer (line 33) | def get_buffer(group: torch.distributed.ProcessGroup, hidden_bytes: int): class FusedDispatch (line 69) | class FusedDispatch(torch.autograd.Function): method forward (line 73) | def forward( method backward (line 140) | def backward( class FusedCombine (line 163) | class FusedCombine(torch.autograd.Function): method forward (line 167) | def forward(ctx, x, group, handle, async_finish=False, allocate_on_com... method backward (line 191) | def backward(ctx, grad_output, previous_event=None): function fused_dispatch (line 212) | def fused_dispatch( function fused_combine (line 244) | def fused_combine(x, group, handle, async_finish=False, allocate_on_comm... function set_deepep_num_sms (line 258) | def set_deepep_num_sms(num_sms): function init_hybrid_ep_buffer (line 278) | def init_hybrid_ep_buffer( function reset_hybrid_ep_buffer (line 324) | def reset_hybrid_ep_buffer(): class HybridEPDispatch (line 332) | class HybridEPDispatch(torch.autograd.Function): method forward (line 338) | def forward( method backward (line 397) | def backward(ctx, grad_x, grad_probs, grad_scaling_factor, grad_tokens... class HybridEPCombine (line 409) | class HybridEPCombine(torch.autograd.Function): method forward (line 415) | def forward(ctx, x, handle, num_permuted_tokens=None, pad_multiple=None): method backward (line 428) | def backward(ctx, grad_x): function hybrid_ep_dispatch (line 446) | def hybrid_ep_dispatch( function hybrid_ep_combine (line 497) | def hybrid_ep_combine(x, handle, num_permuted_tokens, pad_multiple): FILE: megatron/core/transformer/moe/moe_layer.py class RouterInterface (line 58) | class RouterInterface(Protocol): method forward (line 61) | def forward(self, input: torch.Tensor, /) -> tuple[torch.Tensor, torch... method set_layer_number (line 69) | def set_layer_number(self, layer_number: int) -> None: class RouterBuilder (line 77) | class RouterBuilder(Protocol): method __call__ (line 80) | def __call__( class MoESubmodules (line 86) | class MoESubmodules: class BaseMoELayer (line 94) | class BaseMoELayer(MegatronModule, ABC): method __init__ (line 101) | def __init__( method forward (line 137) | def forward(self, hidden_states): method set_layer_number (line 141) | def set_layer_number(self, layer_number: int): class MoELayer (line 147) | class MoELayer(BaseMoELayer): method __init__ (line 155) | def __init__( method _setup_inference_mode (line 293) | def _setup_inference_mode(self, pg_collection): method set_inference_cuda_graphed_iteration (line 313) | def set_inference_cuda_graphed_iteration(self): method unset_inference_cuda_graphed_iteration (line 331) | def unset_inference_cuda_graphed_iteration(self): method route (line 347) | def route(self, hidden_states: torch.Tensor, padding_mask: Optional[to... method preprocess (line 357) | def preprocess( method dispatch (line 376) | def dispatch(self, hidden_states: torch.Tensor, probs: torch.Tensor): method shared_experts_compute (line 386) | def shared_experts_compute(self, hidden_states: torch.Tensor): method routed_experts_compute (line 414) | def routed_experts_compute(self, hidden_states: torch.Tensor, probs: t... method combine (line 441) | def combine(self, output: torch.Tensor): method postprocess (line 450) | def postprocess(self, output: torch.Tensor, shared_expert_output: Opti... method router_and_preprocess (line 462) | def router_and_preprocess(self, hidden_states: torch.Tensor): method forward (line 469) | def forward( method backward_dw (line 564) | def backward_dw(self, routed_experts: bool = True, shared_experts: boo... method set_for_recompute_pre_mlp_layernorm (line 586) | def set_for_recompute_pre_mlp_layernorm(self): FILE: megatron/core/transformer/moe/moe_utils.py function switch_load_balancing_loss_func (line 55) | def switch_load_balancing_loss_func( function z_loss_func (line 145) | def z_loss_func( function sinkhorn (line 177) | def sinkhorn(cost: torch.Tensor, tol: float = 0.0001) -> torch.Tensor: function get_capacity (line 202) | def get_capacity( function get_tokens_per_expert_and_token_count (line 223) | def get_tokens_per_expert_and_token_count( class MoEAuxLossAutoScaler (line 245) | class MoEAuxLossAutoScaler(torch.autograd.Function): method forward (line 251) | def forward(ctx, output: torch.Tensor, aux_loss: torch.Tensor) -> torc... method backward (line 265) | def backward(ctx, grad_output: torch.Tensor) -> Tuple[torch.Tensor, to... method set_loss_scale (line 285) | def set_loss_scale(scale: torch.Tensor) -> None: function permute (line 298) | def permute( function unpermute (line 427) | def unpermute( function sort_chunks_by_idxs (line 529) | def sort_chunks_by_idxs( function group_limited_topk (line 574) | def group_limited_topk( function pad_routing_map (line 632) | def pad_routing_map(routing_map: torch.Tensor, pad_multiple: int) -> tor... function topk_routing_with_score_function (line 667) | def topk_routing_with_score_function( function compute_routing_scores_for_aux_loss (line 822) | def compute_routing_scores_for_aux_loss( function apply_router_token_dropping (line 873) | def apply_router_token_dropping( function save_to_aux_losses_tracker (line 939) | def save_to_aux_losses_tracker( function clear_aux_losses_tracker (line 976) | def clear_aux_losses_tracker() -> None: function reduce_aux_losses_tracker_across_ranks (line 983) | def reduce_aux_losses_tracker_across_ranks( function track_moe_metrics (line 1028) | def track_moe_metrics( function get_updated_expert_bias (line 1132) | def get_updated_expert_bias( function maybe_move_tensor_to_cpu (line 1158) | def maybe_move_tensor_to_cpu( function get_moe_layer_wise_logging_tracker (line 1183) | def get_moe_layer_wise_logging_tracker() -> dict: class RandomSTE (line 1190) | class RandomSTE(torch.autograd.Function): method forward (line 1199) | def forward(ctx, logits: torch.Tensor) -> torch.Tensor: method backward (line 1214) | def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor: function apply_random_logits (line 1227) | def apply_random_logits(logits: torch.Tensor) -> torch.Tensor: class RouterGatingLinearFunction (line 1240) | class RouterGatingLinearFunction(torch.autograd.Function): method forward (line 1246) | def forward( method backward (line 1286) | def backward( function router_gating_linear (line 1323) | def router_gating_linear( function get_align_size_for_quantization (line 1343) | def get_align_size_for_quantization(config: TransformerConfig) -> int: function get_default_pg_collection (line 1361) | def get_default_pg_collection() -> ProcessGroupCollection: class MoECudaGraphPartialCaptureSignal (line 1381) | class MoECudaGraphPartialCaptureSignal(Exception): method __init__ (line 1389) | def __init__(self, moe_layer, return_step: str, **kwargs): method get_early_return_outputs (line 1394) | def get_early_return_outputs( class MoECudaGraphTensorStore (line 1447) | class MoECudaGraphTensorStore: method is_empty (line 1467) | def is_empty(self) -> bool: method set (line 1478) | def set(self, **kwargs): method clear (line 1493) | def clear(self): function maybe_skip_or_early_return_by_cudagraph (line 1499) | def maybe_skip_or_early_return_by_cudagraph(step_condition): FILE: megatron/core/transformer/moe/router.py class Router (line 28) | class Router(ABC, MegatronModule): method __init__ (line 31) | def __init__( method reset_parameters (line 72) | def reset_parameters(self): method gating (line 84) | def gating(self, input: torch.Tensor): method routing (line 109) | def routing(self, logits: torch.Tensor): method forward (line 122) | def forward(self, input: torch.Tensor): method set_layer_number (line 131) | def set_layer_number(self, layer_number: int): class TopKRouter (line 136) | class TopKRouter(Router): method __init__ (line 152) | def __init__( method _maintain_float32_expert_bias (line 218) | def _maintain_float32_expert_bias(self): method sinkhorn_load_balancing (line 229) | def sinkhorn_load_balancing(self, logits: torch.Tensor): method get_aux_loss_coeff (line 262) | def get_aux_loss_coeff(self, aux_loss_type: str) -> float: method is_aux_loss_enabled (line 277) | def is_aux_loss_enabled(self) -> bool: method _apply_aux_loss (line 284) | def _apply_aux_loss( method _apply_seq_aux_loss (line 324) | def _apply_seq_aux_loss( method _apply_global_aux_loss (line 379) | def _apply_global_aux_loss( method attach_and_log_load_balancing_loss (line 425) | def attach_and_log_load_balancing_loss( method apply_z_loss (line 496) | def apply_z_loss(self, logits, padding_mask: Optional[torch.Tensor] = ... method apply_input_jitter (line 551) | def apply_input_jitter(self, input: torch.Tensor): method _apply_expert_bias (line 573) | def _apply_expert_bias( method routing (line 586) | def routing(self, logits: torch.Tensor, padding_mask: Optional[torch.T... method reset_global_aux_loss_tracker (line 674) | def reset_global_aux_loss_tracker(self): method forward (line 680) | def forward(self, input: torch.Tensor, padding_mask: Optional[torch.Te... method _load_from_state_dict (line 704) | def _load_from_state_dict(self, *args, **kwargs): method _save_to_state_dict (line 709) | def _save_to_state_dict(self, *args, **kwargs): class InferenceTopKRouter (line 715) | class InferenceTopKRouter(TopKRouter): method __init__ (line 727) | def __init__( method set_inference_cuda_graphed_iteration (line 753) | def set_inference_cuda_graphed_iteration(self): method unset_inference_cuda_graphed_iteration (line 757) | def unset_inference_cuda_graphed_iteration(self): method _compiled_topk_routing (line 763) | def _compiled_topk_routing( method _forward (line 790) | def _forward(self, input: torch.Tensor, padding_mask: Optional[torch.T... method forward (line 808) | def forward(self, input: torch.Tensor, padding_mask: Optional[torch.Te... FILE: megatron/core/transformer/moe/router_replay.py class RouterReplayAction (line 8) | class RouterReplayAction(Enum): class RouterReplay (line 18) | class RouterReplay: method set_replay_data (line 29) | def set_replay_data(all_layers_topk_indices: List[torch.Tensor]): method get_recorded_data (line 45) | def get_recorded_data() -> List[torch.Tensor]: method clear_global_indices (line 55) | def clear_global_indices(): method set_global_router_replay_action (line 61) | def set_global_router_replay_action(router_replay_action: RouterReplay... method clear_global_router_replay_action (line 67) | def clear_global_router_replay_action(): method clear_global_router_replay_instances (line 73) | def clear_global_router_replay_instances(): method set_global_static_buffers (line 78) | def set_global_static_buffers(static_buffer: torch.Tensor): method clear_global_static_buffers (line 95) | def clear_global_static_buffers(): method __init__ (line 100) | def __init__(self): method set_target_indices (line 113) | def set_target_indices(self, topk_indices: torch.Tensor): method get_recorded_indices (line 118) | def get_recorded_indices(self) -> Optional[torch.Tensor]: method clear_indices (line 122) | def clear_indices(self): method set_router_replay_action (line 128) | def set_router_replay_action(self, router_replay_action: RouterReplayA... method clear_router_replay_action (line 132) | def clear_router_replay_action(self): method get_replay_topk (line 136) | def get_replay_topk( method set_static_buffer (line 183) | def set_static_buffer(self, buffer: torch.Tensor): method clear_static_buffer (line 191) | def clear_static_buffer(self): method record_indices (line 195) | def record_indices(self, topk_indices: torch.Tensor): FILE: megatron/core/transformer/moe/shared_experts.py class SharedExpertMLP (line 37) | class SharedExpertMLP(MLP): method __init__ (line 46) | def __init__( method forward (line 121) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: method sharded_state_dict (line 130) | def sharded_state_dict( method pre_forward_comm (line 150) | def pre_forward_comm(self, input): method linear_fc1_forward_and_act (line 171) | def linear_fc1_forward_and_act(self, overlapped_comm_output=None): method linear_fc2_forward (line 224) | def linear_fc2_forward(self, overlapped_comm_output=None): method post_forward_comm (line 239) | def post_forward_comm(self): method get_output (line 259) | def get_output(self): function set_tensor_grad_fn_sequence_sr (line 279) | def set_tensor_grad_fn_sequence_sr(tensor, value): FILE: megatron/core/transformer/moe/token_dispatcher.py class MoETokenDispatcher (line 53) | class MoETokenDispatcher: method __init__ (line 58) | def __init__( method dispatch_preprocess (line 86) | def dispatch_preprocess( method token_dispatch (line 110) | def token_dispatch(self, hidden_states: torch.Tensor, probs: torch.Ten... method dispatch_postprocess (line 126) | def dispatch_postprocess(self, hidden_states: torch.Tensor, probs: tor... method combine_preprocess (line 148) | def combine_preprocess(self, hidden_states): method token_combine (line 168) | def token_combine(self, hidden_states): method combine_postprocess (line 183) | def combine_postprocess(self, hidden_states): method set_shared_experts (line 202) | def set_shared_experts(self, shared_experts): class MoEAllGatherTokenDispatcher (line 208) | class MoEAllGatherTokenDispatcher(MoETokenDispatcher): method __init__ (line 214) | def __init__( method dispatch_preprocess (line 246) | def dispatch_preprocess( method token_dispatch (line 256) | def token_dispatch(self, hidden_states, probs): method dispatch_postprocess (line 280) | def dispatch_postprocess(self, hidden_states, probs): method combine_preprocess (line 312) | def combine_preprocess(self, hidden_states): method token_combine (line 330) | def token_combine(self, hidden_states): method combine_postprocess (line 345) | def combine_postprocess(self, hidden_states): class MoEAlltoAllTokenDispatcher (line 350) | class MoEAlltoAllTokenDispatcher(MoETokenDispatcher): method __init__ (line 367) | def __init__( method set_shared_experts (line 464) | def set_shared_experts(self, shared_experts): method preprocess (line 471) | def preprocess(self, routing_map: torch.Tensor) -> torch.Tensor: method dispatch_preprocess (line 594) | def dispatch_preprocess( method token_dispatch (line 651) | def token_dispatch(self, permutated_local_input_tokens, permuted_probs): method dispatch_postprocess (line 680) | def dispatch_postprocess(self, global_input_tokens, global_probs): method combine_preprocess (line 751) | def combine_preprocess(self, hidden_states): method token_combine (line 792) | def token_combine( method combine_postprocess (line 820) | def combine_postprocess(self, permutated_local_input_tokens): method _maybe_update_cuda_sync_point (line 856) | def _maybe_update_cuda_sync_point(self, point: str): method _maybe_dtoh_and_synchronize (line 867) | def _maybe_dtoh_and_synchronize( class _DispatchManager (line 909) | class _DispatchManager(ABC): method setup_metadata (line 922) | def setup_metadata(self, routing_map: torch.Tensor, probs: torch.Tensor): method dispatch (line 927) | def dispatch(self, hidden_states: torch.Tensor) -> torch.Tensor: method combine (line 932) | def combine(self, hidden_states: torch.Tensor) -> torch.Tensor: method get_permuted_hidden_states_by_experts (line 937) | def get_permuted_hidden_states_by_experts(self, hidden_states: torch.T... method get_restored_hidden_states_by_experts (line 942) | def get_restored_hidden_states_by_experts(self, hidden_states: torch.T... class _HybridEPManager (line 947) | class _HybridEPManager(_DispatchManager): method __init__ (line 962) | def __init__( method setup_metadata (line 1007) | def setup_metadata(self, routing_map: torch.Tensor, probs: torch.Tensor): method dispatch (line 1027) | def dispatch( method combine (line 1063) | def combine( method get_permuted_hidden_states_by_experts (line 1083) | def get_permuted_hidden_states_by_experts(self, hidden_states: torch.T... method get_restored_hidden_states_by_experts (line 1086) | def get_restored_hidden_states_by_experts(self, hidden_states: torch.T... method get_number_of_tokens_per_expert (line 1089) | def get_number_of_tokens_per_expert(self) -> torch.Tensor: class _DeepepManager (line 1096) | class _DeepepManager(_DispatchManager): method __init__ (line 1118) | def __init__( method setup_metadata (line 1160) | def setup_metadata(self, routing_map: torch.Tensor, probs: torch.Tensor): method dispatch (line 1172) | def dispatch( method _indices_to_multihot (line 1203) | def _indices_to_multihot(self, indices, probs): method get_number_of_tokens_per_expert (line 1233) | def get_number_of_tokens_per_expert(self) -> torch.Tensor: method combine (line 1239) | def combine( method _pad_routing_map (line 1256) | def _pad_routing_map( method get_permuted_hidden_states_by_experts (line 1287) | def get_permuted_hidden_states_by_experts(self, hidden_states: torch.T... method get_restored_hidden_states_by_experts (line 1322) | def get_restored_hidden_states_by_experts(self, hidden_states: torch.T... class MoEFlexTokenDispatcher (line 1334) | class MoEFlexTokenDispatcher(MoETokenDispatcher): method __init__ (line 1340) | def __init__( method set_shared_experts (line 1385) | def set_shared_experts(self, shared_experts): method _initialize_metadata (line 1390) | def _initialize_metadata(self, routing_map: torch.Tensor, probs: torch... method dispatch_preprocess (line 1418) | def dispatch_preprocess( method token_dispatch (line 1444) | def token_dispatch( method dispatch_postprocess (line 1473) | def dispatch_postprocess(self, hidden_states: torch.Tensor, probs: tor... method combine_preprocess (line 1492) | def combine_preprocess(self, hidden_states: torch.Tensor): method token_combine (line 1501) | def token_combine( method combine_postprocess (line 1521) | def combine_postprocess(self, hidden_states: torch.Tensor): FILE: megatron/core/transformer/moe/token_dispatcher_inference.py class InferenceCUDAGraphTokenDispatcher (line 33) | class InferenceCUDAGraphTokenDispatcher(MoEAllGatherTokenDispatcher): method __init__ (line 47) | def __init__( method _maybe_allocate_ag_buffers (line 73) | def _maybe_allocate_ag_buffers( method _maybe_allocate_rs_buffer (line 141) | def _maybe_allocate_rs_buffer(self, x: torch.Tensor) -> dict: method token_dispatch (line 160) | def token_dispatch(self, hidden_states, probs): method dispatch_postprocess (line 246) | def dispatch_postprocess(self, hidden_states, probs): method combine_preprocess (line 266) | def combine_preprocess(self, expert_output): method token_combine (line 281) | def token_combine(self, hidden_states): FILE: megatron/core/transformer/moe/upcycling_utils.py function _get_keys_endswith (line 15) | def _get_keys_endswith(model, suffix): function _find_submodule (line 22) | def _find_submodule(model, submodule_name): function _get_config (line 32) | def _get_config(moe_model, dense_model): function _convert_to_moe_state_dict (line 108) | def _convert_to_moe_state_dict(moe_model, dense_model): function upcycle_state_dict (line 301) | def upcycle_state_dict(moe_model, dense_model): function load_and_upcycle_model (line 329) | def load_and_upcycle_model( FILE: megatron/core/transformer/multi_latent_attention.py class MLASelfAttentionSubmodules (line 81) | class MLASelfAttentionSubmodules: class MultiLatentAttention (line 98) | class MultiLatentAttention(Attention): method __init__ (line 105) | def __init__( method forward (line 216) | def forward( class MLASelfAttention (line 377) | class MLASelfAttention(MultiLatentAttention): method __init__ (line 384) | def __init__( method _qkv_down_projection (line 525) | def _qkv_down_projection(self, hidden_states): method get_query_key_value_tensors (line 553) | def get_query_key_value_tensors( method uncompress_kv_from_cache (line 877) | def uncompress_kv_from_cache(self, kv_cached): method prepare_for_absorption (line 903) | def prepare_for_absorption(self): method backward_dw (line 964) | def backward_dw(self) -> NoReturn: method _backward_kv_proj (line 970) | def _backward_kv_proj(self): method _backward_q_proj (line 975) | def _backward_q_proj(self): method _backward_output_proj (line 983) | def _backward_output_proj(self): method set_for_recompute_input_layernorm (line 987) | def set_for_recompute_input_layernorm(self): method clip_qk (line 993) | def clip_qk(self): method _clip_q_proj_weight (line 1060) | def _clip_q_proj_weight(self, weight): method _clip_kv_proj_weight (line 1087) | def _clip_kv_proj_weight(self, weight): class FusedMLASelfAttention (line 1114) | class FusedMLASelfAttention(MLASelfAttention): method __init__ (line 1117) | def __init__( method _qkv_down_projection (line 1216) | def _qkv_down_projection(self, hidden_states): method sharded_state_dict (line 1226) | def sharded_state_dict(self, prefix: str = "", sharded_offsets: tuple ... method _load_from_state_dict (line 1307) | def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs): FILE: megatron/core/transformer/multi_token_prediction.py function tie_word_embeddings_state_dict (line 60) | def tie_word_embeddings_state_dict( function tie_output_layer_state_dict (line 95) | def tie_output_layer_state_dict( function roll_tensor (line 130) | def roll_tensor(tensor, shifts=-1, dims=-1, cp_group=None, packed_seq_pa... function _roll_tensor_packed_seq (line 228) | def _roll_tensor_packed_seq(tensor, shifts, dims, packed_seq_params, cp_... class MTPLossLoggingHelper (line 332) | class MTPLossLoggingHelper: method save_loss_to_tracker (line 338) | def save_loss_to_tracker( method clean_loss_in_tracker (line 364) | def clean_loss_in_tracker(): method reduce_loss_in_tracker (line 371) | def reduce_loss_in_tracker(): method track_mtp_metrics (line 385) | def track_mtp_metrics(loss_scale, iteration, writer, wandb_writer=None... class MultiTokenPredictionLayerSubmodules (line 410) | class MultiTokenPredictionLayerSubmodules: function get_mtp_layer_spec (line 433) | def get_mtp_layer_spec( function get_mtp_layer_spec_for_backend (line 447) | def get_mtp_layer_spec_for_backend( function mtp_on_this_rank (line 470) | def mtp_on_this_rank( function get_mtp_ranks (line 514) | def get_mtp_ranks(pp_ranks: List[int], config: TransformerConfig) -> Lis... function get_mtp_layer_offset (line 530) | def get_mtp_layer_offset(config: TransformerConfig, vp_stage: Optional[i... function get_mtp_num_layers_to_build (line 544) | def get_mtp_num_layers_to_build( class MTPLossAutoScaler (line 566) | class MTPLossAutoScaler(torch.autograd.Function): method forward (line 572) | def forward(ctx, output: torch.Tensor, mtp_loss: torch.Tensor): method backward (line 586) | def backward(ctx, grad_output: torch.Tensor): method set_loss_scale (line 602) | def set_loss_scale(scale: torch.Tensor): function process_mtp_loss (line 612) | def process_mtp_loss( class MultiTokenPredictionLayer (line 712) | class MultiTokenPredictionLayer(MegatronModule): method __init__ (line 732) | def __init__( method _get_embeddings (line 850) | def _get_embeddings( method _concat_embeddings (line 895) | def _concat_embeddings(self, hidden_states: torch.Tensor, decoder_inpu... method _proj_and_transformer_layer (line 918) | def _proj_and_transformer_layer( method _postprocess (line 987) | def _postprocess(self, hidden_states: torch.Tensor): method forward_single_position (line 1001) | def forward_single_position( method _checkpointed_forward (line 1048) | def _checkpointed_forward(self, forward_func, *args, **kwargs): method forward (line 1086) | def forward( method sharded_state_dict (line 1168) | def sharded_state_dict( class MultiTokenPredictionBlockSubmodules (line 1198) | class MultiTokenPredictionBlockSubmodules: function _get_mtp_block_submodules (line 1215) | def _get_mtp_block_submodules( class MultiTokenPredictionBlock (line 1243) | class MultiTokenPredictionBlock(MegatronModule): method __init__ (line 1266) | def __init__( method _build_layers (line 1309) | def _build_layers(self, pg_collection): method forward (line 1392) | def forward( method sharded_state_dict (line 1451) | def sharded_state_dict( FILE: megatron/core/transformer/pipeline_parallel_layer_layout.py class PipelineParallelLayerLayout (line 15) | class PipelineParallelLayerLayout: method __repr__ (line 18) | def __repr__(self) -> str: method __init__ (line 24) | def __init__(self, layout: str | list, pipeline_model_parallel_size: i... method validate_layer_layout (line 85) | def validate_layer_layout(self, num_layers: int, mtp_num_layers: int): method get_num_layers_to_build (line 152) | def get_num_layers_to_build( method get_layer_offset (line 170) | def get_layer_offset( method get_layer_id_list (line 194) | def get_layer_id_list( method pretty_repr (line 207) | def pretty_repr(self): method from_str (line 263) | def from_str(layout, pipeline_model_parallel_size): method get_num_stages_from_str (line 277) | def get_num_stages_from_str(layout: str): method parse_str_to_list (line 283) | def parse_str_to_list(layout_str: str): FILE: megatron/core/transformer/spec_utils.py class ModuleSpec (line 12) | class ModuleSpec: method __call__ (line 33) | def __call__(self, *args: Any, **kwargs: Any) -> Any: function import_module (line 43) | def import_module(module_path: Tuple[str]): function get_module (line 59) | def get_module(spec_or_module: Union[ModuleSpec, type], **additional_kwa... function build_module (line 73) | def build_module(spec_or_module: Union[ModuleSpec, type], *args, **kwargs): FILE: megatron/core/transformer/torch_norm.py class LayerNormInterface (line 11) | class LayerNormInterface(Protocol): method forward (line 14) | def forward(self, x: torch.Tensor, /) -> torch.Tensor: class LayerNormBuilder (line 19) | class LayerNormBuilder(Protocol): method __call__ (line 22) | def __call__( class WrappedTorchNorm (line 27) | class WrappedTorchNorm: method __new__ (line 33) | def __new__( class L2Norm (line 72) | class L2Norm(torch.nn.Module, LayerNormInterface): method __init__ (line 85) | def __init__(self, hidden_size: int, eps: float = 1e-6, **kwargs): method _norm (line 91) | def _norm(self, x: torch.Tensor) -> torch.Tensor: method forward (line 104) | def forward(self, x: torch.Tensor) -> torch.Tensor: FILE: megatron/core/transformer/transformer_block.py function get_num_layers_to_build (line 71) | def get_num_layers_to_build( class TransformerBlockSubmodules (line 202) | class TransformerBlockSubmodules: function _get_block_submodules (line 221) | def _get_block_submodules( class TransformerBlock (line 262) | class TransformerBlock(GraphableMegatronModule, MegatronModule): method __init__ (line 265) | def __init__( method _build_layers (line 325) | def _build_layers(self): method has_final_layernorm_in_this_stage (line 386) | def has_final_layernorm_in_this_stage(self): method _setup_fused_tp_communication (line 415) | def _setup_fused_tp_communication(self): method _get_layer (line 439) | def _get_layer(self, layer_number: int): method _checkpointed_forward (line 442) | def _checkpointed_forward( method set_input_tensor (line 604) | def set_input_tensor(self, input_tensor: Tensor): method _should_call_local_cudagraph (line 614) | def _should_call_local_cudagraph(self, *args, **kwargs): method __call__ (line 637) | def __call__(self, *args, **kwargs): method forward (line 647) | def forward( method sharded_state_dict (line 871) | def sharded_state_dict( FILE: megatron/core/transformer/transformer_config.py class TransformerConfig (line 40) | class TransformerConfig(ModelParallelConfig): method __post_init__ (line 1007) | def __post_init__(self): class MLATransformerConfig (line 2229) | class MLATransformerConfig(TransformerConfig): method __post_init__ (line 2294) | def __post_init__(self): FILE: megatron/core/transformer/transformer_layer.py function get_transformer_layer_offset (line 45) | def get_transformer_layer_offset( class TransformerLayerSubmodules (line 203) | class TransformerLayerSubmodules: class BaseTransformerLayer (line 246) | class BaseTransformerLayer(ABC): method __init__ (line 258) | def __init__(self): class TransformerLayer (line 262) | class TransformerLayer(GraphableMegatronModule, BaseTransformerLayer): method __init__ (line 269) | def __init__( method create_mcore_cudagraph_manager (line 493) | def create_mcore_cudagraph_manager(self, config): method _get_layer_offset (line 515) | def _get_layer_offset(config: TransformerConfig): method _forward_attention (line 528) | def _forward_attention( method forward (line 696) | def forward(self, *args, **kwargs): method _forward_pre_mlp_layernorm (line 711) | def _forward_pre_mlp_layernorm(self, hidden_states: Tensor): method _forward_mlp (line 728) | def _forward_mlp( method _forward_post_mlp (line 840) | def _forward_post_mlp( method sharded_state_dict (line 901) | def sharded_state_dict( method configure_fused_tp_inference (line 924) | def configure_fused_tp_inference( method _set_proj_next_layer_norm_weights (line 947) | def _set_proj_next_layer_norm_weights(self, weights: Tensor): method _set_fc2_next_layer_norm_weights (line 951) | def _set_fc2_next_layer_norm_weights(self, weights: Optional[Tensor]): method _set_proj_residual (line 958) | def _set_proj_residual(self, residual: Tensor): method _set_fc2_residual (line 962) | def _set_fc2_residual(self, residual: Tensor): method get_mlp_layer_norm_weights (line 966) | def get_mlp_layer_norm_weights(self) -> Tensor: method get_qkv_layer_norm_weights (line 975) | def get_qkv_layer_norm_weights(self) -> Tensor: method get_layer_static_inputs (line 984) | def get_layer_static_inputs(self, seq_length, micro_batch_size): method _get_submodules_under_cudagraphs (line 1006) | def _get_submodules_under_cudagraphs(self): method _te_cuda_graph_capture (line 1034) | def _te_cuda_graph_capture(self, *args, **kwargs): method _te_cuda_graph_replay (line 1071) | def _te_cuda_graph_replay(self, *args, **kwargs): method _get_te_cuda_graph_replay_args (line 1196) | def _get_te_cuda_graph_replay_args(self, *args, **kwargs): method _should_call_local_cudagraph (line 1251) | def _should_call_local_cudagraph(self, *args, **kwargs): method get_layer_norm_weights (line 1283) | def get_layer_norm_weights(self): class MoETransformerLayer (line 1292) | class MoETransformerLayer(TransformerLayer): method __init__ (line 1302) | def __init__(self, *args, **kwargs): method _should_call_local_cudagraph (line 1310) | def _should_call_local_cudagraph(self, *args, **kwargs): method transition_cudagraph_scope (line 1330) | def transition_cudagraph_scope(self, mode): method create_mcore_cudagraph_manager (line 1366) | def create_mcore_cudagraph_manager(self, config): method _forward_mlp_router (line 1386) | def _forward_mlp_router(self, hidden_states, padding_mask=None): method _forward_mlp_expert_compute (line 1427) | def _forward_mlp_expert_compute(self, hidden_states, probs): method _forward_mlp_postprocess (line 1446) | def _forward_mlp_postprocess(self, residual, output, shared_expert_out... method _forward_mlp (line 1466) | def _forward_mlp(self, hidden_states, inference_context=None, padding_... FILE: megatron/core/transformer/utils.py function get_linear_layer (line 23) | def get_linear_layer(rows, columns, init_method, perform_initialization=... function get_default_causal_mask (line 33) | def get_default_causal_mask(sq: int) -> torch.Tensor: function get_sliding_window_causal_mask (line 38) | def get_sliding_window_causal_mask(sq, skv, window_size): function attention_mask_func (line 49) | def attention_mask_func(attention_scores, attention_mask): function gelu_impl (line 55) | def gelu_impl(x): function openai_gelu (line 61) | def openai_gelu(x): function erf_gelu (line 69) | def erf_gelu(x): function make_sharded_tensors_for_checkpoint (line 75) | def make_sharded_tensors_for_checkpoint( function make_sharded_object_for_checkpoint (line 151) | def make_sharded_object_for_checkpoint( function _get_extra_state_offsets (line 178) | def _get_extra_state_offsets( function ensure_metadata_has_dp_cp_group (line 194) | def ensure_metadata_has_dp_cp_group(metadata: Optional[dict]) -> dict: function sharded_state_dict_default (line 209) | def sharded_state_dict_default( function _init_sequence_parallel_cache (line 261) | def _init_sequence_parallel_cache(model, exclude_modules): function set_model_to_sequence_parallel (line 310) | def set_model_to_sequence_parallel(model, set_to=False, exclude_modules=... function init_cuda_graph_cache (line 337) | def init_cuda_graph_cache(model): function toggle_cuda_graphs (line 394) | def toggle_cuda_graphs(model, set_to="none"): function transition_moe_cudagraphs (line 437) | def transition_moe_cudagraphs(model, scope: str): function is_layer_window_attention (line 453) | def is_layer_window_attention( FILE: megatron/core/typed_torch.py class _Module (line 16) | class _Module(Generic[P, R_co], Protocol): method forward (line 19) | def forward(self, *args: P.args, **kwargs: P.kwargs) -> R_co: function apply_module (line 24) | def apply_module(m: _Module[P, R_co], *, check_subclass: bool = True) ->... function not_none (line 40) | def not_none(value: T | None) -> T: function copy_signature (line 62) | def copy_signature( function copy_signature (line 73) | def copy_signature( function copy_signature (line 84) | def copy_signature( function copy_signature (line 95) | def copy_signature( function copy_signature (line 106) | def copy_signature( function copy_signature (line 119) | def copy_signature( function copy_signature (line 132) | def copy_signature( function copy_signature (line 145) | def copy_signature( function copy_signature (line 157) | def copy_signature( FILE: megatron/core/utils.py function null_decorator (line 87) | def null_decorator(*args, **kwargs): class ExperimentalNotEnabledError (line 101) | class ExperimentalNotEnabledError(Exception): function experimental_fn (line 105) | def experimental_fn(introduced_with_version: str): function experimental_cls (line 171) | def experimental_cls(introduced_with_version: str): function get_te_version (line 299) | def get_te_version(): function is_te_min_version (line 330) | def is_te_min_version(version, check_equality=True): function get_torch_version (line 342) | def get_torch_version(): function is_torch_min_version (line 349) | def is_torch_min_version(version, check_equality=True): function get_fa_version (line 360) | def get_fa_version(): function is_fa_min_version (line 381) | def is_fa_min_version(version, check_equality=True): function get_mamba_version (line 392) | def get_mamba_version(): function is_mamba_min_version (line 413) | def is_mamba_min_version(version, check_equality=True): function get_causal_conv1d_version (line 424) | def get_causal_conv1d_version(): function is_causal_conv1d_min_version (line 445) | def is_causal_conv1d_min_version(version, check_equality=True): function get_flashinfer_version (line 456) | def get_flashinfer_version(): function is_flashinfer_min_version (line 481) | def is_flashinfer_min_version(version, check_equality=True): function ensure_divisibility (line 494) | def ensure_divisibility(numerator, denominator): function divide (line 499) | def divide(numerator, denominator): function get_tensor_model_parallel_group_if_none (line 506) | def get_tensor_model_parallel_group_if_none(tp_group, is_expert=False, c... function get_pg_size (line 535) | def get_pg_size(group=None): function get_pg_rank (line 549) | def get_pg_rank(group=None): function get_pg_src_rank (line 563) | def get_pg_src_rank(group=None): function get_attr_wrapped_model (line 580) | def get_attr_wrapped_model(model, attr, allow_none=True, return_model_ob... function get_model_type (line 608) | def get_model_type(model): function get_model_xattn (line 613) | def get_model_xattn(model): function get_model_config (line 621) | def get_model_config(model): class GlobalMemoryBuffer (line 626) | class GlobalMemoryBuffer: method __init__ (line 631) | def __init__(self): method get_tensor (line 634) | def get_tensor(self, tensor_shape, dtype, name, mem_alloc_context: Opt... function _kernel_make_viewless_tensor (line 655) | def _kernel_make_viewless_tensor(inp, requires_grad): class WrappedTensor (line 669) | class WrappedTensor: method __init__ (line 676) | def __init__(self, tensor: torch.Tensor): method unwrap (line 679) | def unwrap(self): class MakeViewlessTensor (line 689) | class MakeViewlessTensor(torch.autograd.Function): method forward (line 700) | def forward(ctx, inp, requires_grad): method backward (line 705) | def backward(ctx, grad_output): function make_viewless_tensor (line 710) | def make_viewless_tensor(inp, requires_grad, keep_graph): function assert_viewless_tensor (line 731) | def assert_viewless_tensor(tensor, extra_msg=None): function safely_set_viewless_tensor_data (line 747) | def safely_set_viewless_tensor_data(tensor, new_data_tensor): function init_method_normal (line 761) | def init_method_normal(sigma): function scaled_init_method_normal (line 766) | def scaled_init_method_normal(sigma, num_layers, multiplier=2.0): function mup_scaled_init_method_normal (line 773) | def mup_scaled_init_method_normal(sigma, num_layers, width_mult, multipl... function log_on_each_pipeline_stage (line 793) | def log_on_each_pipeline_stage( function check_param_hashes_across_dp_replicas (line 824) | def check_param_hashes_across_dp_replicas( function make_tp_sharded_tensor_for_checkpoint (line 903) | def make_tp_sharded_tensor_for_checkpoint( function make_sharded_tensor_for_checkpoint (line 972) | def make_sharded_tensor_for_checkpoint(tensor, key, prepend_offsets=(), ... function get_full_tensor_if_necessary (line 1025) | def get_full_tensor_if_necessary(tensor): function to_local_if_dtensor (line 1041) | def to_local_if_dtensor(tensor: Union[torch.Tensor, "DTensor"]) -> torch... function get_data_parallel_group_if_dtensor (line 1047) | def get_data_parallel_group_if_dtensor( function prepare_input_tensors_for_wgrad_compute (line 1058) | def prepare_input_tensors_for_wgrad_compute(grad_output, all_gathered_in... function drain_embedding_wgrad_compute (line 1087) | def drain_embedding_wgrad_compute( function local_multi_tensor_applier (line 1171) | def local_multi_tensor_applier(op, noop_flag_buffer, tensor_lists, *args): function local_multi_tensor_l2_norm (line 1178) | def local_multi_tensor_l2_norm(chunk_size, noop_flag, tensor_lists, per_... function local_multi_tensor_scale (line 1190) | def local_multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale): class _ValueWithRank (line 1196) | class _ValueWithRank: method __init__ (line 1205) | def __init__(self, value: float, rank: int, unit: str = "") -> None: method __lt__ (line 1217) | def __lt__(self, other) -> bool: method __gt__ (line 1228) | def __gt__(self, other) -> bool: method __call__ (line 1239) | def __call__(self) -> Tuple[float, int, str]: method __str__ (line 1247) | def __str__(self) -> str: class _StragglerData (line 1258) | class _StragglerData: class StragglerDetector (line 1298) | class StragglerDetector: method __new__ (line 1341) | def __new__(cls: Type["StragglerDetector"]) -> "StragglerDetector": method __init__ (line 1356) | def __init__(self) -> None: method configure (line 1386) | def configure( method reset (line 1461) | def reset(self) -> None: method start_method (line 1485) | def start_method(self) -> None: method stop_method (line 1519) | def stop_method(self) -> None: method elapsed (line 1538) | def elapsed(self) -> Tuple[float, float, int, int, int, int]: method report (line 1591) | def report(self, total_flops: float = 0.0, log_interval: int = 0) -> b... method _check_toggle (line 1658) | def _check_toggle(self) -> None: method _handler (line 1688) | def _handler(self) -> None: method _controller (line 1719) | def _controller(self): method _min_max (line 1739) | def _min_max( method enabled (line 1860) | def enabled(self) -> bool: method configured (line 1871) | def configured(self) -> bool: method my_rank (line 1880) | def my_rank(self): method world_size (line 1889) | def world_size(self) -> int: method null_method (line 1897) | def null_method(self) -> None: method __enter__ (line 1901) | def __enter__(self) -> "StragglerDetector": method __call__ (line 1910) | def __call__(self, bdata: bool = False) -> "StragglerDetector": method __exit__ (line 1924) | def __exit__( function is_submodule (line 1954) | def is_submodule(module, parent_module, strict=True): function get_batch_on_this_cp_rank (line 1972) | def get_batch_on_this_cp_rank( function get_thd_batch_on_this_cp_rank (line 2019) | def get_thd_batch_on_this_cp_rank( function get_batch_on_this_hybrid_cp_rank (line 2064) | def get_batch_on_this_hybrid_cp_rank( function configure_nvtx_profiling (line 2121) | def configure_nvtx_profiling(enabled: bool) -> None: function _nvtx_range_get_func_path (line 2131) | def _nvtx_range_get_func_path(): function nvtx_range_push (line 2145) | def nvtx_range_push(msg=None, suffix=None) -> None: function nvtx_range_pop (line 2167) | def nvtx_range_pop(msg=None, suffix=None) -> None: function _nvtx_decorator_get_func_path (line 2197) | def _nvtx_decorator_get_func_path(func): function nvtx_decorator (line 2212) | def nvtx_decorator(message: Optional[str] = None, color: Optional[str] =... function unwrap_model (line 2242) | def unwrap_model(model, module_instances=None): function get_asyncio_loop (line 2271) | def get_asyncio_loop(loop: asyncio.AbstractEventLoop | None = None) -> a... function is_using_quantization_scales (line 2286) | def is_using_quantization_scales(config): function trace_async_exceptions (line 2294) | def trace_async_exceptions(func: Optional[Callable] = None, *, verbose: ... function deprecated (line 2370) | def deprecated( function internal_api (line 2435) | def internal_api(func: Callable) -> Callable: function experimental_api (line 2468) | def experimental_api(func: Callable) -> Callable: function deprecate_args (line 2501) | def deprecate_args( function deprecate_inference_params (line 2530) | def deprecate_inference_params(inference_context, inference_params): FILE: megatron/inference/utils.py function get_model_for_inference (line 39) | def get_model_for_inference() -> MegatronModule: function add_inference_args (line 86) | def add_inference_args(parser: ArgumentParser) -> ArgumentParser: function get_inference_config_from_model_and_args (line 279) | def get_inference_config_from_model_and_args(model: MegatronModule, args): function get_dynamic_inference_engine (line 369) | def get_dynamic_inference_engine(model: Optional[MegatronModule] = None)... FILE: megatron/legacy/fp16_deprecated/loss_scaler.py class LossScaler (line 5) | class LossScaler: method __init__ (line 6) | def __init__(self, scale=1): class DynamicLossScaler (line 9) | class DynamicLossScaler: method __init__ (line 10) | def __init__(self, FILE: megatron/legacy/fused_kernels/__init__.py function load (line 17) | def load(args): function _get_cuda_bare_metal_version (line 57) | def _get_cuda_bare_metal_version(cuda_dir): function _create_build_dir (line 70) | def _create_build_dir(buildpath): FILE: megatron/legacy/fused_kernels/tests/test_fused_kernels.py function test_load_fused_kernels (line 13) | def test_load_fused_kernels(): function test_fused_softmax (line 25) | def test_fused_softmax(): function test_fused_upper_triangle_mask_softmax (line 123) | def test_fused_upper_triangle_mask_softmax(): function test_layer_norm (line 223) | def test_layer_norm(): function attention_mask_func (line 282) | def attention_mask_func(attention_scores, attention_mask): function forward_torch_softmax (line 287) | def forward_torch_softmax(input, mask, scale): function test_masked_softmax_forward (line 294) | def test_masked_softmax_forward(): function test_masked_softmax_backward (line 309) | def test_masked_softmax_backward(): function test_allmasked_softmax_forward (line 330) | def test_allmasked_softmax_forward(): function test_allmasked_softmax_backward (line 346) | def test_allmasked_softmax_backward(): FILE: megatron/legacy/model/bert_model.py function bert_extended_attention_mask (line 20) | def bert_extended_attention_mask(attention_mask): function bert_position_ids (line 36) | def bert_position_ids(token_ids): class BertLMHead (line 46) | class BertLMHead(MegatronModule): method __init__ (line 55) | def __init__(self, mpu_vocab_size, config, parallel_output): method forward (line 74) | def forward(self, hidden_states, word_embeddings_weight): method load_state_dict (line 84) | def load_state_dict(self, state_dict, strict=True): function post_language_model_processing (line 96) | def post_language_model_processing(lm_output, pooled_output, class BertModel (line 127) | class BertModel(MegatronModule): method __init__ (line 130) | def __init__(self, method set_input_tensor (line 171) | def set_input_tensor(self, input_tensor): method forward (line 175) | def forward(self, bert_model_input, attention_mask, method state_dict_for_save_checkpoint (line 222) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): method load_state_dict (line 243) | def load_state_dict(self, state_dict, strict=True): FILE: megatron/legacy/model/biencoder_model.py function get_model_provider (line 19) | def get_model_provider(only_query_model=False, only_context_model=False, function biencoder_model_provider (line 37) | def biencoder_model_provider(only_query_model=False, class BiEncoderModel (line 65) | class BiEncoderModel(MegatronModule): method __init__ (line 68) | def __init__(self, method set_input_tensor (line 107) | def set_input_tensor(self, input_tensor): method forward (line 114) | def forward(self, query_tokens, query_attention_mask, query_types, method embed_text (line 136) | def embed_text(model, tokens, attention_mask, token_types): method state_dict_for_save_checkpoint (line 143) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): method load_state_dict (line 163) | def load_state_dict(self, state_dict, strict=True): method init_state_dict_from_bert (line 180) | def init_state_dict_from_bert(self): class PretrainedBertModel (line 246) | class PretrainedBertModel(MegatronModule): method __init__ (line 250) | def __init__(self, num_tokentypes=2, method forward (line 280) | def forward(self, input_ids, attention_mask, tokentype_ids=None): method state_dict_for_save_checkpoint (line 304) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): method load_state_dict (line 320) | def load_state_dict(self, state_dict, strict=True): FILE: megatron/legacy/model/classification.py class Classification (line 17) | class Classification(MegatronModule): method __init__ (line 19) | def __init__(self, method set_input_tensor (line 48) | def set_input_tensor(self, input_tensor): method forward (line 52) | def forward(self, model_input, attention_mask, tokentype_ids=None): method state_dict_for_save_checkpoint (line 76) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): method load_state_dict (line 89) | def load_state_dict(self, state_dict, strict=True): FILE: megatron/legacy/model/enums.py class LayerType (line 5) | class LayerType(enum.Enum): class AttnType (line 9) | class AttnType(enum.Enum): class AttnMaskType (line 13) | class AttnMaskType(enum.Enum): FILE: megatron/legacy/model/fused_bias_gelu.py function bias_gelu (line 16) | def bias_gelu(bias, y): function bias_gelu_back (line 24) | def bias_gelu_back(g, bias, y): class GeLUFunction (line 31) | class GeLUFunction(torch.autograd.Function): method forward (line 34) | def forward(ctx, input, bias): method backward (line 39) | def backward(ctx, grad_output): FILE: megatron/legacy/model/fused_layer_norm.py class MixedFusedLayerNorm (line 31) | class MixedFusedLayerNorm(torch.nn.Module): method __init__ (line 33) | def __init__(self, normalized_shape, eps=1e-5, method reset_parameters (line 69) | def reset_parameters(self): method forward (line 78) | def forward(self, input): FILE: megatron/legacy/model/fused_softmax.py class ScaledUpperTriangMaskedSoftmax (line 9) | class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function): method forward (line 18) | def forward(ctx, inputs, scale): method backward (line 33) | def backward(ctx, output_grads): class ScaledMaskedSoftmax (line 47) | class ScaledMaskedSoftmax(torch.autograd.Function): method forward (line 56) | def forward(ctx, inputs, mask, scale): method backward (line 69) | def backward(ctx, output_grads): class ScaledSoftmax (line 83) | class ScaledSoftmax(torch.autograd.Function): method forward (line 91) | def forward(ctx, inputs, scale): method backward (line 106) | def backward(ctx, output_grads): class FusedScaleMaskSoftmax (line 120) | class FusedScaleMaskSoftmax(nn.Module): method __init__ (line 134) | def __init__( method forward (line 161) | def forward(self, input, mask): method is_kernel_available (line 170) | def is_kernel_available(self, mask, b, np, sq, sk): method forward_fused_softmax (line 192) | def forward_fused_softmax(self, input, mask): method forward_torch_softmax (line 210) | def forward_torch_softmax(self, input, mask): method get_batch_per_block (line 228) | def get_batch_per_block(sq, sk, b, np): FILE: megatron/legacy/model/gpt_model.py function post_language_model_processing (line 18) | def post_language_model_processing(lm_output, labels, logit_weights, class GPTModel (line 45) | class GPTModel(MegatronModule): method __init__ (line 48) | def __init__(self, method set_input_tensor (line 74) | def set_input_tensor(self, input_tensor): method forward (line 78) | def forward(self, input_ids, position_ids, attention_mask, method state_dict_for_save_checkpoint (line 98) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): method load_state_dict (line 111) | def load_state_dict(self, state_dict, strict=True): FILE: megatron/legacy/model/language_model.py function parallel_lm_logits (line 22) | def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, ... function get_language_model (line 52) | def get_language_model( class Pooler (line 91) | class Pooler(MegatronModule): method __init__ (line 103) | def __init__(self, hidden_size, init_method): method forward (line 109) | def forward(self, hidden_states, sequence_index=0): class Embedding (line 126) | class Embedding(MegatronModule): method __init__ (line 140) | def __init__( method zero_parameters (line 192) | def zero_parameters(self): method add_tokentype_embeddings (line 203) | def add_tokentype_embeddings(self, num_tokentypes): method forward (line 218) | def forward(self, input_ids, position_ids, tokentype_ids=None): method state_dict_for_save_checkpoint (line 255) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): method load_state_dict (line 273) | def load_state_dict(self, state_dict, strict=True): class TransformerLanguageModel (line 319) | class TransformerLanguageModel(MegatronModule): method __init__ (line 332) | def __init__( method set_input_tensor (line 441) | def set_input_tensor(self, input_tensor): method forward (line 471) | def forward( method state_dict_for_save_checkpoint (line 555) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): method load_state_dict (line 584) | def load_state_dict(self, state_dict, strict=True): FILE: megatron/legacy/model/module.py function param_is_not_shared (line 18) | def param_is_not_shared(param): class MegatronModule (line 22) | class MegatronModule(torch.nn.Module): method __init__ (line 26) | def __init__(self, config=None, share_embeddings_and_output_weights=Tr... method state_dict_for_save_checkpoint (line 31) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): method shared_embedding_or_output_weight (line 36) | def shared_embedding_or_output_weight(self): method initialize_word_embeddings (line 47) | def initialize_word_embeddings(self): function conversion_helper (line 127) | def conversion_helper(val, conversion): function fp32_to_float16 (line 138) | def fp32_to_float16(val, float16_convertor): function float16_to_fp32 (line 152) | def float16_to_fp32(val): FILE: megatron/legacy/model/multiple_choice.py class MultipleChoice (line 17) | class MultipleChoice(MegatronModule): method __init__ (line 19) | def __init__(self, method set_input_tensor (line 45) | def set_input_tensor(self, input_tensor): method forward (line 49) | def forward(self, model_input, attention_mask, tokentype_ids=None): method state_dict_for_save_checkpoint (line 87) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): method load_state_dict (line 100) | def load_state_dict(self, state_dict, strict=True): FILE: megatron/legacy/model/realm_model.py function general_ict_model_provider (line 18) | def general_ict_model_provider(only_query_model=False, only_block_model=... class ICTBertModel (line 39) | class ICTBertModel(MegatronModule): method __init__ (line 41) | def __init__(self, method forward (line 67) | def forward(self, query_tokens, query_attention_mask, block_tokens, bl... method embed_query (line 73) | def embed_query(self, query_tokens, query_attention_mask): method embed_block (line 82) | def embed_block(self, block_tokens, block_attention_mask): method state_dict_for_save_checkpoint (line 91) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): method load_state_dict (line 106) | def load_state_dict(self, state_dict, strict=True): method init_state_dict_from_bert (line 118) | def init_state_dict_from_bert(self): class IREncoderBertModel (line 148) | class IREncoderBertModel(MegatronModule): method __init__ (line 150) | def __init__(self, ict_head_size, num_tokentypes=2, parallel_output=Tr... method forward (line 170) | def forward(self, input_ids, attention_mask, tokentype_ids=None): method state_dict_for_save_checkpoint (line 185) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): method load_state_dict (line 198) | def load_state_dict(self, state_dict, strict=True): FILE: megatron/legacy/model/rms_norm.py class RMSNorm (line 6) | class RMSNorm(torch.nn.Module): method __init__ (line 8) | def __init__(self, method _norm (line 27) | def _norm(self, x): method forward (line 30) | def forward(self, x): FILE: megatron/legacy/model/t5_model.py function t5_extended_attention_mask (line 19) | def t5_extended_attention_mask(attention_mask_list): function t5_position_ids (line 29) | def t5_position_ids(token_ids): class T5LMHead (line 39) | class T5LMHead(MegatronModule): method __init__ (line 47) | def __init__(self, mpu_vocab_size, parallel_output): method forward (line 56) | def forward(self, hidden_states, word_embeddings_weight): class T5Model (line 64) | class T5Model(MegatronModule): method __init__ (line 67) | def __init__(self, method set_input_tensor (line 112) | def set_input_tensor(self, input_tensor): method forward (line 116) | def forward(self, encoder_input_ids, decoder_input_ids, encoder_attn_m... method state_dict_for_save_checkpoint (line 165) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): method load_state_dict (line 184) | def load_state_dict(self, state_dict, strict=True): FILE: megatron/legacy/model/transformer.py class DropPath (line 76) | class DropPath(MegatronModule): method __init__ (line 81) | def __init__(self, drop_prob=0.): method forward (line 85) | def forward(self, hidden_state): class ParallelMLP (line 98) | class ParallelMLP(MegatronModule): method __init__ (line 106) | def __init__(self, config, is_expert=False): method forward (line 161) | def forward(self, hidden_states): function sinkhorn (line 179) | def sinkhorn(cost, tol=0.0001): function get_router_linear_layer (line 195) | def get_router_linear_layer(config): class SwitchMLP (line 204) | class SwitchMLP(MegatronModule): method __init__ (line 208) | def __init__(self, config): method gather_indices (line 227) | def gather_indices(self, local_indices): method forward (line 245) | def forward(self, hidden_states): class CoreAttention (line 317) | class CoreAttention(MegatronModule): method __init__ (line 319) | def __init__(self, layer_number, config, method forward (line 363) | def forward(self, query_layer, key_layer, class FlashSelfAttention (line 452) | class FlashSelfAttention(torch.nn.Module): method __init__ (line 462) | def __init__(self, causal=False, softmax_scale=None, attention_dropout... method forward (line 472) | def forward(self, q, k, v): class ParallelAttention (line 514) | class ParallelAttention(MegatronModule): method __init__ (line 521) | def __init__(self, config, layer_number, method _checkpointed_attention_forward (line 625) | def _checkpointed_attention_forward(self, query_layer, key_layer, method _allocate_memory (line 648) | def _allocate_memory(self, inference_max_sequence_len, batch_size, num... method forward (line 657) | def forward(self, hidden_states, attention_mask, function bias_dropout_add (line 844) | def bias_dropout_add(x, bias, residual, prob, training): function get_bias_dropout_add (line 853) | def get_bias_dropout_add(training): function bias_dropout_add_fused_train (line 860) | def bias_dropout_add_fused_train(x: torch.Tensor, function bias_dropout_add_fused_inference (line 868) | def bias_dropout_add_fused_inference(x: torch.Tensor, class ParallelTransformerLayer (line 875) | class ParallelTransformerLayer(MegatronModule): method __init__ (line 882) | def __init__(self, config, method default_decoder_cross_attention (line 927) | def default_decoder_cross_attention(self, method forward (line 963) | def forward(self, hidden_states, attention_mask, class NoopTransformerLayer (line 1076) | class NoopTransformerLayer(MegatronModule): method __init__ (line 1092) | def __init__(self, layer_number): method forward (line 1096) | def forward(self, hidden_states, attention_mask, function _get_num_layers (line 1102) | def _get_num_layers(args, model_type, is_decoder=False): class ParallelTransformer (line 1127) | class ParallelTransformer(MegatronModule): method __init__ (line 1130) | def __init__(self, config, method _get_layer (line 1308) | def _get_layer(self, layer_number): method _checkpointed_forward (line 1311) | def _checkpointed_forward(self, hidden_states, attention_mask, method set_input_tensor (line 1390) | def set_input_tensor(self, input_tensor): method forward (line 1400) | def forward(self, hidden_states, attention_mask, method load_state_dict (line 1505) | def load_state_dict(self, state_dict, strict=True): FILE: megatron/legacy/model/utils.py function init_method_normal (line 13) | def init_method_normal(sigma): function scaled_init_method_normal (line 21) | def scaled_init_method_normal(sigma, num_layers): function attention_mask_func (line 31) | def attention_mask_func(attention_scores, attention_mask): function get_linear_layer (line 36) | def get_linear_layer(rows, columns, init_method): function gelu_impl (line 47) | def gelu_impl(x): function openai_gelu (line 52) | def openai_gelu(x): function erf_gelu (line 58) | def erf_gelu(x): function get_norm (line 62) | def get_norm(config): FILE: megatron/legacy/model/vision/classification.py class VitClassificationModel (line 13) | class VitClassificationModel(MegatronModule): method __init__ (line 16) | def __init__(self, config, num_classes, finetune=False, method set_input_tensor (line 44) | def set_input_tensor(self, input_tensor): method forward (line 48) | def forward(self, input): class MitClassificationModel (line 57) | class MitClassificationModel(MegatronModule): method __init__ (line 60) | def __init__(self, num_classes, method _init_weights (line 72) | def _init_weights(self, m): method set_input_tensor (line 78) | def set_input_tensor(self, input_tensor): method forward (line 82) | def forward(self, input): FILE: megatron/legacy/model/vision/dino.py class DINOLoss (line 23) | class DINOLoss(torch.nn.Module): method __init__ (line 24) | def __init__(self, out_dim, ncrops, warmup_teacher_temp, teacher_temp, method forward (line 41) | def forward(self, student_output, teacher_output, iteration): method update_center (line 73) | def update_center(self, teacher_output): class DINOHead (line 82) | class DINOHead(torch.nn.Module): method __init__ (line 83) | def __init__(self, in_dim, out_dim, norm_last_layer=True, nlayers=3): method _init_weights (line 105) | def _init_weights(self, m): method forward (line 111) | def forward(self, x): class MultiCropWrapper (line 118) | class MultiCropWrapper(MegatronModule): method __init__ (line 128) | def __init__(self, backbone, head): method forward (line 135) | def forward(self, x): function cosine_scheduler (line 159) | def cosine_scheduler(base_value, final_value, epochs, niter_per_ep, function get_student_backbone_and_num_features (line 176) | def get_student_backbone_and_num_features(config, pre_process=True, post... function get_teacher_backbone_and_num_features (line 198) | def get_teacher_backbone_and_num_features(config, pre_process=True, post... class DINOPretrainModel (line 219) | class DINOPretrainModel(MegatronModule): method __init__ (line 220) | def __init__(self, config, pre_process=True, post_process=True): method set_input_tensor (line 266) | def set_input_tensor(self, tensor): method forward (line 269) | def forward(self, input): method cancel_gradients_last_layer (line 278) | def cancel_gradients_last_layer(self, iteration): method update_momentum (line 286) | def update_momentum(self, iteration): FILE: megatron/legacy/model/vision/esvit_swin_backbone.py class Mlp (line 25) | class Mlp(nn.Module): method __init__ (line 26) | def __init__(self, in_features, hidden_features=None, method forward (line 36) | def forward(self, x): function window_partition (line 45) | def window_partition(x, window_size): function window_reverse (line 59) | def window_reverse(windows, window_size, H, W): class WindowAttention (line 75) | class WindowAttention(nn.Module): method __init__ (line 88) | def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scal... method forward (line 122) | def forward(self, x, mask=None): method extra_repr (line 156) | def extra_repr(self) -> str: method flops (line 159) | def flops(self, N): method compute_macs (line 173) | def compute_macs(module, input, output): class SwinTransformerBlock (line 179) | class SwinTransformerBlock(nn.Module): method __init__ (line 197) | def __init__(self, dim, input_resolution, num_heads, window_size=7, sh... method create_attn_mask (line 229) | def create_attn_mask(self, H, W): method forward (line 255) | def forward(self, x): method extra_repr (line 313) | def extra_repr(self) -> str: method flops (line 317) | def flops(self): class PatchMerging (line 332) | class PatchMerging(nn.Module): method __init__ (line 340) | def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm): method forward (line 347) | def forward(self, x): method extra_repr (line 377) | def extra_repr(self) -> str: method flops (line 380) | def flops(self): class BasicLayer (line 387) | class BasicLayer(nn.Module): method __init__ (line 405) | def __init__(self, dim, input_resolution, depth, num_heads, window_size, method forward (line 429) | def forward(self, x): method forward_with_features (line 436) | def forward_with_features(self, x): method forward_with_attention (line 445) | def forward_with_attention(self, x): method extra_repr (line 455) | def extra_repr(self) -> str: method flops (line 458) | def flops(self): class PatchEmbed (line 467) | class PatchEmbed(nn.Module): method __init__ (line 471) | def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=... method forward (line 490) | def forward(self, x): method flops (line 499) | def flops(self): class SwinTransformer (line 506) | class SwinTransformer(nn.Module): method __init__ (line 530) | def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes... method _init_weights (line 580) | def _init_weights(self, m): method no_weight_decay (line 590) | def no_weight_decay(self): method no_weight_decay_keywords (line 594) | def no_weight_decay_keywords(self): method forward (line 598) | def forward(self, x): method forward_feature_maps (line 614) | def forward_feature_maps(self, x): method forward_selfattention (line 630) | def forward_selfattention(self, x, n=1): method forward_last_selfattention (line 644) | def forward_last_selfattention(self, x): method forward_all_selfattention (line 653) | def forward_all_selfattention(self, x): method forward_return_n_last_blocks (line 663) | def forward_return_n_last_blocks(self, x, n=1, return_patch_avgpool=Fa... method flops (line 705) | def flops(self): method init_weights (line 716) | def init_weights(self, pretrained='', pretrained_layers=[], verbose=Tr... method freeze_pretrained_layers (line 783) | def freeze_pretrained_layers(self, frozen_layers=[]): function get_swin (line 810) | def get_swin(is_teacher=False): FILE: megatron/legacy/model/vision/inpainting.py class VitInpaintingModel (line 19) | class VitInpaintingModel(MegatronModule): method __init__ (line 21) | def __init__(self, config, pre_process=True, post_process=True): method set_input_tensor (line 48) | def set_input_tensor(self, input_tensor): method forward (line 51) | def forward(self, input): class MLP (line 70) | class MLP(torch.nn.Module): method __init__ (line 74) | def __init__(self, input_dim=2048, embed_dim=768): method forward (line 78) | def forward(self, x): class MitInpaintingModel (line 84) | class MitInpaintingModel(MegatronModule): method __init__ (line 87) | def __init__(self, pre_process=True, post_process=True): method set_input_tensor (line 115) | def set_input_tensor(self, input_tensor): method forward (line 119) | def forward(self, input): FILE: megatron/legacy/model/vision/knn_monitor.py function build_data_loader (line 12) | def build_data_loader(dataset, drop_last=True, shuffle=False): function compute_feature_bank (line 38) | def compute_feature_bank(model): function get_feature_bank (line 96) | def get_feature_bank(): function knn_predict (line 105) | def knn_predict(feature, feature_bank, feature_labels, classes, knn_k, k... FILE: megatron/legacy/model/vision/mit_backbone.py class Mlp (line 13) | class Mlp(nn.Module): method __init__ (line 14) | def __init__(self, method _init_weights (line 31) | def _init_weights(self, m): method forward (line 46) | def forward(self, x, H, W): class Attention (line 56) | class Attention(nn.Module): method __init__ (line 57) | def __init__(self, method _init_weights (line 86) | def _init_weights(self, m): method forward (line 101) | def forward(self, x, H, W): class Block (line 125) | class Block(nn.Module): method __init__ (line 127) | def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_sc... method _init_weights (line 143) | def _init_weights(self, m): method forward (line 158) | def forward(self, x, H, W): class OverlapPatchEmbed (line 165) | class OverlapPatchEmbed(nn.Module): method __init__ (line 169) | def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, e... method _init_weights (line 180) | def _init_weights(self, m): method forward (line 195) | def forward(self, x): class MixVisionTransformer (line 204) | class MixVisionTransformer(nn.Module): method __init__ (line 205) | def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classe... method _init_weights (line 260) | def _init_weights(self, m): method reset_drop_path (line 275) | def reset_drop_path(self, drop_path_rate): method freeze_patch_emb (line 293) | def freeze_patch_emb(self): method forward_features (line 296) | def forward_features(self, x): method forward (line 335) | def forward(self, x): class DWConv (line 344) | class DWConv(nn.Module): method __init__ (line 345) | def __init__(self, dim=768): method forward (line 349) | def forward(self, x, H, W): class mit_b0 (line 357) | class mit_b0(MixVisionTransformer): method __init__ (line 358) | def __init__(self, **kwargs): class mit_b1 (line 365) | class mit_b1(MixVisionTransformer): method __init__ (line 366) | def __init__(self, **kwargs): class mit_b2 (line 373) | class mit_b2(MixVisionTransformer): method __init__ (line 374) | def __init__(self, **kwargs): class mit_b3 (line 381) | class mit_b3(MixVisionTransformer): method __init__ (line 382) | def __init__(self, **kwargs): class mit_b3_avg (line 388) | class mit_b3_avg(MixVisionTransformer): method __init__ (line 389) | def __init__(self, drop_path_rate=0.1, **kwargs): class mit_b4 (line 395) | class mit_b4(MixVisionTransformer): method __init__ (line 396) | def __init__(self, **kwargs): class mit_b5 (line 402) | class mit_b5(MixVisionTransformer): method __init__ (line 403) | def __init__(self, **kwargs): class mit_b5_avg (line 409) | class mit_b5_avg(MixVisionTransformer): method __init__ (line 410) | def __init__(self, drop_path_rate=0.1, **kwargs): FILE: megatron/legacy/model/vision/swin_backbone.py class Mlp (line 19) | class Mlp(nn.Module): method __init__ (line 20) | def __init__(self, in_features, hidden_features=None, method forward (line 30) | def forward(self, x): function window_partition (line 39) | def window_partition(x, window_size): function window_reverse (line 54) | def window_reverse(windows, window_size, H, W): class WindowAttention (line 71) | class WindowAttention(nn.Module): method __init__ (line 85) | def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scal... method forward (line 119) | def forward(self, x, mask=None): method extra_repr (line 152) | def extra_repr(self) -> str: method flops (line 155) | def flops(self, N): class SwinTransformerBlock (line 169) | class SwinTransformerBlock(nn.Module): method __init__ (line 188) | def __init__(self, dim, input_resolution, num_heads, window_size=7, sh... method create_attn_mask (line 219) | def create_attn_mask(self, H, W): method forward (line 245) | def forward(self, x): method extra_repr (line 284) | def extra_repr(self) -> str: method flops (line 288) | def flops(self): class PatchMerging (line 303) | class PatchMerging(nn.Module): method __init__ (line 312) | def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm): method forward (line 319) | def forward(self, x): method extra_repr (line 342) | def extra_repr(self) -> str: method flops (line 345) | def flops(self): class BasicLayer (line 352) | class BasicLayer(nn.Module): method __init__ (line 372) | def __init__(self, dim, input_resolution, depth, num_heads, window_size, method forward (line 400) | def forward(self, x): method extra_repr (line 411) | def extra_repr(self) -> str: method flops (line 414) | def flops(self): class PatchEmbed (line 423) | class PatchEmbed(nn.Module): method __init__ (line 434) | def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=9... method forward (line 453) | def forward(self, x): method flops (line 463) | def flops(self): class SwinTransformer (line 471) | class SwinTransformer(nn.Module): method __init__ (line 496) | def __init__(self, img_size=224, patch_size=4, in_chans=3, method _init_weights (line 552) | def _init_weights(self, m): method no_weight_decay (line 562) | def no_weight_decay(self): method no_weight_decay_keywords (line 566) | def no_weight_decay_keywords(self): method forward (line 569) | def forward(self, x): method flops (line 595) | def flops(self): function get_swin (line 605) | def get_swin(drop_path_rate=0.3, output_avg=False): FILE: megatron/legacy/model/vision/utils.py function resize (line 7) | def resize(input, FILE: megatron/legacy/model/vision/vit_backbone.py class VitMlpHead (line 21) | class VitMlpHead(MegatronModule): method __init__ (line 33) | def __init__(self, config, hidden_size, num_classes): method forward (line 41) | def forward(self, hidden_states): function isPerfectSquare (line 50) | def isPerfectSquare(x): function twod_interpolate_position_embeddings_hook (line 57) | def twod_interpolate_position_embeddings_hook( class VitBackbone (line 130) | class VitBackbone(MegatronModule): method __init__ (line 133) | def __init__(self, method set_input_tensor (line 208) | def set_input_tensor(self, input_tensor): method forward (line 212) | def forward(self, input): FILE: megatron/post_training/arguments.py function add_modelopt_args (line 4) | def add_modelopt_args(parser): FILE: megatron/post_training/checkpointing.py function has_modelopt_state (line 23) | def has_modelopt_state(checkpoint_path: str) -> bool: function get_sharded_load_dir (line 55) | def get_sharded_load_dir(load_dir: str) -> Tuple[Union[Path, None], str]: function load_modelopt_state (line 92) | def load_modelopt_state(model: nn.Module, load_dir: Optional[str] = None... function load_modelopt_checkpoint (line 129) | def load_modelopt_checkpoint( FILE: megatron/post_training/generate.py function simple_generate (line 15) | def simple_generate( function simple_speculative_generate (line 106) | def simple_speculative_generate( FILE: megatron/post_training/loss_func.py function _mask_loss (line 13) | def _mask_loss(output_tensor, loss_mask): function loss_func (line 39) | def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor, mode... FILE: megatron/post_training/model_builder.py function count_parameters_in_layer (line 31) | def count_parameters_in_layer(model, layer_name): function _add_load_convert_hooks (line 40) | def _add_load_convert_hooks(model: MCoreGPTModel): function _load_teacher_model_config (line 48) | def _load_teacher_model_config(checkpoint_path: str) -> Namespace: function _load_teacher_model (line 117) | def _load_teacher_model(config, config_raw: Namespace, model_kwargs: Dic... function modelopt_gpt_mamba_builder (line 161) | def modelopt_gpt_mamba_builder( FILE: megatron/post_training/non_loss_data_func.py function report_draft_acceptance_length (line 11) | def report_draft_acceptance_length(model, osl: int = 64, draft_steps: in... FILE: megatron/post_training/utils.py function modelopt_version_higher_than (line 16) | def modelopt_version_higher_than(target_version: str): function modelopt_version_at_least (line 27) | def modelopt_version_at_least(target_version: str): function function_has_parameter (line 39) | def function_has_parameter(function, argument_name: str) -> bool: function get_current_memory_info (line 44) | def get_current_memory_info(): function report_current_memory_info (line 57) | def report_current_memory_info(): function get_mtbench_chat_data (line 63) | def get_mtbench_chat_data(): function to_empty_if_meta (line 81) | def to_empty_if_meta(module: torch.nn.Module, *, device: torch.device, r... function print_distributed_quant_summary (line 101) | def print_distributed_quant_summary(model, msg=""): FILE: megatron/rl/__init__.py function import_class (line 16) | def import_class(class_path: str) -> Type: class TypeLookupable (line 38) | class TypeLookupable(BaseModel, extra='allow'): method unwrap (line 43) | def unwrap(self) -> Self: method register_subclass (line 48) | def register_subclass(cls, register_type: Type[Self]) -> Type[Self]: class GenericGenerationArgs (line 60) | class GenericGenerationArgs(BaseModel): method add (line 69) | def add(self, generation_args: 'GenericGenerationArgs') -> 'GenericGen... class Request (line 75) | class Request(BaseModel): FILE: megatron/rl/agent/api.py class AgentBaseModel (line 23) | class AgentBaseModel(BaseModel, extra='allow'): class RolloutRequest (line 27) | class RolloutRequest(Request): class GroupedRolloutRequest (line 35) | class GroupedRolloutRequest(Request): class Rollout (line 47) | class Rollout(AgentBaseModel): class TokenRollout (line 60) | class TokenRollout(AgentBaseModel): class RolloutGroup (line 77) | class RolloutGroup(AgentBaseModel): method __iter__ (line 84) | def __iter__(self): method __len__ (line 87) | def __len__(self): method __getitem__ (line 90) | def __getitem__(self, idx): class ContrastiveRollout (line 97) | class ContrastiveRollout(AgentBaseModel): class Head2HeadRolloutRequest (line 104) | class Head2HeadRolloutRequest(Request): class EvaluationRequest (line 110) | class EvaluationRequest(Request): class EvaluationResult (line 121) | class EvaluationResult(AgentBaseModel): class RewardEvaluationResult (line 126) | class RewardEvaluationResult(EvaluationResult): class EvaluationResponse (line 134) | class EvaluationResponse(AgentBaseModel, TypeLookupable, Generic[T]): method metrics (line 138) | def metrics(self): class Agent (line 142) | class Agent(ABC, AgentBaseModel): class RolloutGenerator (line 146) | class RolloutGenerator(Agent, ABC): method rollout (line 150) | async def rollout(self, request: RolloutRequest) -> Rollout: ... method get_reward_rollouts (line 152) | async def get_reward_rollouts(self, request: RolloutRequest) -> list[R... class ContrastiveRolloutGenerator (line 162) | class ContrastiveRolloutGenerator(Agent, ABC): method get_contrastive_rollouts (line 166) | async def get_contrastive_rollouts( class TokenizedRolloutGenerator (line 171) | class TokenizedRolloutGenerator(Agent, ABC): method rollout (line 179) | async def rollout(self, request: RolloutRequest) -> TokenRollout: ... method get_reward_rollouts (line 181) | async def get_reward_rollouts(self, request: RolloutRequest) -> list[T... class GroupedRolloutGenerator (line 191) | class GroupedRolloutGenerator(Agent, ABC): method __init__ (line 197) | def __init__(self, *, parallel_generation_tasks: int | None = None, **... method group_rollout (line 203) | async def group_rollout(self, request: GroupedRolloutRequest) -> list[... method get_grouped_rollouts (line 205) | async def get_grouped_rollouts(self, request: GroupedRolloutRequest): class EvaluationAgent (line 302) | class EvaluationAgent(Agent, ABC): method run_evaluation (line 306) | async def run_evaluation(self, request: EvaluationRequest) -> Evaluati... FILE: megatron/rl/agent/huggingface_dataset_agent.py class HFDatasetAgent (line 7) | class HFDatasetAgent(BaseModel): method __init__ (line 22) | def __init__(self, **data): method load_hf_dataset (line 26) | def load_hf_dataset(self): FILE: megatron/rl/agent/pass_at_evaluation_agent.py function pass_at_k (line 14) | def pass_at_k(n_samples: int, n_correct: int, k: int) -> float: class PassAtEvaluationResult (line 26) | class PassAtEvaluationResult(RewardEvaluationResult): class PassAtEvaluationResponse (line 34) | class PassAtEvaluationResponse(EvaluationResponse[PassAtEvaluationResult]): method metrics (line 37) | def metrics(self): class PassAtEvaluationAgent (line 47) | class PassAtEvaluationAgent(EvaluationAgent, ABC): method __init__ (line 49) | def __init__(self, max_k=32, **kwargs): method _evaluation (line 54) | async def _evaluation( method evaluation (line 58) | async def evaluation( FILE: megatron/rl/agent/remote_agent.py class RemoteAgent (line 7) | class RemoteAgent(FastAPIEnvServer, RolloutGenerator, GroupedRolloutGene... FILE: megatron/rl/agent/reward_only_agent.py class RewardOnlyEvaluationResponse (line 30) | class RewardOnlyEvaluationResponse(EvaluationResponse[RewardEvaluationRe... method metrics (line 33) | def metrics(self): class RewardOnlyAgent (line 37) | class RewardOnlyAgent(RolloutGenerator, GroupedRolloutGenerator, PassAtE... method get_dataset (line 42) | def get_dataset(self, validation: bool = False): method get_reward (line 46) | async def get_reward(self, response: str, golden: Any) -> float: method get_prompt (line 50) | async def get_prompt(self, validation: bool) -> tuple[str, Any]: method evaluation_prompts (line 54) | async def evaluation_prompts( method _get_rank_subset (line 60) | def _get_rank_subset( method rollout_from_response (line 84) | async def rollout_from_response( method rollout (line 124) | async def rollout(self, request: RolloutRequest) -> Rollout: method group_rollout (line 136) | async def group_rollout(self, request: GroupedRolloutRequest) -> list[... method _evaluation (line 147) | async def _evaluation( method run_evaluation (line 168) | async def run_evaluation(self, request: EvaluationRequest): FILE: megatron/rl/agent/weighted_multi_task.py class AgentConfig (line 24) | class AgentConfig(AgentBaseModel): method __init__ (line 32) | def __init__(self, **data): class WeightedMultiTask (line 38) | class WeightedMultiTask( method __init__ (line 43) | def __init__(self, agent_configs: list[AgentConfig]): method from_config (line 69) | def from_config( method _distribute_counts (line 106) | def _distribute_counts(self, total_count: int, distribute_remainder: b... method group_rollout (line 156) | async def group_rollout(self, request: GroupedRolloutRequest) -> list[... method rollout (line 161) | async def rollout(self, request: RolloutRequest) -> Rollout: method get_reward_rollouts (line 166) | async def get_reward_rollouts(self, request: RolloutRequest) -> list[R... method get_grouped_rollouts (line 186) | async def get_grouped_rollouts(self, request: GroupedRolloutRequest): method get_contrastive_rollouts (line 246) | async def get_contrastive_rollouts(self, request: RolloutRequest) -> l... method run_evaluation (line 271) | async def run_evaluation(self, request: EvaluationRequest) -> list[Eva... FILE: megatron/rl/inference/api.py class LLMChatMessage (line 8) | class LLMChatMessage(BaseModel): class InferenceRequest (line 13) | class InferenceRequest(Request): class InferenceResponse (line 18) | class InferenceResponse(BaseModel): FILE: megatron/rl/inference/inference_interface.py class InferenceInterface (line 15) | class InferenceInterface(BaseModel): class Config (line 18) | class Config: method prepare_request (line 21) | def prepare_request( method base_generate (line 27) | async def base_generate(self, request: InferenceRequest) -> InferenceR... method agenerate (line 30) | async def agenerate( method generate (line 35) | def generate( class ReturnsRaw (line 45) | class ReturnsRaw(InferenceInterface): class ReturnsTokens (line 51) | class ReturnsTokens(InferenceInterface): class ReturnsLogProbs (line 57) | class ReturnsLogProbs(ReturnsTokens): FILE: megatron/rl/inference/megatron.py class MegatronLocal (line 36) | class MegatronLocal(InferenceServer, ReturnsTokens, ReturnsRaw): method base_generate (line 47) | async def base_generate(self, request: InferenceRequest) -> InferenceR... method launch (line 85) | async def launch(cls, model: GPTModel, **kwargs): method kill (line 147) | async def kill(self): method set_generation_epoch (line 168) | def set_generation_epoch(self, generation_epoch: int): method suspend (line 172) | async def suspend(self): method resume (line 181) | async def resume(self): FILE: megatron/rl/logging.py function log (line 18) | def log(message): FILE: megatron/rl/parallel_utils.py function build_inference_pg_collection (line 16) | def build_inference_pg_collection( FILE: megatron/rl/rl_utils.py function _torch_saver_swap_inference_model (line 112) | def _torch_saver_swap_inference_model(*, to_cpu: bool) -> None: function _maybe_prefetch_separate_inference_model_weights (line 145) | def _maybe_prefetch_separate_inference_model_weights(model_core, *, to_c... function verify_model_weights_swap (line 179) | def verify_model_weights_swap( class RolloutStats (line 273) | class RolloutStats: class RLRuntimeState (line 296) | class RLRuntimeState: method __init__ (line 299) | def __init__(self): method reset_iteration_counters (line 305) | def reset_iteration_counters(self, iteration): method increment_sequences (line 310) | def increment_sequences(self, count): function get_rl_runtime_state (line 320) | def get_rl_runtime_state(): function update_inference_logprobs_group_stats (line 325) | def update_inference_logprobs_group_stats( function align_unpacked_inference_logprobs (line 359) | def align_unpacked_inference_logprobs( function get_agent (line 424) | def get_agent(args, parallel_generation_tasks: int | None = None): function get_inference_interface (line 442) | def get_inference_interface(args, loop, model): function get_rollout_generator (line 458) | def get_rollout_generator(args, inference_interface, n_prompts, samples_... function get_environment_rollouts (line 480) | def get_environment_rollouts( function selective_log_softmax (line 596) | def selective_log_softmax(logits, index): function get_logprobs (line 637) | def get_logprobs(model, tokens, position_ids, no_grad=False, sequence_pa... function calculate_grpo_advantages (line 717) | def calculate_grpo_advantages(rewards: list[list[float]], num_turns: lis... function compute_group_stats (line 744) | def compute_group_stats( function prep_wandb_metrics (line 845) | def prep_wandb_metrics( function maybe_log_training_metrics (line 956) | def maybe_log_training_metrics( function prepare_trajectories (line 1037) | def prepare_trajectories( function logprobs_forward_step (line 1164) | def logprobs_forward_step(data_iterator, model, is_correction, packing_c... function compute_logprobs_batch (line 1195) | def compute_logprobs_batch( function prepare_data_for_update (line 1245) | def prepare_data_for_update( function get_grpo_data_iterator (line 1518) | def get_grpo_data_iterator( function evaluate_and_print_results_rl (line 1590) | def evaluate_and_print_results_rl( function calculate_grpo_loss (line 1703) | def calculate_grpo_loss( function megatron_rl_inference_mode (line 1798) | def megatron_rl_inference_mode( function rl_inference_interface_shutdown (line 1920) | def rl_inference_interface_shutdown(): function get_iteration_sequence_count (line 1943) | def get_iteration_sequence_count(args): function _pad_nonnull_with_zeros (line 1953) | def _pad_nonnull_with_zeros(data: list[Optional[torch.Tensor]], max_len:... FILE: megatron/rl/sequence_packing_utils.py class PackingInfo (line 25) | class PackingInfo: class PackingContext (line 43) | class PackingContext: function load_packed_data_by_index (line 74) | def load_packed_data_by_index(bin_idx: int, packing_context: PackingCont... function log_packing_efficiency (line 140) | def log_packing_efficiency(packing_context: PackingContext): function get_actual_sequence_lengths (line 282) | def get_actual_sequence_lengths(sequences: torch.Tensor, pad_token: int)... function create_empty_bins (line 311) | def create_empty_bins( function get_default_packed_seq_params (line 398) | def get_default_packed_seq_params(seq_length: int, max_sequences_per_bin... function create_packed_seq_params (line 434) | def create_packed_seq_params(packing_context: PackingContext): function create_packed_seq_params_for_bin (line 451) | def create_packed_seq_params_for_bin( function pack_inference_logprobs (line 511) | def pack_inference_logprobs( function compute_packed_inference_logprobs_stats (line 582) | def compute_packed_inference_logprobs_stats( class SequencePacker (line 623) | class SequencePacker: method __init__ (line 626) | def __init__(self, bin_size: int, pad_token: int, max_sequences_per_bi... method pack_sequences (line 631) | def pack_sequences( function distribute_packed_bins (line 800) | def distribute_packed_bins( function pack_all_trajectories (line 973) | def pack_all_trajectories(trajs, generation_masks, inference_logprobs, g... function update_microbatch_calculator (line 1067) | def update_microbatch_calculator( function get_microbatch_dataloader (line 1154) | def get_microbatch_dataloader(num_bins_this_rank, micro_batch_size): function get_sequence_packing_log_info (line 1159) | def get_sequence_packing_log_info(args): function get_sequence_packing_tensorboard_metrics (line 1166) | def get_sequence_packing_tensorboard_metrics(args): FILE: megatron/rl/server/agent/fastapi_env_server.py class FastAPIEnvServer (line 43) | class FastAPIEnvServer(EnvironmentServer): method launch (line 49) | async def launch(cls, env_cls: type[Agent], cls_args: dict, port: int,... method kill (line 101) | def kill(self): method get_contrastive_rollouts (line 104) | async def get_contrastive_rollouts(self, request: RolloutRequest) -> l... method group_rollout (line 119) | async def group_rollout(self, request: GroupedRolloutRequest): method get_grouped_rollouts (line 124) | async def get_grouped_rollouts( method rollout (line 141) | async def rollout(self, request: RolloutRequest) -> TokenRollout: method get_reward_rollouts (line 146) | async def get_reward_rollouts(self, request: RolloutRequest) -> list[T... method run_evaluation (line 159) | async def run_evaluation(self, request: EvaluationRequest) -> Evaluati... function run (line 173) | def run(agent_cls: type[Agent], cls_args: dict, port: int): FILE: megatron/rl/server/api.py class Server (line 11) | class Server(TypeLookupable): method launch (line 15) | async def launch(cls) -> Self: method suspend (line 18) | async def suspend(self): method resume (line 21) | async def resume(self): method kill (line 24) | async def kill(self): class InferenceServer (line 28) | class InferenceServer(Server, InferenceInterface): class EnvironmentServer (line 34) | class EnvironmentServer(Server): class RemoteRolloutRequest (line 40) | class RemoteRolloutRequest(RolloutRequest): class RemoteGroupedRolloutRequest (line 44) | class RemoteGroupedRolloutRequest(GroupedRolloutRequest): class RemoteEvaluationRequest (line 48) | class RemoteEvaluationRequest(EvaluationRequest): FILE: megatron/rl/server/inference/inference_interface_server.py class InferenceInterfaceClient (line 25) | class InferenceInterfaceClient(InferenceServer): method base_generate (line 30) | async def base_generate(self, request: InferenceRequest) -> InferenceR... class InferenceInterfaceServer (line 39) | class InferenceInterfaceServer(InferenceInterfaceClient, ReturnsRaw, Ret... method launch (line 47) | async def launch(cls, interface_cls: type[InferenceInterface], **kwarg... method kill (line 82) | async def kill(self): method suspend (line 89) | async def suspend(self): method resume (line 93) | async def resume(self): FILE: megatron/training/argument_utils.py class TypeInferenceError (line 17) | class TypeInferenceError(Exception): class ArgumentGroupFactory (line 21) | class ArgumentGroupFactory: method __init__ (line 66) | def __init__(self, src_cfg_class: type, exclude: Optional[list[str]] =... method _format_arg_name (line 71) | def _format_arg_name(self, config_attr_name: str, prefix: Optional[str... method _get_enum_kwargs (line 85) | def _get_enum_kwargs(self, config_type: enum.EnumMeta) -> dict[str, Any]: method _extract_type (line 96) | def _extract_type(self, config_type: type) -> dict[str, Any]: method _build_argparse_kwargs_from_field (line 136) | def _build_argparse_kwargs_from_field(self, attribute: Field) -> dict[... method build_group (line 190) | def build_group(self, parser: ArgumentParser, title: Optional[str] = N... method _get_field_docstrings (line 209) | def _get_field_docstrings(self, src_cfg_class: type) -> dict[str, str]: FILE: megatron/training/arguments.py function add_megatron_arguments (line 49) | def add_megatron_arguments(parser: argparse.ArgumentParser): function parse_args (line 88) | def parse_args(extra_args_provider=None, ignore_unknown_args=False): function validate_model_config_args_from_heterogeneous_config (line 126) | def validate_model_config_args_from_heterogeneous_config(args): function _eval_pattern (line 199) | def _eval_pattern(pattern): function no_rope_freq_type (line 209) | def no_rope_freq_type(x): function moe_freq_type (line 232) | def moe_freq_type(x): function la_freq_type (line 257) | def la_freq_type(x): function tuple_type (line 282) | def tuple_type(x): function validate_args (line 294) | def validate_args(args, defaults={}): function _print_args (line 1623) | def _print_args(title, args): function _check_arg_is_not_none (line 1637) | def _check_arg_is_not_none(args, arg): function core_transformer_config_from_args (line 1641) | def core_transformer_config_from_args(args, config_class=None): function _add_transformer_engine_args (line 1729) | def _add_transformer_engine_args(parser): function _add_inference_args (line 1743) | def _add_inference_args(parser): function _add_network_size_args (line 1922) | def _add_network_size_args(parser): function _add_straggler_detector_args (line 2087) | def _add_straggler_detector_args(parser): function _add_workload_inspector_server_args (line 2095) | def _add_workload_inspector_server_args(parser): function _add_inprocess_restart_args (line 2101) | def _add_inprocess_restart_args(parser): function _add_one_logger_args (line 2150) | def _add_one_logger_args(parser): function _add_ft_package_args (line 2179) | def _add_ft_package_args(parser): function _add_logging_args (line 2195) | def _add_logging_args(parser): function _add_regularization_args (line 2204) | def _add_regularization_args(parser): function _add_rl_args (line 2264) | def _add_rl_args(parser): function _add_training_args (line 2412) | def _add_training_args(parser): function _add_rerun_machine_args (line 2512) | def _add_rerun_machine_args(parser): function _add_initialization_args (line 2521) | def _add_initialization_args(parser): function _add_learning_rate_args (line 2533) | def _add_learning_rate_args(parser): function _add_checkpointing_args (line 2558) | def _add_checkpointing_args(parser): function _add_mixed_precision_args (line 2588) | def _add_mixed_precision_args(parser): function _add_distributed_args (line 2619) | def _add_distributed_args(parser): function _add_validation_args (line 2738) | def _add_validation_args(parser): function _add_tokenizer_args (line 2747) | def _add_tokenizer_args(parser): function _add_data_args (line 2806) | def _add_data_args(parser): function _add_autoresume_args (line 2930) | def _add_autoresume_args(parser): function _add_biencoder_args (line 2942) | def _add_biencoder_args(parser): function _add_vision_args (line 2999) | def _add_vision_args(parser): function _add_moe_args (line 3066) | def _add_moe_args(parser): function _add_mla_args (line 3097) | def _add_mla_args(parser): function _add_experimental_attention_variant_args (line 3127) | def _add_experimental_attention_variant_args(parser): function _add_heterogeneous_args (line 3141) | def _add_heterogeneous_args(parser): function _add_experimental_args (line 3191) | def _add_experimental_args(parser): function _add_msc_args (line 3254) | def _add_msc_args(parser): function _add_kitchen_quantization_arguments (line 3260) | def _add_kitchen_quantization_arguments(parser: argparse.ArgumentParser): function _add_sft_args (line 3290) | def _add_sft_args(parser): FILE: megatron/training/async_utils.py function init_persistent_async_worker (line 25) | def init_persistent_async_worker(rank: int, mp_mode: str = 'spawn'): function schedule_async_save (line 47) | def schedule_async_save(async_request: AsyncRequest): function maybe_finalize_async_save (line 56) | def maybe_finalize_async_save(blocking: bool = False, terminate=False): function is_empty_async_queue (line 84) | def is_empty_async_queue() -> bool: function reset_persistent_async_worker (line 93) | def reset_persistent_async_worker(): FILE: megatron/training/checkpointing.py function finalize_deletion_processes (line 78) | def finalize_deletion_processes(blocking=False): function set_checkpoint_version (line 101) | def set_checkpoint_version(value): function get_checkpoint_version (line 109) | def get_checkpoint_version(): function set_loaded_iteration (line 114) | def set_loaded_iteration(value): function get_loaded_iteration (line 124) | def get_loaded_iteration(): function check_checkpoint_args (line 130) | def check_checkpoint_args(checkpoint_args): function isfile (line 172) | def isfile(filename) -> bool: function ensure_directory_exists (line 180) | def ensure_directory_exists(filename, check_parent=True): function get_checkpoint_name (line 190) | def get_checkpoint_name(checkpoints_path, iteration, release=False, function get_load_checkpoint_path_by_args (line 232) | def get_load_checkpoint_path_by_args(args, load_arg="load"): function get_distributed_optimizer_checkpoint_name (line 249) | def get_distributed_optimizer_checkpoint_name(model_checkpoint_name): function find_checkpoint_rank_0 (line 254) | def find_checkpoint_rank_0(checkpoints_path, iteration, release=False): function get_checkpoint_tracker_filename (line 305) | def get_checkpoint_tracker_filename(checkpoints_path): function checkpoint_exists (line 312) | def checkpoint_exists(checkpoints_path): function read_metadata (line 319) | def read_metadata(tracker_filename): function get_rng_state (line 364) | def get_rng_state(ckpt_format: str, tp_group: torch.distributed.ProcessG... class CheckpointType (line 402) | class CheckpointType(Enum): function _build_sharded_state_dict_metadata (line 410) | def _build_sharded_state_dict_metadata(args: Namespace, dp_cp_group: Opt... function save_grads (line 446) | def save_grads(save_dir, state_dict, iteration, grad_label): function save_checkpoint (line 480) | def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, nu... function _async_delete_checkpoint_impl (line 826) | def _async_delete_checkpoint_impl(save_path, iteration_to_delete, log_pr... function cleanup_old_non_persistent_checkpoint (line 861) | def cleanup_old_non_persistent_checkpoint(save_dir, leave_ckpt_num=1, do... function maybe_save_dataloader_state (line 884) | def maybe_save_dataloader_state(train_iterator, iteration, dataloader_sa... function generate_state_dict (line 933) | def generate_state_dict( function preprocess_fsdp_dtensor_state_dict (line 1011) | def preprocess_fsdp_dtensor_state_dict(args, raw_state_dict, model): function _transpose_first_dim (line 1033) | def _transpose_first_dim(t, num_splits, num_splits_first, model): function fix_query_key_value_ordering (line 1072) | def fix_query_key_value_ordering(model, checkpoint_version): function _get_non_persistent_iteration (line 1103) | def _get_non_persistent_iteration(non_persistent_global_dir, args, check... function _load_non_persistent_base_checkpoint (line 1124) | def _load_non_persistent_base_checkpoint( function _load_global_dist_base_checkpoint (line 1159) | def _load_global_dist_base_checkpoint( function _get_checkpoint_format (line 1199) | def _get_checkpoint_format(checkpoint_name, args): function _load_base_checkpoint (line 1225) | def _load_base_checkpoint( function load_args_from_checkpoint (line 1396) | def load_args_from_checkpoint( function load_checkpoint (line 1561) | def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg=... function _to_dtensor (line 2013) | def _to_dtensor(wrapped_model, model_state_dict): function load_biencoder_checkpoint (line 2027) | def load_biencoder_checkpoint(model, only_query_model=False, FILE: megatron/training/config/common_config.py class RNGConfig (line 7) | class RNGConfig: class ProfilingConfig (line 25) | class ProfilingConfig: class DistributedInitConfig (line 71) | class DistributedInitConfig: FILE: megatron/training/config/resilience_config.py class RerunStateMachineConfig (line 6) | class RerunStateMachineConfig: class StragglerDetectionConfig (line 28) | class StragglerDetectionConfig: FILE: megatron/training/config/training_config.py class TrainingConfig (line 7) | class TrainingConfig: class ValidationConfig (line 91) | class ValidationConfig: class SchedulerConfig (line 120) | class SchedulerConfig: class LoggerConfig (line 199) | class LoggerConfig: class CheckpointConfig (line 326) | class CheckpointConfig: FILE: megatron/training/datasets/data_samplers.py function build_pretraining_data_loader (line 19) | def build_pretraining_data_loader(dataset, consumed_samples): class MegatronPretrainingSampler (line 112) | class MegatronPretrainingSampler: method __init__ (line 121) | def __init__( method __len__ (line 151) | def __len__(self): method get_start_end_idx (line 154) | def get_start_end_idx(self): method __iter__ (line 166) | def __iter__(self): class HybridCPMegatronPretrainingSampler (line 181) | class HybridCPMegatronPretrainingSampler(MegatronPretrainingSampler): method __init__ (line 189) | def __init__(self, total_samples, consumed_samples, micro_batch_size, ... method __len__ (line 196) | def __len__(self): method get_start_end_idx_global_batch (line 199) | def get_start_end_idx_global_batch(self): method __iter__ (line 204) | def __iter__(self): class RandomSeedDataset (line 225) | class RandomSeedDataset(Dataset): method __init__ (line 242) | def __init__(self, dataset, seed): method __len__ (line 247) | def __len__(self): method set_epoch (line 250) | def set_epoch(self, epoch): method __getitem__ (line 259) | def __getitem__(self, idx): class MegatronPretrainingRandomSampler (line 267) | class MegatronPretrainingRandomSampler: method __init__ (line 275) | def __init__( method __len__ (line 306) | def __len__(self): method __iter__ (line 309) | def __iter__(self): FILE: megatron/training/datasets/fim_dataset.py class GPTFIMDatasetConfig (line 16) | class GPTFIMDatasetConfig(GPTDatasetConfig): class GPTFIMDataset (line 38) | class GPTFIMDataset(GPTDataset): method __init__ (line 54) | def __init__( method _query_document_sample_shuffle_indices (line 104) | def _query_document_sample_shuffle_indices(self, idx: int) -> Tuple[np... method _fim_permute_sequence (line 184) | def _fim_permute_sequence(self, sequence, rate): method _fim_split_and_permute_sequence (line 198) | def _fim_split_and_permute_sequence(self, sequence): method _permute (line 233) | def _permute( FILE: megatron/training/datasets/sft_dataset.py class SFTLowLevelDataset (line 17) | class SFTLowLevelDataset: method __init__ (line 35) | def __init__(self, dataset_path: str) -> None: method __len__ (line 44) | def __len__(self) -> int: method __getitem__ (line 47) | def __getitem__(self, idx: int) -> list: class SFTDataset (line 51) | class SFTDataset(MegatronDataset): method __init__ (line 54) | def __init__( method numel_low_level_dataset (line 66) | def numel_low_level_dataset(low_level_dataset: LowLevelDataset) -> int: method build_low_level_dataset (line 70) | def build_low_level_dataset(dataset_path: str, config: GPTDatasetConfi... method __len__ (line 73) | def __len__(self) -> int: method _split_conversations (line 76) | def _split_conversations(self, merged_conversations): method __getitem__ (line 91) | def __getitem__(self, idx: int) -> Dict[str, Any]: FILE: megatron/training/dgrad_logging.py function _get_linear_types (line 15) | def _get_linear_types(): class DataGradLogger (line 50) | class DataGradLogger: method __init__ (line 56) | def __init__(self, save_dir: str): method _make_hook (line 61) | def _make_hook(self, model_chunk_name: str, module_name: str): method save (line 74) | def save(self, iteration: int): method register_hooks (line 81) | def register_hooks(self, model: torch.nn.Module): method remove_hooks (line 94) | def remove_hooks(self): function enable_dgrad_logging (line 104) | def enable_dgrad_logging(model: torch.nn.Module, save_dir: str): function disable_dgrad_logging (line 112) | def disable_dgrad_logging(): function save_dgrads (line 119) | def save_dgrads(iteration: int): FILE: megatron/training/dist_signal_handler.py function get_world_size (line 6) | def get_world_size(): function get_device (line 14) | def get_device(local_rank=None): function all_gather_item (line 28) | def all_gather_item(item, dtype, group=None, async_op=False, local_rank=... class DistributedSignalHandler (line 50) | class DistributedSignalHandler: method __init__ (line 51) | def __init__(self, sig: signal.Signals = signal.SIGTERM): method signals_received (line 54) | def signals_received(self): method __enter__ (line 60) | def __enter__(self): method __exit__ (line 72) | def __exit__(self, type, value, tb): method release (line 75) | def release(self): FILE: megatron/training/ft_integration.py function get_rank_monitor_client (line 67) | def get_rank_monitor_client() -> Optional[Any]: function setup (line 76) | def setup() -> None: function on_training_step_start (line 121) | def on_training_step_start() -> None: function on_training_step_end (line 136) | def on_training_step_end() -> None: function on_eval_step_start (line 146) | def on_eval_step_start() -> None: function on_eval_step_end (line 159) | def on_eval_step_end() -> None: function on_checkpointing_start (line 169) | def on_checkpointing_start() -> None: function on_checkpointing_end (line 176) | def on_checkpointing_end(is_async_finalization: bool) -> None: function on_checkpoint_loaded (line 193) | def on_checkpoint_loaded(is_local_chkpt: bool) -> None: function shutdown (line 207) | def shutdown() -> None: function _load_state_if_exists (line 219) | def _load_state_if_exists(): function _update_timeouts (line 228) | def _update_timeouts(selected_sections, calc_out_of_section): function _maybe_update_timeouts (line 244) | def _maybe_update_timeouts(is_closing_ft=False): function maybe_setup_simulated_fault (line 300) | def maybe_setup_simulated_fault() -> None: FILE: megatron/training/global_vars.py function get_args (line 27) | def get_args(): function get_tokenizer (line 33) | def get_tokenizer(): function get_tensorboard_writer (line 39) | def get_tensorboard_writer(): function get_wandb_writer (line 45) | def get_wandb_writer(): function get_one_logger (line 51) | def get_one_logger(): function get_adlr_autoresume (line 56) | def get_adlr_autoresume(): function get_timers (line 62) | def get_timers(): function get_energy_monitor (line 67) | def get_energy_monitor(): function get_signal_handler (line 72) | def get_signal_handler(): function _set_signal_handler (line 77) | def _set_signal_handler(exit_signal): function set_global_variables (line 85) | def set_global_variables(args, build_tokenizer=True): function unset_global_variables (line 120) | def unset_global_variables(): function set_args (line 151) | def set_args(args): function _build_tokenizer (line 156) | def _build_tokenizer(args): function rebuild_tokenizer (line 164) | def rebuild_tokenizer(args): function _set_tensorboard_writer (line 170) | def _set_tensorboard_writer(args): function _set_wandb_writer (line 190) | def _set_wandb_writer(args): function _set_one_logger (line 222) | def _set_one_logger(args): function _set_adlr_autoresume (line 246) | def _set_adlr_autoresume(args): function _set_timers (line 264) | def _set_timers(args): function _set_energy_monitor (line 270) | def _set_energy_monitor(args): function _ensure_var_is_initialized (line 277) | def _ensure_var_is_initialized(var, name): function _ensure_var_is_not_initialized (line 282) | def _ensure_var_is_not_initialized(var, name): function destroy_global_vars (line 286) | def destroy_global_vars(): FILE: megatron/training/initialize.py function initialize_megatron (line 41) | def initialize_megatron( function _compile_dependencies (line 179) | def _compile_dependencies(): function _initialize_tp_communicators (line 242) | def _initialize_tp_communicators(): function _initialize_distributed (line 315) | def _initialize_distributed(get_embedding_ranks, get_position_embedding_... function _init_autoresume (line 431) | def _init_autoresume(): function _set_random_seed (line 440) | def _set_random_seed( function write_args_to_tensorboard (line 465) | def write_args_to_tensorboard(): function set_jit_fusion_options (line 474) | def set_jit_fusion_options(): function _warmup_jit_function (line 498) | def _warmup_jit_function(): function setup_logging (line 561) | def setup_logging() -> None: FILE: megatron/training/inprocess_restart.py function destroy_state (line 25) | def destroy_state(): function inprocess_restart (line 30) | def inprocess_restart(train, args): function maybe_wrap_for_inprocess_restart (line 126) | def maybe_wrap_for_inprocess_restart(pretrain): function maybe_force_nccl_backend_init (line 148) | def maybe_force_nccl_backend_init(device_id): FILE: megatron/training/log_handler.py class CustomHandler (line 9) | class CustomHandler(StreamHandler): method __init__ (line 15) | def __init__(self): method filter (line 18) | def filter(self, record: LogRecord) -> bool: FILE: megatron/training/one_logger_utils.py function get_timestamp_in_ms (line 9) | def get_timestamp_in_ms(): function on_train_start (line 18) | def on_train_start(iteration, consumed_train_samples, train_samples, seq... function _produce_e2e_metrics (line 76) | def _produce_e2e_metrics(log_throughput=False, throughput=None): function track_e2e_metrics (line 209) | def track_e2e_metrics(log_throughput=False, throughput=None): function on_save_checkpoint_start (line 226) | def on_save_checkpoint_start(async_save): function on_pretrain_start (line 265) | def on_pretrain_start(): function track_config_flags (line 300) | def track_config_flags(train_iters, skip_train, do_train, do_valid, do_t... function on_save_checkpoint_success (line 322) | def on_save_checkpoint_success(productive_metrics, async_save): function on_save_checkpoint_end (line 366) | def on_save_checkpoint_end(save_checkpoint_duration, current_iteration, ... function track_app_tag (line 437) | def track_app_tag(batch_size, world_size, seq_length): function finish (line 456) | def finish(): FILE: megatron/training/theoretical_memory_usage.py function compute_weight_and_optimizer_memory (line 12) | def compute_weight_and_optimizer_memory(args, verbose=False): function compute_activation_memory (line 192) | def compute_activation_memory(args, num_microbatches, verbose=False): function compute_activation_memory_without_sp (line 266) | def compute_activation_memory_without_sp(args, num_microbatches, verbose... function report_theoretical_memory (line 340) | def report_theoretical_memory(args, num_microbatches=None, verbose=False): FILE: megatron/training/training.py function set_startup_timestamps (line 16) | def set_startup_timestamps(program_start=None, main_entry=None): function destroy_global_state (line 209) | def destroy_global_state(): function print_datetime (line 218) | def print_datetime(string, override_timestamp=None): function num_floating_point_operations (line 228) | def num_floating_point_operations(args, batch_size): function get_start_time_from_progress_log (line 630) | def get_start_time_from_progress_log(): function preprocess_common_state_dict (line 673) | def preprocess_common_state_dict(common_state_dict): function pretrain (line 718) | def pretrain( function update_train_iters (line 1178) | def update_train_iters(args): function get_model (line 1211) | def get_model(model_provider_func, model_type=ModelType.encoder_or_decod... function get_optimizer_param_scheduler (line 1404) | def get_optimizer_param_scheduler(optimizer): function get_megatron_optimizer_config (line 1460) | def get_megatron_optimizer_config(args: Any) -> OptimizerConfig: function setup_model_and_optimizer (line 1488) | def setup_model_and_optimizer( function dummy_train_step (line 1661) | def dummy_train_step(data_iterator): function train_step (line 1672) | def train_step(forward_step_func, data_iterator, model, optimizer, opt_p... function training_log (line 1842) | def training_log( function compute_throughputs_and_append_to_progress_log (line 2189) | def compute_throughputs_and_append_to_progress_log(iteration, num_floati... function enable_forward_pre_hook (line 2222) | def enable_forward_pre_hook(model_chunks): function disable_forward_pre_hook (line 2228) | def disable_forward_pre_hook(model_chunks, param_sync=True): function force_param_sync (line 2234) | def force_param_sync(model_chunks: list[DDP]) -> None: function save_checkpoint_and_time (line 2243) | def save_checkpoint_and_time( function post_training_step_callbacks (line 2325) | def post_training_step_callbacks( function checkpoint_and_decide_exit (line 2392) | def checkpoint_and_decide_exit( function train (line 2504) | def train( function evaluate (line 3163) | def evaluate( function evaluate_and_print_results (line 3328) | def evaluate_and_print_results( function cyclic_iter (line 3420) | def cyclic_iter(iter): function get_train_valid_test_num_samples (line 3426) | def get_train_valid_test_num_samples(): function build_train_valid_test_datasets (line 3460) | def build_train_valid_test_datasets(build_train_valid_test_datasets_prov... function build_train_valid_test_data_loaders (line 3471) | def build_train_valid_test_data_loaders(build_train_valid_test_datasets_... function build_train_valid_test_data_iterators (line 3555) | def build_train_valid_test_data_iterators(build_train_valid_test_dataset... function should_disable_forward_pre_hook (line 3629) | def should_disable_forward_pre_hook(args): FILE: megatron/training/utils.py function calc_params_l2_norm (line 48) | def calc_params_l2_norm(model, force_create_fp32_copy=False): function calc_dtensor_params_l2_norm (line 197) | def calc_dtensor_params_l2_norm(params): function average_losses_across_data_parallel_group (line 234) | def average_losses_across_data_parallel_group(losses): function reduce_max_stat_across_model_parallel_group (line 243) | def reduce_max_stat_across_model_parallel_group(stat: float) -> float | ... function logical_and_across_model_parallel_group (line 264) | def logical_and_across_model_parallel_group(input: bool) -> bool: function report_memory (line 279) | def report_memory(name): function print_params_min_max_norm (line 294) | def print_params_min_max_norm(optimizer, iteration): function check_adlr_autoresume_termination (line 313) | def check_adlr_autoresume_termination(iteration, model, optimizer, opt_p... function get_ltor_masks_and_position_ids (line 331) | def get_ltor_masks_and_position_ids(data, function print_rank_0 (line 394) | def print_rank_0(message, rank=None): function warn_rank_0 (line 404) | def warn_rank_0(message, rank=None): function is_rank0 (line 414) | def is_rank0(): function is_last_rank (line 419) | def is_last_rank(): function print_rank_last (line 425) | def print_rank_last(message): function is_hybrid_model (line 434) | def is_hybrid_model(args): function is_first_or_last_pipeline_stage (line 439) | def is_first_or_last_pipeline_stage(vp_stage): function get_device_arch_version (line 451) | def get_device_arch_version(): function append_to_progress_log (line 456) | def append_to_progress_log(string, barrier=True): function get_blend_and_blend_per_split (line 474) | def get_blend_and_blend_per_split(args): function get_batch_on_this_tp_rank (line 522) | def get_batch_on_this_tp_rank(data_iterator, mtp_on_this_rank: bool = Fa... function update_use_dist_ckpt (line 724) | def update_use_dist_ckpt(args): function to_empty_if_meta_device (line 728) | def to_empty_if_meta_device(module: torch.nn.Module, *, device: torch.de... function get_nvtx_range (line 756) | def get_nvtx_range(): FILE: megatron/training/wandb_utils.py function _get_wandb_artifact_tracker_filename (line 10) | def _get_wandb_artifact_tracker_filename(save_dir: str) -> Path: function _get_artifact_name_and_version (line 15) | def _get_artifact_name_and_version(save_dir: Path, checkpoint_path: Path... function on_save_checkpoint_success (line 19) | def on_save_checkpoint_success(checkpoint_path: str, tracker_filename: s... function on_load_checkpoint_success (line 44) | def on_load_checkpoint_success(checkpoint_path: str, load_dir: str) -> N... FILE: megatron/training/yaml_arguments.py function env_constructor (line 25) | def env_constructor(loader, node): function validate_yaml (line 41) | def validate_yaml(args, defaults={}): function _print_args (line 339) | def _print_args(title, args): function core_config_from_args (line 353) | def core_config_from_args(args, dataclass=TransformerConfig): function _check_arg_is_not_none (line 374) | def _check_arg_is_not_none(args, arg): function core_transformer_config_from_yaml (line 377) | def core_transformer_config_from_yaml(args, transfomer_key = "language_m... function load_yaml (line 416) | def load_yaml(yaml_path): FILE: model_provider.py function model_provider (line 24) | def model_provider( function count_parameters_in_layer (line 66) | def count_parameters_in_layer(model, layer_name): FILE: pretrain_bert.py function model_provider (line 29) | def model_provider(pre_process=True, post_process=True, vp_stage=None, c... function get_batch (line 72) | def get_batch(data_iterator): function loss_func (line 98) | def loss_func(loss_mask, sentence_order, output_tensor): function forward_step (line 123) | def forward_step(data_iterator, model): function train_valid_test_datasets_provider (line 144) | def train_valid_test_datasets_provider(train_val_test_num_samples, vp_st... FILE: pretrain_gpt.py function get_batch (line 65) | def get_batch(data_iterator, vp_stage: Optional[int] = None): function loss_func (line 170) | def loss_func( function forward_step (line 232) | def forward_step(data_iterator, model: GPTModel, return_schedule_plan: b... function is_dataset_built_on_rank (line 271) | def is_dataset_built_on_rank(vp_stage=None, is_packed_sequence=False): function core_gpt_dataset_config_from_args (line 284) | def core_gpt_dataset_config_from_args(args): function train_valid_test_datasets_provider (line 349) | def train_valid_test_datasets_provider(train_val_test_num_samples, vp_st... function get_embedding_ranks (line 384) | def get_embedding_ranks(pp_ranks: List[int]): FILE: pretrain_mamba.py function get_batch (line 74) | def get_batch(data_iterator, vp_stage=None): function loss_func (line 152) | def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor, mode... function forward_step (line 210) | def forward_step(data_iterator, model: MambaModel): function is_dataset_built_on_rank (line 267) | def is_dataset_built_on_rank(vp_stage=None, is_packed_sequence=False): function core_gpt_dataset_config_from_args (line 276) | def core_gpt_dataset_config_from_args(args): function train_valid_test_datasets_provider (line 313) | def train_valid_test_datasets_provider(train_val_test_num_samples, vp_st... FILE: pretrain_t5.py function model_provider (line 67) | def model_provider( function get_batch (line 149) | def get_batch(data_iterator, use_local): function forward_step (line 179) | def forward_step(data_iterator, model: T5Model): function train_valid_test_datasets_provider (line 206) | def train_valid_test_datasets_provider(train_val_test_num_samples: int): function t5_embedding_ranks (line 255) | def t5_embedding_ranks(pp_ranks): function t5_position_embedding_ranks (line 269) | def t5_position_embedding_ranks(pp_ranks): FILE: pretrain_vlm.py function model_provider (line 45) | def model_provider( function train_valid_test_datasets_provider (line 213) | def train_valid_test_datasets_provider(train_val_test_num_samples): function _preprocess_data_for_llava (line 253) | def _preprocess_data_for_llava(data): function get_batch (line 289) | def get_batch(data_iterator): function forward_step (line 364) | def forward_step(data_iterator, model: LLaVAModel): function add_vlm_extra_args (line 389) | def add_vlm_extra_args(parser): function llava_embedding_ranks (line 417) | def llava_embedding_ranks(pp_ranks): function llava_position_embedding_ranks (line 431) | def llava_position_embedding_ranks(pp_ranks): FILE: scripts/check_api_backwards_compatibility.py function has_exempt_decorator (line 66) | def has_exempt_decorator(obj: Object) -> bool: function get_filtered_paths (line 87) | def get_filtered_paths(package: Object, package_name: str) -> set: function strip_ansi_codes (line 139) | def strip_ansi_codes(text): function get_object_path (line 158) | def get_object_path(change) -> str: function should_skip_change (line 224) | def should_skip_change(change, filtered_paths: set) -> bool: function main (line 275) | def main(): FILE: tasks/data_utils.py function clean_text (line 9) | def clean_text(text): function build_sample (line 20) | def build_sample(ids, types, paddings, label, unique_id): function build_tokens_types_paddings_from_text (line 35) | def build_tokens_types_paddings_from_text(text_a, text_b, function build_tokens_types_paddings_from_ids (line 49) | def build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, max_seq... FILE: tasks/eval_utils.py function accuracy_func_provider (line 19) | def accuracy_func_provider(single_dataset_provider): function calculate_correct_answers (line 65) | def calculate_correct_answers(name, model, dataloader, FILE: tasks/finetune_utils.py function process_batch (line 26) | def process_batch(batch): function cross_entropy_loss_func (line 40) | def cross_entropy_loss_func(labels, output_tensor): function _cross_entropy_forward_step (line 53) | def _cross_entropy_forward_step(batch, model): function build_data_loader (line 72) | def build_data_loader(dataset, micro_batch_size, num_workers, drop_last, function _build_infinite_size_dataloader (line 95) | def _build_infinite_size_dataloader(dataloader): function _build_train_valid_dataloaders (line 106) | def _build_train_valid_dataloaders(train_dataset, valid_dataset, function _train (line 145) | def _train(model, optimizer, opt_param_scheduler, forward_step, function finetune (line 239) | def finetune(train_valid_datasets_provider, model_provider, FILE: tests/functional_tests/python_test_utils/common.py class TypeOfTestResult (line 22) | class TypeOfTestResult(enum.Enum): class Test (line 27) | class Test(pydantic.BaseModel): class NotApproximateError (line 31) | class NotApproximateError(Exception): class NotDeterminsticError (line 35) | class NotDeterminsticError(Exception): class ApproximateTest (line 39) | class ApproximateTest(Test): method type_of_test_result (line 44) | def type_of_test_result(self) -> TypeOfTestResult: method error_message (line 47) | def error_message(self, metric_name: str) -> NotApproximateError: class DeterministicTest (line 51) | class DeterministicTest(Test): method rtol (line 53) | def rtol(self) -> float: method atol (line 57) | def atol(self) -> Union[int, float]: method type_of_test_result (line 61) | def type_of_test_result(self) -> TypeOfTestResult: method error_message (line 64) | def error_message(self, metric_name: str) -> NotDeterminsticError: class GoldenValueMetric (line 68) | class GoldenValueMetric(pydantic.BaseModel): method __repr__ (line 74) | def __repr__(self): class GoldenValues (line 78) | class GoldenValues(pydantic.RootModel): class MissingTensorboardLogsError (line 82) | class MissingTensorboardLogsError(Exception): class UndefinedMetricError (line 86) | class UndefinedMetricError(Exception): class SkipMetricError (line 90) | class SkipMetricError(Exception): function read_tb_logs_as_list (line 94) | def read_tb_logs_as_list( function read_golden_values_from_json (line 161) | def read_golden_values_from_json( function _filter_checks (line 172) | def _filter_checks( function pipeline (line 178) | def pipeline( FILE: tests/functional_tests/python_test_utils/compute_golden_statistics.py function find_result_json_files (line 46) | def find_result_json_files(results_dir: str, workspace_root: Optional[st... function _extract_result_path_from_log (line 92) | def _extract_result_path_from_log(out_file: Path, workspace_root: str) -... function _find_json_files_directly (line 151) | def _find_json_files_directly(results_dir: str) -> List[str]: function load_result_file (line 178) | def load_result_file(filepath: str) -> Optional[Dict[str, Any]]: function _detect_result_format (line 196) | def _detect_result_format(data: Dict[str, Any]) -> str: function _is_valid_numeric (line 221) | def _is_valid_numeric(value) -> bool: function _to_float (line 235) | def _to_float(value) -> Optional[float]: function _aggregate_training_results (line 251) | def _aggregate_training_results( function _aggregate_inference_results (line 296) | def _aggregate_inference_results( function aggregate_results (line 375) | def aggregate_results(result_files: List[str]) -> Dict[str, Dict[str, Li... function compute_statistics (line 413) | def compute_statistics(aggregated: Dict[str, Dict[str, List[float]]]) ->... function compute_recommended_tolerances (line 467) | def compute_recommended_tolerances( function format_summary (line 617) | def format_summary(stats: Dict[str, Any], tolerances: Dict[str, Dict[str... function main (line 659) | def main(): FILE: tests/functional_tests/python_test_utils/conftest.py function pytest_addoption (line 6) | def pytest_addoption(parser): function compare_approximate_results (line 32) | def compare_approximate_results(request) -> bool: function golden_values_path (line 38) | def golden_values_path(request): function golden_values (line 44) | def golden_values(request): function actual_values (line 50) | def actual_values(request): function actual_values_first_run (line 56) | def actual_values_first_run(request): function actual_values_second_run (line 64) | def actual_values_second_run(request): function scope (line 72) | def scope(request): function train_iters (line 78) | def train_iters(request): function tensorboard_logs (line 84) | def tensorboard_logs(request, train_iters): function test_values_path (line 92) | def test_values_path(request): function tensorboard_path (line 97) | def tensorboard_path(request): function model_config_path (line 103) | def model_config_path(request): FILE: tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py function collect_train_test_metrics (line 32) | def collect_train_test_metrics( FILE: tests/functional_tests/python_test_utils/test_grpo_training_loop.py function validate_with_tolerance (line 31) | def validate_with_tolerance( function test_grpo_training_loop (line 81) | def test_grpo_training_loop( FILE: tests/functional_tests/python_test_utils/test_inference_regular_pipeline.py function _median_as_float (line 24) | def _median_as_float(value): function _bytes_to_gib (line 39) | def _bytes_to_gib(num_bytes: float) -> float: function test_inference_pipeline (line 43) | def test_inference_pipeline( FILE: tests/functional_tests/python_test_utils/test_optimizer_grads_match.py function _as_iter (line 13) | def _as_iter(x: TensorLike): function _fro_norm (line 17) | def _fro_norm(x: TensorLike) -> torch.Tensor: function machine_epsilon_for_dtype (line 26) | def machine_epsilon_for_dtype(dtype: torch.dtype) -> float: function relative_grad_diff (line 41) | def relative_grad_diff(g_hat: TensorLike, g_ref: TensorLike, eps_den: fl... function expected_rel_bound (line 59) | def expected_rel_bound( function check_gradient (line 80) | def check_gradient( function _filter_optimizer_tensors (line 104) | def _filter_optimizer_tensors(plain_tensors: Dict[str, torch.Tensor]) ->... function assert_grads_close (line 111) | def assert_grads_close(left: torch.Tensor, right: torch.Tensor): function unshard_row_parallel_state (line 144) | def unshard_row_parallel_state(saved_state, out_features, in_features, tp): function _assert_optimizer_tensors_equal (line 154) | def _assert_optimizer_tensors_equal( function load_dist_checkpoint_pt (line 252) | def load_dist_checkpoint_pt( function test_optimizer_states_match (line 291) | def test_optimizer_states_match(checkpoint_dirs): function main (line 327) | def main(): FILE: tests/functional_tests/python_test_utils/test_pretraining_regular_pipeline.py function test_regular_pipeline (line 25) | def test_regular_pipeline( FILE: tests/functional_tests/python_test_utils/test_pretraining_resume_checkpoint_pipeline.py function test_resume_checkpoint_pipeline (line 14) | def test_resume_checkpoint_pipeline( FILE: tests/functional_tests/test_cases/common/ckpt_converter/__main__.py function is_model_parallel_rank_0 (line 39) | def is_model_parallel_rank_0(): function broadcast (line 46) | def broadcast(item): class TempSharedDir (line 56) | class TempSharedDir: method __enter__ (line 59) | def __enter__(self): method __exit__ (line 67) | def __exit__(self, exc_type, exc_value, exc_tb): class ModelParallelState (line 78) | class ModelParallelState(_ModelParallelState): method __new__ (line 81) | def __new__(cls, tp=1, pp=1, ep=1): method __str__ (line 84) | def __str__(self): class ModelMeta (line 88) | class ModelMeta: method __init__ (line 97) | def __init__(self, format: str, mp: ModelParallelState, transformer_im... method __str__ (line 112) | def __str__(self): class Pipeline (line 116) | class Pipeline: method __init__ (line 131) | def __init__(self, src: ModelMeta, dst: ModelMeta): method __str__ (line 138) | def __str__(self): method get_model_argv (line 141) | def get_model_argv(self): method get_converter_model_type (line 145) | def get_converter_model_type(self): method get_meta (line 149) | def get_meta(self, key): method init_args_and_model (line 154) | def init_args_and_model(self, key): method build_model (line 224) | def build_model(): method get_input_ids (line 234) | def get_input_ids(): method get_batch (line 256) | def get_batch(input_ids): method forward_step (line 316) | def forward_step(cls, orig_input_ids: T.Iterator, model: torch.nn.Modu... method forward_model (line 346) | def forward_model(cls, models, orig_input_ids): method rand_init_model_params (line 376) | def rand_init_model_params(self, key, models): method save_checkpoint (line 399) | def save_checkpoint(self): method load_checkpoint (line 422) | def load_checkpoint(self, orig_input_ids): method convert_checkpoint (line 444) | def convert_checkpoint(self): method run (line 488) | def run(self): class GPTPipeline (line 544) | class GPTPipeline(Pipeline): method __init__ (line 553) | def __init__(self, src: ModelMeta, dst: ModelMeta, num_moe_experts: T.... method __str__ (line 558) | def __str__(self): method get_model_argv (line 564) | def get_model_argv(self): method get_converter_model_type (line 590) | def get_converter_model_type(self): class LLaVAPipeline (line 594) | class LLaVAPipeline(Pipeline): method __init__ (line 595) | def __init__( method __str__ (line 603) | def __str__(self): method get_model_argv (line 610) | def get_model_argv(self): method get_test_image (line 650) | def get_test_image(): method get_input_ids (line 656) | def get_input_ids(): method forward_step (line 681) | def forward_step(cls, orig_input_ids: T.Iterator, model: torch.nn.Modu... method forward_model (line 711) | def forward_model(cls, models, orig_input_ids): method build_model (line 743) | def build_model(): method get_converter_model_type (line 753) | def get_converter_model_type(self): method init_args_and_model (line 756) | def init_args_and_model(self, key): function get_gpt_pipelines (line 828) | def get_gpt_pipelines(): function get_moe_pipelines (line 843) | def get_moe_pipelines(): function get_llava_pipelines (line 851) | def get_llava_pipelines(): function test_all_pipelines (line 874) | def test_all_pipelines(): FILE: tests/functional_tests/test_cases/common/moe_perf/__main__.py function _build_transformer_config (line 43) | def _build_transformer_config(case: MoEPerformanceCase) -> TransformerCo... function _resolve_moe_submodules (line 89) | def _resolve_moe_submodules(case: MoEPerformanceCase): function _load_baselines (line 95) | def _load_baselines() -> Dict[str, Dict[str, float]]: function _persist_baselines (line 102) | def _persist_baselines(data: Dict[str, Dict[str, float]]) -> None: function _serialize_metrics (line 109) | def _serialize_metrics(metrics: Dict[str, float]) -> Dict[str, float]: function _assert_within_baseline (line 120) | def _assert_within_baseline( function _benchmark_moe_layer (line 179) | def _benchmark_moe_layer(layer: MoELayer, case: MoEPerformanceCase): function _maybe_update_baseline (line 271) | def _maybe_update_baseline( function _prepare_moe_layer (line 292) | def _prepare_moe_layer(case: MoEPerformanceCase) -> MoELayer: function _check_env (line 301) | def _check_env(): function _check_dependencies (line 309) | def _check_dependencies(case: MoEPerformanceCase): function test_moe_layer_performance (line 325) | def test_moe_layer_performance(perf_case: MoEPerformanceCase, debug_mode... FILE: tests/functional_tests/test_cases/common/moe_perf/test_cases.py class MoEModelConfig (line 9) | class MoEModelConfig: class MoEPerformanceCase (line 29) | class MoEPerformanceCase: method input_dtype (line 62) | def input_dtype(self) -> torch.dtype: method is_current_platform (line 65) | def is_current_platform(self) -> bool: