SYMBOL INDEX (5871 symbols across 514 files)

FILE: .github/scripts/oncall_manager.py
  function get_headers (line 37) | def get_headers():
  function get_repo_info (line 52) | def get_repo_info():
  function get_team_members (line 61) | def get_team_members(org, team_slug):
  function get_user_email (line 85) | def get_user_email(username):
  function get_slack_client (line 150) | def get_slack_client():
  function get_slack_user_id (line 158) | def get_slack_user_id(slack_client, email):
  function get_slack_usergroup_id (line 176) | def get_slack_usergroup_id(slack_client, handle):
  function update_slack_usergroup (line 192) | def update_slack_usergroup(new_oncall_username, old_members_usernames):
  function load_schedule (line 238) | def load_schedule():
  function save_schedule (line 255) | def save_schedule(schedule):
  function update_active_oncall_team (line 260) | def update_active_oncall_team(org, new_oncall):
  function rotate_schedule (line 289) | def rotate_schedule(repo_owner, dry_run=False):
  function get_last_wednesday (line 339) | def get_last_wednesday():
  function ensure_schedule_filled (line 345) | def ensure_schedule_filled(schedule, repo_owner):
  function assign_reviewer (line 393) | def assign_reviewer(pr_number):
  function main (line 408) | def main():

FILE: .github/scripts/sync_team_usergroups.py
  function get_headers (line 45) | def get_headers():
  function get_org (line 61) | def get_org():
  function github_team_to_slack_usergroup (line 67) | def github_team_to_slack_usergroup(team_slug):
  function get_child_teams (line 100) | def get_child_teams(org, parent_team_slug):
  function get_team_members (line 139) | def get_team_members(org, team_slug):
  function get_user_email (line 167) | def get_user_email(username):
  function get_slack_client (line 233) | def get_slack_client():
  function get_slack_user_id (line 242) | def get_slack_user_id(slack_client, email):
  function fetch_all_usergroups (line 261) | def fetch_all_usergroups(slack_client):
  function get_slack_usergroup_id (line 291) | def get_slack_usergroup_id(slack_client, handle):
  function github_team_to_usergroup_name (line 301) | def github_team_to_usergroup_name(team_slug):
  function create_slack_usergroup (line 312) | def create_slack_usergroup(slack_client, handle, team_slug):
  function sync_team_to_usergroup (line 356) | def sync_team_to_usergroup(team_slug, usergroup_handle, dry_run=False):
  function get_team_to_usergroup_mapping (line 447) | def get_team_to_usergroup_mapping(parent_team_slug):
  function sync_all_teams (line 464) | def sync_all_teams(dry_run=False, parent_teams=None, direct_teams=None):
  function main (line 520) | def main():

FILE: .gitlab/scripts/check_imports.py
  class ImportChecker (line 31) | class ImportChecker:
    method __init__ (line 34) | def __init__(self, package_name: str = "megatron.core", verbose: bool ...
    method should_skip_module (line 59) | def should_skip_module(self, module_name: str) -> bool:
    method discover_modules (line 66) | def discover_modules(self, package_path: str) -> List[str]:
    method import_module (line 102) | def import_module(self, module_name: str) -> Tuple[str, str]:
    method check_all_imports (line 123) | def check_all_imports(self):
  function main (line 200) | def main(package_name: str):

FILE: docs/add_copyright_header.py
  function main (line 15) | def main():

FILE: docs/autodoc2_docstrings_parser.py
  class NapoleonParser (line 20) | class NapoleonParser(MystParser):
    method parse (line 23) | def parse(self, input_string: str, document: nodes.document) -> None:

FILE: examples/academic_paper_scripts/detxoify_lm/annotations/filter-selfgeneration.py
  function get_corpus_scores (line 24) | def get_corpus_scores(lines):
  function main (line 37) | def main():

FILE: examples/academic_paper_scripts/detxoify_lm/annotations/perspective_api_annotate.py
  class PerspectiveApiScorer (line 24) | class PerspectiveApiScorer:
    method __init__ (line 31) | def __init__(self):
    method get_scores (line 44) | def get_scores(self, input_text: str, requested_attributes: Optional[L...
  function test (line 73) | def test():
  function split_lines (line 79) | def split_lines(lines, split):
  function get_score (line 88) | def get_score(line):
  function get_scores (line 118) | def get_scores(lines):
  function get_annotated_datasets (line 150) | def get_annotated_datasets(lines, threads=10):
  function main (line 160) | def main():

FILE: examples/academic_paper_scripts/detxoify_lm/finetune_gpt.py
  function model_provider (line 28) | def model_provider(pre_process=True, post_process=True):
  function get_batch (line 41) | def get_batch(data_iterator):
  function loss_func (line 72) | def loss_func(loss_mask, output_tensor):
  function forward_step (line 83) | def forward_step(data_iterator, model):
  function train_valid_test_datasets_provider (line 100) | def train_valid_test_datasets_provider(train_val_test_num_samples):
  function add_validation_args (line 141) | def add_validation_args(parser):

FILE: examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py
  function model_provider (line 29) | def model_provider(pre_process=True, post_process=True) -> Union[GPTMode...
  function add_text_generate_args (line 93) | def add_text_generate_args(parser):
  function generate_samples_unconditional (line 119) | def generate_samples_unconditional(model):
  function generate_samples_conditional (line 156) | def generate_samples_conditional(model):
  function generate_and_write_samples_unconditional (line 209) | def generate_and_write_samples_unconditional(model):
  function generate_and_write_samples_conditional (line 218) | def generate_and_write_samples_conditional(model):
  function main (line 232) | def main():

FILE: examples/academic_paper_scripts/detxoify_lm/perspective_api.py
  class PerspectiveApiScorer (line 24) | class PerspectiveApiScorer:
    method __init__ (line 31) | def __init__(self):
    method get_scores (line 44) | def get_scores(self, input_text: str, requested_attributes: Optional[L...
  function test (line 73) | def test():
  function get_score (line 80) | def get_score(x):
  function main (line 92) | def main():

FILE: examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
  function initialize_distributed (line 17) | def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_...
  function model_provider (line 29) | def model_provider():
  function load_distributed_checkpoint (line 49) | def load_distributed_checkpoint(checkpoint_path, gpt_model):

FILE: examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
  function initialize_distributed (line 18) | def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_...
  function model_provider (line 30) | def model_provider():
  function load_distributed_checkpoint (line 50) | def load_distributed_checkpoint(checkpoint_path, gpt_model):

FILE: examples/gptoss/01_convert_from_hf.py
  function _parse_args (line 10) | def _parse_args():

FILE: examples/gptoss/03_convert_to_hf.py
  function _parse_args (line 10) | def _parse_args():

FILE: examples/inference/gpt/gpt_dynamic_inference.py
  function run_inference (line 58) | def run_inference(
  function main (line 279) | def main():

FILE: examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py
  function suspend_resume_cycle (line 33) | async def suspend_resume_cycle(client, engine, args, futures):
  function main (line 49) | async def main(

FILE: examples/inference/gpt/gpt_static_inference.py
  function add_static_inference_args (line 37) | def add_static_inference_args(parser):
  function get_inference_engine (line 55) | def get_inference_engine(args: Namespace, model: MegatronModule) -> Stat...
  function generate (line 84) | async def generate(
  function main (line 119) | def main():

FILE: examples/inference/gpt/utils.py
  function get_default_sampling_params (line 23) | def get_default_sampling_params(termination_id: int = None):
  function get_curr_time (line 34) | def get_curr_time() -> float:
  class Request (line 42) | class Request:
    method __init__ (line 57) | def __init__(
    method __str__ (line 81) | def __str__(self) -> str:
  function get_time_offsets (line 91) | def get_time_offsets(
  function get_cli_requests (line 136) | def get_cli_requests(
  function get_synthetic_requests (line 153) | def get_synthetic_requests(
  function get_requests_from_file (line 188) | def get_requests_from_file(
  function build_requests (line 230) | def build_requests(
  function get_model_size_str (line 244) | def get_model_size_str(model):
  function build_dynamic_engine_setup_prefix (line 253) | def build_dynamic_engine_setup_prefix(
  function get_global_peak_memory_stats_bytes (line 315) | def get_global_peak_memory_stats_bytes() -> dict:

FILE: examples/inference/t5/simple_t5_batch_inference.py
  function add_text_generate_args (line 38) | def add_text_generate_args(parser):
  function get_inference_engine (line 70) | def get_inference_engine(args: Namespace, model: MegatronModule) -> Abst...
  function main (line 102) | def main():

FILE: examples/mimo/avlm_inference.py
  function init_distributed (line 23) | def init_distributed(tp_size: int = 1, pp_size: int = 1):
  function get_input_data (line 32) | def get_input_data(
  function main (line 129) | def main():
  function load_distributed_checkpoint (line 213) | def load_distributed_checkpoint(model: torch.nn.Module, ckpt_dir: str):

FILE: examples/mimo/configs/llava_avlm.py
  function get_llava_projection_config (line 25) | def get_llava_projection_config(
  function get_vicuna_language_layer_spec (line 46) | def get_vicuna_language_layer_spec() -> ModuleSpec:
  function get_llava_projection_layer_spec (line 50) | def get_llava_projection_layer_spec() -> ModuleSpec:

FILE: examples/mimo/configs/llava_vlm.py
  function get_vicuna_language_model_config (line 24) | def get_vicuna_language_model_config(
  function get_llava_projection_config (line 82) | def get_llava_projection_config(
  function get_vicuna_language_layer_spec (line 103) | def get_vicuna_language_layer_spec() -> ModuleSpec:
  function get_llava_projection_layer_spec (line 107) | def get_llava_projection_layer_spec() -> ModuleSpec:

FILE: examples/mimo/configs/mock.py
  function get_mock_language_model_config (line 28) | def get_mock_language_model_config(config: Optional[TransformerConfig] =...
  function get_mock_vision_model_config (line 47) | def get_mock_vision_model_config(config: Optional[TransformerConfig] = N...
  function get_mock_projection_config (line 76) | def get_mock_projection_config(hidden_size: int = 128) -> TransformerCon...
  function get_mock_language_layer_spec (line 97) | def get_mock_language_layer_spec():
  function get_mock_vision_layer_spec (line 107) | def get_mock_vision_layer_spec():
  function get_mock_projection_layer_spec (line 120) | def get_mock_projection_layer_spec():

FILE: examples/mimo/data/avlm_sample_loader.py
  function sample_loader (line 3) | def sample_loader(raw: dict) -> dict:
  function part_filter (line 85) | def part_filter(part: str) -> bool:

FILE: examples/mimo/data/energon_avlm_task_encoder.py
  class ConversationTemplateConfig (line 50) | class ConversationTemplateConfig:
  class LlavaConversationTemplateConfig (line 56) | class LlavaConversationTemplateConfig(ConversationTemplateConfig):
  class VisionAudioQASample (line 64) | class VisionAudioQASample(VQASample):
  class AVLMModelType (line 74) | class AVLMModelType(Enum):
  class AVLMTaskEncoder (line 78) | class AVLMTaskEncoder(
    method __init__ (line 86) | def __init__(
    method apply_prompt_template (line 100) | def apply_prompt_template(self, input_text: VisionAudioQASample):
    method _find_pattern_indices (line 157) | def _find_pattern_indices(
    method encode_sample (line 169) | def encode_sample(self, sample: VisionAudioQASample):
    method batch (line 289) | def batch(self, samples: List[Dict]) -> Dict:
    method encode_batch_avlm_clip_whisper_llava (line 314) | def encode_batch_avlm_clip_whisper_llava(self, batch_data: Dict) -> Dict:
    method encode_batch (line 351) | def encode_batch(self, batch_data: Dict) -> dict:
  function llava_avlm_dataloader_provider (line 358) | def llava_avlm_dataloader_provider(train_val_test_num_samples):
  class KeyProcessor (line 445) | class KeyProcessor(Protocol):
    method __call__ (line 448) | def __call__(self, values: List[torch.Tensor]) -> torch.Tensor:  # pra...
  class StackProcessor (line 452) | class StackProcessor:
    method __init__ (line 455) | def __init__(self, dim: int = 0):
    method __call__ (line 458) | def __call__(self, values: List[torch.Tensor]) -> torch.Tensor:
  class PaddingProcessor (line 462) | class PaddingProcessor:
    method __init__ (line 465) | def __init__(self, pad_value: int, batch_first: bool = True):
    method __call__ (line 469) | def __call__(self, values: List[torch.Tensor]) -> torch.Tensor:

FILE: examples/mimo/data/energon_vlm_task_encoder.py
  class ConversationTemplateConfig (line 44) | class ConversationTemplateConfig:
  class LlavaConversationTemplateConfig (line 51) | class LlavaConversationTemplateConfig(ConversationTemplateConfig):
  class ModelType (line 57) | class ModelType(Enum):
  function predict_seq_len_with_padding (line 61) | def predict_seq_len_with_padding(instance_tokens: torch.Tensor, pad_to_m...
  function group_samples (line 73) | def group_samples(samples: List[Dict[str, torch.Tensor]],
  class VLMTaskEncoder (line 110) | class VLMTaskEncoder(
    method __init__ (line 118) | def __init__(
    method apply_prompt_template (line 145) | def apply_prompt_template(self, input_text: VQASample):
    method _find_pattern_indices (line 202) | def _find_pattern_indices(
    method select_samples_to_pack (line 213) | def select_samples_to_pack(self, samples: List[Dict[str, torch.Tensor]...
    method pack_selected_samples (line 237) | def pack_selected_samples(self, samples: List[Dict[str, torch.Tensor]]...
    method encode_sample (line 335) | def encode_sample(self, sample: VQASample):
    method batch (line 385) | def batch(self, samples: List[Dict]) -> Dict:
    method encode_batch_vlm_clip_llava (line 437) | def encode_batch_vlm_clip_llava(self, batch_data: Dict) -> Dict:
    method encode_batch_vlm_clip_llava_video (line 469) | def encode_batch_vlm_clip_llava_video(self, batch_data: Dict) -> Dict:
    method encode_batch (line 494) | def encode_batch(self, batch_data: Dict) -> dict:
  function llava_vlm_dataloader_provider (line 502) | def llava_vlm_dataloader_provider(train_val_test_num_samples, max_seq_le...
  class KeyProcessor (line 573) | class KeyProcessor(Protocol):
    method __call__ (line 576) | def __call__(self, values: List[torch.Tensor], max_len: Optional[int] ...
  class StackProcessor (line 580) | class StackProcessor:
    method __init__ (line 583) | def __init__(self, dim: int = 0):
    method __call__ (line 586) | def __call__(self, values: List[torch.Tensor], max_len: Optional[int] ...
  class PaddingProcessor (line 594) | class PaddingProcessor:
    method __init__ (line 597) | def __init__(self, pad_value: int, batch_first: bool = True):
    method _pad_and_stack (line 601) | def _pad_and_stack(self, tensors: List[torch.Tensor], max_len: int, pa...
    method __call__ (line 616) | def __call__(self, values: List[torch.Tensor], max_len: Optional[int] ...
  class PackingKwargsProcessor (line 624) | class PackingKwargsProcessor:
    method __call__ (line 627) | def __call__(self, values: List[torch.Tensor], max_len: Optional[int] ...
  class GenericStackProcessor (line 633) | class GenericStackProcessor:
    method __init__ (line 635) | def __init__(self, dim: int = 0):
    method __call__ (line 638) | def __call__(self, values: List[torch.Tensor], max_len: Optional[int] ...

FILE: examples/mimo/data/mock.py
  function create_mock_image (line 15) | def create_mock_image(image_size: int = 336) -> torch.Tensor:
  function create_mock_caption (line 28) | def create_mock_caption() -> str:
  class MockVLMDataset (line 38) | class MockVLMDataset(Dataset):
    method __init__ (line 41) | def __init__(
    method __len__ (line 81) | def __len__(self) -> int:
    method __getitem__ (line 85) | def __getitem__(self, idx: int) -> Dict:
    method _mock_tokenize (line 134) | def _mock_tokenize(self) -> torch.Tensor:
  function get_mock_vlm_dataloader (line 165) | def get_mock_vlm_dataloader(
  function _collate_fn (line 211) | def _collate_fn(batch: List[Dict]) -> Dict[str, torch.Tensor]:
  function train_valid_test_datasets_provider (line 240) | def train_valid_test_datasets_provider(train_val_test_num_samples):

FILE: examples/mimo/data/prepare_video_llava_data.py
  function _extract_archives (line 11) | def _extract_archives(root: str):
  function convert_llava_video_to_wds (line 23) | def convert_llava_video_to_wds(dataset_root: str, shard_size: int = 8000):

FILE: examples/mimo/data/utils/calculate_audio_tokens.py
  function calculate_num_mel_frames (line 18) | def calculate_num_mel_frames(audio_length, sample_rate, window_stride, w...
  function calculate_num_audio_tokens (line 44) | def calculate_num_audio_tokens(audio_tensor, model_name):

FILE: examples/mimo/model_providers/hf_clip_encoder.py
  class HFCLIPEncoderWrapper (line 10) | class HFCLIPEncoderWrapper(torch.nn.Module):
    method __init__ (line 13) | def __init__(self, feature_layer_index=-2, is_video_input: bool = False):
    method forward (line 30) | def forward(self, pixel_values: torch.Tensor):

FILE: examples/mimo/model_providers/hf_whisper_encoder.py
  class HFWhisperEncoderWrapper (line 6) | class HFWhisperEncoderWrapper(torch.nn.Module):
    method __init__ (line 9) | def __init__(self, model_name: str):
    method forward (line 13) | def forward(self, input_features, seq_lengths=None):

FILE: examples/mimo/model_providers/llava_avlm.py
  function model_provider_llava_avlm (line 31) | def model_provider_llava_avlm(

FILE: examples/mimo/model_providers/llava_vlm.py
  function model_provider_llava_vlm (line 29) | def model_provider_llava_vlm(

FILE: examples/mimo/model_providers/mock.py
  function model_provider_mock_vlm_single_encoder (line 28) | def model_provider_mock_vlm_single_encoder(

FILE: examples/mimo/train.py
  function add_mimo_args (line 52) | def add_mimo_args(parser):
  function get_batch (line 86) | def get_batch(data_iterator: Iterator[Dict[str, Any]]):
  function loss_func (line 139) | def loss_func(loss_mask, output_tensor):
  function forward_step (line 176) | def forward_step(data_iterator, model):
  function train_valid_test_datasets_provider (line 193) | def train_valid_test_datasets_provider(*provider_args, **provider_kwargs):
  function model_provider (line 219) | def model_provider(

FILE: examples/mimo/utils/data_helpers.py
  function flatten (line 15) | def flatten(
  function regroup (line 32) | def regroup(flat: List[Tuple[Tuple[str, ...], torch.Tensor]]) -> Dict[st...
  function broadcast_nested_data_batch (line 43) | def broadcast_nested_data_batch(nested_dict: Dict[str, Any]) -> Dict[str...

FILE: examples/mimo/utils/logging.py
  function print_mimo_structure (line 9) | def print_mimo_structure(model):

FILE: examples/mimo/utils/model_helpers.py
  function load_submodule_ckpt (line 10) | def load_submodule_ckpt(module: torch.nn.Module, ckpt_dir: str):

FILE: examples/multimodal/combine_state_dicts.py
  function combine (line 15) | def combine(input_files, module_prefixes, output_files):

FILE: examples/multimodal/config.py
  function get_language_model_config (line 9) | def get_language_model_config(config):
  function get_vision_model_config (line 179) | def get_vision_model_config(config, apply_query_key_layer_scaling):
  function get_vision_projection_config (line 334) | def get_vision_projection_config(config, hidden_size):
  class EvaluationConfig (line 393) | class EvaluationConfig:

FILE: examples/multimodal/dataloader_provider.py
  function datasets_provider (line 27) | def datasets_provider(task_encoder,worker_config=None):
  function is_first_or_last_stage (line 71) | def is_first_or_last_stage(pp_size):
  function is_dataloader_rank (line 84) | def is_dataloader_rank():
  function train_valid_test_dataloaders_provider (line 95) | def train_valid_test_dataloaders_provider(train_val_test_num_samples, ta...
  class EnergonDataloader (line 152) | class EnergonDataloader:
    method __init__ (line 154) | def __init__(self, dataloader):
    method __next__ (line 158) | def __next__(self):
    method __iter__ (line 161) | def __iter__(self):
    method save_state (line 164) | def save_state(self):
  function cyclic_iter (line 168) | def cyclic_iter(iter):

FILE: examples/multimodal/dataset_helpers.py
  class ImageTaskSample (line 36) | class ImageTaskSample(Sample):
  class ImageTaskSamplePacked (line 50) | class ImageTaskSamplePacked(Sample):
  class ImageTaskBatchPacked (line 72) | class ImageTaskBatchPacked(Batch):
  function search_for_fit (line 95) | def search_for_fit(numbers: List[int], capacity: int) -> int:
  function greedy_knapsack (line 103) | def greedy_knapsack(item_sizes: List[int], samples: List, max_capacity: ...
  class TaskEncoder (line 145) | class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatc...
    method __init__ (line 148) | def __init__(
    method _get_total_seq_length (line 195) | def _get_total_seq_length(self, input_ids, num_tiles):
    method _truncate_for_packing (line 202) | def _truncate_for_packing(self, input_ids, target, num_tiles):
    method encode_sample (line 219) | def encode_sample(self, sample: Union[CaptioningSample, OCRSample, VQA...
    method encode_captioning (line 247) | def encode_captioning(self, sample: CaptioningSample):
    method encode_llava_pretrain (line 293) | def encode_llava_pretrain(self, sample: VQASample):
    method encode_sample_list (line 327) | def encode_sample_list(self, samples: SampleListSample):
    method encode_llava_sft (line 347) | def encode_llava_sft(self, sample: Union[SimilarityInterleavedSample, ...
    method target_has_trainable_tokens (line 533) | def target_has_trainable_tokens(self, input_ids, num_tiles, target):
    method replace_value_with_repetition (line 552) | def replace_value_with_repetition(self, arr, token_to_replace, num_rep...
    method encode_any_single_turn_vqa (line 581) | def encode_any_single_turn_vqa(self, sample):
    method combined_ocr_encoder (line 663) | def combined_ocr_encoder(self, sample, task_type):
    method encode_pdf_prompt (line 703) | def encode_pdf_prompt(self, sample: OCRSample) -> ImageTaskSample:
    method encode_ocr_ref_prompt (line 724) | def encode_ocr_ref_prompt(self, sample: OCRSample) -> ImageTaskSample:
    method bbox_coord_to_label (line 758) | def bbox_coord_to_label(self, text, bbox):
    method encode_ocr_prompt (line 772) | def encode_ocr_prompt(self, sample: OCRSample) -> ImageTaskSample:
    method batch (line 791) | def batch(self, samples: List[Union[ImageTaskSample, ImageTaskSamplePa...
    method encode_batch (line 864) | def encode_batch(self, batch: ImageTaskBatchPacked) -> dict:
    method select_samples_to_pack (line 869) | def select_samples_to_pack(self, samples: List[ImageTaskSample]) -> Li...
    method pack_selected_samples (line 882) | def pack_selected_samples(self, samples: List[ImageTaskSample]) -> Lis...
  function print_error_handler (line 945) | def print_error_handler(exc: Exception, key: Optional[str]):
  function format_multichoice_question (line 953) | def format_multichoice_question(question, multichoice_options):
  function format_multichoice_answer (line 964) | def format_multichoice_answer(idx):

FILE: examples/multimodal/energon_util.py
  class SampleListSample (line 10) | class SampleListSample(Sample):
  class OfflineTargetAspectRatioSample (line 21) | class OfflineTargetAspectRatioSample(Sample):

FILE: examples/multimodal/evaluation/evaluate_ai2d.py
  function merge_input_files (line 8) | def merge_input_files(input_path):
  function ai2d_eval (line 38) | def ai2d_eval(input_path):

FILE: examples/multimodal/evaluation/evaluate_chartqa.py
  function merge_input_files (line 8) | def merge_input_files(input_path):
  function chartqa_eval (line 35) | def chartqa_eval(input_path):

FILE: examples/multimodal/evaluation/evaluate_coco.py
  function convert_to_coco_format (line 10) | def convert_to_coco_format(input_path):
  function coco_captioning_eval (line 40) | def coco_captioning_eval(input_path, groundtruth_file):

FILE: examples/multimodal/evaluation/evaluate_infovqa.py
  function merge_input_files (line 8) | def merge_input_files(input_path):
  function infovqa_eval (line 35) | def infovqa_eval(input_path):

FILE: examples/multimodal/evaluation/evaluate_mathvista.py
  function merge_input_files (line 10) | def merge_input_files(input_path):
  function extra_processing (line 36) | def extra_processing(text):
  function extract_answer (line 60) | def extract_answer(text):
  function compute_mathvista_accuracy (line 74) | def compute_mathvista_accuracy(result_file):
  function mathvista_eval (line 108) | def mathvista_eval(input_path):

FILE: examples/multimodal/evaluation/evaluate_mmmu.py
  function get_input_output_paths (line 21) | def get_input_output_paths(input_path, task):
  function extract_answer (line 38) | def extract_answer(text):
  function convert_to_mmmu_format (line 55) | def convert_to_mmmu_format(input_path):
  function mmmu_eval (line 87) | def mmmu_eval(input_path, groundtruth_path):
  function main (line 113) | def main():

FILE: examples/multimodal/evaluation/evaluate_ocrbench.py
  function merge_input_files (line 7) | def merge_input_files(input_path):
  function compute_ocrbench_score (line 33) | def compute_ocrbench_score(result_file):
  function ocrbench_eval (line 123) | def ocrbench_eval(input_path):

FILE: examples/multimodal/evaluation/evaluate_ocrbench_v2.py
  function convert_to_ocrbench_v2_format (line 10) | def convert_to_ocrbench_v2_format(input_path, groundtruth_path):
  function ocrbench_v2_eval (line 37) | def ocrbench_v2_eval(input_path, groundtruth_path, output_path):
  function main (line 71) | def main():

FILE: examples/multimodal/evaluation/evaluate_rd_tablebench.py
  function convert_to_rdtablebench_format (line 22) | def convert_to_rdtablebench_format(input_path):
  function rdtablebench_eval (line 42) | def rdtablebench_eval(input_path):
  function main (line 67) | def main():

FILE: examples/multimodal/evaluation/evaluate_realworldqa.py
  function merge_input_files (line 8) | def merge_input_files(input_path):
  function realworldqa_eval (line 32) | def realworldqa_eval(input_path):

FILE: examples/multimodal/evaluation/evaluate_spdocvqa.py
  function merge_input_files (line 8) | def merge_input_files(input_path):
  function spdocvqa_eval (line 35) | def spdocvqa_eval(input_path):

FILE: examples/multimodal/evaluation/evaluate_textvqa.py
  function merge_input_files (line 8) | def merge_input_files(input_path):
  function textvqa_eval (line 38) | def textvqa_eval(input_path):

FILE: examples/multimodal/evaluation/evaluate_video_motionbench.py
  function merge_input_files (line 9) | def merge_input_files(input_path):
  function motionbench_eval (line 33) | def motionbench_eval(input_path):

FILE: examples/multimodal/evaluation/evaluate_video_mvbench.py
  function merge_input_files (line 7) | def merge_input_files(input_path):
  function check_ans (line 36) | def check_ans(pred, gt):
  function create_result_dict (line 53) | def create_result_dict(result_list):
  function combine_all_res (line 83) | def combine_all_res(acc_dict):
  function mvbench_eval (line 98) | def mvbench_eval(input_path):

FILE: examples/multimodal/evaluation/evaluate_video_phys_game_bench.py
  function merge_input_files (line 7) | def merge_input_files(input_path):
  function check_ans (line 35) | def check_ans(pred, gt):
  function compute_all_acc (line 52) | def compute_all_acc(result_list):
  function phys_game_bench_eval (line 83) | def phys_game_bench_eval(input_path):

FILE: examples/multimodal/evaluation/evaluate_vqav2.py
  function levenshtein_distance (line 11) | def levenshtein_distance(s1: str, s2: str) -> int:
  function normalized_levenshtein_distance (line 29) | def normalized_levenshtein_distance(s1: str, s2: str) -> float:
  function similarity_function (line 34) | def similarity_function(prediction: str, gold_label: str, threshold: flo...
  function anls_score (line 38) | def anls_score(
  function merge_input_files (line 58) | def merge_input_files(input_path):
  function is_number (line 85) | def is_number(n: str):
  function compute_vqa_accuracy (line 94) | def compute_vqa_accuracy(result_file, task):
  function vqav2_eval (line 148) | def vqav2_eval(input_path):

FILE: examples/multimodal/evaluation/evaluation_datasets.py
  function _get_partition_bounds (line 17) | def _get_partition_bounds(
  class VQADataset (line 28) | class VQADataset(torch.utils.data.Dataset):
    method __init__ (line 31) | def __init__(
    method __len__ (line 69) | def __len__(self):
    method __getitem__ (line 72) | def __getitem__(self, idx):
  class CaptioningDataset (line 110) | class CaptioningDataset(torch.utils.data.Dataset):
    method __init__ (line 113) | def __init__(
    method __len__ (line 150) | def __len__(self):
    method __getitem__ (line 153) | def __getitem__(self, idx):
  class MMMUDataset (line 179) | class MMMUDataset(torch.utils.data.Dataset):
    method __init__ (line 182) | def __init__(
    method __len__ (line 255) | def __len__(self):
    method process_image_tag (line 258) | def process_image_tag(self, q):
    method __getitem__ (line 307) | def __getitem__(self, idx):
  class VideoMMEDataset (line 452) | class VideoMMEDataset(torch.utils.data.Dataset):
    method __init__ (line 455) | def __init__(
    method __len__ (line 500) | def __len__(self):
    method __getitem__ (line 503) | def __getitem__(self, idx):
  class OCRBenchDataset (line 553) | class OCRBenchDataset(torch.utils.data.Dataset):
    method __init__ (line 556) | def __init__(
    method __len__ (line 587) | def __len__(self):
    method __getitem__ (line 590) | def __getitem__(self, idx):
  class MathVistaDataset (line 621) | class MathVistaDataset(torch.utils.data.Dataset):
    method __init__ (line 624) | def __init__(
    method __len__ (line 665) | def __len__(self):
    method __getitem__ (line 668) | def __getitem__(self, idx):
  class AI2DDataset (line 719) | class AI2DDataset(torch.utils.data.Dataset):
    method __init__ (line 722) | def __init__(
    method __len__ (line 756) | def __len__(self):
    method __getitem__ (line 759) | def __getitem__(self, idx):
  class RDTableBenchDataset (line 787) | class RDTableBenchDataset(torch.utils.data.Dataset):
    method __init__ (line 788) | def __init__(
    method __len__ (line 828) | def __len__(self):
    method __getitem__ (line 831) | def __getitem__(self, idx):
  class RealworldQADataset (line 865) | class RealworldQADataset(torch.utils.data.Dataset):
    method __init__ (line 866) | def __init__(
    method __len__ (line 899) | def __len__(self):
    method __getitem__ (line 902) | def __getitem__(self, idx):
  class MotionBenchDataset (line 952) | class MotionBenchDataset(torch.utils.data.Dataset):
    method __init__ (line 953) | def __init__(
    method __len__ (line 1007) | def __len__(self):
    method __getitem__ (line 1010) | def __getitem__(self, idx):
  class PhysGameBenchDataset (line 1057) | class PhysGameBenchDataset(torch.utils.data.Dataset):
    method __init__ (line 1058) | def __init__(
    method __len__ (line 1104) | def __len__(self):
    method _qa_template (line 1107) | def _qa_template(self, data):
    method __getitem__ (line 1116) | def __getitem__(self, idx):
  class MVBenchDataset (line 1167) | class MVBenchDataset(torch.utils.data.Dataset):
    method __init__ (line 1168) | def __init__(
    method __len__ (line 1245) | def __len__(self):
    method get_index (line 1248) | def get_index(self, bound, fps, max_frame, first_idx=0):
    method qa_template (line 1262) | def qa_template(self, data):
    method read_frame (line 1276) | def read_frame(self, video_path, bound=None, fps=2):
    method read_video_ours (line 1285) | def read_video_ours(self, video_path, bound=None):
    method __getitem__ (line 1299) | def __getitem__(self, idx):
  class ExampleInferenceDataset (line 1342) | class ExampleInferenceDataset(torch.utils.data.Dataset):
    method __init__ (line 1343) | def __init__(
    method __len__ (line 1372) | def __len__(self):
    method __getitem__ (line 1375) | def __getitem__(self, idx):
  function get_evaluation_dataset (line 1408) | def get_evaluation_dataset(

FILE: examples/multimodal/evaluation/mmmu_utils.py
  function load_yaml (line 58) | def load_yaml(file_path):
  function parse_img_path (line 68) | def parse_img_path(text):
  function process_single_sample (line 73) | def process_single_sample(data):
  function construct_prompt (line 98) | def construct_prompt(sample, config):
  function parse_multi_choice_response (line 151) | def parse_multi_choice_response(response, all_choices, index2ans):
  function check_is_number (line 206) | def check_is_number(string):
  function normalize_str (line 218) | def normalize_str(string):
  function extract_numbers (line 243) | def extract_numbers(string):
  function parse_open_response (line 266) | def parse_open_response(response):
  function eval_multi_choice (line 321) | def eval_multi_choice(gold_i, pred_i):
  function eval_open (line 338) | def eval_open(gold_i, pred_i):
  function evaluate (line 367) | def evaluate(samples):
  function calculate_ins_level_acc (line 393) | def calculate_ins_level_acc(results: Dict):
  function mmmu_main_eval (line 405) | def mmmu_main_eval(output_dict, task_cfg):

FILE: examples/multimodal/image_processing.py
  function find_closest_aspect_ratio (line 31) | def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height...
  function find_closest_area_weighted_aspect_ratio (line 47) | def find_closest_area_weighted_aspect_ratio(aspect_ratio, target_ratios,...
  class ImageTransform (line 65) | class ImageTransform:
    method __init__ (line 68) | def __init__(self, input_size, vision_model_type):
    method __call__ (line 72) | def __call__(self, img, img_h, img_w, use_tiling=False, max_num_tiles=...
  function dynamic_preprocess (line 88) | def dynamic_preprocess(
  function _build_transform (line 131) | def _build_transform(input_size, vision_model_type):

FILE: examples/multimodal/layer_scaling.py
  function _bias_dropout_add_func_layer_scaling (line 10) | def _bias_dropout_add_func_layer_scaling(ls, x_with_bias, residual, prob...
  function bias_dropout_add_unfused_layer_scaling (line 24) | def bias_dropout_add_unfused_layer_scaling(ls, training):
  function get_bias_dropout_add_layer_scaling (line 33) | def get_bias_dropout_add_layer_scaling(ls, training, fused):
  class LayerScalingTransformerLayer (line 40) | class LayerScalingTransformerLayer(TransformerLayer):
    method __init__ (line 42) | def __init__(self, *args, **kwargs):

FILE: examples/multimodal/layer_specs.py
  function get_layer_spec (line 54) | def get_layer_spec(is_vit, normalization) -> ModuleSpec:
  function get_layer_spec_te (line 98) | def get_layer_spec_te(is_vit=False, padding=False) -> ModuleSpec:
  function get_mamba_layer_spec_te (line 128) | def get_mamba_layer_spec_te(padding=False) -> ModuleSpec:
  function get_mlp_module_spec (line 187) | def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:
  function get_norm_mlp_module_spec_te (line 198) | def get_norm_mlp_module_spec_te() -> ModuleSpec:

FILE: examples/multimodal/model.py
  function model_provider (line 18) | def model_provider(
  function _get_tile_tags (line 238) | def _get_tile_tags(args, tokenizer):

FILE: examples/multimodal/model_converter/clip_converter.py
  function convert (line 10) | def convert(download_root, output_path, tensor_parallel_size, use_te):

FILE: examples/multimodal/model_converter/internvit_converter.py
  function convert (line 8) | def convert(model_name, output_path, tensor_parallel_size, use_te):

FILE: examples/multimodal/model_converter/radio_converter.py
  function convert_radio_h (line 7) | def convert_radio_h(output_path, tensor_parallel_size, use_te, version):
  function convert_radio_g (line 127) | def convert_radio_g(output_path, tensor_parallel_size, use_te, version):
  function convert (line 279) | def convert(output_path, tensor_parallel_size, use_te, model_type, versi...

FILE: examples/multimodal/model_converter/siglip_converter.py
  function convert (line 8) | def convert(output_path, tensor_parallel_size, use_te):

FILE: examples/multimodal/model_converter/vision_model_tester.py
  function run_mcore_vision (line 24) | def run_mcore_vision(model_path):
  function run_hf_vision (line 74) | def run_hf_vision(model_name):
  function main (line 89) | def main(mcore_model, hf_model):

FILE: examples/multimodal/multimodal_args.py
  function add_multimodal_extra_args (line 5) | def add_multimodal_extra_args(parser):

FILE: examples/multimodal/nvlm/internvit.py
  class InternViTRMSNorm (line 61) | class InternViTRMSNorm(MegatronModule):
    method __init__ (line 63) | def __init__(
    method _norm (line 91) | def _norm(self, x, var):
    method forward (line 97) | def forward(self, x: torch.Tensor) -> torch.Tensor:
    method _gather_var (line 115) | def _gather_var(self, input_, max_dim):
    method sharded_state_dict (line 150) | def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata={}):
  function get_mlp_module_spec (line 163) | def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:
  class InternViTSelfAttention (line 175) | class InternViTSelfAttention(SelfAttention):
    method __init__ (line 177) | def __init__(
  class InternViTTEDotProductAttention (line 214) | class InternViTTEDotProductAttention(TEDotProductAttention):
    method forward (line 218) | def forward(self, *args, **kwargs):
  function get_internvit_layer_spec (line 237) | def get_internvit_layer_spec(use_te) -> ModuleSpec:
  function get_internvit300M_layer_spec (line 263) | def get_internvit300M_layer_spec(use_te) -> ModuleSpec:

FILE: examples/multimodal/nvlm/pp_checkpoint_converter.py
  function split (line 14) | def split(input_dir, base_output_dir, input_pp, output_pp, num_tp, num_l...
  function combine (line 82) | def combine(input_dir, base_output_dir, input_pp, output_pp, num_tp, num...

FILE: examples/multimodal/radio/radio_g.py
  function get_mlp_module_spec (line 54) | def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:
  function get_norm_mlp_module_spec_te (line 65) | def get_norm_mlp_module_spec_te() -> ModuleSpec:
  function get_radio_g_layer_spec (line 75) | def get_radio_g_layer_spec(normalization) -> ModuleSpec:
  function get_radio_g_layer_spec_te (line 115) | def get_radio_g_layer_spec_te() -> ModuleSpec:

FILE: examples/multimodal/run_text_generation.py
  function is_first_rank (line 46) | def is_first_rank():
  function add_text_generation_args (line 54) | def add_text_generation_args(parser):
  function get_evaluation_dataloader (line 107) | def get_evaluation_dataloader(
  function generate_samples (line 156) | def generate_samples(model, config: EvaluationConfig, print_output):
  function get_evaluation_configs (line 365) | def get_evaluation_configs(config_path=None) -> Dict[str, EvaluationConf...
  function get_output_path (line 424) | def get_output_path(config, dp_rank):
  function generate_and_write_samples (line 439) | def generate_and_write_samples(model, config, print_output=True):
  class VLMForwardStep (line 457) | class VLMForwardStep(ForwardStep):
    method __init__ (line 460) | def __init__(
    method _forward (line 482) | def _forward(self, tokens, position_ids, attention_mask):
    method __call__ (line 493) | def __call__(self, tokens, position_ids, attention_mask):
  function get_conversation (line 536) | def get_conversation(task, question, metadata=None):
  function get_prompt_and_generated (line 648) | def get_prompt_and_generated(prompt_and_generation, prompt_format):
  function run_eval (line 690) | def run_eval(config, iteration=None):
  function run_evaluation_loop (line 804) | def run_evaluation_loop(model, configs, output_dir_override=None, iterat...
  function eval_tasks (line 843) | def eval_tasks():

FILE: examples/multimodal/train.py
  function get_batch (line 33) | def get_batch(data_iterator, image_token_index, img_seq_len):
  function get_ltor_masks_and_position_ids (line 152) | def get_ltor_masks_and_position_ids(input_ids, target, pad_token):
  function get_mask_start_and_end_idx (line 168) | def get_mask_start_and_end_idx(arr):
  function scaled_loss_func (line 193) | def scaled_loss_func(loss_mask, output_tensor):
  function loss_func (line 241) | def loss_func(loss_mask, output_tensor):
  function forward_step (line 254) | def forward_step(data_iterator, model: LLaVAModel):
  function llava_embedding_ranks (line 300) | def llava_embedding_ranks(pp_ranks):
  function llava_position_embedding_ranks (line 313) | def llava_position_embedding_ranks(pp_ranks):
  function run_online_eval (line 326) | def run_online_eval(model):
  function write_eval_to_tensorboard (line 353) | def write_eval_to_tensorboard(data, iteration, writer, walltime=None):
  function write_online_eval_to_tensorboard (line 363) | def write_online_eval_to_tensorboard(data, iteration, writer, walltime=N...

FILE: examples/post_training/modelopt/convert_model.py
  function add_convert_args (line 39) | def add_convert_args(parser):
  function get_model (line 73) | def get_model(model_provider_func, model_type=ModelType.encoder_or_decod...
  function check_arguments (line 91) | def check_arguments():

FILE: examples/post_training/modelopt/export.py
  function add_modelopt_export_args (line 27) | def add_modelopt_export_args(parser):

FILE: examples/post_training/modelopt/finetune.py
  function add_finetune_args (line 37) | def add_finetune_args(parser):
  function get_eos_id (line 46) | def get_eos_id():
  class OfflineDataset (line 66) | class OfflineDataset(torch.utils.data.Dataset):
    method __init__ (line 67) | def __init__(self, data_dir: str, num_samples):
    method __len__ (line 77) | def __len__(self):
    method __getitem__ (line 80) | def __getitem__(self, idx):
  class SFTDataset (line 86) | class SFTDataset(torch.utils.data.Dataset):
    method _wildcard_get (line 112) | def _wildcard_get(cls, directory: Dict[str, Any], name: str, default_v...
    method __init__ (line 120) | def __init__(
    method __len__ (line 195) | def __len__(self):
    method __getitem__ (line 198) | def __getitem__(self, idx):
    method _process_and_pack_example (line 232) | def _process_and_pack_example(self):
    method _process_example (line 260) | def _process_example(self, example: Dict[str, Any]):
    method _to_conversation (line 305) | def _to_conversation(cls, question, response):
    method _sharegpt_to_openai_conversations (line 311) | def _sharegpt_to_openai_conversations(cls, data):
    method _special_to_openai_conversations (line 330) | def _special_to_openai_conversations(cls, data):
  function train_valid_test_sft_datasets_provider (line 335) | def train_valid_test_sft_datasets_provider(train_val_test_num_samples):
  function get_batch (line 377) | def get_batch(data_iterator):
  function non_loss_data_func (line 444) | def non_loss_data_func(model: GPTModel):
  function forward_step (line 455) | def forward_step(data_iterator, model: GPTModel):

FILE: examples/post_training/modelopt/generate.py
  function add_generate_args (line 28) | def add_generate_args(parser):
  function check_arguments (line 41) | def check_arguments():
  function mtbench_to_oai_chat (line 53) | def mtbench_to_oai_chat(example):
  function get_conversations (line 62) | def get_conversations(example):

FILE: examples/post_training/modelopt/mmlu.py
  function add_mmlu_args (line 32) | def add_mmlu_args(parser):
  function get_all_subjects (line 45) | def get_all_subjects():
  function format_example (line 108) | def format_example(example, include_answer: bool = True):
  function generate_prompt (line 120) | def generate_prompt(test_example, dev_examples, few_shots=0, no_subject_...

FILE: examples/post_training/modelopt/offline_feature_extract.py
  function add_extract_args (line 23) | def add_extract_args(parser):
  function extract_feature (line 32) | def extract_feature(dataset, model, output_dir, idx_start, idx_end):

FILE: examples/post_training/modelopt/prune.py
  function add_prune_args (line 43) | def add_prune_args(parser):
  function check_arguments (line 125) | def check_arguments(args):
  function get_calib_dataloader (line 132) | def get_calib_dataloader(calib_size=1024, max_sequence_length=512):
  function get_params (line 142) | def get_params(model):
  function _custom_prompt_forward_loop_func (line 187) | def _custom_prompt_forward_loop_func(model):
  function _hf_dataset_forword_loop_func (line 202) | def _hf_dataset_forword_loop_func(model):

FILE: examples/post_training/modelopt/quantize.py
  function add_text_generate_ptq_args (line 76) | def add_text_generate_ptq_args(parser):
  function check_arguments (line 145) | def check_arguments():
  function _is_first_layers (line 157) | def _is_first_layers(name: str, num_layers: int = 1, num_layers_to_disab...
  function _is_last_layers (line 167) | def _is_last_layers(name: str, num_layers: int = 1, num_layers_to_disabl...
  function get_first_layers_disabled_config (line 177) | def get_first_layers_disabled_config(config, num_layers: int = 1, num_la...
  function get_last_layers_disabled_config (line 195) | def get_last_layers_disabled_config(config, num_layers: int = 1, num_lay...
  function get_modelopt_torch_quantization_config (line 213) | def get_modelopt_torch_quantization_config():
  function get_calib_dataloader (line 270) | def get_calib_dataloader(
  function _custom_prompt_forward_loop_func (line 382) | def _custom_prompt_forward_loop_func(model):
  function _dataset_forward_loop_func (line 397) | def _dataset_forward_loop_func(model):

FILE: examples/post_training/modelopt/validate.py
  function add_ar_validation_args (line 27) | def add_ar_validation_args(parser):
  function check_arguments (line 59) | def check_arguments():
  function get_current_memory_info (line 71) | def get_current_memory_info():
  function report_current_memory_info (line 82) | def report_current_memory_info():

FILE: examples/rl/benchmark_refit.py
  function add_benchmark_args (line 24) | def add_benchmark_args(parser):
  function model_provider (line 51) | def model_provider(pre_process=True, post_process=True, parallel_output=...
  function create_refit_service (line 67) | def create_refit_service(method):
  function print_config_summary (line 79) | def print_config_summary(args, src_config, dst_config, world_size, mode):
  function run_benchmark (line 94) | def run_benchmark(src_model, dst_model, refit_service, num_warmup, num_i...
  function print_results (line 129) | def print_results(timings):
  function benchmark_collocated (line 145) | def benchmark_collocated():
  function benchmark_non_collocated (line 220) | def benchmark_non_collocated():
  function main (line 316) | def main():

FILE: examples/rl/environments/countdown/countdown.py
  function extract_solution (line 6) | def extract_solution(solution_str: str, remove_prompt: bool = False):
  function validate_equation (line 28) | def validate_equation(equation_str, available_numbers):
  function evaluate_equation (line 44) | def evaluate_equation(equation_str):
  function compute_score (line 59) | def compute_score(solution_str, ground_truth, method='strict', format_sc...

FILE: examples/rl/environments/countdown/countdown_agent.py
  class CountdownAgent (line 12) | class CountdownAgent(RewardOnlyAgent, HFDatasetAgent):
    method make_prefix (line 15) | def make_prefix(self, target, nums) -> str:
    method get_dataset (line 20) | def get_dataset(self, validation: bool = False):
    method evaluation_prompts (line 29) | async def evaluation_prompts(
    method get_prompt (line 38) | async def get_prompt(self, validation=False) -> tuple[str, dict]:
    method get_reward (line 43) | async def get_reward(self, response, golden: dict) -> float:

FILE: examples/rl/environments/math/aime_agent.py
  class AIMEAgent (line 15) | class AIMEAgent(MathAgent):
    method get_dataset (line 18) | def get_dataset(self, validation: bool = False):
    method evaluation_prompts (line 22) | async def evaluation_prompts(
    method get_prompt (line 34) | async def get_prompt(self, validation=False) -> tuple[str, dict]:
    method get_reward (line 44) | async def get_reward(self, response, golden: dict) -> float:

FILE: examples/rl/environments/math/bigmath_agent.py
  class BigMathAgent (line 16) | class BigMathAgent(MathAgent):
    method get_dataset (line 19) | def get_dataset(self, validation: bool = False):
    method evaluation_prompts (line 22) | async def evaluation_prompts(
    method get_prompt (line 31) | async def get_prompt(self, validation=False) -> tuple[str, dict]:
    method get_reward (line 37) | async def get_reward(self, response, golden: dict) -> float:

FILE: examples/rl/environments/math/dapo_agent.py
  class DAPOAgent (line 15) | class DAPOAgent(MathAgent):
    method reformat_datum (line 18) | def reformat_datum(self, datum: dict) -> dict:
    method get_dataset (line 30) | def get_dataset(self, validation: bool = False):
    method evaluation_prompts (line 33) | async def evaluation_prompts(
    method get_prompt (line 43) | async def get_prompt(self, validation=False) -> tuple[str, dict]:
    method get_reward (line 50) | async def get_reward(self, response, golden: dict) -> float:

FILE: examples/rl/environments/math/gsm8k_agent.py
  class GSM8KAgent (line 25) | class GSM8KAgent(MathAgent):
    method __init__ (line 26) | def __init__(self,
    method reformat_datum (line 41) | def reformat_datum(self, datum: dict) -> dict:
    method get_dataset (line 48) | def get_dataset(self, validation: bool = False):
    method evaluation_prompts (line 51) | async def evaluation_prompts(
    method get_prompt (line 60) | async def get_prompt(self, validation=False) -> tuple[str, dict]:
    method get_reward (line 67) | async def get_reward(self, response, golden: dict) -> float:

FILE: examples/rl/environments/math/math_agent.py
  class MathAgent (line 23) | class MathAgent(RewardOnlyAgent):
    method __init__ (line 24) | def __init__(self,
    method compute_score (line 49) | def compute_score(self, response: str, golden: dict, golden_key: str =...
    method make_prefix (line 120) | def make_prefix(self, problem_key: str = "problem", **kwargs) -> str:

FILE: examples/rl/environments/math/openmath_agent.py
  class OpenMathInstructAgent (line 16) | class OpenMathInstructAgent(MathAgent):
    method get_dataset (line 19) | def get_dataset(self, validation: bool = False):
    method evaluation_prompts (line 22) | async def evaluation_prompts(
    method get_prompt (line 31) | async def get_prompt(self, validation=False) -> tuple[str, dict]:
    method get_reward (line 37) | async def get_reward(self, response, golden: dict) -> float:

FILE: examples/run_simple_mcore_train_loop.py
  function initialize_distributed (line 32) | def initialize_distributed(
  function model_provider (line 60) | def model_provider() -> GPTModel:
  function get_train_data_iterator (line 85) | def get_train_data_iterator() -> Iterator:
  function forward_step_func (line 123) | def forward_step_func(
  function save_distributed_checkpoint (line 163) | def save_distributed_checkpoint(
  function load_distributed_checkpoint (line 183) | def load_distributed_checkpoint(

FILE: gpt_builders.py
  function gpt_builder (line 28) | def gpt_builder(args, pre_process, post_process, vp_stage=None, config=N...
  function _get_transformer_layer_spec (line 116) | def _get_transformer_layer_spec(use_te, config):

FILE: mamba_builders.py
  function mamba_builder (line 12) | def mamba_builder(args, pre_process, post_process, vp_stage=None, config...

FILE: megatron/core/_rank_utils.py
  function safe_get_rank (line 12) | def safe_get_rank() -> int:
  function log_single_rank (line 31) | def log_single_rank(logger: logging.Logger, *args: Any, rank: int = 0, *...

FILE: megatron/core/activations.py
  function squared_relu (line 9) | def squared_relu(x: torch.Tensor) -> torch.Tensor:
  function quick_gelu (line 15) | def quick_gelu(x: torch.Tensor) -> torch.Tensor:
  function fast_gelu (line 21) | def fast_gelu(x: torch.Tensor) -> torch.Tensor:

FILE: megatron/core/config.py
  function set_experimental_flag (line 6) | def set_experimental_flag(flag: bool):
  function is_experimental_enabled (line 12) | def is_experimental_enabled():

FILE: megatron/core/config_logger.py
  function get_config_logger_path (line 25) | def get_config_logger_path(config):
  function has_config_logger_enabled (line 30) | def has_config_logger_enabled(config):
  function get_path_count (line 40) | def get_path_count(path):
  function get_path_with_count (line 52) | def get_path_with_count(path):
  class JSONEncoderWithMcoreTypes (line 59) | class JSONEncoderWithMcoreTypes(json.JSONEncoder):
    method default (line 64) | def default(self, o):
  function log_config_to_disk (line 97) | def log_config_to_disk(config, dict_data, prefix='', rank_str=''):

FILE: megatron/core/datasets/bert_dataset.py
  class BERTMaskedWordPieceDatasetConfig (line 17) | class BERTMaskedWordPieceDatasetConfig(MaskedWordPieceDatasetConfig):
    method __post_init__ (line 23) | def __post_init__(self) -> None:
  class BERTMaskedWordPieceDataset (line 30) | class BERTMaskedWordPieceDataset(MaskedWordPieceDataset):
    method __init__ (line 44) | def __init__(
    method _key_config_attributes (line 64) | def _key_config_attributes() -> List[str]:
    method __getitem__ (line 74) | def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
    method _get_token_mask (line 173) | def _get_token_mask(self, numpy_random_state: numpy.random.RandomState...

FILE: megatron/core/datasets/blended_dataset.py
  class BlendedDataset (line 24) | class BlendedDataset(torch.utils.data.Dataset):
    method __init__ (line 41) | def __init__(
    method __len__ (line 88) | def __len__(self) -> int:
    method __getitem__ (line 97) | def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
    method _build_indices (line 110) | def _build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]:

FILE: megatron/core/datasets/blended_megatron_dataset_builder.py
  class BlendedMegatronDatasetBuilder (line 29) | class BlendedMegatronDatasetBuilder(object):
    method __init__ (line 45) | def __init__(
    method build (line 77) | def build(self) -> List[Optional[TopLevelDataset]]:
    method _build_blended_dataset_splits (line 136) | def _build_blended_dataset_splits(self) -> List[Optional[TopLevelDatas...
    method _build_megatron_datasets_parallel (line 331) | def _build_megatron_datasets_parallel(
    method _build_megatron_dataset_splits (line 416) | def _build_megatron_dataset_splits(
    method build_generic_dataset (line 491) | def build_generic_dataset(
  function _get_size_per_split_per_dataset (line 553) | def _get_size_per_split_per_dataset(

FILE: megatron/core/datasets/blended_megatron_dataset_config.py
  class BlendedMegatronDatasetConfig (line 16) | class BlendedMegatronDatasetConfig:
    method __post_init__ (line 99) | def __post_init__(self) -> None:
  function parse_and_normalize_split (line 155) | def parse_and_normalize_split(split: str) -> List[float]:
  function convert_split_vector_to_split_matrix (line 175) | def convert_split_vector_to_split_matrix(

FILE: megatron/core/datasets/data_schedule.py
  class HybridCPDataLoaderWrapper (line 12) | class HybridCPDataLoaderWrapper:
    method __init__ (line 28) | def __init__(
    method __iter__ (line 51) | def __iter__(self):
    method get_global_seqlens (line 55) | def get_global_seqlens(self, subsample_seqlens: torch.Tensor) -> List[...
    method get_global_id_seqlens (line 105) | def get_global_id_seqlens(self, num_local_subsamples, offsets, seqlens...
    method _gid_to_src_rank (line 126) | def _gid_to_src_rank(self, gid: int, offsets: List[int]) -> int:
    method reroute_samples_to_hdp_ranks (line 136) | def reroute_samples_to_hdp_ranks(
    method unpack_batch (line 245) | def unpack_batch(self, batch):
    method __next__ (line 267) | def __next__(self) -> Any:

FILE: megatron/core/datasets/gpt_dataset.py
  class GPTDatasetConfig (line 25) | class GPTDatasetConfig(BlendedMegatronDatasetConfig):
    method __post_init__ (line 79) | def __post_init__(self) -> None:
  class GPTDataset (line 101) | class GPTDataset(MegatronDataset):
    method __init__ (line 119) | def __init__(
    method numel_low_level_dataset (line 148) | def numel_low_level_dataset(low_level_dataset: IndexedDataset) -> int:
    method build_low_level_dataset (line 163) | def build_low_level_dataset(dataset_path: str, config: GPTDatasetConfi...
    method __len__ (line 196) | def __len__(self) -> int:
    method __getitem__ (line 225) | def __getitem__(self, idx: Optional[int]) -> Dict[str, torch.Tensor]:
    method _query_document_sample_shuffle_indices (line 298) | def _query_document_sample_shuffle_indices(
    method _build_document_sample_shuffle_indices (line 381) | def _build_document_sample_shuffle_indices(
    method _get_num_tokens_per_epoch (line 609) | def _get_num_tokens_per_epoch(self) -> int:
    method _get_num_epochs (line 617) | def _get_num_epochs(self, num_tokens_per_epoch: int) -> int:
  function _build_document_index (line 640) | def _build_document_index(
  function _build_shuffle_index (line 674) | def _build_shuffle_index(
  function _get_ltor_masks_and_position_ids (line 706) | def _get_ltor_masks_and_position_ids(
  class MockGPTLowLevelDataset (line 783) | class MockGPTLowLevelDataset:
    method __init__ (line 803) | def __init__(self, tokenizer: MegatronTokenizerBase) -> None:
    method __len__ (line 811) | def __len__(self) -> int:
    method __getitem__ (line 814) | def __getitem__(self, idx: int) -> numpy.number:
    method get (line 821) | def get(self, idx: int, offset: int = 0, length: Optional[int] = None)...
  class MockGPTDataset (line 839) | class MockGPTDataset(GPTDataset):
    method __init__ (line 857) | def __init__(
    method numel_low_level_dataset (line 878) | def numel_low_level_dataset(low_level_dataset: MockGPTLowLevelDataset)...
    method build_low_level_dataset (line 890) | def build_low_level_dataset(  # type: ignore[override]

FILE: megatron/core/datasets/helpers.cpp
  function build_exhaustive_blending_indices (line 22) | void build_exhaustive_blending_indices(py::array_t<int16_t> &dataset_ind...
  function build_blending_indices (line 77) | void build_blending_indices(py::array_t<int16_t> &dataset_index,
  function build_sample_idx (line 145) | py::array_t<T> build_sample_idx(
  function get_target_sample_len (line 251) | inline int32_t get_target_sample_len(const int32_t short_seq_ratio,
  function build_mapping_impl (line 269) | py::array build_mapping_impl(const py::array_t<int64_t> &docs_,
  function build_mapping (line 529) | py::array build_mapping(const py::array_t<int64_t> &docs_,
  function build_blocks_mapping_impl (line 567) | py::array build_blocks_mapping_impl(const py::array_t<int64_t> &docs_,
  function build_blocks_mapping (line 808) | py::array build_blocks_mapping(const py::array_t<int64_t> &docs_,
  function PYBIND11_MODULE (line 841) | PYBIND11_MODULE(helpers_cpp, m)

FILE: megatron/core/datasets/helpers.py
  function build_sample_idx (line 12) | def build_sample_idx(

FILE: megatron/core/datasets/indexed_dataset.py
  class DType (line 50) | class DType(Enum):
    method code_from_dtype (line 63) | def code_from_dtype(cls, value: Type[numpy.number]) -> int:
    method dtype_from_code (line 75) | def dtype_from_code(cls, value: int) -> Type[numpy.number]:
    method size (line 87) | def size(key: Union[int, Type[numpy.number]]) -> int:
    method optimal_dtype (line 107) | def optimal_dtype(cardinality: Optional[int]) -> Type[numpy.number]:
  class _IndexWriter (line 122) | class _IndexWriter(object):
    method __init__ (line 131) | def __init__(self, idx_path: str, dtype: Type[numpy.number]) -> None:
    method __enter__ (line 135) | def __enter__(self) -> "_IndexWriter":
    method __exit__ (line 154) | def __exit__(
    method write (line 175) | def write(
    method _sequence_pointers (line 213) | def _sequence_pointers(
  class _IndexReader (line 233) | class _IndexReader(object):
    method __init__ (line 246) | def __init__(
    method __del__ (line 336) | def __del__(self) -> None:
    method __len__ (line 342) | def __len__(self) -> int:
    method __getitem__ (line 351) | def __getitem__(self, idx: int) -> Tuple[numpy.int32, numpy.int64, Opt...
  class _BinReader (line 368) | class _BinReader(ABC):
    method read (line 372) | def read(self, dtype: Type[numpy.number], count: int, offset: int) -> ...
  class _MMapBinReader (line 389) | class _MMapBinReader(_BinReader):
    method __init__ (line 396) | def __init__(self, bin_path: str) -> None:
    method read (line 405) | def read(self, dtype: Type[numpy.number], count: int, offset: int) -> ...
    method __del__ (line 421) | def __del__(self) -> None:
  class _FileBinReader (line 431) | class _FileBinReader(_BinReader):
    method __init__ (line 438) | def __init__(
    method read (line 447) | def read(self, dtype: Type[numpy.number], count: int, offset: int) -> ...
  class _S3BinReader (line 500) | class _S3BinReader(_BinReader):
    method __init__ (line 513) | def __init__(self, bin_path: str, object_storage_config: ObjectStorage...
    method _extract_from_cache (line 523) | def _extract_from_cache(self, offset: int, size: int) -> bytes:
    method read (line 532) | def read(self, dtype: Type[numpy.number], count: int, offset: int) -> ...
    method __del__ (line 581) | def __del__(self) -> None:
  class _MultiStorageClientBinReader (line 586) | class _MultiStorageClientBinReader(_BinReader):
    method __init__ (line 595) | def __init__(self, bin_path: str, object_storage_config: ObjectStorage...
    method read (line 599) | def read(self, dtype: Type[numpy.number], count: int, offset: int) -> ...
  class IndexedDataset (line 611) | class IndexedDataset(torch.utils.data.Dataset):
    method __init__ (line 634) | def __init__(
    method initialize (line 678) | def initialize(
    method __getstate__ (line 736) | def __getstate__(self) -> Tuple[str, bool, bool, Optional[ObjectStorag...
    method __setstate__ (line 752) | def __setstate__(self, state: Tuple[str, bool, bool, Optional[ObjectSt...
    method __del__ (line 777) | def __del__(self) -> None:
    method __len__ (line 782) | def __len__(self) -> int:
    method __getitem__ (line 790) | def __getitem__(
    method get (line 843) | def get(
    method sequence_lengths (line 872) | def sequence_lengths(self) -> numpy.ndarray:
    method document_indices (line 881) | def document_indices(self) -> numpy.ndarray:
    method get_document_indices (line 889) | def get_document_indices(self) -> numpy.ndarray:
    method set_document_indices (line 899) | def set_document_indices(self, document_indices: numpy.ndarray) -> None:
    method sequence_modes (line 910) | def sequence_modes(self) -> numpy.ndarray:
    method exists (line 920) | def exists(path_prefix: str) -> bool:
  class IndexedDatasetBuilder (line 937) | class IndexedDatasetBuilder(object):
    method __init__ (line 948) | def __init__(
    method add_item (line 965) | def add_item(self, tensor: torch.Tensor, mode: int = 0) -> None:
    method add_document (line 979) | def add_document(
    method end_document (line 999) | def end_document(self) -> None:
    method add_index (line 1003) | def add_index(self, path_prefix: str) -> None:
    method finalize (line 1029) | def finalize(self, idx_path: str) -> None:
  function get_idx_path (line 1040) | def get_idx_path(path_prefix: str) -> str:
  function get_bin_path (line 1052) | def get_bin_path(path_prefix: str) -> str:

FILE: megatron/core/datasets/masked_dataset.py
  class MaskedWordPieceDatasetConfig (line 23) | class MaskedWordPieceDatasetConfig(BlendedMegatronDatasetConfig):
    method __post_init__ (line 49) | def __post_init__(self) -> None:
  class MaskedWordPieceDataset (line 76) | class MaskedWordPieceDataset(MegatronDataset):
    method __init__ (line 102) | def __init__(
    method numel_low_level_dataset (line 116) | def numel_low_level_dataset(low_level_dataset: IndexedDataset) -> int:
    method build_low_level_dataset (line 128) | def build_low_level_dataset(
    method _key_config_attributes (line 144) | def _key_config_attributes() -> List[str]:
    method __len__ (line 160) | def __len__(self) -> int:
    method _build_sample_index (line 163) | def _build_sample_index(
    method _create_masked_lm_predictions (line 247) | def _create_masked_lm_predictions(
    method _get_token_mask (line 440) | def _get_token_mask(self, numpy_random_state: numpy.random.RandomState...

FILE: megatron/core/datasets/megatron_dataset.py
  class MegatronDataset (line 23) | class MegatronDataset(ABC, torch.utils.data.Dataset):
    method __init__ (line 41) | def __init__(
    method numel_low_level_dataset (line 117) | def numel_low_level_dataset(low_level_dataset: LowLevelDataset) -> int:
    method build_low_level_dataset (line 134) | def build_low_level_dataset(
    method _key_config_attributes (line 155) | def _key_config_attributes() -> List[str]:
    method __len__ (line 167) | def __len__(self) -> int:
    method __getitem__ (line 176) | def __getitem__(self, idx: int) -> Dict[str, Union[torch.Tensor, numpy...

FILE: megatron/core/datasets/multimodal_dataset.py
  class MultimodalDatasetConfig (line 12) | class MultimodalDatasetConfig(GPTDatasetConfig):
    method __post_init__ (line 28) | def __post_init__(self) -> None:
  class MockMultimodalDataset (line 35) | class MockMultimodalDataset(MockGPTDataset):
    method __getitem__ (line 42) | def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:

FILE: megatron/core/datasets/object_storage_utils.py
  class ObjectStorageConfig (line 21) | class ObjectStorageConfig:
  class S3Client (line 46) | class S3Client(Protocol):
    method download_file (line 49) | def download_file(self, Bucket: str, Key: str, Filename: str) -> None:
    method upload_file (line 53) | def upload_file(self, Filename: str, Bucket: str, Key: str) -> None:
    method head_object (line 57) | def head_object(self, Bucket: str, Key: str) -> Dict[str, Any]:
    method get_object (line 61) | def get_object(self, Bucket: str, Key: str, Range: str) -> Dict[str, A...
    method close (line 65) | def close(self) -> None:
  function _remove_s3_prefix (line 70) | def _remove_s3_prefix(path: str) -> str:
  function _is_s3_path (line 82) | def _is_s3_path(path: str) -> bool:
  function _remove_msc_prefix (line 94) | def _remove_msc_prefix(path: str) -> str:
  function _is_msc_path (line 107) | def _is_msc_path(path: str) -> bool:
  function _s3_download_file (line 119) | def _s3_download_file(client: S3Client, s3_path: str, local_path: str) -...
  function _s3_object_exists (line 135) | def _s3_object_exists(client: S3Client, path: str) -> bool:
  function is_object_storage_path (line 158) | def is_object_storage_path(path: str) -> bool:
  function get_index_cache_path (line 170) | def get_index_cache_path(idx_path: str, object_storage_config: ObjectSto...
  function parse_s3_path (line 195) | def parse_s3_path(path: str) -> Tuple[str, str]:
  function get_object_storage_access (line 215) | def get_object_storage_access(path: str) -> str:
  function dataset_exists (line 220) | def dataset_exists(path_prefix: str, idx_path: str, bin_path: str) -> bool:
  function cache_index_file (line 243) | def cache_index_file(remote_path: str, local_path: str) -> None:

FILE: megatron/core/datasets/t5_dataset.py
  class T5MaskedWordPieceDatasetConfig (line 22) | class T5MaskedWordPieceDatasetConfig(MaskedWordPieceDatasetConfig):
    method __post_init__ (line 36) | def __post_init__(self) -> None:
  class T5MaskedWordPieceDataset (line 48) | class T5MaskedWordPieceDataset(MaskedWordPieceDataset):
    method __init__ (line 67) | def __init__(
    method _key_config_attributes (line 85) | def _key_config_attributes() -> List[str]:
    method _build_b1ss_attention_mask (line 96) | def _build_b1ss_attention_mask(
    method config_attention_mask (line 128) | def config_attention_mask(
    method __getitem__ (line 225) | def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
    method _get_token_mask (line 329) | def _get_token_mask(self, numpy_random_state: numpy.random.RandomState...

FILE: megatron/core/datasets/utils.py
  class Split (line 14) | class Split(Enum):
  function compile_helpers (line 20) | def compile_helpers():
  function normalize (line 33) | def normalize(weights: List[float]) -> List[float]:
  function get_blend_from_list (line 49) | def get_blend_from_list(

FILE: megatron/core/dist_checkpointing/core.py
  class CheckpointingException (line 15) | class CheckpointingException(Exception):
  class CheckpointingConfig (line 22) | class CheckpointingConfig:
  function check_is_distributed_checkpoint (line 38) | def check_is_distributed_checkpoint(checkpoint_dir):
  function maybe_load_config (line 50) | def maybe_load_config(checkpoint_dir: str) -> Optional[CheckpointingConf...
  function save_config (line 76) | def save_config(config: CheckpointingConfig, checkpoint_dir: str):

FILE: megatron/core/dist_checkpointing/dict_utils.py
  function extract_matching_values (line 18) | def extract_matching_values(
  function diff (line 69) | def diff(x1: Any, x2: Any, prefix: Tuple = ()) -> Tuple[list, list, list]:
  function inspect_types (line 138) | def inspect_types(x: Any, prefix: Tuple = (), indent: int = 4):
  function nested_values (line 166) | def nested_values(x: Union[dict, list]):
  function nested_items_iter (line 176) | def nested_items_iter(x: Union[dict, list]):
  function dict_map (line 186) | def dict_map(f: Callable, d: dict):
  function dict_map_with_key (line 192) | def dict_map_with_key(f: Callable, d: dict):
  function dict_list_map_inplace (line 198) | def dict_list_map_inplace(f: Callable[[U], V], x: Union[Dict, List, U]):
  function dict_list_map_outplace (line 210) | def dict_list_map_outplace(f: Callable[[U], V], x: Union[Dict, List, U])...
  function merge (line 220) | def merge(x1: Union[dict, list], x2: Union[dict, list], key: Tuple[Union...
  function map_reduce (line 244) | def map_reduce(

FILE: megatron/core/dist_checkpointing/exchange_utils.py
  function is_float8tensor (line 32) | def is_float8tensor(tensor: torch.Tensor) -> bool:
  class ShardDistribution (line 40) | class ShardDistribution(NamedTuple):
  function _shard_size (line 63) | def _shard_size(sh_ten: ShardedTensor):
  function _get_empty_tensor_for_exchange (line 69) | def _get_empty_tensor_for_exchange(
  function distribute_shards_to_ranks (line 118) | def distribute_shards_to_ranks(
  function determine_main_replica_uniform_distribution (line 174) | def determine_main_replica_uniform_distribution(
  function exchange_loaded_tensors_gather_rounds (line 257) | def exchange_loaded_tensors_gather_rounds(
  function exchange_loaded_tensors_gather_object (line 375) | def exchange_loaded_tensors_gather_object(
  function exchange_loaded_objects_gather_object (line 421) | def exchange_loaded_objects_gather_object(
  function exchange_loaded_tensors_broadcast (line 454) | def exchange_loaded_tensors_broadcast(
  function exchange_by_distribution (line 538) | def exchange_by_distribution(

FILE: megatron/core/dist_checkpointing/mapping.py
  class ShardedBase (line 34) | class ShardedBase(ABC):
    method validate_metadata_integrity (line 42) | def validate_metadata_integrity(self):
    method without_data (line 46) | def without_data(self) -> "ShardedBase":
  class ShardedTensor (line 52) | class ShardedTensor(ShardedBase):
    method __post_init__ (line 93) | def __post_init__(self):
    method validate_metadata_integrity (line 96) | def validate_metadata_integrity(self) -> None:
    method has_regular_grid (line 137) | def has_regular_grid(self):
    method global_slice (line 141) | def global_slice(self) -> Tuple[Union[int, slice], ...]:
    method local_chunk_offset_in_global (line 159) | def local_chunk_offset_in_global(self) -> Tuple[int, ...]:
    method max_allowed_chunks (line 172) | def max_allowed_chunks(self) -> Tuple[int, ...]:
    method without_data (line 186) | def without_data(self):
    method from_rank_offsets (line 190) | def from_rank_offsets(
    method init_data (line 247) | def init_data(self, device: Union[str, torch.device], init_fn=torch.em...
    method narrow (line 262) | def narrow(self, dim: int, start: int, length: int) -> List["ShardedTe...
  function is_main_replica (line 322) | def is_main_replica(replica_id: ReplicaId):
  class LocalNonpersistentObject (line 342) | class LocalNonpersistentObject:
    method __init__ (line 351) | def __init__(self, obj):
    method unwrap (line 354) | def unwrap(self):
  class ShardedObject (line 360) | class ShardedObject(ShardedBase):
    method __post_init__ (line 384) | def __post_init__(self):
    method validate_metadata_integrity (line 387) | def validate_metadata_integrity(self):
    method without_data (line 393) | def without_data(self):
    method unique_key (line 397) | def unique_key(self):
    method __str__ (line 405) | def __str__(self):
    method empty_from_unique_key (line 409) | def empty_from_unique_key(cls, unique_key, replica_id: ReplicaId = 0) ...
  class ShardedTensorFactory (line 438) | class ShardedTensorFactory(ShardedBase):
    method build (line 471) | def build(self):
    method validate_metadata_integrity (line 475) | def validate_metadata_integrity(self):
    method without_data (line 479) | def without_data(self):
  function apply_factories (line 483) | def apply_factories(sharded_state_dict: ShardedStateDict):
  function apply_factory_merges (line 502) | def apply_factory_merges(

FILE: megatron/core/dist_checkpointing/optimizer.py
  function get_optim_param_to_id_map (line 35) | def get_optim_param_to_id_map(optim_params_iter: Iterable[torch.nn.Param...
  function get_param_id_to_sharded_param_map (line 45) | def get_param_id_to_sharded_param_map(
  function make_sharded_optimizer_tensor (line 83) | def make_sharded_optimizer_tensor(
  function optim_state_to_sharding_state (line 111) | def optim_state_to_sharding_state(

FILE: megatron/core/dist_checkpointing/serialization.py
  function load (line 61) | def load(
  function load_common_state_dict (line 174) | def load_common_state_dict(checkpoint_dir: Union[str, Path]) -> StateDict:
  function load_tensors_metadata (line 196) | def load_tensors_metadata(
  function load_sharded_metadata (line 227) | def load_sharded_metadata(
  function load_plain_tensors (line 270) | def load_plain_tensors(checkpoint_dir: str) -> StateDict:
  function load_content_metadata (line 287) | def load_content_metadata(
  function remove_sharded_tensors (line 308) | def remove_sharded_tensors(checkpoint_dir: str, key_prefix: str):
  function save (line 314) | def save(
  function get_default_save_sharded_strategy (line 442) | def get_default_save_sharded_strategy(
  function get_default_save_common_strategy (line 449) | def get_default_save_common_strategy(
  function get_default_load_sharded_strategy (line 456) | def get_default_load_sharded_strategy(

FILE: megatron/core/dist_checkpointing/state_dict_utils.py
  function save_preprocess (line 20) | def save_preprocess(
  function load_preprocess (line 62) | def load_preprocess(sharded_state_dict: ShardedStateDict):
  function filter_out_empty_flatten_tensor (line 96) | def filter_out_empty_flatten_tensor(sharded_state_dict: Union[dict, list]):

FILE: megatron/core/dist_checkpointing/strategies/async_utils.py
  function _set_process_qos (line 28) | def _set_process_qos(cpu_priority: int, io_priority: Optional[int]) -> N...
  function _disable_gc (line 85) | def _disable_gc():
  class AsyncRequest (line 97) | class AsyncRequest(NamedTuple):
    method add_finalize_fn (line 123) | def add_finalize_fn(self, fn: Callable) -> None:
    method execute_sync (line 137) | def execute_sync(self) -> None:
    method freeze (line 163) | def freeze(self) -> 'AsyncRequest':
  class AsyncCaller (line 173) | class AsyncCaller(ABC):
    method schedule_async_call (line 180) | def schedule_async_call(self, async_req: AsyncRequest) -> None:
    method is_current_async_call_done (line 193) | def is_current_async_call_done(self, blocking: bool, no_dist: bool) ->...
    method sync_all_async_calls (line 213) | def sync_all_async_calls(self, is_alive: int) -> bool:
    method close (line 228) | def close(self, abort=False):
    method __del__ (line 232) | def __del__(self):
  class TemporalAsyncCaller (line 236) | class TemporalAsyncCaller(AsyncCaller):
    method __init__ (line 242) | def __init__(self):
    method schedule_async_call (line 247) | def schedule_async_call(self, async_req: AsyncRequest) -> None:
    method is_current_async_call_done (line 283) | def is_current_async_call_done(self, blocking: bool = False, no_dist: ...
    method close (line 315) | def close(self, abort=False):
    method __del__ (line 343) | def __del__(self):
  class PersistentAsyncCaller (line 347) | class PersistentAsyncCaller(AsyncCaller):
    method __init__ (line 358) | def __init__(self):
    method _get_process (line 365) | def _get_process(
    method schedule_async_call (line 395) | def schedule_async_call(self, async_req: AsyncRequest) -> None:
    method is_current_async_call_done (line 435) | def is_current_async_call_done(self, blocking: bool = False, no_dist: ...
    method close (line 488) | def close(self, abort=False):
    method __del__ (line 517) | def __del__(self):
    method async_loop (line 522) | def async_loop(
  class _ActiveAsyncRequest (line 599) | class _ActiveAsyncRequest(NamedTuple):
  class AsyncCallsQueue (line 614) | class AsyncCallsQueue:
    method __init__ (line 623) | def __init__(self, persistent: bool = False):
    method _get_async_caller (line 628) | def _get_async_caller(self):
    method warmup_persistent_caller (line 636) | def warmup_persistent_caller(
    method schedule_async_request (line 646) | def schedule_async_request(self, async_request: AsyncRequest) -> int:
    method maybe_finalize_async_calls (line 670) | def maybe_finalize_async_calls(self, blocking=False, no_dist=False) ->...
    method get_num_unfinalized_calls (line 707) | def get_num_unfinalized_calls(self):
    method close (line 711) | def close(self, abort=False):

FILE: megatron/core/dist_checkpointing/strategies/base.py
  class StrategyAction (line 15) | class StrategyAction(Enum):
  function get_default_strategy (line 29) | def get_default_strategy(action: StrategyAction, backend: str, version: ...
  function register_default_strategy (line 50) | def register_default_strategy(
  class LoadStrategyBase (line 67) | class LoadStrategyBase(ABC):
    method check_backend_compatibility (line 72) | def check_backend_compatibility(self, loaded_backend):
    method check_version_compatibility (line 77) | def check_version_compatibility(self, loaded_version):
    method can_handle_sharded_objects (line 82) | def can_handle_sharded_objects(self):
  class SaveStrategyBase (line 87) | class SaveStrategyBase(ABC):
    method __init__ (line 91) | def __init__(self, backend: str, version: int):
    method can_handle_sharded_objects (line 96) | def can_handle_sharded_objects(self):
    method __str__ (line 100) | def __str__(self):
  class LoadCommonStrategy (line 104) | class LoadCommonStrategy(LoadStrategyBase):
    method load_common (line 108) | def load_common(self, checkpoint_dir: Union[str, Path]):
    method load_sharded_objects (line 113) | def load_sharded_objects(
    method load_sharded_metadata (line 119) | def load_sharded_metadata(self, checkpoint_dir: Union[str, Path]) -> S...
  class LoadShardedStrategy (line 126) | class LoadShardedStrategy(LoadStrategyBase):
    method load (line 130) | def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: U...
    method load_tensors_metadata (line 135) | def load_tensors_metadata(self, checkpoint_dir: Union[str, Path]):
    method load_sharded_metadata (line 149) | def load_sharded_metadata(self, checkpoint_dir: Union[str, Path]):
    method remove_sharded_tensors (line 164) | def remove_sharded_tensors(self, checkpoint_dir: Union[str, Path], key...
  class SaveCommonStrategy (line 169) | class SaveCommonStrategy(SaveStrategyBase):
    method save_common (line 173) | def save_common(self, common_state_dict: StateDict, checkpoint_dir: Un...
    method save_sharded_objects (line 177) | def save_sharded_objects(
  class SaveShardedStrategy (line 184) | class SaveShardedStrategy(SaveStrategyBase):
    method save (line 188) | def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: U...
  class AsyncSaveShardedStrategy (line 193) | class AsyncSaveShardedStrategy(SaveShardedStrategy):
    method async_save (line 197) | def async_save(
    method save (line 212) | def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: U...

FILE: megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py
  class CachedMetadataFileSystemReader (line 11) | class CachedMetadataFileSystemReader(FileSystemReader):
    method __init__ (line 24) | def __init__(self, path: Union[str, os.PathLike], cache_metadata: bool...
    method read_metadata (line 34) | def read_metadata(self) -> Metadata:
    method clear_metadata_cache (line 49) | def clear_metadata_cache(cls):

FILE: megatron/core/dist_checkpointing/strategies/checkpointable.py
  class CheckpointableShardedTensor (line 15) | class CheckpointableShardedTensor(torch.Tensor):
    method __new__ (line 21) | def __new__(cls, data: torch.Tensor, sh_ten: ShardedTensor):
    method __init__ (line 24) | def __init__(self, data: torch.Tensor, sh_ten: ShardedTensor):
    method __create_write_items__ (line 28) | def __create_write_items__(
    method __create_chunk_list__ (line 59) | def __create_chunk_list__(self) -> list[ChunkStorageMetadata]:
    method __get_tensor_shard__ (line 71) | def __get_tensor_shard__(self, index: MetadataIndex) -> torch.Tensor:
    method from_sh_ten (line 83) | def from_sh_ten(cls, sh_ten: ShardedTensor) -> 'CheckpointableShardedT...
    method __torch_dispatch__ (line 96) | def __torch_dispatch__(cls, func, types, args, kwargs=None):
    method __repr__ (line 103) | def __repr__(self):
  class LocalShardsContainer (line 107) | class LocalShardsContainer(torch.Tensor):
    method __new__ (line 117) | def __new__(cls, local_shards: list[torch.Tensor]) -> "LocalShardsCont...
    method __init__ (line 122) | def __init__(self, local_shards: list[torch.Tensor]):
    method __torch_dispatch__ (line 129) | def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
    method __create_write_items__ (line 136) | def __create_write_items__(
    method __create_chunk_list__ (line 155) | def __create_chunk_list__(self) -> list[ChunkStorageMetadata]:
    method __get_tensor_shard__ (line 165) | def __get_tensor_shard__(self, index: MetadataIndex) -> torch.Tensor:
    method __repr__ (line 195) | def __repr__(self):

FILE: megatron/core/dist_checkpointing/strategies/common.py
  function register_default_common_strategies (line 29) | def register_default_common_strategies():
  class TorchCommonSaveStrategy (line 37) | class TorchCommonSaveStrategy(SaveCommonStrategy):
    method save_common (line 40) | def save_common(self, common_state_dict: StateDict, checkpoint_dir: Un...
    method save_sharded_objects (line 50) | def save_sharded_objects(
    method can_handle_sharded_objects (line 66) | def can_handle_sharded_objects(self):
  class TorchCommonLoadStrategy (line 71) | class TorchCommonLoadStrategy(LoadCommonStrategy):
    method load_common (line 74) | def load_common(self, checkpoint_dir: Union[str, Path]):
    method load_sharded_objects (line 100) | def load_sharded_objects(
    method load_sharded_metadata (line 153) | def load_sharded_metadata(self, checkpoint_dir: Union[str, Path]) -> S...
    method can_handle_sharded_objects (line 185) | def can_handle_sharded_objects(self):
    method check_backend_compatibility (line 189) | def check_backend_compatibility(self, loaded_version):
    method check_version_compatibility (line 192) | def check_version_compatibility(self, loaded_version):

FILE: megatron/core/dist_checkpointing/strategies/filesystem_async.py
  function get_write_results_queue (line 53) | def get_write_results_queue(mp_mode: str = 'spawn') -> mp.Queue:
  class FileSystemWriterAsync (line 69) | class FileSystemWriterAsync(FileSystemWriter):
    method __init__ (line 90) | def __init__(
    method prepare_write_data (line 114) | def prepare_write_data(self, plan: SavePlan, planner: SavePlanner) -> ...
    method get_save_function_and_args (line 201) | def get_save_function_and_args(self) -> Tuple[Optional[Callable], Opti...
    method preload_tensors (line 222) | def preload_tensors(write_buckets: List[WriteBucket], non_blocking=Tru...
    method write_preloaded_data_multithread (line 248) | def write_preloaded_data_multithread(
    method write_preloaded_data (line 359) | def write_preloaded_data(
    method write_data (line 438) | def write_data(self, plan: SavePlan, planner: SavePlanner) -> Future[L...
    method retrieve_write_results (line 442) | def retrieve_write_results(self) -> Union[List[WriteResult], WRAPPED_E...
    method prepare_decentralized_global_plan (line 478) | def prepare_decentralized_global_plan(self, local_plan: SavePlan) -> S...
    method finish (line 493) | def finish(self, metadata: Metadata, results: List[List[WriteResult]])...
    method prepare_local_plan (line 518) | def prepare_local_plan(self, plan: SavePlan) -> SavePlan:
    method checkpoint_id (line 532) | def checkpoint_id(self) -> Union[str, os.PathLike]:
    method validate_checkpoint_id (line 539) | def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]...
  function _split_by_size_and_type (line 554) | def _split_by_size_and_type(bins: int, items: List[WriteItem]) -> List[L...
  function _split_by_separation_hint (line 600) | def _split_by_separation_hint(
  function _item_size (line 631) | def _item_size(item: WriteItem) -> int:
  function _process_memory (line 653) | def _process_memory() -> int:

FILE: megatron/core/dist_checkpointing/strategies/fully_parallel.py
  class FullyParallelSaveStrategyWrapper (line 48) | class FullyParallelSaveStrategyWrapper(AsyncSaveShardedStrategy):
    method __init__ (line 73) | def __init__(
    method async_save (line 88) | def async_save(self, sharded_state_dict: ShardedStateDict, checkpoint_...
    method save (line 96) | def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: P...
    method apply_saving_parallelization (line 100) | def apply_saving_parallelization(self, sharded_state_dict: ShardedStat...
    method can_handle_sharded_objects (line 137) | def can_handle_sharded_objects(self):
  class FullyParallelLoadStrategyWrapper (line 141) | class FullyParallelLoadStrategyWrapper(LoadShardedStrategy):
    method __init__ (line 167) | def __init__(
    method load (line 188) | def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: P...
    method _defer_loading_sharded_objects (line 293) | def _defer_loading_sharded_objects(
    method _defer_loading_sharded_tensors (line 304) | def _defer_loading_sharded_tensors(
    method fill_in_deferred_sharded_objects (line 317) | def fill_in_deferred_sharded_objects(
    method fill_in_deferred_sharded_tensors (line 336) | def fill_in_deferred_sharded_tensors(
    method apply_loading_parallelization (line 354) | def apply_loading_parallelization(
    method can_handle_sharded_objects (line 392) | def can_handle_sharded_objects(self):
    method load_tensors_metadata (line 395) | def load_tensors_metadata(self, checkpoint_dir: Path):
    method load_sharded_metadata (line 398) | def load_sharded_metadata(self, checkpoint_dir: Path):
    method check_backend_compatibility (line 401) | def check_backend_compatibility(self, loaded_version):
    method check_version_compatibility (line 404) | def check_version_compatibility(self, loaded_version):
  function distribute_main_replicas_with_precomputed_distribution (line 408) | def distribute_main_replicas_with_precomputed_distribution(
  function _defer_loading_sharded_items (line 465) | def _defer_loading_sharded_items(
  function _fill_in_deferred_sharded_items (line 502) | def _fill_in_deferred_sharded_items(

FILE: megatron/core/dist_checkpointing/strategies/state_dict_saver.py
  function _compare_dataclasses (line 27) | def _compare_dataclasses(obj1, obj2):
  function save_state_dict_async_plan (line 41) | def save_state_dict_async_plan(
  function verify_global_md_reuse (line 171) | def verify_global_md_reuse(
  function save_state_dict_async_finalize (line 213) | def save_state_dict_async_finalize(

FILE: megatron/core/dist_checkpointing/strategies/torch.py
  class MCoreMetadata (line 87) | class MCoreMetadata:
  class MCoreSavePlan (line 94) | class MCoreSavePlan:
  function register_default_torch_strategies (line 100) | def register_default_torch_strategies():
  function flatten_state_dict (line 113) | def flatten_state_dict(
  function sharded_tensor_to_torch_sharded_tensor (line 141) | def sharded_tensor_to_torch_sharded_tensor(
  function mcore_to_pyt_state_dict (line 248) | def mcore_to_pyt_state_dict(
  function _unwrap_pyt_sharded_tensor (line 338) | def _unwrap_pyt_sharded_tensor(
  function _replace_state_dict_keys_with_sharded_keys (line 363) | def _replace_state_dict_keys_with_sharded_keys(
  function _replace_sharded_keys_with_state_dict_keys (line 380) | def _replace_sharded_keys_with_state_dict_keys(
  function _restore_dict_types (line 395) | def _restore_dict_types(x: Union[dict, list, Any], keys_template: Union[...
  class MCoreSavePlanner (line 410) | class MCoreSavePlanner(DefaultSavePlanner):
    method __init__ (line 421) | def __init__(
    method create_local_plan (line 442) | def create_local_plan(self) -> SavePlan:
    method create_decentralized_global_plan (line 462) | def create_decentralized_global_plan(self, local_plan: SavePlan) -> Sa...
    method transform_object (line 479) | def transform_object(self, write_item: WriteItem, object: Any):
  class MCoreLoadPlanner (line 484) | class MCoreLoadPlanner(DefaultLoadPlanner):
    method __init__ (line 491) | def __init__(
    method _validate_global_shapes (line 503) | def _validate_global_shapes(self, metadata, sharded_tensors):
    method _temporarily_bypass_shape_validation (line 521) | def _temporarily_bypass_shape_validation(self):
    method create_local_plan (line 545) | def create_local_plan(self) -> LoadPlan:
    method resolve_tensor (line 554) | def resolve_tensor(self, read_item: ReadItem):
    method commit_tensor (line 576) | def commit_tensor(self, read_item: ReadItem, tensor: torch.Tensor) -> ...
  class TorchDistSaveShardedStrategy (line 589) | class TorchDistSaveShardedStrategy(AsyncSaveShardedStrategy):
    method __init__ (line 597) | def __init__(
    method async_save (line 643) | def async_save(
    method _get_save_and_finalize_callbacks (line 745) | def _get_save_and_finalize_callbacks(self, writer, save_state_dict_ret...
    method can_handle_sharded_objects (line 754) | def can_handle_sharded_objects(self):
  function _get_filesystem_reader (line 758) | def _get_filesystem_reader(
  class TorchDistLoadShardedStrategy (line 771) | class TorchDistLoadShardedStrategy(LoadShardedStrategy):
    method __init__ (line 774) | def __init__(self, cache_metadata: bool = False):
    method load (line 779) | def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: P...
    method load_tensors_metadata (line 835) | def load_tensors_metadata(self, checkpoint_dir: Path, metadata: Metada...
    method load_sharded_metadata (line 853) | def load_sharded_metadata(self, checkpoint_dir: Path) -> ShardedStateD...
    method remove_sharded_tensors (line 868) | def remove_sharded_tensors(self, checkpoint_dir: str, key_prefix: str):
    method can_handle_sharded_objects (line 952) | def can_handle_sharded_objects(self):
    method check_backend_compatibility (line 955) | def check_backend_compatibility(self, loaded_version):
    method check_version_compatibility (line 958) | def check_version_compatibility(self, loaded_version):

FILE: megatron/core/dist_checkpointing/tensor_aware_state_dict.py
  class MCoreTensorAwareStateDict (line 48) | class MCoreTensorAwareStateDict(TensorAwareStateDict):
    method _validate_params (line 61) | def _validate_params(algo):
    method _get_distribution (line 68) | def _get_distribution(
    method _remove_redundant_data (line 86) | def _remove_redundant_data(
    method from_state_dict (line 101) | def from_state_dict(
    method is_hollow (line 155) | def is_hollow(self):
    method _sharded_tensors (line 162) | def _sharded_tensors(self):
    method tensors (line 179) | def tensors(self) -> Iterator[torch.Tensor]:
    method common_state_dict (line 187) | def common_state_dict(self) -> Dict:
    method pop_tensors (line 193) | def pop_tensors(self) -> List[torch.Tensor]:
    method insert_tensors (line 213) | def insert_tensors(self, tensor_data: Iterable[torch.Tensor]):
    method init_tensors (line 230) | def init_tensors(self):
    method copy_tensors_to_cpu (line 245) | def copy_tensors_to_cpu(self, non_blocking=False):
    method restore_tensor_device (line 264) | def restore_tensor_device(self, non_blocking=True):
    method _insert_sharded_data (line 276) | def _insert_sharded_data(
    method to_state_dict (line 325) | def to_state_dict(

FILE: megatron/core/dist_checkpointing/utils.py
  function zip_strict (line 25) | def zip_strict(*args):
  function _sharded_tensor_shard_id (line 37) | def _sharded_tensor_shard_id(sharded_tensor: ShardedTensor) -> _ShardId:
  function _sharded_object_id (line 55) | def _sharded_object_id(sharded_object: ShardedObject) -> _ShardId:
  function extract_sharded_tensors (line 68) | def extract_sharded_tensors(
  function extract_sharded_tensors_and_factories (line 86) | def extract_sharded_tensors_and_factories(
  function extract_sharded_tensors_or_nonpersistent (line 107) | def extract_sharded_tensors_or_nonpersistent(
  function extract_sharded_base (line 129) | def extract_sharded_base(
  function extract_nonpersistent (line 145) | def extract_nonpersistent(
  function add_prefix_for_sharding (line 165) | def add_prefix_for_sharding(sharded_state_dict: ShardedStateDict, prefix...
  function replace_prefix_for_sharding (line 184) | def replace_prefix_for_sharding(
  function apply_prefix_mapping (line 210) | def apply_prefix_mapping(sharded_state_dict: ShardedStateDict, prefix_ma...
  function force_all_tensors_to_non_fp8 (line 236) | def force_all_tensors_to_non_fp8(sharded_state_dict: ShardedStateDict):
  function logger_stack (line 255) | def logger_stack(name: Optional[str] = None, current_logger: Optional[lo...
  function debug_time (line 293) | def debug_time(
  function debug_msg (line 318) | def debug_msg(msg: str):
  function _clean_metadata_for_serialization (line 335) | def _clean_metadata_for_serialization(metadata: dict) -> dict:

FILE: megatron/core/dist_checkpointing/validation.py
  class StrictHandling (line 44) | class StrictHandling(Enum):
    method requires_explicit_ckpt_mismatch_check (line 86) | def requires_explicit_ckpt_mismatch_check(val: "StrictHandling") -> bool:
    method requires_global_app_metadata (line 91) | def requires_global_app_metadata(val: "StrictHandling") -> bool:
    method requires_returning_mismatch_keys (line 101) | def requires_returning_mismatch_keys(val: "StrictHandling") -> bool:
  function parse_strict_flag (line 106) | def parse_strict_flag(strict: Union[str, StrictHandling]) -> StrictHandl...
  function validate_integrity_and_strict_load (line 124) | def validate_integrity_and_strict_load(
  function verify_checkpoint_and_load_strategy (line 202) | def verify_checkpoint_and_load_strategy(
  function adjust_non_strict_load (line 268) | def adjust_non_strict_load(
  function _determine_missing_and_unexpected_keys (line 289) | def _determine_missing_and_unexpected_keys(
  function maybe_report_missing_and_unexpected_keys (line 337) | def maybe_report_missing_and_unexpected_keys(
  function _validate_common_state_dict (line 381) | def _validate_common_state_dict(common_state_dict: CommonStateDict) -> N...
  function validate_sharding_integrity (line 415) | def validate_sharding_integrity(
  function _validate_sharding_for_key (line 458) | def _validate_sharding_for_key(
  function _compute_shards_access (line 500) | def _compute_shards_access(rank_sharding):
  function _validate_objects_for_key (line 510) | def _validate_objects_for_key(sharded_objects: List[ShardedObject]) -> L...
  function determine_global_metadata (line 530) | def determine_global_metadata(
  function validate_sharded_objects_handling (line 547) | def validate_sharded_objects_handling(

FILE: megatron/core/distributed/data_parallel_base.py
  class _BaseDataParallel (line 11) | class _BaseDataParallel(MegatronModule):
    method __init__ (line 14) | def __init__(self, config: TransformerConfig, module: torch.nn.Module):
    method forward (line 18) | def forward(self, *inputs, **kwargs):
    method no_sync (line 25) | def no_sync(self):
    method start_grad_sync (line 34) | def start_grad_sync(self, *unused):
    method scale_gradients (line 45) | def scale_gradients(self, scaling_factor: float) -> None:
    method finish_grad_sync (line 49) | def finish_grad_sync(self):
    method zero_grad_buffer (line 60) | def zero_grad_buffer(self):
    method broadcast_params (line 67) | def broadcast_params(self):
    method state_dict (line 73) | def state_dict(self, prefix='', keep_vars=False, destination=None):
    method state_dict_for_save_checkpoint (line 84) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
    method load_state_dict (line 90) | def load_state_dict(self, state_dict, strict=True):

FILE: megatron/core/distributed/distributed_data_parallel.py
  class DistributedDataParallel (line 22) | class DistributedDataParallel(_BaseDataParallel):
    method __init__ (line 41) | def __init__(
    method enable_forward_pre_hook (line 359) | def enable_forward_pre_hook(self):
    method disable_forward_pre_hook (line 371) | def disable_forward_pre_hook(self, param_sync: bool = True):
    method _make_forward_pre_hook (line 388) | def _make_forward_pre_hook(self):
    method _make_backward_post_hook (line 424) | def _make_backward_post_hook(self, param: torch.nn.Parameter):
    method no_sync (line 455) | def no_sync(self):
    method start_param_sync (line 467) | def start_param_sync(self, *unused, force_sync: bool = False, force_di...
    method start_grad_sync (line 525) | def start_grad_sync(self, *unused):
    method finish_grad_sync (line 537) | def finish_grad_sync(self, force_all_reduce: Optional[bool] = False):
    method free_overlap_buffers (line 549) | def free_overlap_buffers(self):
    method scale_gradients (line 554) | def scale_gradients(self, scaling_factor: float):
    method zero_grad_buffer (line 559) | def zero_grad_buffer(self):
    method broadcast_params (line 575) | def broadcast_params(self):
    method offload_grad_buffers (line 592) | def offload_grad_buffers(self, synchronize: bool = True, empty_cache: ...
    method restore_grad_buffers (line 613) | def restore_grad_buffers(self, synchronize: bool = True) -> None:

FILE: megatron/core/distributed/distributed_data_parallel_config.py
  class DistributedDataParallelConfig (line 10) | class DistributedDataParallelConfig:
    method __post_init__ (line 194) | def __post_init__(self):

FILE: megatron/core/distributed/finalize_model_grads.py
  function _get_main_grad_attr (line 34) | def _get_main_grad_attr(param: torch.nn.Parameter):
  function _unshard_if_dtensor (line 40) | def _unshard_if_dtensor(tensor: Union[torch.Tensor, "DTensor"]) -> torch...
  function _reshard_if_dtensor (line 60) | def _reshard_if_dtensor(
  function _allreduce_conditional_embedding_grads (line 89) | def _allreduce_conditional_embedding_grads(
  function _get_shared_word_embedding_weight (line 132) | def _get_shared_word_embedding_weight(
  function _get_position_embedding_weight (line 151) | def _get_position_embedding_weight(model_module: torch.nn.Module) -> tor...
  function _allreduce_word_embedding_grads (line 164) | def _allreduce_word_embedding_grads(
  function _allreduce_embedding_grad (line 204) | def _allreduce_embedding_grad(
  function _allreduce_position_embedding_grads (line 262) | def _allreduce_position_embedding_grads(
  function reset_model_temporary_tensors (line 278) | def reset_model_temporary_tensors(config: TransformerConfig, model: List...
  function _update_router_expert_bias (line 293) | def _update_router_expert_bias(model: List[torch.nn.Module], config: Tra...
  function _allreduce_non_tensor_model_parallel_grads (line 322) | def _allreduce_non_tensor_model_parallel_grads(
  function finalize_model_grads (line 400) | def finalize_model_grads(

FILE: megatron/core/distributed/fsdp/mcore_fsdp_adapter.py
  class FullyShardedDataParallel (line 62) | class FullyShardedDataParallel(_BaseDataParallel):
    method __init__ (line 67) | def __init__(
    method load_state_dict (line 163) | def load_state_dict(self, state_dict, strict=True):
    method _fix_tensor_parallel_attributes (line 185) | def _fix_tensor_parallel_attributes(self, module):
    method _init_dist_index (line 223) | def _init_dist_index(self, pg_collection):
    method stop_communication (line 344) | def stop_communication(self):
    method sync_rng_states_across_tp_group (line 351) | def sync_rng_states_across_tp_group(self):
  function _get_hsdp_tp_mesh (line 366) | def _get_hsdp_tp_mesh(outer_fsdp_dp_group, dp_cp_group, tp_group, ep_siz...
  function _get_dp_tp_mesh (line 435) | def _get_dp_tp_mesh(dp_cp_group, tp_group, ep_size=1):
  function _check_mesh_ranks_and_group_ranks_are_consistent (line 477) | def _check_mesh_ranks_and_group_ranks_are_consistent(mesh_ranks, group_r...
  function _get_rng_state_dict (line 491) | def _get_rng_state_dict():
  function _load_rng_state_dict (line 502) | def _load_rng_state_dict(rng_state_dict):

FILE: megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py
  class DistributedDataParallelConfig (line 10) | class DistributedDataParallelConfig:
    method __post_init__ (line 148) | def __post_init__(self):

FILE: megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py
  class ShardingStrategy (line 43) | class ShardingStrategy(IntEnum):
  function experimental_api (line 63) | def experimental_api(func: Callable) -> Callable:
  function fully_shard_model (line 75) | def fully_shard_model(
  function fully_shard_optimizer (line 408) | def fully_shard_optimizer(
  function fully_shard (line 614) | def fully_shard(

FILE: megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py
  class TrainingState (line 61) | class TrainingState(Enum):
  class MegatronFSDP (line 76) | class MegatronFSDP(torch.nn.Module):
    method __init__ (line 171) | def __init__(
    method _check_module_parameter_types (line 319) | def _check_module_parameter_types(self):
    method _init_fsdp_param_and_grad_buffer (line 333) | def _init_fsdp_param_and_grad_buffer(self):
    method _import_class_from_path (line 415) | def _import_class_from_path(self, class_path: str):
    method all_gather_and_wait_parameters_ready (line 422) | def all_gather_and_wait_parameters_ready(
    method _register_fsdp_hooks (line 477) | def _register_fsdp_hooks(self, root_module):
    method no_sync (line 1059) | def no_sync(self):
    method sync (line 1073) | def sync(self):
    method set_model_auto_sync (line 1084) | def set_model_auto_sync(self, sync_model: bool = True):
    method get_distributed_index (line 1127) | def get_distributed_index(self) -> FSDPDistributedIndex:
    method mixed_precision_context (line 1135) | def mixed_precision_context(self, mixed_precision_policy: MixedPrecisi...
    method reset_mixed_precision_policy (line 1147) | def reset_mixed_precision_policy(self, mixed_precision_policy: MixedPr...
    method start_param_sync (line 1167) | def start_param_sync(self, *unused, force_sync: bool = False, force_di...
    method start_grad_sync (line 1205) | def start_grad_sync(self, *unused):
    method synchronize_param_gather (line 1222) | def synchronize_param_gather(self):
    method synchronize_gradient_reduce (line 1229) | def synchronize_gradient_reduce(self):
    method attach_grad_to_optimizer_state (line 1242) | def attach_grad_to_optimizer_state(self):
    method finish_grad_sync (line 1249) | def finish_grad_sync(self, force_all_reduce: Optional[bool] = False):
    method _replace_param_with_distributed_if_needed (line 1281) | def _replace_param_with_distributed_if_needed(self):
    method _replace_param_with_raw_if_needed (line 1300) | def _replace_param_with_raw_if_needed(self):
    method _reestablish_shared_weights (line 1314) | def _reestablish_shared_weights(self, old_params, new_params):
    method scale_gradients (line 1348) | def scale_gradients(self, scaling_factor: float):
    method zero_grad_buffer (line 1352) | def zero_grad_buffer(self):
    method install_optimized_model_weights (line 1362) | def install_optimized_model_weights(self):
    method broadcast_params (line 1369) | def broadcast_params(self):
    method forward (line 1385) | def forward(self, *inputs, **kwargs):
  class RegisterFSDPBackwardFunction (line 1396) | class RegisterFSDPBackwardFunction(torch.autograd.Function):
    method forward (line 1404) | def forward(ctx, post_backward, *inputs: torch.Tensor):
    method backward (line 1412) | def backward(ctx, *grads: torch.Tensor):
  function _replace_module_parameter (line 1420) | def _replace_module_parameter(module, name, new_param):

FILE: megatron/core/distributed/fsdp/src/megatron_fsdp/mixed_precision.py
  function local_multi_tensor_applier (line 121) | def local_multi_tensor_applier(op, noop_flag_buffer, tensor_lists, *args):
  function local_multi_tensor_scale (line 125) | def local_multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale):
  function _multi_tensor_copy_this_to_that (line 133) | def _multi_tensor_copy_this_to_that(
  function is_te_min_version (line 162) | def is_te_min_version(vers, check_equality=True):
  function is_float8tensor (line 173) | def is_float8tensor(tensor: torch.Tensor) -> bool:
  function is_blockwise_float8tensor (line 178) | def is_blockwise_float8tensor(tensor: torch.Tensor) -> bool:
  function fp8_need_transpose_data (line 183) | def fp8_need_transpose_data(tensor: torch.Tensor) -> bool:
  function fp8_need_transpose_data_for_meta_device_init (line 188) | def fp8_need_transpose_data_for_meta_device_init(module: TransformerEngi...
  function fp8_discard_transpose_cache (line 193) | def fp8_discard_transpose_cache(tensor: torch.Tensor) -> None:
  function fp8_create_transpose_cache (line 204) | def fp8_create_transpose_cache(tensors: List[torch.Tensor]) -> None:
  function _fp8_create_transpose_cache_fallback (line 212) | def _fp8_create_transpose_cache_fallback(tensors: List[torch.Tensor]) ->...
  function fp8_set_raw_data (line 223) | def fp8_set_raw_data(tensor: torch.Tensor, data: torch.Tensor, set_trans...
  function fp8_get_raw_data (line 244) | def fp8_get_raw_data(tensor: torch.Tensor, get_transpose: bool = False) ...
  function fp8_dequantize (line 257) | def fp8_dequantize(tensor: torch.Tensor) -> torch.Tensor:
  function fp8_quantize (line 266) | def fp8_quantize(
  function _fp8_quantize_fallback (line 288) | def _fp8_quantize_fallback(
  function get_quantized_model_init_context_cls (line 353) | def get_quantized_model_init_context_cls():
  class MixedPrecisionPolicy (line 366) | class MixedPrecisionPolicy:

FILE: megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py
  function _p_assert (line 107) | def _p_assert(cond: Any, s: str, raise_assertion_error: bool = True) -> ...
  function _alloc_storage (line 118) | def _alloc_storage(tensor: torch.Tensor, size: torch.Size) -> None:
  function _free_storage (line 138) | def _free_storage(tensor: torch.Tensor):
  class MultiGroupUBRAllocator (line 170) | class MultiGroupUBRAllocator:
    method __init__ (line 200) | def __init__(self, pool, groups):  # torch.cuda.MemPool  # torch.distr...
    method __enter__ (line 206) | def __enter__(self):
    method __exit__ (line 218) | def __exit__(self, *args):
  class BucketingPolicy (line 232) | class BucketingPolicy:
  function _pad (line 252) | def _pad(number_to_be_padded: int, divisor: int) -> int:
  function build_data_parallel_buffer_index (line 256) | def build_data_parallel_buffer_index(
  function _get_dp_buffer_shard_bucket_index (line 384) | def _get_dp_buffer_shard_bucket_index(
  class Bucket (line 444) | class Bucket:
  class TemporaryBucketAllocator (line 461) | class TemporaryBucketAllocator:
    method __init__ (line 497) | def __init__(self):
    method allocate (line 500) | def allocate(
    method free (line 515) | def free(self, bucket_id: int):
  class StorageResizeBasedBucketAllocator (line 524) | class StorageResizeBasedBucketAllocator(TemporaryBucketAllocator):
    method __init__ (line 530) | def __init__(self):
    method allocate (line 533) | def allocate(
    method free (line 550) | def free(self, bucket_id: int):
  class RotaryBucketAllocator (line 558) | class RotaryBucketAllocator(TemporaryBucketAllocator):
    method __init__ (line 589) | def __init__(self, name: str):
    method allocate (line 596) | def allocate(
    method _get_gbuf_name (line 630) | def _get_gbuf_name(self, buffer_id: int):
    method free (line 633) | def free(self, bucket_id: int):
  class FixedPoolAllocator (line 642) | class FixedPoolAllocator(TemporaryBucketAllocator):
    method __init__ (line 652) | def __init__(
    method _is_two_bucket_group_equal (line 729) | def _is_two_bucket_group_equal(self, group_a, group_b):
    method allocate (line 743) | def allocate(
    method _get_gbuf_name (line 806) | def _get_gbuf_name(self, buf_group_id: int, bucket_index: int):
    method free (line 809) | def free(self, bucket_id: int):
  class DataParallelBuffer (line 828) | class DataParallelBuffer:
    method __init__ (line 846) | def __init__(
    method init_data (line 932) | def init_data(self, data: torch.Tensor):
    method fetch_bucket (line 942) | def fetch_bucket(
    method allocate_bucket_storage (line 987) | def allocate_bucket_storage(
    method free_bucket_storage (line 1043) | def free_bucket_storage(self):
    method reset_param_main_grad (line 1049) | def reset_param_main_grad(self):
    method _get_item_slice_in_shard (line 1058) | def _get_item_slice_in_shard(self, item_id: int) -> Tuple[int, int]:
    method locate_item_in_global_item (line 1106) | def locate_item_in_global_item(self, item_id: int) -> Tuple[int, int]:
    method _get_item_local_shard_index (line 1127) | def _get_item_local_shard_index(self, item_id: int) -> Tuple[int, int]:
    method _get_item_local_index (line 1171) | def _get_item_local_index(self, item_id: int) -> Tuple[int, int]:
    method set_item (line 1188) | def set_item(self, item_id: int, item_data: torch.Tensor) -> None:
    method get_item (line 1223) | def get_item(self, item_id: int, only_shard: bool = False) -> torch.Te...
    method get_item_from_bucket (line 1257) | def get_item_from_bucket(self, bucket: Bucket, item_id: int):
    method get_shard_from_bucket (line 1268) | def get_shard_from_bucket(self, bucket: Bucket):
    method get_shard_from_local_buffer (line 1278) | def get_shard_from_local_buffer(self) -> torch.Tensor:
  class ParameterGroup (line 1290) | class ParameterGroup:
  function _get_parameter_groups (line 1349) | def _get_parameter_groups(
  class ParamAndGradBuffer (line 1583) | class ParamAndGradBuffer:
    method __init__ (line 1625) | def __init__(
    method get_mem_alloc_context (line 1770) | def get_mem_alloc_context(self, groups=None, symmetric=True):
    method manual_buffer_registration (line 1834) | def manual_buffer_registration(self):
    method _log_parameter_groups (line 1869) | def _log_parameter_groups(self):
    method _init_each_parameter_group_buffers (line 1918) | def _init_each_parameter_group_buffers(self, meta_device_init_fp8_para...
    method _reset_parameters (line 2615) | def _reset_parameters(self, old_params, new_params):
    method scale_gradients (line 2659) | def scale_gradients(self, scaling_factor: float) -> None:
    method zero_grad (line 2667) | def zero_grad(self):
    method _init_distributed_params (line 2685) | def _init_distributed_params(self):
    method _init_optimizer_named_parameters (line 2756) | def _init_optimizer_named_parameters(self) -> List[Tuple[str, torch.nn...
    method update_main_grads (line 2813) | def update_main_grads(self):
    method num_buckets (line 2879) | def num_buckets(self):
    method copy_main_weights_to_model_weights (line 2884) | def copy_main_weights_to_model_weights(self):
    method copy_model_weights_to_main_weights (line 3055) | def copy_model_weights_to_main_weights(self):
    method all_gather_parameters (line 3073) | def all_gather_parameters(self, async_op: bool = True):
    method reduce_scatter_gradients (line 3104) | def reduce_scatter_gradients(self, async_op: bool = True):
    method all_reduce_gradients (line 3140) | def all_reduce_gradients(self, async_op: bool = False):
  class BucketStatus (line 3176) | class BucketStatus(Enum):
  class GradReducePipeline (line 3191) | class GradReducePipeline:
    method __init__ (line 3196) | def __init__(
    method num_buckets (line 3227) | def num_buckets(self):
    method reset (line 3231) | def reset(self):
    method reduce_gradients (line 3251) | def reduce_gradients(
    method wait_for_previous_grad_reduce (line 3295) | def wait_for_previous_grad_reduce(
    method _enforce_double_buffer_limit (line 3327) | def _enforce_double_buffer_limit(self, add_buckets):
    method get_ready_bucket_group_for_reduction (line 3349) | def get_ready_bucket_group_for_reduction(self, bucket_id: int) -> Opti...
    method get_fsdp_buffer (line 3375) | def get_fsdp_buffer(self, bucket_id: int) -> DataParallelBuffer:
    method _bucket_group_gradient_reduce (line 3382) | def _bucket_group_gradient_reduce(
  class PrefetchOrder (line 3634) | class PrefetchOrder(Enum):
  class AllGatherPipeline (line 3647) | class AllGatherPipeline:
    method __init__ (line 3652) | def __init__(
    method get_bucket_key (line 3696) | def get_bucket_key(self, bucket_id, bwd):
    method num_buckets (line 3704) | def num_buckets(self):
    method reset (line 3708) | def reset(self):
    method all_gather_params (line 3737) | def all_gather_params(
    method wait_bucket_ready (line 3922) | def wait_bucket_ready(self, bucket_id, bwd, empty_ok=False):
    method release_bucket (line 3942) | def release_bucket(self, bucket_id, bwd, lazy: bool = False):
    method recycle_unused_buckets (line 3995) | def recycle_unused_buckets(self):
    method get_fsdp_buffer (line 4003) | def get_fsdp_buffer(self, bucket_id: int, bwd=False) -> DataParallelBu...
    method async_bucket_gather (line 4020) | def async_bucket_gather(self, bucket_id, bwd) -> None:
  function gradient_reduce_preprocessing (line 4067) | def gradient_reduce_preprocessing(grad_data, scaling_factor, ddp_config):
  function _check_nan_in_grad (line 4092) | def _check_nan_in_grad(grad: torch.Tensor):
  function check_gpu_memory (line 4104) | def check_gpu_memory(threshold=0.9):
  class ResetParametersContext (line 4134) | class ResetParametersContext:
    method __init__ (line 4139) | def __init__(self, init_param_with_fp8=False, with_cuda_rng_tracker=Fa...
    method __enter__ (line 4143) | def __enter__(self):
    method __exit__ (line 4177) | def __exit__(self, *exc_details):
  function override_sharded_param_methods_with_safety_checks (line 4181) | def override_sharded_param_methods_with_safety_checks(params, all_gather...
  function _dtype_size (line 4221) | def _dtype_size(dtype: torch.dtype) -> int:
  function to_local_if_dtensor (line 4252) | def to_local_if_dtensor(tensor):
  function _get_fsdp_tensor_spec (line 4265) | def _get_fsdp_tensor_spec(
  function make_fsdp_dtensor (line 4341) | def make_fsdp_dtensor(

FILE: megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py
  function gather_and_compute_chunk_metadata (line 31) | def gather_and_compute_chunk_metadata(dtensor: DTensor) -> ChunkStorageM...
  function update_uneven_dtensor_chunk_metadata (line 98) | def update_uneven_dtensor_chunk_metadata(dtensor: DTensor) -> dict:
  function validate_uneven_dtensor (line 141) | def validate_uneven_dtensor(dtensor: DTensor) -> None:
  function filter_unflattened_state_dict (line 208) | def filter_unflattened_state_dict(state_dict, key_chain=[], visit_condit...
  function get_unflattened_state_dict (line 227) | def get_unflattened_state_dict(state_dict, key_chain=[]):
  function preprocess_state_dict_for_uneven_dtensor (line 240) | def preprocess_state_dict_for_uneven_dtensor(state_dict: dict) -> dict:
  function gather_uneven_dtensor_to_full_tensor (line 258) | def gather_uneven_dtensor_to_full_tensor(
  function _assemble_full_tensor_from_uneven_chunks (line 333) | def _assemble_full_tensor_from_uneven_chunks(
  function _intersection (line 402) | def _intersection(s1, s2):
  function _offset_slice (line 411) | def _offset_slice(s, offset):
  function split_dtensor (line 415) | def split_dtensor(

FILE: megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py
  function get_te_version (line 59) | def get_te_version():
  function is_te_min_version (line 76) | def is_te_min_version(vers, check_equality=True):
  function is_submodule (line 88) | def is_submodule(module, parent_module, strict=True):
  function get_mesh_names (line 101) | def get_mesh_names(
  function contains_submesh (line 147) | def contains_submesh(
  function _get_cuda_rng_state (line 162) | def _get_cuda_rng_state(
  function _set_cuda_rng_state (line 193) | def _set_cuda_rng_state(new_state: torch.Tensor, device: int = -1, graph...
  function initialize_rng_tracker (line 235) | def initialize_rng_tracker(
  function get_cuda_rng_tracker (line 427) | def get_cuda_rng_tracker(
  function safe_get_rank (line 437) | def safe_get_rank() -> int:
  function log_single_rank (line 457) | def log_single_rank(logger_: logging.Logger, level: int, msg: str, *args...
  class FSDPDistributedIndex (line 465) | class FSDPDistributedIndex:
    method __init__ (line 474) | def __init__(
    method get_submesh (line 627) | def get_submesh(
    method get_dp_group (line 671) | def get_dp_group(self, is_expert_parallel: bool = False) -> ProcessGroup:
    method get_fsdp_group (line 681) | def get_fsdp_group(
    method get_outer_fsdp_group (line 691) | def get_outer_fsdp_group(self, is_expert_parallel: bool = False) -> Pr...
    method get_root_mesh (line 699) | def get_root_mesh(self, is_expert_parallel: bool = False) -> DeviceMesh:
    method get_logical_hybrid_fsdp_rank (line 708) | def get_logical_hybrid_fsdp_rank(self, is_expert_parallel: bool = False):
  class GlobalMemoryBuffer (line 752) | class GlobalMemoryBuffer:
    method __init__ (line 757) | def __init__(self):
    method get_tensor (line 760) | def get_tensor(self, tensor_shape, dtype, name, mem_alloc_context: Opt...
  function get_global_memory_buffer (line 781) | def get_global_memory_buffer():
  function create_updated_function_signature (line 789) | def create_updated_function_signature(original_function, **extended_kwar...
  function is_mcore_tensor_model_parallel (line 813) | def is_mcore_tensor_model_parallel(param: torch.Tensor) -> bool:
  function is_mcore_tensor_parallel_duplicated (line 820) | def is_mcore_tensor_parallel_duplicated(param: torch.Tensor) -> bool:
  function get_mcore_tensor_parallel_partition_dim (line 827) | def get_mcore_tensor_parallel_partition_dim(param: torch.Tensor) -> Opti...

FILE: megatron/core/distributed/param_and_grad_buffer.py
  class BufferType (line 48) | class BufferType(Enum):
  function shard_buffer (line 57) | def shard_buffer(buffer: torch.Tensor, data_parallel_world_size: int):
  class _ParamAndGradBucket (line 69) | class _ParamAndGradBucket:
    method __init__ (line 87) | def __init__(
    method set_layerwise_params_list (line 122) | def set_layerwise_params_list(self, layerwise_params_list: List[List[t...
  class _LayerwiseAllGatherHandle (line 136) | class _LayerwiseAllGatherHandle:
    method __init__ (line 143) | def __init__(self, handles):
    method wait (line 146) | def wait(self):
  class _ParamAndGradBucketGroup (line 153) | class _ParamAndGradBucketGroup:
    method __init__ (line 168) | def __init__(
    method reset (line 242) | def reset(self):
    method check_grads (line 254) | def check_grads(self, check_for_nan_or_inf, check_for_large):
    method start_param_sync (line 292) | def start_param_sync(self, force_sync: bool = False):
    method finish_param_sync (line 427) | def finish_param_sync(self, skip_next_bucket_dispatch: bool = False):
    method start_grad_sync (line 515) | def start_grad_sync(self, force_all_reduce: Optional[bool] = False):
    method finish_grad_sync (line 658) | def finish_grad_sync(self, force_all_reduce: Optional[bool] = False):
    method free_overlap_buffers (line 690) | def free_overlap_buffers(self):
    method register_grad_ready (line 705) | def register_grad_ready(
  class _ParamAndGradBuffer (line 730) | class _ParamAndGradBuffer:
    method __init__ (line 752) | def __init__(
    method scale_gradients (line 1060) | def scale_gradients(self, scaling_factor: float) -> None:
    method _get (line 1064) | def _get(self, shape: torch.Size, start_index: int, buffer_type: Buffe...
    method _new_bucket (line 1081) | def _new_bucket(
    method reset (line 1125) | def reset(self):
    method offload_to_cpu (line 1131) | def offload_to_cpu(self, move_params: bool = True, move_grads: bool = ...
    method reload_from_cpu (line 1146) | def reload_from_cpu(self, move_params: bool = True, move_grads: bool =...
  function partition_buckets (line 1164) | def partition_buckets(

FILE: megatron/core/distributed/reduce_scatter_with_fp32_accumulation.py
  class _ReduceScatterWithFP32AccumulationWorkHandle (line 9) | class _ReduceScatterWithFP32AccumulationWorkHandle:
    method __init__ (line 13) | def __init__(
    method wait (line 26) | def wait(self):
  function reduce_scatter_with_fp32_accumulation (line 42) | def reduce_scatter_with_fp32_accumulation(

FILE: megatron/core/distributed/torch_fully_sharded_data_parallel.py
  class TorchFullyShardedDataParallel (line 28) | class TorchFullyShardedDataParallel(_BaseDataParallel):
    method __init__ (line 55) | def __init__(
    method load_state_dict (line 150) | def load_state_dict(self, state_dict, strict=True):

FILE: megatron/core/distributed/torch_fully_sharded_data_parallel_config.py
  class TorchFullyShardedDataParallelConfig (line 10) | class TorchFullyShardedDataParallelConfig(DistributedDataParallelConfig):

FILE: megatron/core/energy_monitor.py
  class EnergyMonitor (line 22) | class EnergyMonitor:
    method __init__ (line 30) | def __init__(self) -> None:
    method setup (line 37) | def setup(self) -> None:
    method shutdown (line 43) | def shutdown(self) -> None:
    method pause (line 48) | def pause(self) -> None:
    method resume (line 54) | def resume(self) -> None:
    method _get_energy (line 59) | def _get_energy(self) -> int:
    method lap (line 66) | def lap(self) -> float:
    method get_total (line 83) | def get_total(self) -> float:

FILE: megatron/core/enums.py
  class ModelType (line 6) | class ModelType(enum.Enum):
  class Fp8Recipe (line 12) | class Fp8Recipe(str, enum.Enum):
  class Fp4Recipe (line 22) | class Fp4Recipe(str, enum.Enum):

FILE: megatron/core/export/export_config.py
  class ExportConfig (line 9) | class ExportConfig:
    method __post_init__ (line 23) | def __post_init__(self):

FILE: megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py
  class TRTLLMEngineBuilder (line 19) | class TRTLLMEngineBuilder:
    method build_and_save_engine (line 23) | def build_and_save_engine(

FILE: megatron/core/export/trtllm/trtllm_helper.py
  class TRTLLMHelper (line 39) | class TRTLLMHelper:
    method __init__ (line 42) | def __init__(
    method _get_trtllm_config (line 110) | def _get_trtllm_config(
    method _load_scaling_factors (line 210) | def _load_scaling_factors(self, model_state_dict: dict) -> dict:
    method get_trtllm_pretrained_config_and_model_weights (line 264) | def get_trtllm_pretrained_config_and_model_weights(
    method _add_scales_to_converter (line 352) | def _add_scales_to_converter(
    method _get_trtllm_pretrained_config_and_model_weights_in_distributed_setting (line 377) | def _get_trtllm_pretrained_config_and_model_weights_in_distributed_set...
    method _get_trtllm_pretrained_config_and_model_weights_list_on_single_device (line 451) | def _get_trtllm_pretrained_config_and_model_weights_list_on_single_dev...
    method build_and_save_engine (line 532) | def build_and_save_engine(

FILE: megatron/core/export/trtllm/trtllm_layers.py
  class TRTLLMLayers (line 8) | class TRTLLMLayers(Enum):
    method return_layer_name_and_number (line 56) | def return_layer_name_and_number(layer_name: str) -> Tuple[str, int]:
    method rename_input_layer_names_to_trtllm_layer_names (line 80) | def rename_input_layer_names_to_trtllm_layer_names(
  function get_layer_name_without_prefix (line 157) | def get_layer_name_without_prefix(layer: TRTLLMLayers) -> str:

FILE: megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py
  function str_dtype_to_torch (line 23) | def str_dtype_to_torch(dtype: DataType):
  class DistributedTRTLLMModelWeightsConverter (line 31) | class DistributedTRTLLMModelWeightsConverter:
    method __init__ (line 37) | def __init__(
    method _add_to_trtllm_model_weights (line 82) | def _add_to_trtllm_model_weights(self, val: torch.Tensor, layer_name: ...
    method _convert_transformer_layer (line 100) | def _convert_transformer_layer(self, layer_name: str, val: torch.Tensor):
    method _convert_non_transformer_layer (line 195) | def _convert_non_transformer_layer(self, model_state_dict: dict, layer...
    method _get_remove_vocab_padding (line 209) | def _get_remove_vocab_padding(self, layer_name, model_state_dict, toke...
    method convert (line 236) | def convert(

FILE: megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py
  function pad_vocab_size (line 26) | def pad_vocab_size(vocab_size: int, tp_size: int):
  function str_dtype_to_torch (line 33) | def str_dtype_to_torch(dtype: DataType):
  class SingleDeviceTRTLLMModelWeightsConverter (line 40) | class SingleDeviceTRTLLMModelWeightsConverter:
    method __init__ (line 43) | def __init__(
    method _convert_non_transformer_layer (line 81) | def _convert_non_transformer_layer(self, model_state_dict: dict, layer...
    method _cast_value (line 95) | def _cast_value(self, val: torch.Tensor, layer_name: str) -> torch.Ten...
    method _convert_transformer_layer (line 114) | def _convert_transformer_layer(self, layer_name: str, val: torch.Tensor):
    method convert (line 332) | def convert(
    method get_padded_vocab_size (line 405) | def get_padded_vocab_size(self) -> int:
    method get_local_model_weights_per_gpu (line 422) | def get_local_model_weights_per_gpu(self, mapping, trtllm_model_config...

FILE: megatron/core/export/trtllm/trtllm_weights_converter/utils.py
  function is_gated_activation (line 6) | def is_gated_activation(helper):

FILE: megatron/core/extensions/transformer_engine.py
  class TransformerEngineConfigType (line 85) | class TransformerEngineConfigType(enum.Enum):
  class TEQuantizationRecipe (line 92) | class TEQuantizationRecipe:
    method parse_from_config (line 128) | def parse_from_config(cls, quant_config: Dict[Any, Any]) -> "TEQuantiz...
    method get_config_keys (line 157) | def get_config_keys(cls) -> Set[str]:
  class TEQuantizationParams (line 163) | class TEQuantizationParams:
    method parse_from_config (line 175) | def parse_from_config(quant_config: QuantizationConfig) -> "TEQuantiza...
  function _get_fp8_autocast_for_quant_recipe (line 208) | def _get_fp8_autocast_for_quant_recipe(qrecipe: TEQuantizationRecipe):
  function _get_fp8_autocast_for_quant_params (line 259) | def _get_fp8_autocast_for_quant_params(qparams: TEQuantizationParams | N...
  function _get_should_context_be_quantized_recipe (line 268) | def _get_should_context_be_quantized_recipe(
  function _get_should_context_be_quantized_params (line 284) | def _get_should_context_be_quantized_params(
  function _get_extra_te_kwargs (line 299) | def _get_extra_te_kwargs(config: TransformerConfig):
  function condition_init_method (line 312) | def condition_init_method(config, init_method):
  function split_te_layernorm_column_parallel_linear (line 317) | def split_te_layernorm_column_parallel_linear(
  class TEActivationOp (line 400) | class TEActivationOp:
    method __new__ (line 406) | def __new__(cls, config: TransformerConfig):
  class TEFusedResidualRMSNorm (line 438) | class TEFusedResidualRMSNorm(te.pytorch.RMSNorm):
    method __init__ (line 453) | def __init__(self, *args, **kwargs):
    method _make_fused_impl (line 458) | def _make_fused_impl(self) -> te.pytorch.ops.Sequential:
    method _register_hooks_on_fused_impl (line 493) | def _register_hooks_on_fused_impl(self, fused_impl: torch.nn.Module) -...
    method forward (line 574) | def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, ...
  class TENorm (line 602) | class TENorm:
    method __new__ (line 617) | def __new__(
  class TELinear (line 661) | class TELinear(te.pytorch.Linear):
    method __init__ (line 676) | def __init__(
    method finish_init (line 849) | def finish_init(self, quantization_config: QuantizationConfig):
    method will_execute_quantized (line 856) | def will_execute_quantized(self, is_context_quantized: bool) -> bool:
    method forward (line 862) | def forward(self, x):
    method sharded_state_dict (line 880) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N...
    method backward_dw (line 898) | def backward_dw(self):
  class TELayerNormColumnParallelLinear (line 904) | class TELayerNormColumnParallelLinear(te.pytorch.LayerNormLinear):
    method __init__ (line 908) | def __init__(
    method finish_init (line 1070) | def finish_init(self, quantization_config: QuantizationConfig):
    method will_execute_quantized (line 1077) | def will_execute_quantized(self, is_context_quantized: bool) -> bool:
    method forward (line 1083) | def forward(self, x):
    method sharded_state_dict (line 1102) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N...
    method extra_repr (line 1116) | def extra_repr(self) -> str:
    method backward_dw (line 1125) | def backward_dw(self):
  class TEColumnParallelLinear (line 1131) | class TEColumnParallelLinear(TELinear):
    method __init__ (line 1135) | def __init__(
    method sharded_state_dict (line 1213) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N...
    method extra_repr (line 1226) | def extra_repr(self) -> str:
    method backward_dw (line 1235) | def backward_dw(self):
  class TERowParallelLinear (line 1241) | class TERowParallelLinear(TELinear):
    method __init__ (line 1245) | def __init__(
    method sharded_state_dict (line 1317) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N...
    method extra_repr (line 1330) | def extra_repr(self) -> str:
    method backward_dw (line 1339) | def backward_dw(self):
  class TEDotProductAttention (line 1345) | class TEDotProductAttention(te.pytorch.DotProductAttention):
    method __init__ (line 1356) | def __init__(
    method forward (line 1542) | def forward(
    method sharded_state_dict (line 1645) | def sharded_state_dict(
  class TEGroupedLinear (line 1668) | class TEGroupedLinear(te.pytorch.GroupedLinear):
    method __init__ (line 1677) | def __init__(
    method finish_init (line 1874) | def finish_init(self, quantization_config: QuantizationConfig):
    method will_execute_quantized (line 1881) | def will_execute_quantized(self, is_context_quantized: bool) -> bool:
    method forward (line 1887) | def forward(self, x, m_splits):
    method _encode_extra_state (line 1905) | def _encode_extra_state(self, state):
    method _decode_extra_state (line 1916) | def _decode_extra_state(self, state):
    method _split_extra_state (line 1928) | def _split_extra_state(self, state):
    method _sharded_state_dict_grouped (line 1969) | def _sharded_state_dict_grouped(
    method backward_dw (line 2032) | def backward_dw(self):
  class TEColumnParallelGroupedLinear (line 2040) | class TEColumnParallelGroupedLinear(TEGroupedLinear):
    method __init__ (line 2046) | def __init__(
    method sharded_state_dict (line 2074) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N...
  class TERowParallelGroupedLinear (line 2086) | class TERowParallelGroupedLinear(TEGroupedLinear):
    method __init__ (line 2092) | def __init__(
    method sharded_state_dict (line 2120) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N...
  class TEFusedMLP (line 2138) | class TEFusedMLP(MLP):
    method __init__ (line 2142) | def __init__(self, *args, **kwargs):
    method _make_fused_impl (line 2148) | def _make_fused_impl(self) -> te.pytorch.ops.Sequential:
    method _make_activation_op (line 2274) | def _make_activation_op(
    method _register_hooks_on_fused_impl (line 2310) | def _register_hooks_on_fused_impl(self, fused_impl: torch.nn.Module) -...
    method forward (line 2396) | def forward(self, hidden_states: torch.Tensor, **kwargs) -> Tuple[Tens...
  class TEDelayedScaling (line 2423) | class TEDelayedScaling(te.common.recipe.DelayedScaling):
    method __init__ (line 2428) | def __init__(
  class TECudaRNGStatesTracker (line 2459) | class TECudaRNGStatesTracker(te.pytorch.distributed.CudaRNGStatesTracker):
    method __init__ (line 2463) | def __init__(self, is_inference_rng_tracker=False):
    method is_initialized (line 2474) | def is_initialized(self):
    method reset (line 2478) | def reset(self):
    method set_states (line 2483) | def set_states(self, states):
    method add (line 2488) | def add(self, name, seed):
  function te_checkpoint (line 2494) | def te_checkpoint(
  function get_cpu_offload_context (line 2534) | def get_cpu_offload_context(
  function fused_apply_rotary_pos_emb (line 2575) | def fused_apply_rotary_pos_emb(
  function fused_apply_rotary_pos_emb_thd (line 2596) | def fused_apply_rotary_pos_emb_thd(
  function te_parallel_cross_entropy (line 2683) | def te_parallel_cross_entropy(
  function te_general_gemm (line 2711) | def te_general_gemm(
  function set_save_original_input (line 2764) | def set_save_original_input(module):

FILE: megatron/core/extensions/transformer_engine_spec_provider.py
  class _TENormWithResidual (line 31) | class _TENormWithResidual:
    method __new__ (line 34) | def __new__(cls, *args, **kwargs):
  class TESpecProvider (line 38) | class TESpecProvider(BackendSpecProvider):
    method linear (line 41) | def linear(self) -> type:
    method column_parallel_linear (line 45) | def column_parallel_linear(self) -> type:
    method row_parallel_linear (line 49) | def row_parallel_linear(self) -> type:
    method fuse_layernorm_and_linear (line 53) | def fuse_layernorm_and_linear(self) -> bool:
    method column_parallel_layer_norm_linear (line 57) | def column_parallel_layer_norm_linear(self) -> Optional[type]:
    method layer_norm (line 61) | def layer_norm(
    method core_attention (line 73) | def core_attention(self) -> type:
    method grouped_mlp_modules (line 77) | def grouped_mlp_modules(
    method activation_func (line 102) | def activation_func(self) -> TEActivationFunctionBuilder | None:

FILE: megatron/core/fp4_utils.py
  function is_nvfp4tensor (line 46) | def is_nvfp4tensor(tensor: torch.Tensor) -> bool:
  function get_fp4_align_size (line 51) | def get_fp4_align_size(fp4_recipe: Fp4Recipe) -> int:
  function dequantize_fp4_tensor (line 83) | def dequantize_fp4_tensor(fp4_tensor: torch.Tensor) -> torch.Tensor:
  function get_fp4_recipe (line 94) | def get_fp4_recipe(config: TransformerConfig):
  function get_fp4_context (line 122) | def get_fp4_context(config: TransformerConfig, layer_no: int = -1, is_in...
  function get_fp4_recipe (line 167) | def get_fp4_recipe(config: TransformerConfig):
  function get_fp4_context (line 171) | def get_fp4_context(config: TransformerConfig, layer_no: int = -1, is_in...

FILE: megatron/core/fp8_utils.py
  function is_float8tensor (line 96) | def is_float8tensor(tensor: torch.Tensor) -> bool:
  function is_mxfp8tensor (line 108) | def is_mxfp8tensor(tensor: torch.Tensor) -> bool:
  function dequantize_fp8_tensor (line 113) | def dequantize_fp8_tensor(fp8_tensor: torch.Tensor) -> torch.Tensor:
  function _resolve_callable_from_python_import_path (line 121) | def _resolve_callable_from_python_import_path(dotted_path: str):
  function _get_custom_recipe (line 155) | def _get_custom_recipe(quantizer_factory_python_path: str) -> Union[Fp8R...
  function get_fp8_align_size (line 168) | def get_fp8_align_size(fp8_recipe: Fp8Recipe) -> int:
  function is_column_parallel_linear (line 176) | def is_column_parallel_linear(module):
  function is_row_parallel_linear (line 188) | def is_row_parallel_linear(module):
  function _modify_underlying_storage_impl (line 226) | def _modify_underlying_storage_impl(
  function _quantize_param_shard_impl (line 233) | def _quantize_param_shard_impl(
  function _correct_amax_history_if_needed_impl (line 267) | def _correct_amax_history_if_needed_impl(model: List[torch.nn.Module]) -...
  function _modify_underlying_storage_impl (line 275) | def _modify_underlying_storage_impl(
  function _quantize_param_shard_impl (line 284) | def _quantize_param_shard_impl(
  function _correct_amax_history_if_needed_impl (line 359) | def _correct_amax_history_if_needed_impl(model: List[torch.nn.Module]) -...
  function _modify_underlying_storage_impl (line 367) | def _modify_underlying_storage_impl(tensor: Float8Tensor, new_raw_data: ...
  function _quantize_param_shard_impl (line 374) | def _quantize_param_shard_impl(
  function _correct_amax_history_if_needed_impl (line 446) | def _correct_amax_history_if_needed_impl(model: List[torch.nn.Module]) -...
  function _modify_underlying_storage_impl (line 461) | def _modify_underlying_storage_impl(*args, **kwargs):
  function _quantize_param_shard_impl (line 464) | def _quantize_param_shard_impl(model_params, *args, **kwargs):
  function _correct_amax_history_if_needed_impl (line 471) | def _correct_amax_history_if_needed_impl(*args, **kwargs):
  function modify_underlying_storage (line 478) | def modify_underlying_storage(tensor: torch.Tensor, new_raw_data: torch....
  function quantize_param_shard (line 484) | def quantize_param_shard(
  function correct_amax_history_if_needed (line 494) | def correct_amax_history_if_needed(model: List[torch.nn.Module]):
  function post_all_gather_processing (line 499) | def post_all_gather_processing(model_params):
  function is_first_last_bf16_layer (line 513) | def is_first_last_bf16_layer(config: TransformerConfig, layer_no: int):
  function get_fp8_recipe (line 536) | def get_fp8_recipe(config: TransformerConfig):
  function get_fp8_context (line 596) | def get_fp8_context(config: TransformerConfig, layer_no: int = -1, is_in...
  function get_fp8_recipe (line 658) | def get_fp8_recipe(config: TransformerConfig):
  function get_fp8_context (line 662) | def get_fp8_context(config: TransformerConfig, layer_no: int = -1, is_in...
  function _wrap_te_linear_for_padding (line 673) | def _wrap_te_linear_for_padding(module: torch.nn.Module):
  function prepare_model_for_fp8_inference (line 757) | def prepare_model_for_fp8_inference(model):
  function prepare_model_for_fp8_inference (line 780) | def prepare_model_for_fp8_inference(model):

FILE: megatron/core/full_cuda_graph.py
  function copy_tensors_in_struct (line 19) | def copy_tensors_in_struct(src):
  function clone_tensors_in_struct (line 33) | def clone_tensors_in_struct(tgt, src):
  class StaticBufferLoader (line 57) | class StaticBufferLoader:
    method __init__ (line 62) | def __init__(self):
    method __call__ (line 65) | def __call__(self, inputs, stage, microbatch):
  class FullCudaGraphWrapper (line 94) | class FullCudaGraphWrapper:
    method __init__ (line 101) | def __init__(self, forward_backward_func, cuda_graph_warmup_steps=1):
    method data_read (line 106) | def data_read(self, data_iterator, model, training, num_microbatches):
    method __call__ (line 139) | def __call__(self, *args, **kwargs):
    method curr_iter (line 192) | def curr_iter(self, stage):
    method next_iter (line 196) | def next_iter(self, stage):

FILE: megatron/core/fusions/fused_bias_dropout.py
  function _bias_dropout_add_func (line 11) | def _bias_dropout_add_func(x_with_bias, residual, prob, training):
  function bias_dropout_add_unfused (line 62) | def bias_dropout_add_unfused(training):
  function bias_dropout_add_fused_train (line 70) | def bias_dropout_add_fused_train(
  function bias_dropout_add_fused_inference (line 77) | def bias_dropout_add_fused_inference(
  function get_bias_dropout_add (line 83) | def get_bias_dropout_add(training, fused):

FILE: megatron/core/fusions/fused_bias_geglu.py
  function geglu (line 17) | def geglu(y):
  function bias_geglu (line 31) | def bias_geglu(bias, y):
  function geglu_back (line 49) | def geglu_back(g, y):
  function bias_geglu_back (line 69) | def bias_geglu_back(g, y, bias):
  class BiasGeGLUFunction (line 84) | class BiasGeGLUFunction(torch.autograd.Function):
    method forward (line 89) | def forward(ctx, input, bias):
    method backward (line 104) | def backward(ctx, grad_output):
  class GeGLUFunction (line 119) | class GeGLUFunction(torch.autograd.Function):
    method forward (line 124) | def forward(ctx, input):
    method backward (line 138) | def backward(ctx, grad_output):
  function bias_geglu_impl (line 153) | def bias_geglu_impl(input, bias):
  function quick_gelu (line 185) | def quick_gelu(y: torch.Tensor) -> torch.Tensor:
  function quick_geglu (line 191) | def quick_geglu(y: torch.Tensor, linear_offset: float = 0.0) -> torch.Te...
  function weighted_quick_geglu (line 206) | def weighted_quick_geglu(
  function quick_geglu_back (line 221) | def quick_geglu_back(g, y, linear_offset: float = 0.0) -> torch.Tensor:
  function weighted_quick_geglu_back (line 240) | def weighted_quick_geglu_back(g, y, weights, linear_offset: float = 0.0):
  function weighted_bias_quick_geglu (line 259) | def weighted_bias_quick_geglu(
  function weighted_bias_quick_geglu_back (line 279) | def weighted_bias_quick_geglu_back(g, y, bias, weights, linear_offset: f...
  class WeightedQuickGeGLUFunction (line 303) | class WeightedQuickGeGLUFunction(torch.autograd.Function):
    method forward (line 307) | def forward(
    method backward (line 333) | def backward(ctx, grad_output):
  class WeightedBiasQuickGeGLUFunction (line 350) | class WeightedBiasQuickGeGLUFunction(torch.autograd.Function):
    method forward (line 354) | def forward(
    method backward (line 387) | def backward(ctx, grad_output):
  function weighted_bias_quick_geglu_impl (line 410) | def weighted_bias_quick_geglu_impl(

FILE: megatron/core/fusions/fused_bias_gelu.py
  function bias_gelu (line 17) | def bias_gelu(bias, y):
  function bias_gelu_back (line 26) | def bias_gelu_back(g, bias, y):
  class GeLUFunction (line 36) | class GeLUFunction(torch.autograd.Function):
    method forward (line 39) | def forward(ctx, input, bias):
    method backward (line 44) | def backward(ctx, grad_output):
    method apply (line 51) | def apply(cls, *args, **kwargs):

FILE: megatron/core/fusions/fused_bias_swiglu.py
  function swiglu (line 16) | def swiglu(y):
  function bias_swiglu (line 30) | def bias_swiglu(y, bias):
  function weighted_swiglu (line 45) | def weighted_swiglu(y, weights):
  function swiglu_back (line 55) | def swiglu_back(g, y):
  function bias_swiglu_back (line 73) | def bias_swiglu_back(g, y, bias):
  function weighted_swiglu_back (line 90) | def weighted_swiglu_back(g, y, weights):
  class BiasSwiGLUFunction (line 100) | class BiasSwiGLUFunction(torch.autograd.Function):
    method forward (line 105) | def forward(ctx, input, bias, fp8_input_store, cpu_offload_input):
    method backward (line 128) | def backward(ctx, grad_output):
  class SwiGLUFunction (line 147) | class SwiGLUFunction(torch.autograd.Function):
    method forward (line 152) | def forward(ctx, input, fp8_input_store, cpu_offload_input):
    method backward (line 173) | def backward(ctx, grad_output):
  class WeightedSwiGLUFunction (line 191) | class WeightedSwiGLUFunction(torch.autograd.Function):
    method forward (line 194) | def forward(ctx, input, weights, fp8_input_store):
    method backward (line 202) | def backward(ctx, grad_output):
  function bias_swiglu_impl (line 209) | def bias_swiglu_impl(input, bias, fp8_input_store=False, cpu_offload_inp...
  function weighted_bias_swiglu_impl (line 239) | def weighted_bias_swiglu_impl(input, bias, weights, fp8_input_store=False):

FILE: megatron/core/fusions/fused_cross_entropy.py
  function calculate_logits_max (line 13) | def calculate_logits_max(vocab_parallel_logits: torch.Tensor) -> Tuple[t...
  function calculate_predicted_logits (line 26) | def calculate_predicted_logits(
  function calculate_cross_entropy_loss (line 48) | def calculate_cross_entropy_loss(
  function calculate_gradients (line 65) | def calculate_gradients(
  class _VocabParallelCrossEntropy (line 87) | class _VocabParallelCrossEntropy(torch.autograd.Function):
    method forward (line 89) | def forward(ctx, vocab_parallel_logits, target, tp_group):
    method backward (line 124) | def backward(ctx, grad_output):
  function fused_vocab_parallel_cross_entropy (line 136) | def fused_vocab_parallel_cross_entropy(vocab_parallel_logits, target, tp...

FILE: megatron/core/fusions/fused_indices_converter.py
  function _indices_to_multihot_kernel (line 32) | def _indices_to_multihot_kernel(
  function _multihot_to_indices_kernel (line 112) | def _multihot_to_indices_kernel(
  class IndicesToMultihot (line 176) | class IndicesToMultihot(torch.autograd.Function):
    method forward (line 186) | def forward(ctx, indices, probs_indices, num_of_local_experts):
    method backward (line 239) | def backward(ctx, grad_multihot_indices, grad_probs_in_multihot):
  function fused_indices_to_multihot (line 282) | def fused_indices_to_multihot(indices, probs_indices, num_of_local_exper...

FILE: megatron/core/fusions/fused_layer_norm.py
  class FusedLayerNorm (line 30) | class FusedLayerNorm(torch.nn.Module):
    method __init__ (line 52) | def __init__(
    method reset_parameters (line 122) | def reset_parameters(self):
    method forward (line 131) | def forward(self, input: Tensor) -> Tensor:

FILE: megatron/core/fusions/fused_mla_yarn_rope_apply.py
  function _get_thd_token_idx (line 31) | def _get_thd_token_idx(cu_seqlens, pid_m, seq_num, cp_rank, cp_size):
  function rotary_fwd_q_kernel (line 68) | def rotary_fwd_q_kernel(
  function rotary_bwd_q_kernel (line 148) | def rotary_bwd_q_kernel(
  class ApplyMLARotaryEmbQ (line 210) | class ApplyMLARotaryEmbQ(torch.autograd.Function):
    method forward (line 216) | def forward(
    method backward (line 285) | def backward(ctx, grad):
  function fused_apply_mla_rope_for_q (line 327) | def fused_apply_mla_rope_for_q(
  function rotary_fwd_kv_kernel (line 379) | def rotary_fwd_kv_kernel(
  function rotary_bwd_kv_kernel (line 487) | def rotary_bwd_kv_kernel(
  class ApplyMLARotaryEmbKV (line 581) | class ApplyMLARotaryEmbKV(torch.autograd.Function):
    method forward (line 587) | def forward(
    method backward (line 675) | def backward(ctx, dk, dv):
  function fused_apply_mla_rope_for_kv (line 735) | def fused_apply_mla_rope_for_kv(

FILE: megatron/core/fusions/fused_pad_routing_map.py
  function _pad_routing_map_kernel (line 31) | def _pad_routing_map_kernel(
  function fused_pad_routing_map (line 74) | def fused_pad_routing_map(routing_map: torch.Tensor, pad_multiple: int) ...

FILE: megatron/core/fusions/fused_softmax.py
  class ScaledUpperTriangMaskedSoftmax (line 11) | class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
    method forward (line 20) | def forward(ctx, inputs, scale):
    method backward (line 40) | def backward(ctx, output_grads):
  class ScaledMaskedSoftmax (line 60) | class ScaledMaskedSoftmax(torch.autograd.Function):
    method forward (line 69) | def forward(ctx, inputs, mask, scale):
    method backward (line 90) | def backward(ctx, output_grads):
  class ScaledSoftmax (line 108) | class ScaledSoftmax(torch.autograd.Function):
    method forward (line 116) | def forward(ctx, inputs, scale):
    method backward (line 136) | def backward(ctx, output_grads):
  class SoftmaxOne (line 154) | class SoftmaxOne(nn.Module):
    method __init__ (line 161) | def __init__(
    method forward (line 168) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  class FusedScaleMaskSoftmax (line 179) | class FusedScaleMaskSoftmax(nn.Module):
    method __init__ (line 193) | def __init__(
    method forward (line 219) | def forward(
    method is_kernel_available (line 238) | def is_kernel_available(self, mask, b, np, sq, sk):
    method forward_fused_softmax (line 272) | def forward_fused_softmax(self, input, mask):
    method forward_torch_softmax (line 299) | def forward_torch_softmax(self, input, mask, softmax_offset=None):
    method get_batch_per_block (line 345) | def get_batch_per_block(sq, sk, b, np):

FILE: megatron/core/fusions/fused_weighted_squared_relu.py
  function weighted_squared_relu (line 14) | def weighted_squared_relu(x: torch.Tensor, weights: torch.Tensor) -> tor...
  function _squared_relu_back (line 32) | def _squared_relu_back(g: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
  function weighted_squared_relu_back (line 41) | def weighted_squared_relu_back(g: torch.Tensor, x: torch.Tensor, weights...
  class WeightedSquaredReLUFunction (line 60) | class WeightedSquaredReLUFunction(torch.autograd.Function):
    method forward (line 65) | def forward(ctx, input: torch.Tensor, weights: torch.Tensor):
    method backward (line 79) | def backward(ctx, grad_output: torch.Tensor):
  function weighted_squared_relu_impl (line 91) | def weighted_squared_relu_impl(input: torch.Tensor, weights: torch.Tenso...

FILE: megatron/core/hyper_comm_grid.py
  class HyperCommGrid (line 33) | class HyperCommGrid:
    method __init__ (line 82) | def __init__(
    method create_pg (line 120) | def create_pg(self, dims: Union[str, list[str]], **kwargs: Any) -> dis...
    method destroy (line 168) | def destroy(self) -> None:
    method get_pg (line 175) | def get_pg(self, dims: Union[str, list[str]]) -> dist.ProcessGroup:
    method get_rank_enum (line 190) | def get_rank_enum(self, dims: Union[str, list[str]]) -> list[list[int]]:
    method _gen_rank_enum (line 206) | def _gen_rank_enum(self, dims: list[str]) -> list[list[int]]:
    method _order_dims (line 251) | def _order_dims(self, dims: Union[str, list[str]]) -> Tuple[list[str],...

FILE: megatron/core/inference/async_stream.py
  class AsyncStream (line 17) | class AsyncStream:
    method __init__ (line 24) | def __init__(
    method put (line 36) | def put(self, item: Union[InferenceRequest, Exception]) -> None:
    method finish (line 41) | def finish(self, exception: Optional[Union[BaseException, Type[BaseExc...
    method finished (line 51) | def finished(self) -> bool:
    method generator (line 55) | async def generator(self) -> AsyncGenerator[InferenceRequest, None]:
    method _is_raisable (line 70) | def _is_raisable(value: Any):

FILE: megatron/core/inference/batch_dimensions_utils.py
  class InferenceBatchDimensions (line 21) | class InferenceBatchDimensions:
    method __str__ (line 38) | def __str__(self):
    method is_applicable_for_batch_dim (line 44) | def is_applicable_for_batch_dim(
    method is_valid (line 76) | def is_valid(
    method __hash__ (line 112) | def __hash__(self):
    method __eq__ (line 119) | def __eq__(self, other: "InferenceBatchDimensions") -> bool:
    method req_count (line 132) | def req_count(self) -> int:
    method adjust_batch_dims_for_expert_parallelism (line 139) | def adjust_batch_dims_for_expert_parallelism(
  class CUDAGraphBatchDimensionBuilder (line 233) | class CUDAGraphBatchDimensionBuilder:
    method _calculate_cuda_graph_token_counts (line 244) | def _calculate_cuda_graph_token_counts(
    method generate_cuda_graph_batch_dimensions_list (line 318) | def generate_cuda_graph_batch_dimensions_list(
    method match_graph_config (line 508) | def match_graph_config(

FILE: megatron/core/inference/communication/torch_symm_triton/barrier.py
  function _send_signal (line 21) | def _send_signal(addrs, sem: tl.constexpr):
  function _wait_signal (line 43) | def _wait_signal(addrs, sem: tl.constexpr):
  function symm_mem_sync (line 65) | def symm_mem_sync(

FILE: megatron/core/inference/communication/torch_symm_triton/collectives.py
  function _ag_phase (line 30) | def _ag_phase(
  function _multimem_all_gather_kernel (line 68) | def _multimem_all_gather_kernel(
  function _multimem_all_gather_3_kernel (line 95) | def _multimem_all_gather_3_kernel(
  function _multimem_reduce_scatter_kernel (line 159) | def _multimem_reduce_scatter_kernel(
  function _kernel_launch_config (line 212) | def _kernel_launch_config(element_size: int, max_numel: int, world_size:...
  function multimem_all_gather (line 230) | def multimem_all_gather(
  function multimem_all_gather_fused (line 270) | def multimem_all_gather_fused(
  function multimem_reduce_scatter (line 323) | def multimem_reduce_scatter(

FILE: megatron/core/inference/communication/torch_symm_triton/fused_collectives.py
  function unpack_bf16x2 (line 23) | def unpack_bf16x2(x, mask):
  function sum_sq (line 42) | def sum_sq(x, y, z, w, mask):
  function apply_norm (line 74) | def apply_norm(x, y, z, w, wx, wy, wz, ww, rrms, mask):
  function _multimem_reduce_scatter_residual_add_kernel (line 115) | def _multimem_reduce_scatter_residual_add_kernel(
  function fused_multimem_rs_add_norm_ag (line 210) | def fused_multimem_rs_add_norm_ag(

FILE: megatron/core/inference/communication/torch_symm_triton/multimem_asm.py
  function ld_128 (line 21) | def ld_128(ptr, mask, multicast_op: tl.constexpr, reduce_f32: tl.constex...
  function st_128 (line 111) | def st_128(ptr, x, y, z, w, mask, multicast_op):
  function add_v8_bf16_from_u32 (line 181) | def add_v8_bf16_from_u32(
  function asm_rsqrt (line 215) | def asm_rsqrt(x, eps):

FILE: megatron/core/inference/communication/torch_symm_triton/utils.py
  function is_device_nvls_capable (line 20) | def is_device_nvls_capable(device: torch.device) -> bool:
  function are_tensors_nvls_eligible (line 26) | def are_tensors_nvls_eligible(*tensors: torch.Tensor) -> bool:
  function get_tid (line 42) | def get_tid():
  function get_ntid (line 61) | def get_ntid():
  function get_flat_tid (line 80) | def get_flat_tid():
  function get_flat_bid (line 90) | def get_flat_bid():
  function sync_threads (line 101) | def sync_threads():

FILE: megatron/core/inference/communication_utils.py
  function is_pipeline_first_stage (line 10) | def is_pipeline_first_stage(pp_group: ProcessGroup):
  function is_pipeline_last_stage (line 19) | def is_pipeline_last_stage(pp_group: ProcessGroup):
  function _is_cuda (line 28) | def _is_cuda(tensor):
  function _is_cuda_contiguous (line 34) | def _is_cuda_contiguous(tensor):
  function broadcast_from_last_pipeline_stage (line 40) | def broadcast_from_last_pipeline_stage(
  function recv_from_prev_pipeline_rank_ (line 83) | def recv_from_prev_pipeline_rank_(
  function send_to_next_pipeline_rank (line 114) | def send_to_next_pipeline_rank(
  function broadcast_tensor (line 145) | def broadcast_tensor(size, dtype, tensor=None, rank=0, data_parallel=Fal...
  function broadcast_list (line 169) | def broadcast_list(size, dtype, list_values=None, rank=0, data_parallel=...
  function broadcast_int_list (line 190) | def broadcast_int_list(size, int_list=None, rank=0, data_parallel=False):
  function broadcast_float_list (line 202) | def broadcast_float_list(size, float_list=None, rank=0, data_parallel=Fa...

FILE: megatron/core/inference/config.py
  class MambaInferenceStateConfig (line 15) | class MambaInferenceStateConfig:
    method from_model (line 46) | def from_model(
  class PrefixCachingEvictionPolicy (line 81) | class PrefixCachingEvictionPolicy(str, Enum):
  class PrefixCachingCoordinatorPolicy (line 94) | class PrefixCachingCoordinatorPolicy(str, Enum):
  class KVCacheManagementMode (line 107) | class KVCacheManagementMode(str, Enum):
  class InferenceConfig (line 121) | class InferenceConfig:

FILE: megatron/core/inference/contexts/attention_context/mamba_metadata.py
  class MambaMetadata (line 10) | class MambaMetadata:
    method __init__ (line 13) | def __init__(self, max_requests: int, max_tokens: int, mamba_chunk_siz...
    method reset (line 87) | def reset(self) -> None:
    method reset_varlen_metadata (line 101) | def reset_varlen_metadata(self) -> None:
    method update (line 120) | def update(
    method allocate_slot (line 294) | def allocate_slot(self) -> Optional[int]:
    method batch_allocate_slots (line 311) | def batch_allocate_slots(self, num_slots: int) -> Optional[torch.Tensor]:
    method free_slots (line 330) | def free_slots(self, request_indices: torch.Tensor) -> None:

FILE: megatron/core/inference/contexts/attention_context/metadata_base.py
  class MetadataBase (line 4) | class MetadataBase:
    method __init__ (line 14) | def __init__(self):
    method update (line 20) | def update(self, *args, **kwargs):
    method reset (line 26) | def reset(self):
    method tensor_copy_and_pad (line 32) | def tensor_copy_and_pad(
    method __str__ (line 68) | def __str__(self):

FILE: megatron/core/inference/contexts/attention_context/mha_metadata.py
  class MHAMetadata (line 9) | class MHAMetadata(MetadataBase):
    method __init__ (line 14) | def __init__(
    method update (line 37) | def update(
    method reset (line 123) | def reset(self):
  class GraphedMHAMetadata (line 136) | class GraphedMHAMetadata(MHAMetadata):
    method __init__ (line 141) | def __init__(
    method update (line 148) | def update(
    method reset (line 175) | def reset(self):
  class NonGraphedMHAMetadata (line 179) | class NonGraphedMHAMetadata(MHAMetadata):
    method update (line 184) | def update(

FILE: megatron/core/inference/contexts/attention_context/triton/tensor_ops.py
  function _tensor_get_slice_after_kernel (line 24) | def _tensor_get_slice_after_kernel(
  function _tensor_merge_kernel (line 54) | def _tensor_merge_kernel(
  function _tensor_masked_update_kernel_2d (line 101) | def _tensor_masked_update_kernel_2d(
  function _tensor_masked_update_kernel_3d (line 141) | def _tensor_masked_update_kernel_3d(
  function _tensor_masked_update_kernel_4d (line 197) | def _tensor_masked_update_kernel_4d(
  function _compute_row_size (line 262) | def _compute_row_size(tensor):
  function tensor_get_slice_after (line 272) | def tensor_get_slice_after(input_tensor, output_tensor, pos_on_device, c...
  function tensor_merge (line 327) | def tensor_merge(
  function tensor_masked_update (line 395) | def tensor_masked_update(states: torch.Tensor, idx: torch.Tensor, new_st...

FILE: megatron/core/inference/contexts/base_context.py
  class BaseInferenceContext (line 8) | class BaseInferenceContext(abc.ABC):
    method __init__ (line 15) | def __init__(self, inference_config: InferenceConfig):
    method is_static_batching (line 22) | def is_static_batching(self) -> bool:
    method is_dynamic_batching (line 26) | def is_dynamic_batching(self) -> bool:
    method increment_sequence_len_offset (line 30) | def increment_sequence_len_offset(self, increment: int) -> None:
    method increment_batch_size_offset (line 35) | def increment_batch_size_offset(self, increment: int) -> None:
    method reset_batch_size_offset (line 40) | def reset_batch_size_offset(self) -> None:

FILE: megatron/core/inference/contexts/dynamic_context.py
  class ContextOverflowError (line 97) | class ContextOverflowError(Exception):
    method __init__ (line 106) | def __init__(
  class RequestOverflowError (line 117) | class RequestOverflowError(ContextOverflowError):
  class TokenOverflowError (line 123) | class TokenOverflowError(ContextOverflowError):
  class MaxSequenceLengthOverflowError (line 129) | class MaxSequenceLengthOverflowError(ContextOverflowError):
    method __init__ (line 132) | def __init__(self, request_id, message: Optional[str] = None):
  class BlockOverflowError (line 136) | class BlockOverflowError(ContextOverflowError):
  class ActiveRequestCountOverflowError (line 142) | class ActiveRequestCountOverflowError(ContextOverflowError):
    method __init__ (line 146) | def __init__(self, max_request_count, active_request_count):
  class TensorStateDeallocatedError (line 155) | class TensorStateDeallocatedError(ContextOverflowError):
  class ContextErrorFactory (line 162) | class ContextErrorFactory:
    method serialize (line 166) | def serialize(cls, error: ContextOverflowError) -> dict:
    method deserialize (line 184) | def deserialize(cls, obj: dict) -> ContextOverflowError:
  function get_mem_size_str (line 206) | def get_mem_size_str(n_bytes: int) -> str:
  class DynamicInferenceContext (line 217) | class DynamicInferenceContext(BaseInferenceContext):
    method __init__ (line 248) | def __init__(self, model_config: TransformerConfig, inference_config: ...
    method _allocate_memory_buffer (line 600) | def _allocate_memory_buffer(self):
    method _allocate_mamba_states (line 636) | def _allocate_mamba_states(self):
    method initialize_all_tensors (line 702) | def initialize_all_tensors(self) -> None:
    method reinitialize_inference_state_buffers (line 801) | def reinitialize_inference_state_buffers(self):
    method deallocate_inference_state_buffers (line 838) | def deallocate_inference_state_buffers(self):
    method round_up_tokens (line 879) | def round_up_tokens(cls, value, tp_size=None):
    method round_up_requests (line 893) | def round_up_requests(cls, value, tp_size=None):
    method is_static_batching (line 906) | def is_static_batching(self) -> bool:
    method is_decode_only (line 910) | def is_decode_only(self) -> bool:
    method using_cuda_graph_this_step (line 916) | def using_cuda_graph_this_step(self) -> bool:
    method has_unfinished_requests (line 920) | def has_unfinished_requests(self) -> bool:
    method cu_query_lengths (line 924) | def cu_query_lengths(self) -> Tuple[Tensor, int]:
    method cu_kv_lengths (line 932) | def cu_kv_lengths(self) -> Tuple[Tensor, Tensor, int]:
    method get_active_sequence_lengths (line 941) | def get_active_sequence_lengths(self) -> Tensor:
    method get_max_sequence_lengths (line 947) | def get_max_sequence_lengths(self) -> Tensor:
    method get_active_request_count (line 951) | def get_active_request_count(self):
    method append_key_value_cache (line 955) | def append_key_value_cache(self, layer_number: int, key: Tensor, value...
    method key_value_cache (line 1004) | def key_value_cache(self, layer_number: int) -> Tuple[Tensor, Optional...
    method mamba_states_cache (line 1031) | def mamba_states_cache(
    method _allocate_mamba_cache (line 1051) | def _allocate_mamba_cache(self, mamba_gb: float) -> None:
    method apply_fused_qk_rotary_emb (line 1093) | def apply_fused_qk_rotary_emb(
    method apply_rotary_emb_query (line 1125) | def apply_rotary_emb_query(
    method apply_rotary_emb_key (line 1159) | def apply_rotary_emb_key(
    method reset_attention_state (line 1198) | def reset_attention_state(self) -> None:
    method reset_mamba_state (line 1210) | def reset_mamba_state(self) -> None:
    method add_dummy_requests_parallel (line 1215) | def add_dummy_requests_parallel(
    method add_dummy_requests_for_cudagraph_capture (line 1354) | def add_dummy_requests_for_cudagraph_capture(
    method num_decode_requests (line 1414) | def num_decode_requests(self) -> int:
    method add_dummy_requests_for_expert_parallel_step (line 1420) | def add_dummy_requests_for_expert_parallel_step(self) -> None:
    method initialize_attention_state (line 1476) | def initialize_attention_state(
    method reset_tensors (line 1648) | def reset_tensors(self) -> None:
    method reset_metadata (line 1674) | def reset_metadata(self) -> None:
    method reset (line 1717) | def reset(self) -> None:
    method current_input_and_position_ids (line 1736) | def current_input_and_position_ids(
    method last_token_logits (line 1758) | def last_token_logits(self, logits: Tensor) -> Tensor:
    method _compute_prefix_match (line 1781) | def _compute_prefix_match(
    method check_availability (line 1860) | def check_availability(self, req: DynamicInferenceRequest) -> Tuple[bo...
    method _find_kv_match_count (line 1880) | def _find_kv_match_count(
    method add_request (line 1927) | def add_request(
    method _move_book_keeping_tensors (line 2135) | def _move_book_keeping_tensors(
    method _swap_book_keeping_tensors (line 2164) | def _swap_book_keeping_tensors(
    method get_index_of_chunked_prefill_request (line 2194) | def get_index_of_chunked_prefill_request(self, safe: bool = True) -> int:
    method is_chunked_prefill_enabled (line 2216) | def is_chunked_prefill_enabled(self) -> bool:
    method release_memory_blocks_from_request_indexes (line 2222) | def release_memory_blocks_from_request_indexes(self, request_indexes) ...
    method resume_paused_requests (line 2254) | def resume_paused_requests(
    method evict_overflow_paused_requests (line 2338) | def evict_overflow_paused_requests(
    method update_requests (line 2453) | def update_requests(
    method calculate_log_probs (line 2902) | def calculate_log_probs(
    method get_kvcache_utilization_stats (line 2974) | def get_kvcache_utilization_stats(self) -> dict:

FILE: megatron/core/inference/contexts/fused_kv_append_kernel.py
  function _append_kv_cache_kernel (line 22) | def _append_kv_cache_kernel(
  function triton_append_key_value_cache (line 93) | def triton_append_key_value_cache(

FILE: megatron/core/inference/contexts/kv_block_allocator.py
  class KVBlockAllocator (line 12) | class KVBlockAllocator:
    method __init__ (line 27) | def __init__(
    method __str__ (line 76) | def __str__(self):
    method get_total_used (line 83) | def get_total_used(self):
    method get_active_used (line 87) | def get_active_used(self):
    method get_paused_used (line 107) | def get_paused_used(self):
    method get_active_avail (line 123) | def get_active_avail(self):
    method get_paused_avail (line 127) | def get_paused_avail(self):
    method is_memory_available (line 131) | def is_memory_available(self, num_blocks: int) -> bool:
    method allocate_memory_blocks (line 153) | def allocate_memory_blocks(self, num_blocks: int) -> Optional[Tensor]:
    method release_memory_blocks (line 188) | def release_memory_blocks(self, blocks: Tensor) -> None:
    method reset (line 227) | def reset(self) -> None:
    method register_kv_block_hashes (line 262) | def register_kv_block_hashes(self, block_ids: list[int], block_hashes:...
    method _deregister_blocks (line 276) | def _deregister_blocks(self, block_ids: Tensor) -> None:
    method update_timestamps (line 313) | def update_timestamps(self, block_ids: Tensor) -> None:
    method get_evictable_block_count (line 326) | def get_evictable_block_count(self) -> Tensor:
    method evict_lru_blocks (line 335) | def evict_lru_blocks(self, num_blocks_needed: int) -> bool:

FILE: megatron/core/inference/contexts/mamba_slot_allocator.py
  class MambaSlotAllocator (line 14) | class MambaSlotAllocator:
    method __init__ (line 31) | def __init__(
    method allocate_slot (line 79) | def allocate_slot(self, block_id: int) -> int:
    method _evict_lru_slot (line 104) | def _evict_lru_slot(self) -> int:
    method get_slot (line 138) | def get_slot(self, block_id: int) -> int:
    method has_state (line 149) | def has_state(self, block_id: int) -> bool:
    method invalidate_block (line 153) | def invalidate_block(self, block_id: int) -> None:
    method store_from_tensors (line 174) | def store_from_tensors(
    method store_from_live (line 190) | def store_from_live(self, block_id: int, request_idx: int) -> None:
    method restore_to_live (line 206) | def restore_to_live(self, request_idx: int, block_id: int) -> bool:
    method register_block_hash (line 228) | def register_block_hash(self, block_id: int, block_hash: int) -> None:
    method on_kv_blocks_deregistered (line 241) | def on_kv_blocks_deregistered(self, block_ids_list: list, hashes_to_de...
    method compute_and_store_offsets (line 263) | def compute_and_store_offsets(
    method get_intermediate_offsets (line 332) | def get_intermediate_offsets(self) -> Optional[List[List[int]]]:
    method buffer_intermediate_states (line 361) | def buffer_intermediate_states(
    method commit_intermediate_states (line 373) | def commit_intermediate_states(self) -> None:
    method _clear_intermediate_state (line 426) | def _clear_intermediate_state(self) -> None:
    method reset (line 444) | def reset(self) -> None:

FILE: megatron/core/inference/contexts/routing_metadata.py
  class RoutingMetadata (line 13) | class RoutingMetadata:
    method __init__ (line 25) | def __init__(self, context: 'DynamicInferenceContext', moe_router_topk...
    method _ensure_buffer_allocated (line 36) | def _ensure_buffer_allocated(self) -> None:
    method get_routing_indices (line 57) | def get_routing_indices(self) -> Optional[torch.Tensor]:
    method enable_static_buffer_recording (line 83) | def enable_static_buffer_recording(self) -> None:
    method disable_static_buffer_recording (line 94) | def disable_static_buffer_recording(self) -> None:

FILE: megatron/core/inference/contexts/static_context.py
  class StaticInferenceContext (line 8) | class StaticInferenceContext(BaseInferenceContext):
    method __init__ (line 17) | def __init__(
    method swap_key_value_dict (line 29) | def swap_key_value_dict(self, batch_idx):
    method enable_prefill_mode (line 46) | def enable_prefill_mode(self):
    method enable_decode_mode (line 54) | def enable_decode_mode(self):
    method is_decode_only (line 62) | def is_decode_only(self):
    method reset (line 66) | def reset(self):
    method __str__ (line 72) | def __str__(self):
    method __eq__ (line 83) | def __eq__(self, other):
    method is_static_batching (line 121) | def is_static_batching(self):

FILE: megatron/core/inference/data_parallel_inference_coordinator.py
  class DataParallelInferenceCoordinator (line 43) | class DataParallelInferenceCoordinator:
    class CoordinatorState (line 77) | class CoordinatorState(Enum):
    method __init__ (line 85) | def __init__(
    method get_next_data_parallel_rank (line 203) | def get_next_data_parallel_rank(self):
    method _remove_engine (line 217) | def _remove_engine(self, identity):
    method _send_to_engine (line 226) | def _send_to_engine(self, identity, payload):
    method compute_request_hashes (line 241) | def compute_request_hashes(self, prompt):
    method get_best_data_parallel_rank (line 259) | def get_best_data_parallel_rank(self, request_hashes):
    method _update_rank_hashes (line 292) | def _update_rank_hashes(self, rank_identity, request_hashes):
    method start (line 304) | def start(self):
    method detokenize (line 497) | def detokenize(self, finished_request):
    method entrypoint (line 522) | def entrypoint(
    method stop (line 574) | def stop(self):

FILE: megatron/core/inference/engines/abstract_engine.py
  class AbstractEngine (line 6) | class AbstractEngine(ABC):
    method generate (line 9) | def generate(self) -> dict:

FILE: megatron/core/inference/engines/async_zmq_communicator.py
  class AsyncZMQCommunicator (line 20) | class AsyncZMQCommunicator:
    method __init__ (line 29) | def __init__(self, zmq_context: zmq.Context, process_group: dist.Proce...
    method all_reduce_max (line 68) | async def all_reduce_max(self, *local_vals: int, async_op=True) -> int...
    method close (line 127) | def close(self):

FILE: megatron/core/inference/engines/dynamic_engine.py
  class EngineState (line 114) | class EngineState(Enum):
  class EngineSuspendedError (line 129) | class EngineSuspendedError(Exception):
  function format_mem_bytes (line 135) | def format_mem_bytes(mem_bytes):
  class RequestEntry (line 145) | class RequestEntry:
  class DynamicInferenceEngine (line 154) | class DynamicInferenceEngine(AbstractEngine):
    method __init__ (line 185) | def __init__(self, controller: TextGenerationController, context: Dyna...
    method reset (line 266) | def reset(self) -> None:
    method wait_until (line 314) | async def wait_until(self, state: EngineState):
    method create_cuda_graphs (line 326) | def create_cuda_graphs(self, reset_context: bool = True):
    method start_listening_to_data_parallel_coordinator (line 424) | async def start_listening_to_data_parallel_coordinator(
    method suspend_resume_ctx (line 627) | def suspend_resume_ctx(key: str, *, unified_memory_level: int) -> None:
    method suspend (line 688) | def suspend(self):
    method resume (line 737) | def resume(self):
    method _notify_cond_for_new_request (line 800) | async def _notify_cond_for_new_request(self):
    method _handle_failed_request (line 805) | def _handle_failed_request(self, request_id: int):
    method has_unfinished_requests (line 846) | def has_unfinished_requests(self) -> bool:
    method get_request (line 850) | def get_request(self, request_id: int) -> DynamicInferenceRequest:
    method _add_request (line 861) | def _add_request(
    method add_request (line 950) | def add_request(
    method post_process_requests (line 1009) | def post_process_requests(
    method _get_and_clear_stop_word_finished_ids (line 1299) | def _get_and_clear_stop_word_finished_ids(self, active_request_ids: li...
    method _check_stop_words_for_request_post_append (line 1322) | def _check_stop_words_for_request_post_append(
    method get_prefix_coordination_metrics (line 1370) | def get_prefix_coordination_metrics(self) -> dict:
    method _find_mamba_match_count (line 1378) | def _find_mamba_match_count(self, req: DynamicInferenceRequest) -> int:
    method schedule_waiting_requests (line 1394) | def schedule_waiting_requests(self):
    method schedule_non_chunked_prefill (line 1411) | def schedule_non_chunked_prefill(self):
    method schedule_chunked_prefill (line 1466) | def schedule_chunked_prefill(self):
    method async_forward (line 1594) | async def async_forward(self) -> Tuple[Dict, Dict, float]:
    method async_bookkeep (line 1666) | async def async_bookkeep(
    method async_step (line 1898) | async def async_step(
    method _run_coroutine_sync (line 1917) | def _run_coroutine_sync(self, coro):
    method step_modern (line 1935) | def step_modern(
    method step_legacy (line 1941) | def step_legacy(
    method generate (line 1959) | def generate(
    method schedule_requests (line 1978) | def schedule_requests(self) -> int:
    method shutdown (line 2123) | async def shutdown(self):
    method run_engine (line 2157) | async def run_engine(self, *, loop: Optional[asyncio.AbstractEventLoop...
    method _ep_establish_consensus (line 2178) | async def _ep_establish_consensus(
    method _world_barrier (line 2227) | async def _world_barrier(self):
    method run_engine_with_coordinator (line 2244) | async def run_engine_with_coordinator(

FILE: megatron/core/inference/engines/static_engine.py
  class StaticInferenceEngine (line 35) | class StaticInferenceEngine(AbstractEngine):
    method __init__ (line 50) | def __init__(
    method get_new_request_id (line 132) | def get_new_request_id(self) -> str:
    method add_request (line 136) | def add_request(
    method get_stream_generator (line 192) | def get_stream_generator(
    method generate_using_dynamic_engine (line 202) | def generate_using_dynamic_engine(
    method generate_using_legacy_static_engine (line 250) | def generate_using_legacy_static_engine(
    method generate (line 305) | def generate(
    method run_engine (line 351) | def run_engine(self):
    method _wrapped_run_engine (line 389) | def _wrapped_run_engine(self, cuda_device):
    method run_engine_async (line 399) | async def run_engine_async(self, loop: Optional[asyncio.AbstractEventL...

FILE: megatron/core/inference/headers.py
  class Headers (line 6) | class Headers(Enum):
  class UnknownHeaderError (line 25) | class UnknownHeaderError(Exception):
    method __init__ (line 28) | def __init__(self, header):

FILE: megatron/core/inference/inference_client.py
  class InferenceClient (line 29) | class InferenceClient:
    method __init__ (line 54) | def __init__(self, inference_coordinator_address: str, deserialize: bo...
    method add_request (line 87) | def add_request(
    method _recv_task (line 119) | async def _recv_task(self):
    method _connect_with_inference_coordinator (line 154) | def _connect_with_inference_coordinator(self):
    method start (line 166) | def start(self, loop: Optional[asyncio.AbstractEventLoop] = None):
    method _send_signal_to_engines (line 179) | def _send_signal_to_engines(self, signal, *args):
    method pause_engines (line 191) | def pause_engines(self):
    method unpause_engines (line 200) | def unpause_engines(self) -> None:
    method set_generation_epoch (line 204) | def set_generation_epoch(self, generation_epoch: int):
    method suspend_engines (line 212) | def suspend_engines(self):
    method resume_engines (line 219) | def resume_engines(self):
    method stop_engines (line 226) | def stop_engines(self):
    method shutdown_coordinator (line 234) | def shutdown_coordinator(self):
    method stop (line 241) | def stop(self):

FILE: megatron/core/inference/inference_request.py
  function serialize_tensor (line 18) | def serialize_tensor(tensor: torch.Tensor) -> List:
  function deserialize_tensor (line 36) | def deserialize_tensor(tensor_as_list: List) -> torch.Tensor:
  function unwrap_serialized_tensors (line 49) | def unwrap_serialized_tensors(serialized_request: dict) -> dict:
  class Status (line 65) | class Status(Enum):
  function compute_block_hashes_batched (line 88) | def compute_block_hashes_batched(prompt_tokens: torch.Tensor, block_size...
  class InferenceRequest (line 131) | class InferenceRequest:
    method __post_init__ (line 158) | def __post_init__(self):
    method serialize (line 166) | def serialize(self) -> dict:
    method deserialize (line 191) | def deserialize(cls, obj: dict) -> "InferenceRequest":
    method _post_deserialize (line 206) | def _post_deserialize(self, obj: dict):
  class DynamicInferenceEventType (line 230) | class DynamicInferenceEventType(Enum):
  class DynamicInferenceEvent (line 245) | class DynamicInferenceEvent:
    method __post_init__ (line 263) | def __post_init__(self):
    method __str__ (line 287) | def __str__(self):
    method serialize (line 296) | def serialize(self) -> dict:
    method deserialize (line 322) | def deserialize(cls, obj: dict) -> "DynamicInferenceEvent":
  class DynamicInferenceRequest (line 349) | class DynamicInferenceRequest(InferenceRequest):
    method __post_init__ (line 377) | def __post_init__(self):
    method _compute_block_hashes (line 391) | def _compute_block_hashes(self) -> None:
    method remaining_prompt_length (line 403) | def remaining_prompt_length(self):
    method __str__ (line 414) | def __str__(self):
    method serialize (line 425) | def serialize(self):
    method _post_deserialize (line 450) | def _post_deserialize(self, obj):
    method tracked_metadata (line 455) | def tracked_metadata(self) -> List[Any]:
    method get_metadata_types (line 475) | def get_metadata_types() -> List[Tuple[str, torch.dtype, bool]]:
    method add_event (line 494) | def add_event(
    method add_event_add_engine (line 502) | def add_event_add_engine(self):
    method add_event_add_context (line 507) | def add_event_add_context(self):
    method add_event_generated_token (line 511) | def add_event_generated_token(
    method add_event_pause (line 547) | def add_event_pause(self):
    method add_event_evict (line 551) | def add_event_evict(self):
    method add_event_finish (line 555) | def add_event_finish(self):
    method add_event_fail (line 559) | def add_event_fail(self):
    method add_event_error_transient (line 563) | def add_event_error_transient(self, error: Exception):
    method add_event_error_nontransient (line 567) | def add_event_error_nontransient(self, error: Exception):
    method succeeded (line 571) | def succeeded(self) -> bool:
    method failed (line 575) | def failed(self) -> bool:
  class DynamicInferenceRequestRecord (line 581) | class DynamicInferenceRequestRecord:
    method from_request (line 589) | def from_request(cls, request: DynamicInferenceRequest) -> "DynamicInf...
    method __getitem__ (line 602) | def __getitem__(self, idx: int) -> DynamicInferenceRequest:
    method request_id (line 614) | def request_id(self) -> int:
    method checkpoint (line 622) | def checkpoint(self, tokenizer: MegatronTokenizer | None = None):
    method merge (line 679) | def merge(self, tokenizer: MegatronTokenizer | None = None) -> Dynamic...
    method serialize (line 737) | def serialize(self) -> dict:
    method deserialize (line 751) | def deserialize(cls, obj: dict) -> "DynamicInferenceRequestRecord":
  class VLMInferenceRequest (line 766) | class VLMInferenceRequest(InferenceRequest):

FILE: megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py
  class AbstractModelInferenceWrapper (line 23) | class AbstractModelInferenceWrapper(abc.ABC):
    method __init__ (line 39) | def __init__(
    method prep_model_for_inference (line 70) | def prep_model_for_inference(self):
    method prep_inference_input (line 87) | def prep_inference_input(self, prompt_tokens) -> Dict[str, Any]:
    method get_batch_for_context_window (line 99) | def get_batch_for_context_window(self, *args, **kwargs) -> Dict[str, A...
    method _forward (line 109) | def _forward(self, inference_input):
    method dummy_forward (line 130) | def dummy_forward(self):
    method _get_batch_size_and_seq_len (line 157) | def _get_batch_size_and_seq_len(
    method _allocate_recv_buffer (line 176) | def _allocate_recv_buffer(self, batch_size, seq_len):
    method forward_pass_without_pipeline_parallel (line 188) | def forward_pass_without_pipeline_parallel(
    method forward_pass_with_pipeline_parallel (line 209) | def forward_pass_with_pipeline_parallel(
    method run_one_forward_step (line 256) | def run_one_forward_step(

FILE: megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py
  class GPTInferenceWrapper (line 19) | class GPTInferenceWrapper(AbstractModelInferenceWrapper):
    method __init__ (line 31) | def __init__(self, model: GPTModel, inference_context: Optional[BaseIn...
    method prep_inference_input (line 34) | def prep_inference_input(self, prompts_tokens: torch.Tensor) -> Dict[s...
    method _build_attention_mask_and_position_ids (line 54) | def _build_attention_mask_and_position_ids(
    method get_batch_for_context_window (line 91) | def get_batch_for_context_window(

FILE: megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py
  class VLMInferenceWrapper (line 18) | class VLMInferenceWrapper(GPTInferenceWrapper):
    method prep_model_for_inference (line 21) | def prep_model_for_inference(self, prompts_tokens: Optional[torch.Tens...
    method prep_inference_input (line 55) | def prep_inference_input(
    method get_batch_for_context_window (line 89) | def get_batch_for_context_window(
    method _forward (line 126) | def _forward(self, inference_input: Dict[str, Any]):
    method run_one_forward_step (line 155) | def run_one_forward_step(self, inference_input: Dict[str, Any]) -> tor...

FILE: megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py
  class T5InferenceWrapper (line 19) | class T5InferenceWrapper(AbstractModelInferenceWrapper):
    method __init__ (line 33) | def __init__(
    method prep_inference_input (line 42) | def prep_inference_input(
    method tokenize_encoder_prompt (line 93) | def tokenize_encoder_prompt(self, encoder_prompt: str, tokenizer) -> t...
    method pad_encoder_prompts_tokens (line 121) | def pad_encoder_prompts_tokens(
    method get_batch_for_context_window (line 144) | def get_batch_for_context_window(
    method forward_pass_without_pipeline_parallel (line 192) | def forward_pass_without_pipeline_parallel(

FILE: megatron/core/inference/moe/__init__.py
  class InferenceGroupedGemmBackend (line 10) | class InferenceGroupedGemmBackend(enum.Enum):
  function resolve_inference_grouped_gemm_backend (line 18) | def resolve_inference_grouped_gemm_backend(

FILE: megatron/core/inference/moe/activations.py
  function _ceil_div (line 28) | def _ceil_div(a, b):
  function _squared_relu_kernel (line 33) | def _squared_relu_kernel(input_ptr, output_ptr, src_idx_ptr, M, N, BLOCK...
  function padded_squared_relu (line 46) | def padded_squared_relu(x: torch.Tensor, permutation_map: torch.Tensor) ...
  function _squared_relu_quantize_kernel (line 56) | def _squared_relu_quantize_kernel(
  function squared_relu_and_quantize_mxfp8 (line 121) | def squared_relu_and_quantize_mxfp8(

FILE: megatron/core/inference/moe/fused_moe.py
  class ActivationType (line 40) | class ActivationType(Enum):
  function _bf16_grouped_mm (line 46) | def _bf16_grouped_mm(
  function _mxfp8_grouped_mm (line 54) | def _mxfp8_grouped_mm(act: MXFP8Tensor, weight: MXFP8Tensor, offs: torch...
  function _get_activation_func (line 70) | def _get_activation_func(activation_type: ActivationType, fused_quant: b...
  function mcore_fused_moe (line 81) | def mcore_fused_moe(

FILE: megatron/core/inference/moe/pad.py
  function _pad_tokens_kernel (line 37) | def _pad_tokens_kernel(
  function pad_to_alignment (line 92) | def pad_to_alignment(
  function _unpad_tokens_kernel (line 140) | def _unpad_tokens_kernel(
  function unpad_from_alignment (line 168) | def unpad_from_alignment(

FILE: megatron/core/inference/moe/permute.py
  function _ceil_div (line 31) | def _ceil_div(a, b):
  function _count_local_tokens_kernel (line 36) | def _count_local_tokens_kernel(
  function compute_local_tokens_per_expert (line 60) | def compute_local_tokens_per_expert(
  function _prefix_sum_kernel (line 79) | def _prefix_sum_kernel(
  function compute_expert_offsets (line 104) | def compute_expert_offsets(tokens_per_expert: torch.Tensor, alignment: i...
  function _permute_tokens_kernel (line 121) | def _permute_tokens_kernel(
  function permute_tokens (line 170) | def permute_tokens(
  function _unpermute_tokens_kernel (line 243) | def _unpermute_tokens_kernel(
  function unpermute_tokens (line 271) | def unpermute_tokens(
  function _permute_quantize_mxfp8_kernel (line 295) | def _permute_quantize_mxfp8_kernel(
  function permute_and_quantize_mxfp8 (line 375) | def permute_and_quantize_mxfp8(

FILE: megatron/core/inference/quantization/mxfp8_quantize.py
  function _ceil_div (line 33) | def _ceil_div(a, b):
  function _mxfp8_quant_swizzle_kernel (line 38) | def _mxfp8_quant_swizzle_kernel(
  function mxfp8_quantize (line 160) | def mxfp8_quantize(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:

FILE: megatron/core/inference/quantization/mxfp8_tensor.py
  function _ceil_div (line 20) | def _ceil_div(a, b):
  class MXFP8Tensor (line 25) | class MXFP8Tensor:
    method size (line 32) | def size(self, idx: Optional[int] = None):
    method scale_2d (line 36) | def scale_2d(self, K: Optional[int] = None) -> torch.Tensor:
    method from_bf16 (line 51) | def from_bf16(cls, x: torch.Tensor, group_size: int = 32, backend: str...

FILE: megatron/core/inference/quantization/utils.py
  function _verify_te_to_mcore_mxfp8_conversion (line 32) | def _verify_te_to_mcore_mxfp8_conversion(te_dequantized, fi_quantized: M...
  function quantize_model_to_mxfp8 (line 53) | def quantize_model_to_mxfp8(model: torch.nn.Module, backend: str = "flas...
  function _should_quantize_param (line 97) | def _should_quantize_param(val: torch.Tensor) -> bool:
  function _to_bf16 (line 114) | def _to_bf16(val: torch.Tensor) -> torch.Tensor:
  function collect_mxfp8_param_metadata (line 123) | def collect_mxfp8_param_metadata(
  function quantize_params_to_mxfp8 (line 142) | def quantize_params_to_mxfp8(
  function _mm_mxfp8_flashinfer (line 215) | def _mm_mxfp8_flashinfer(x_mxfp8: MXFP8Tensor, weight: MXFP8Tensor, out=...
  function _mm_mxfp8_torch (line 222) | def _mm_mxfp8_torch(x_mxfp8: MXFP8Tensor, weight: MXFP8Tensor, out=None):
  function mm_mxfp8 (line 241) | def mm_mxfp8(x: torch.Tensor, weight: MXFP8Tensor, out: torch.Tensor = N...

FILE: megatron/core/inference/sampling_params.py
  class SamplingParams (line 9) | class SamplingParams:
    method __post_init__ (line 38) | def __post_init__(self):
    method _sync_prompt_logprobs_fields (line 46) | def _sync_prompt_logprobs_fields(self):
    method add_attributes (line 62) | def add_attributes(self, attribute_value_pair: dict):
    method serialize (line 79) | def serialize(self) -> dict:
    method deserialize (line 84) | def deserialize(cls, data: dict) -> "SamplingParams":

FILE: megatron/core/inference/scheduler.py
  class Scheduler (line 17) | class Scheduler:
    method __init__ (line 28) | def __init__(self, max_batch_size):
    method get_new_request_id (line 37) | def get_new_request_id(self) -> int:
    method add_request (line 42) | def add_request(
    method num_requests_pending (line 124) | def num_requests_pending(self) -> int:
    method have_requests_pending (line 131) | def have_requests_pending(self) -> bool:
    method add_earliest_waiting_request_to_active_pool (line 138) | def add_earliest_waiting_request_to_active_pool(self):
    method update_requests_pools (line 154) | def update_requests_pools(
    method abort_request (line 184) | def abort_request(

FILE: megatron/core/inference/symmetric_memory.py
  class SymmetricMemoryBuffer (line 34) | class SymmetricMemoryBuffer:
    method __init__ (line 41) | def __init__(self, size_in_mb, process_group):
    method _can_allocate (line 60) | def _can_allocate(self, numel, dtype) -> bool:
    method _allocate (line 71) | def _allocate(self, numel, dtype) -> torch.Tensor:
    method maybe_get_tensors (line 77) | def maybe_get_tensors(self, tensor_specs, alignment=16):
    method maybe_get_tensor (line 115) | def maybe_get_tensor(self, tensor_shape, dtype):
  class SymmetricMemoryManager (line 131) | class SymmetricMemoryManager:
    method get_buffer (line 144) | def get_buffer(
    method destroy (line 168) | def destroy(cls, key: Optional[str] = None) -> None:
    method is_initialized (line 180) | def is_initialized(cls, key: str) -> bool:

FILE: megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py
  class EncoderDecoderTextGenerationController (line 13) | class EncoderDecoderTextGenerationController(TextGenerationController):
    method prep_inference_input (line 21) | def prep_inference_input(

FILE: megatron/core/inference/text_generation_controllers/text_generation_controller.py
  class TextGenerationController (line 49) | class TextGenerationController:
    method __init__ (line 60) | def __init__(self, inference_wrapped_model: AbstractModelInferenceWrap...
    method _get_mtp_num_heads (line 90) | def _get_mtp_num_heads(self) -> int:
    method set_stop_word_finished_ids_callback (line 97) | def set_stop_word_finished_ids_callback(self, callback):
    method _init_dynamic_sampling_tensors (line 108) | def _init_dynamic_sampling_tensors(self):
    method _init_mtp_sampling_tensor (line 144) | def _init_mtp_sampling_tensor(self):
    method tokenize_prompt (line 161) | def tokenize_prompt(tokenizer, prompt: str, add_BOS: bool = False) -> ...
    method detokenize (line 187) | def detokenize(
    method detokenize_generations (line 219) | def detokenize_generations(
    method _torch_sampling_func (line 272) | def _torch_sampling_func(
    method sample_from_logits (line 361) | def sample_from_logits(
    method update_generation_status (line 449) | def update_generation_status(
    method pad_input_prompt_tokens (line 495) | def pad_input_prompt_tokens(
    method unpad_input_prompt_tokens (line 531) | def unpad_input_prompt_tokens(
    method _dynamic_step_context_init (line 542) | def _dynamic_step_context_init(
    method _dynamic_step_forward_logits (line 618) | def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids...
    method _dynamic_step_sample_bookkeeping (line 660) | def _dynamic_step_sample_bookkeeping(self):
    method _rewind_kv_cache (line 684) | def _rewind_kv_cache(self):
    method _sample_from_logits_2d (line 793) | def _sample_from_logits_2d(self, logits_2d: Tensor) -> Tensor:
    method _compute_serial_mtp_and_sample (line 818) | def _compute_serial_mtp_and_sample(self):
    method _get_required_logit_indices (line 893) | def _get_required_logit_indices(
    method _sample_speculative_logits (line 934) | def _sample_speculative_logits(
    method _verify_speculative_tokens (line 991) | def _verify_speculative_tokens(
    method _dynamic_step_sample_logits_and_verify_tokens (line 1075) | def _dynamic_step_sample_logits_and_verify_tokens(self, logits: Tensor...
    method _dynamic_step_sample_logits (line 1156) | def _dynamic_step_sample_logits(self, logits: Tensor):
    method _dynamic_step_log_probs_bookkeeping (line 1197) | def _dynamic_step_log_probs_bookkeeping(self) -> Tuple[bool, bool]:
    method _router_record_bookkeeping (line 1211) | def _router_record_bookkeeping(self) -> Optional[Dict[int, Tensor]]:
    method _dynamic_step_calculate_log_probs (line 1271) | def _dynamic_step_calculate_log_probs(self, logits: Tensor) -> Optiona...
    method _dynamic_step_calculate_log_probs_speculative (line 1282) | def _dynamic_step_calculate_log_probs_speculative(
    method _dynamic_step_calculate_top_n_logprobs_speculative (line 1380) | def _dynamic_step_calculate_top_n_logprobs_speculative(
    method _dynamic_step_calculate_top_n_logprobs (line 1477) | def _dynamic_step_calculate_top_n_logprobs(
    method dummy_forward (line 1561) | def dummy_forward(self):
    method _dummy_serial_mtp_forward (line 1609) | def _dummy_serial_mtp_forward(self):
    method _dynamic_step_context_bookkeeping (line 1666) | def _dynamic_step_context_bookkeeping(self) -> Dict[str, Tensor]:
    method async_generate_output_tokens_dynamic_batch (line 1739) | async def async_generate_output_tokens_dynamic_batch(
    method generate_output_tokens_dynamic_batch (line 1860) | def generate_output_tokens_dynamic_batch(
    method _update_top_n_logprobs_dict (line 1867) | def _update_top_n_logprobs_dict(
    method generate_all_output_tokens_static_batch (line 1899) | def generate_all_output_tokens_static_batch(
    method prep_inference_input (line 2367) | def prep_inference_input(
    method stream_tokens (line 2393) | def stream_tokens(

FILE: megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py
  class VLMTextGenerationController (line 13) | class VLMTextGenerationController(TextGenerationController):
    method prep_inference_input (line 16) | def prep_inference_input(

FILE: megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/chat_completions.py
  function _get_field (line 20) | def _get_field(obj, key, default=None):
  function _normalize_tool_calls (line 27) | def _normalize_tool_calls(tool_calls):
  function _coerce_arguments_mapping (line 51) | def _coerce_arguments_mapping(arguments):
  function _sanitize_messages_for_template (line 72) | def _sanitize_messages_for_template(messages):
  function _sanitize_tools_for_template (line 116) | def _sanitize_tools_for_template(tools):
  function _reconstruct_reasoning_content (line 143) | def _reconstruct_reasoning_content(messages: list[dict]) -> list[dict]:
  function _replace_prefix_tokens (line 159) | def _replace_prefix_tokens(
  function apply_parsers (line 199) | def apply_parsers(message_text, tools, parsers_list, tools_requested):
  function chat_completions (line 225) | async def chat_completions():

FILE: megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/common.py
  function send_do_generate (line 11) | def send_do_generate():

FILE: megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/completions.py
  function completions (line 20) | async def completions():

FILE: megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/health.py
  function health (line 14) | async def health():

FILE: megatron/core/inference/text_generation_server/dynamic_text_gen_server/text_generation_server.py
  function temp_log_level (line 31) | def temp_log_level(level, logger=None):
  function _run_text_gen_server (line 43) | async def _run_text_gen_server(
  function _server_process_worker (line 112) | def _server_process_worker(
  function start_text_gen_server (line 141) | def start_text_gen_server(
  function stop_text_gen_server (line 184) | def stop_text_gen_server():

FILE: megatron/core/inference/text_generation_server/dynamic_text_gen_server/tokenization.py
  function tokenize_prompts (line 12) | def tokenize_prompts(
  function _tokenize_prompts_and_batch (line 70) | def _tokenize_prompts_and_batch(tokenizer, prompts, tokens_to_generate, ...

FILE: megatron/core/inference/text_generation_server/endpoints/common.py
  function send_do_generate (line 11) | def send_do_generate():

FILE: megatron/core/inference/text_generation_server/endpoints/completions.py
  function detokenize (line 24) | def detokenize(prompt, tok) -> list[str]:
  class MegatronCompletions (line 46) | class MegatronCompletions(Resource):
    method __init__ (line 49) | def __init__(self, engine, args):
    method post (line 53) | def post(self):

FILE: megatron/core/inference/text_generation_server/run_mcore_engine.py
  function run_mcore_engine (line 12) | def run_mcore_engine(

FILE: megatron/core/inference/text_generation_server/text_generation_server.py
  class MegatronGenerate (line 27) | class MegatronGenerate(Resource):
    method __init__ (line 30) | def __init__(self, engine, args):
    method put (line 35) | def put(self):
  class MegatronServer (line 192) | class MegatronServer(object):
    method __init__ (line 195) | def __init__(self, model, args=None):
    method run (line 204) | def run(self, url, port):

FILE: megatron/core/inference/text_generation_server/tokenization.py
  function tokenize_prompts (line 12) | def tokenize_prompts(
  function _tokenize_prompts_and_batch (line 70) | def _tokenize_prompts_and_batch(tokenizer, prompts, tokens_to_generate, ...

FILE: megatron/core/inference/unified_memory.py
  class CompilationState (line 28) | class CompilationState(Enum):
  class UnifiedMemoryUnsupportedError (line 36) | class UnifiedMemoryUnsupportedError(Exception):
  class UnifiedMemoryCompileTimeoutError (line 40) | class UnifiedMemoryCompileTimeoutError(UnifiedMemoryUnsupportedError):
  function _compile_timeout (line 55) | def _compile_timeout(timeout_s: int):
  function compile_allocator (line 87) | def compile_allocator():
  function create_unified_mempool (line 260) | def create_unified_mempool() -> "MemPool":
  function _get_ctypes_lib (line 283) | def _get_ctypes_lib() -> "ctypes.CDLL":
  function prefetch_managed_tensor (line 317) | def prefetch_managed_tensor(tensor, *, device: int, stream=None) -> None:
  function advise_managed_tensor_preferred_location (line 352) | def advise_managed_tensor_preferred_location(tensor, *, device: int) -> ...
  function advise_managed_tensor_accessed_by (line 381) | def advise_managed_tensor_accessed_by(tensor, *, device: int) -> None:
  function prefetch_managed_module_parameters (line 410) | def prefetch_managed_module_parameters(
  function advise_managed_module_parameters_preferred_location (line 481) | def advise_managed_module_parameters_preferred_location(

FILE: megatron/core/inference/utils.py
  function device_memory_summary (line 20) | def device_memory_summary() -> str:
  class Counter (line 43) | class Counter:
    method __init__ (line 49) | def __init__(self, start: int = 0) -> None:
    method __next__ (line 52) | def __next__(self) -> int:
    method reset (line 57) | def reset(self) -> None:
  function get_attention_mask (line 62) | def get_attention_mask(seq_length: int) -> torch.Tensor:
  function _init_moe_expert_cache (line 78) | def _init_moe_expert_cache(model):
  function set_decode_expert_padding (line 103) | def set_decode_expert_padding(model, set_to: bool = False, capacity_fact...
  function check_flashinfer_jit_cache_installed (line 165) | def check_flashinfer_jit_cache_installed(log_version: bool = False):
  function set_inference_cuda_graphed_iteration_for_ep_inference (line 204) | def set_inference_cuda_graphed_iteration_for_ep_inference(model):
  function unset_inference_cuda_graphed_iteration_for_ep_inference (line 219) | def unset_inference_cuda_graphed_iteration_for_ep_inference(model):
  function tensor_swap (line 232) | def tensor_swap(x, src_idxs, dst_idxs):
  function await_process_call (line 239) | async def await_process_call(call, process: multiprocessing.Process, tim...
  class asyncio_QueueShutDown (line 267) | class asyncio_QueueShutDown(Exception):
  class asyncio_Queue (line 272) | class asyncio_Queue(asyncio.Queue):
    method __init__ (line 275) | def __init__(self, maxsize: int = 0):
    method get (line 279) | async def get(self):
    method put_nowait (line 290) | def put_nowait(self, item):
    method shutdown (line 298) | def shutdown(self):

FILE: megatron/core/jit.py
  function noop_decorator (line 11) | def noop_decorator(func):
  function enable_jit_fuser (line 16) | def enable_jit_fuser():
  function disable_jit_fuser (line 27) | def disable_jit_fuser():

FILE: megatron/core/model_parallel_config.py
  class ModelParallelConfig (line 11) | class ModelParallelConfig:
    method __post_init__ (line 401) | def __post_init__(self):

FILE: megatron/core/models/T5/t5_model.py
  class T5LMHead (line 27) | class T5LMHead(MegatronModule):
    method __init__ (line 39) | def __init__(
    method forward (line 71) | def forward(self, hidden_states: Tensor, word_embeddings_weight: Tenso...
  class T5Model (line 86) | class T5Model(LanguageModule):
    method __init__ (line 136) | def __init__(
    method forward (line 279) | def forward(
    method set_input_tensor (line 441) | def set_input_tensor(self, input_tensor):
    method shared_embedding_or_output_weight (line 471) | def shared_embedding_or_output_weight(self) -> Tensor:
    method sharded_state_dict (line 480) | def sharded_state_dict(
  function t5_extended_attention_mask (line 504) | def t5_extended_attention_mask(attention_mask_list: List[Tensor]) -> Lis...
  function t5_position_ids (line 528) | def t5_position_ids(token_ids: Tensor) -> Tensor:

FILE: megatron/core/models/T5/t5_spec.py
  function encoder_model_with_transformer_engine_default_spec (line 54) | def encoder_model_with_transformer_engine_default_spec() -> ModuleSpec:
  function decoder_model_with_transformer_engine_default_spec (line 84) | def decoder_model_with_transformer_engine_default_spec() -> ModuleSpec:
  function encoder_model_with_local_spec (line 126) | def encoder_model_with_local_spec() -> ModuleSpec:
  function decoder_model_with_local_spec (line 161) | def decoder_model_with_local_spec() -> ModuleSpec:
  function get_t5_encoder_with_transformer_engine_block_spec (line 208) | def get_t5_encoder_with_transformer_engine_block_spec(
  function get_t5_decoder_with_transformer_engine_block_spec (line 222) | def get_t5_decoder_with_transformer_engine_block_spec(
  function get_t5_encoder_with_local_block_spec (line 236) | def get_t5_encoder_with_local_block_spec(num_layers: int) -> Transformer...
  function get_t5_decoder_with_local_block_spec (line 248) | def get_t5_decoder_with_local_block_spec(num_layers: int) -> Transformer...

FILE: megatron/core/models/backends.py
  class BackendSpecProvider (line 51) | class BackendSpecProvider(Protocol):
    method column_parallel_linear (line 55) | def column_parallel_linear(self) -> type:
    method row_parallel_linear (line 60) | def row_parallel_linear(self) -> type:
    method fuse_layernorm_and_linear (line 65) | def fuse_layernorm_and_linear(self) -> bool:
    method column_parallel_layer_norm_linear (line 70) | def column_parallel_layer_norm_linear(self) -> Optional[type]:
    method layer_norm (line 75) | def layer_norm(
    method core_attention (line 82) | def core_attention(self) -> type:
    method grouped_mlp_modules (line 87) | def grouped_mlp_modules(
    method activation_func (line 94) | def activation_func(self) -> TEActivationFunctionBuilder | None:
  class LocalSpecProvider (line 99) | class LocalSpecProvider(BackendSpecProvider):
    method column_parallel_linear (line 102) | def column_parallel_linear(self) -> type:
    method row_parallel_linear (line 106) | def row_parallel_linear(self) -> type:
    method fuse_layernorm_and_linear (line 110) | def fuse_layernorm_and_linear(self) -> bool:
    method column_parallel_layer_norm_linear (line 114) | def column_parallel_layer_norm_linear(self) -> Optional[type]:
    method layer_norm (line 118) | def layer_norm(
    method core_attention (line 129) | def core_attention(self) -> type:
    method grouped_mlp_modules (line 133) | def grouped_mlp_modules(
    method activation_func (line 141) | def activation_func(self) -> TEActivationFunctionBuilder | None:
  class InferenceSpecProvider (line 146) | class InferenceSpecProvider(BackendSpecProvider):
    method linear (line 149) | def linear(self) -> type:
    method column_parallel_linear (line 153) | def column_parallel_linear(self) -> type:
    method row_parallel_linear (line 157) | def row_parallel_linear(self) -> type:
    method fuse_layernorm_and_linear (line 161) | def fuse_layernorm_and_linear(self) -> bool:
    method column_parallel_layer_norm_linear (line 165) | def column_parallel_layer_norm_linear(self) -> type[InferenceLayerNorm...
    method layer_norm (line 169) | def layer_norm(
    method core_attention (line 180) | def core_attention(self) -> type[TEDotProductAttention]:
    method activation_func (line 184) | def activation_func(self) -> TEActivationFunctionBuilder | None:
    method grouped_mlp_modules (line 190) | def grouped_mlp_modules(

FILE: megatron/core/models/bert/bert_layer_specs.py
  function get_bert_layer_with_transformer_engine_submodules (line 44) | def get_bert_layer_with_transformer_engine_submodules() -> TransformerLa...
  function get_bert_layer_with_transformer_engine_spec (line 80) | def get_bert_layer_with_transformer_engine_spec():
  function __getattr__ (line 91) | def __getattr__(name):

FILE: megatron/core/models/bert/bert_lm_head.py
  class BertLMHead (line 19) | class BertLMHead(MegatronModule):
    method __init__ (line 27) | def __init__(self, hidden_size: int, config: TransformerConfig):
    method forward (line 44) | def forward(self, hidden_states: Tensor) -> Tensor:

FILE: megatron/core/models/bert/bert_model.py
  class BertModel (line 31) | class BertModel(LanguageModule):
    method __init__ (line 54) | def __init__(
    method _sanity_check_attention_and_get_attn_mask_dimension (line 164) | def _sanity_check_attention_and_get_attn_mask_dimension(self) -> str:
    method bert_extended_attention_mask (line 240) | def bert_extended_attention_mask(self, attention_mask: Tensor) -> Tensor:
    method bert_position_ids (line 272) | def bert_position_ids(self, token_ids):
    method set_input_tensor (line 281) | def set_input_tensor(self, input_tensor: Tensor) -> None:
    method forward (line 297) | def forward(

FILE: megatron/core/models/bert/pooler.py
  class Pooler (line 11) | class Pooler(MegatronModule):
    method __init__ (line 24) | def __init__(
    method forward (line 38) | def forward(self, hidden_states: Tensor, sequence_index=0):

FILE: megatron/core/models/common/embeddings/language_model_embedding.py
  class LanguageModelEmbedding (line 14) | class LanguageModelEmbedding(MegatronModule):
    method __init__ (line 29) | def __init__(
    method zero_parameters (line 88) | def zero_parameters(self):
    method forward (line 99) | def forward(self, input_ids: Tensor, position_ids: Tensor, tokentype_i...

FILE: megatron/core/models/common/embeddings/relative_pos_embedding.py
  class RelativePositionEmbedding (line 21) | class RelativePositionEmbedding(nn.Module):
    method __init__ (line 28) | def __init__(
    method _relative_position_bucket (line 46) | def _relative_position_bucket(
    method _compute_bias (line 100) | def _compute_bias(self, query_length, key_length):
    method get_relative_seq_len (line 138) | def get_relative_seq_len(
    method forward (line 175) | def forward(self, query_seq_length, key_seq_length):

FILE: megatron/core/models/common/embeddings/rope_utils.py
  function get_pos_emb_on_this_cp_rank (line 48) | def get_pos_emb_on_this_cp_rank(
  function _rotate_half (line 73) | def _rotate_half(x: Tensor, rotary_interleaved: bool) -> Tensor:
  function _apply_rotary_pos_emb_bshd (line 92) | def _apply_rotary_pos_emb_bshd(
  function _get_thd_freqs_on_this_cp_rank (line 129) | def _get_thd_freqs_on_this_cp_rank(
  function _apply_rotary_pos_emb_thd (line 178) | def _apply_rotary_pos_emb_thd(
  function apply_rotary_pos_emb (line 250) | def apply_rotary_pos_emb(
  function apply_rotary_pos_emb_with_cos_sin (line 319) | def apply_rotary_pos_emb_with_cos_sin(

FILE: megatron/core/models/common/embeddings/rotary_pos_embedding.py
  class RotaryEmbedding (line 36) | class RotaryEmbedding(nn.Module):
    method __init__ (line 58) | def __init__(
    method _apply_scaling (line 92) | def _apply_scaling(
    method get_freqs_non_repeated (line 127) | def get_freqs_non_repeated(self, max_seq_len: int, offset: int = 0) ->...
    method get_cos_sin (line 142) | def get_cos_sin(self, max_seq_len: int, offset: int = 0) -> (Tensor, T...
    method get_emb (line 150) | def get_emb(self, max_seq_len: int, offset: int = 0) -> Tensor:
    method forward (line 179) | def forward(
    method _load_from_state_dict (line 208) | def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
    method get_rotary_seq_len (line 212) | def get_rotary_seq_len(
  class MultimodalRotaryEmbedding (line 266) | class MultimodalRotaryEmbedding(nn.Module):
    method __init__ (line 285) | def __init__(
    method forward (line 315) | def forward(

FILE: megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py
  class YarnRotaryEmbedding (line 21) | class YarnRotaryEmbedding(RotaryEmbedding):
    method __init__ (line 49) | def __init__(
    method get_emb (line 106) | def get_emb(self, max_seq_len: int, offset: int = 0) -> Tensor:
    method forward (line 161) | def forward(
    method _set_cos_sin_cache (line 189) | def _set_cos_sin_cache(self, seq_len, offset, dtype, packed_seq=False):
    method get_cached_cos_sin (line 203) | def get_cached_cos_sin(
  function _yarn_find_correction_dim (line 218) | def _yarn_find_correction_dim(
  function _yarn_find_correction_range (line 227) | def _yarn_find_correction_range(
  function _yarn_linear_ramp_mask (line 243) | def _yarn_linear_ramp_mask(min: float, max: float, dim: int, device: tor...
  function _yarn_get_mscale (line 252) | def _yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
  function _yarn_get_concentration_factor (line 259) | def _yarn_get_concentration_factor(
  function _yarn_get_concentration_factor_from_config (line 274) | def _yarn_get_concentration_factor_from_config(config: TransformerConfig...

FILE: megatron/core/models/common/language_module/language_module.py
  class LanguageModule (line 36) | class LanguageModule(MegatronModule):
    method __init__ (line 44) | def __init__(
    method _is_in_embd_group (line 66) | def _is_in_embd_group(self):
    method _set_attention_backend (line 93) | def _set_attention_backend(self):
    method compute_language_model_loss (line 129) | def compute_language_model_loss(self, labels: Tensor, logits: Tensor) ...
    method setup_embeddings_and_output_layer (line 174) | def setup_embeddings_and_output_layer(self) -> None:
    method _scale_logits (line 287) | def _scale_logits(self, logits: Tensor) -> Tensor:
    method shared_embedding_or_output_weight (line 306) | def shared_embedding_or_output_weight(self) -> Tensor:
    method sharded_state_dict (line 326) | def sharded_state_dict(
    method tie_embeddings_and_output_weights_state_dict (line 382) | def tie_embeddings_and_output_weights_state_dict(

FILE: megatron/core/models/common/model_chunk_schedule_plan.py
  class ModelChunkState (line 19) | class ModelChunkState:
  class TransformerLayerSchedulePlan (line 29) | class TransformerLayerSchedulePlan:
    method __init__ (line 56) | def __init__(self, layer, event, chunk_state, comp_stream, comm_stream...
    method release_state (line 85) | def release_state(self):
    method _build_callable_nodes (line 108) | def _build_callable_nodes(self, event, comp_stream, comm_stream, extra...
    method get_fp8_context (line 175) | def get_fp8_context(self):
    method run (line 189) | def run(f_layer, b_layer, f_input=None, b_grad=None, is_last_layer_in_...
  class TransformerModelChunkSchedulePlan (line 256) | class TransformerModelChunkSchedulePlan(AbstractSchedulePlan):
    method __init__ (line 271) | def __init__(
    method _build_layer_schedule_plan (line 349) | def _build_layer_schedule_plan(self, module, comp_stream, comm_stream):
    method event (line 369) | def event(self):
    method record_current_stream (line 373) | def record_current_stream(self):
    method wait_current_stream (line 378) | def wait_current_stream(self):
    method get_layer (line 383) | def get_layer(self, i):
    method pop_layer (line 388) | def pop_layer(self):
    method num_layers (line 392) | def num_layers(self):
    method state (line 397) | def state(self):
    method release_state (line 401) | def release_state(self):
    method run (line 412) | def run(

FILE: megatron/core/models/common/vision_module/vision_module.py
  class VisionModule (line 9) | class VisionModule(MegatronModule):
    method __init__ (line 16) | def __init__(self, config: TransformerConfig) -> None:

FILE: megatron/core/models/gpt/experimental_attention_variant_module_specs.py
  function get_gated_delta_net_module_spec (line 56) | def get_gated_delta_net_module_spec(
  function get_dsa_module_spec_for_backend (line 77) | def get_dsa_module_spec_for_backend(
  function get_experimental_attention_variant_module_spec (line 131) | def get_experimental_attention_variant_module_spec(
  function get_transformer_block_with_experimental_attention_variant_spec (line 152) | def get_transformer_block_with_experimental_attention_variant_spec(
  function is_linear_attention_variant (line 285) | def is_linear_attention_variant(experimental_attention_variant: Optional...
  function get_moe_layer_pattern (line 291) | def get_moe_layer_pattern(config: TransformerConfig) -> List[int]:
  function get_linear_attention_pattern (line 316) | def get_linear_attention_pattern(config: TransformerConfig) -> List[int]:
  function _get_backend_spec_provider (line 353) | def _get_backend_spec_provider(config: TransformerConfig) -> BackendSpec...
  function _get_self_attention_module_spec (line 377) | def _get_self_attention_module_spec(
  function _get_dense_mlp_module_spec (line 411) | def _get_dense_mlp_module_spec(
  function _get_moe_module_spec (line 430) | def _get_moe_module_spec(

FILE: megatron/core/models/gpt/fine_grained_callables.py
  function weak_method (line 29) | def weak_method(method):
  function should_free_input (line 47) | def should_free_input(name, is_moe, config, num_local_experts):
  class TransformerLayerState (line 102) | class TransformerLayerState:
  class PreProcessNode (line 112) | class PreProcessNode(ScheduleNode):
    method __init__ (line 119) | def __init__(self, gpt_model, chunk_state, event, stream):
    method forward_impl (line 132) | def forward_impl(self):
  class PostProcessNode (line 172) | class PostProcessNode(ScheduleNode):
    method __init__ (line 179) | def __init__(self, gpt_model, chunk_state, event, stream):
    method forward_impl (line 192) | def forward_impl(self, hidden_states):
  class TransformerLayerNode (line 237) | class TransformerLayerNode(ScheduleNode):
    method __init__ (line 244) | def __init__(
    method detach (line 302) | def detach(self, t):
    method forward_impl (line 310) | def forward_impl(self, *args):
    method backward_impl (line 314) | def backward_impl(self, outputs, output_grad):
    method backward_dw (line 329) | def backward_dw(self):
    method __del__ (line 348) | def __del__(self):
  class _BackwardDWWrapper (line 357) | class _BackwardDWWrapper:
    method __init__ (line 370) | def __init__(self, layer):
    method backward_dw (line 388) | def backward_dw(self):
    method set_graphed_backward_dw_callable (line 401) | def set_graphed_backward_dw_callable(self, graphed_backward_dw_callable):
  function build_transformer_layer_callables (line 406) | def build_transformer_layer_callables(layer: TransformerLayer):
  function build_mtp_layer_callables (line 638) | def build_mtp_layer_callables(layer):
  function build_layer_callables (line 721) | def build_layer_callables(layer):

FILE: megatron/core/models/gpt/gpt_layer_specs.py
  function get_gpt_layer_with_inference_submodules (line 73) | def get_gpt_layer_with_inference_submodules(
  function get_gpt_layer_with_inference_spec (line 166) | def get_gpt_layer_with_inference_spec(*args, **kwargs) -> ModuleSpec:
  function get_gpt_layer_with_transformer_engine_submodules (line 173) | def get_gpt_layer_with_transformer_engine_submodules(
  function get_gpt_layer_with_transformer_engine_spec (line 343) | def get_gpt_layer_with_transformer_engine_spec(*args, **kwargs) -> Modul...
  function get_gpt_layer_local_submodules (line 351) | def get_gpt_layer_local_submodules(
  function get_gpt_layer_local_spec (line 459) | def get_gpt_layer_local_spec(*args, **kwargs) -> ModuleSpec:
  function _get_mlp_module_spec (line 466) | def _get_mlp_module_spec(
  function get_mlp_module_spec (line 482) | def get_mlp_module_spec(
  function get_mlp_module_spec_for_backend (line 513) | def get_mlp_module_spec_for_backend(
  function get_gpt_decoder_layer_specs (line 549) | def get_gpt_decoder_layer_specs(
  function get_gpt_decoder_block_spec (line 657) | def get_gpt_decoder_block_spec(
  function get_gpt_mtp_block_spec (line 700) | def get_gpt_mtp_block_spec(
  function get_gpt_mtp_block_spec_for_backend (line 733) | def get_gpt_mtp_block_spec_for_backend(

FILE: megatron/core/models/gpt/gpt_model.py
  class GPTModel (line 43) | class GPTModel(LanguageModule):
    method __init__ (line 86) | def __init__(
    method set_input_tensor (line 275) | def set_input_tensor(self, input_tensor: Tensor) -> None:
    method _preprocess (line 291) | def _preprocess(
    method preprocess_for_fine_grained_offloading (line 459) | def preprocess_for_fine_grained_offloading(self):
    method forward (line 477) | def forward(
    method _postprocess (line 571) | def _postprocess(
    method compute_mtp_single_step (line 714) | def compute_mtp_single_step(
    method build_schedule_plan (line 756) | def build_schedule_plan(
    method sharded_state_dict (line 820) | def sharded_state_dict(

FILE: megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py
  function _get_layer_norm (line 72) | def _get_layer_norm(config: AttentionConfig | MLPConfig, use_te: bool, n...
  function _get_qk_layernorm (line 81) | def _get_qk_layernorm(use_te: bool, normalization: str):
  function _get_heterogenous_attention_spec (line 99) | def _get_heterogenous_attention_spec(
  function _get_heterogenous_mlp_spec (line 129) | def _get_heterogenous_mlp_spec(mlp_config: MLPConfig, use_te: bool):
  function _get_sharded_state_dict_keys_map (line 152) | def _get_sharded_state_dict_keys_map(block_config: TransformerBlockConfi...
  function get_gpt_heterogeneous_layer_spec (line 176) | def get_gpt_heterogeneous_layer_spec(

FILE: megatron/core/models/gpt/moe_module_specs.py
  function get_moe_module_spec (line 18) | def get_moe_module_spec(
  function get_moe_module_spec_for_backend (line 43) | def get_moe_module_spec_for_backend(
  function get_inference_optimized_moe_spec (line 80) | def get_inference_optimized_moe_spec() -> ModuleSpec:

FILE: megatron/core/models/huggingface/clip_model.py
  class SiglipHuggingFaceModel (line 19) | class SiglipHuggingFaceModel(HuggingFaceModule):
    method __init__ (line 27) | def __init__(self, config):
    method forward (line 37) | def forward(self, *args, **kwargs):

FILE: megatron/core/models/huggingface/module.py
  class HuggingFaceModule (line 15) | class HuggingFaceModule(MegatronModule):
    method __init__ (line 20) | def __init__(self, config):
    method set_input_tensor (line 23) | def set_input_tensor(self, input_tensor):
    method __setattr__ (line 27) | def __setattr__(self, name: str, value):
  class AutoHuggingFaceModel (line 42) | class AutoHuggingFaceModel(HuggingFaceModule):
    method __init__ (line 47) | def __init__(self, config):
    method forward (line 57) | def forward(self, *args, **kwargs):
  function get_hf_model_type (line 62) | def get_hf_model_type(model_path):
  function build_hf_model (line 82) | def build_hf_model(config, model_path):

FILE: megatron/core/models/huggingface/qwen_model.py
  class QwenHuggingFaceModel (line 21) | class QwenHuggingFaceModel(HuggingFaceModule):
    method __init__ (line 29) | def __init__(self, config):
    method forward (line 39) | def forward(self, *args, **kwargs):
    method embedding (line 57) | def embedding(self, input_ids, position_ids=None):

FILE: megatron/core/models/mamba/mamba_model.py
  class MambaModel (line 37) | class MambaModel(LanguageModule):
    method __init__ (line 85) | def __init__(
    method set_input_tensor (line 284) | def set_input_tensor(self, input_tensor: Tensor) -> None:
    method forward (line 300) | def forward(
    method compute_mtp_single_step (line 479) | def compute_mtp_single_step(

FILE: megatron/core/models/mimo/config/base_configs.py
  class MimoModelConfig (line 11) | class MimoModelConfig:

FILE: megatron/core/models/mimo/model/base.py
  class MimoModel (line 18) | class MimoModel(MegatronModule):
    method __init__ (line 37) | def __init__(self, mimo_config: MimoModelConfig, cp_group=None, tp_gro...
    method align_embeddings_by_token_positions (line 87) | def align_embeddings_by_token_positions(
    method _initialize_submodules (line 155) | def _initialize_submodules(self) -> None:
    method _initialize_language_model (line 171) | def _initialize_language_model(self) -> None:
    method set_input_tensor (line 178) | def set_input_tensor(self, input_tensor):
    method get_text_embeddings (line 199) | def get_text_embeddings(
    method forward (line 233) | def forward(

FILE: megatron/core/models/mimo/partition/utils.py
  class PartitionConfig (line 37) | class PartitionConfig:
    method is_partitioning_enabled (line 54) | def is_partitioning_enabled(self) -> bool:
    method from_mp_config (line 59) | def from_mp_config(
  class PartitionAdapter (line 91) | class PartitionAdapter:
    method __init__ (line 94) | def __init__(self, cfg: PartitionConfig):
    method shard (line 101) | def shard(
    method _apply_context_parallel (line 185) | def _apply_context_parallel(

FILE: megatron/core/models/mimo/submodules/audio.py
  class AudioModalitySubmodules (line 15) | class AudioModalitySubmodules(ModalitySubmodules):
    method __init__ (line 18) | def __init__(
    method encode (line 47) | def encode(self, encoders_data_batch: Dict) -> List[torch.Tensor]:
    method decode (line 90) | def decode(self, embeddings: torch.Tensor, data_batch: Dict) -> torch....
    method combine_embeddings (line 94) | def combine_embeddings(self, embeddings: List[torch.Tensor]) -> torch....
    method project_embeddings (line 108) | def project_embeddings(
    method forward (line 129) | def forward(self, encoder_inputs: Dict[str, Any]) -> Optional[torch.Te...

FILE: megatron/core/models/mimo/submodules/base.py
  class ModalitySubmodules (line 17) | class ModalitySubmodules(ABC, nn.Module):
    method __init__ (line 39) | def __init__(
    method from_spec (line 62) | def from_spec(cls, module_spec: ModuleSpec) -> 'ModalitySubmodules':
    method combine_embeddings (line 124) | def combine_embeddings(self, embeddings: List[torch.Tensor]) -> torch....
    method encode (line 137) | def encode(self, data_batch: Dict) -> List[torch.Tensor]:
    method decode (line 150) | def decode(self, embeddings: torch.Tensor, data_batch: Dict) -> torch....
    method project_embeddings (line 165) | def project_embeddings(
    method forward (line 182) | def forward(self, encoder_inputs: Dict[str, Any]) -> Optional[torch.Te...

FILE: megatron/core/models/mimo/submodules/vision.py
  class VisionModalitySubmodules (line 15) | class VisionModalitySubmodules(ModalitySubmodules):
    method __init__ (line 21) | def __init__(
    method encode (line 55) | def encode(self, encoders_data_batch: Dict) -> List[torch.Tensor]:
    method decode (line 99) | def decode(self, embeddings: torch.Tensor, data_batch: Dict) -> torch....
    method combine_embeddings (line 112) | def combine_embeddings(self, embeddings: List[torch.Tensor]) -> torch....
    method project_embeddings (line 135) | def project_embeddings(
    method forward (line 163) | def forward(self, encoder_inputs: Dict[str, Any]) -> Optional[torch.Te...

FILE: megatron/core/models/multimodal/context_parallel.py
  function get_padding (line 9) | def get_padding(
  function get_packed_seq_params (line 62) | def get_packed_seq_params(tokens, img_seq_len, padding_needed, cp_size, ...

FILE: megatron/core/models/multimodal/llava_model.py
  class LLaVAModel (line 51) | class LLaVAModel(MegatronModule):
    method __init__ (line 91) | def __init__(
    method shared_embedding_or_output_weight (line 389) | def shared_embedding_or_output_weight(self):
    method set_input_tensor (line 396) | def set_input_tensor(self, input_tensor) -> None:
    method freeze (line 413) | def freeze(
    method _preprocess_data (line 437) | def _preprocess_data(
    method _process_embedding_token_parallel (line 669) | def _process_embedding_token_parallel(
    method _apply_tile_tagging (line 762) | def _apply_tile_tagging(self, image_embeddings, num_image_tiles):
    method forward (line 798) | def forward(
  function _load_state_dict_hook_ignore_param_names (line 949) | def _load_state_dict_hook_ignore_param_names(
  function _load_state_dict_hook_ignore_extra_state (line 973) | def _load_state_dict_hook_ignore_extra_state(
  function pixel_shuffle (line 998) | def pixel_shuffle(x, scale_factor=0.5, version=2):

FILE: megatron/core/models/multimodal/llava_spec.py
  function decoder_model_with_transformer_engine_default_spec (line 37) | def decoder_model_with_transformer_engine_default_spec(
  function decoder_model_with_local_default_spec (line 65) | def decoder_model_with_local_default_spec(

FILE: megatron/core/models/vision/clip_vit_model.py
  class CLIPViTModel (line 26) | class CLIPViTModel(VisionModule):
    method __init__ (line 42) | def __init__(
    method set_input_tensor (line 156) | def set_input_tensor(self, input_tensor: torch.Tensor) -> None:
    method forward (line 164) | def forward(
  function get_num_image_embeddings (line 205) | def get_num_image_embeddings(

FILE: megatron/core/models/vision/multimodal_projector.py
  class MultimodalProjector (line 14) | class MultimodalProjector(MegatronModule):
    method __init__ (line 28) | def __init__(
    method forward (line 63) | def forward(self, hidden_states):

FILE: megatron/core/models/vision/radio.py
  class RADIOViTModel (line 29) | class RADIOViTModel(VisionModule):
    method __init__ (line 50) | def __init__(
    method set_input_tensor (line 168) | def set_input_tensor(self, input_tensor: torch.Tensor) -> None:
    method forward (line 176) | def forward(
    method apply_pos_enc (line 237) | def apply_pos_enc(
    method get_pos_enc (line 257) | def get_pos_enc(
    method _get_pos_embeddings (line 281) | def _get_pos_embeddings(self, batch_size: int, input_dims: Tuple[int, ...
  function fp8_pad_hook (line 359) | def fp8_pad_hook(

FILE: megatron/core/models/vision/vit_layer_specs.py
  function get_vit_layer_with_transformer_engine_spec (line 36) | def get_vit_layer_with_transformer_engine_spec() -> ModuleSpec:
  function get_vit_layer_with_local_spec (line 61) | def get_vit_layer_with_local_spec() -> ModuleSpec:
  function _get_mlp_module_spec (line 88) | def _get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:

FILE: megatron/core/msc_utils.py
  class _FeatureFlag (line 18) | class _FeatureFlag:
    method __init__ (line 20) | def __init__(self, default: bool = False):
    method enable (line 23) | def enable(self) -> None:
    method disable (line 27) | def disable(self) -> None:
    method is_enabled (line 31) | def is_enabled(self) -> bool:
    method import_package (line 35) | def import_package(self) -> Any:
    method __getstate__ (line 48) | def __getstate__(self):
    method __setstate__ (line 52) | def __setstate__(self, state):
  function open_file (line 60) | def open_file(*args, **kwargs):

FILE: megatron/core/nccl_allocator.py
  function _build_nccl_allocator (line 25) | def _build_nccl_allocator():
  function get_func_args (line 101) | def get_func_args(func):
  function create_nccl_mem_pool (line 111) | def create_nccl_mem_pool(symmetric=None):  # symmetric: bool | None = No...
  function init (line 143) | def init() -> None:
  function register_mem_pool (line 163) | def register_mem_pool(pool, group, symmetric=True):
  function deregister_mem_pool (line 185) | def deregister_mem_pool(pool, group):
  class nccl_mem (line 195) | class nccl_mem:
    method __init__ (line 200) | def __init__(self, pool, enabled=True, device=None, group=None, symmet...
    method __enter__ (line 225) | def __enter__(self):
    method __exit__ (line 244) | def __exit__(self, *args):
  class MultiGroupMemPoolAllocator (line 276) | class MultiGroupMemPoolAllocator:
    method __init__ (line 301) | def __init__(
    method __enter__ (line 315) | def __enter__(self):
    method __exit__ (line 335) | def __exit__(self, *args):
  class MemPoolAllocatorWithoutRegistration (line 367) | class MemPoolAllocatorWithoutRegistration:
    method __init__ (line 373) | def __init__(self, pool):
    method __enter__ (line 377) | def __enter__(self):
    method __exit__ (line 380) | def __exit__(self, *args):

FILE: megatron/core/num_microbatches_calculator.py
  function get_num_microbatches (line 17) | def get_num_microbatches() -> int:
  function get_current_global_batch_size (line 22) | def get_current_global_batch_size() -> int:
  function get_micro_batch_size (line 27) | def get_micro_batch_size() -> int:
  function get_current_running_global_batch_size (line 32) | def get_current_running_global_batch_size() -> int:
  function update_num_microbatches (line 38) | def update_num_microbatches(
  function unset_num_microbatches_calculator (line 54) | def unset_num_microbatches_calculator():
  function init_num_microbatches_calculator (line 64) | def init_num_microbatches_calculator(
  function destroy_num_microbatches_calculator (line 101) | def destroy_num_microbatches_calculator():
  function reconfigure_num_microbatches_calculator (line 107) | def reconfigure_num_microbatches_calculator(
  function _configure_global_num_microbatches_calculator (line 144) | def _configure_global_num_microbatches_calculator(
  function _build_num_microbatches_calculator (line 191) | def _build_num_microbatches_calculator(
  function _round (line 261) | def _round(batch_size: int, divisor: int) -> int:
  class NumMicroBatchesCalculator (line 266) | class NumMicroBatchesCalculator(ABC):
    method __init__ (line 269) | def __init__(self) -> None:
    method get (line 275) | def get(self) -> int:
    method get_current_global_batch_size (line 279) | def get_current_global_batch_size(self) -> int:
    method get_micro_batch_size (line 283) | def get_micro_batch_size(self) -> int:
    method get_current_running_global_batch_size (line 287) | def get_current_running_global_batch_size(self) -> int:
    method update (line 293) | def update(self, consumed_samples, consistency_check, verbose=False) -...
  class ConstantNumMicroBatchesCalculator (line 298) | class ConstantNumMicroBatchesCalculator(NumMicroBatchesCalculator):
    method __init__ (line 315) | def __init__(
    method update (line 356) | def update(self, consumed_samples, consistency_check, verbose=False) -...
  class RampupBatchsizeNumMicroBatchesCalculator (line 360) | class RampupBatchsizeNumMicroBatchesCalculator(NumMicroBatchesCalculator):
    method __init__ (line 387) | def __init__(
    method update (line 441) | def update(self, consumed_samples: int, consistency_check: bool, verbo...

FILE: megatron/core/optimizer/__init__.py
  function get_standard_config_overrides (line 77) | def get_standard_config_overrides(config: OptimizerConfig) -> Dict[Param...
  function get_mup_config_overrides (line 115) | def get_mup_config_overrides(
  function _get_param_groups (line 260) | def _get_param_groups(
  function _get_param_groups_and_buffers (line 384) | def _get_param_groups_and_buffers(
  function _get_megatron_optimizer_based_on_param_groups (line 419) | def _get_megatron_optimizer_based_on_param_groups(
  function check_config_overrides_consistency (line 649) | def check_config_overrides_consistency(
  function get_megatron_optimizer (line 676) | def get_megatron_optimizer(

FILE: megatron/core/optimizer/clip_grads.py
  function get_grad_norm_fp32 (line 51) | def get_grad_norm_fp32(
  function clip_grad_by_total_norm_fp32 (line 138) | def clip_grad_by_total_norm_fp32(
  function count_zeros_fp32 (line 180) | def count_zeros_fp32(

FILE: megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py
  function _param_generator (line 8) | def _param_generator(cpu_optimizer):
  class HybridDeviceOptimizer (line 14) | class HybridDeviceOptimizer(torch.optim.Optimizer):
    method __init__ (line 45) | def __init__(
    method _set_sub_optimizer_grads (line 83) | def _set_sub_optimizer_grads(self):
    method _register_param_copy_back_gpu_hook (line 117) | def _register_param_copy_back_gpu_hook(self):
    method step (line 150) | def step(self, closure=None):
    method _init_sub_optimizers (line 181) | def _init_sub_optimizers(self):
    method build_cpu_optimizer_list (line 227) | def build_cpu_optimizer_list(cpu_optimizer_cls, cpu_param_groups):
    method _get_sub_optimizer_param_groups (line 251) | def _get_sub_optimizer_param_groups(self, offload_fraction: float):
    method _sync_sub_optimizers_state_to_hdo (line 302) | def _sync_sub_optimizers_state_to_hdo(self):
    method _sync_hdo_state_to_sub_optimizers (line 323) | def _sync_hdo_state_to_sub_optimizers(self):
    method _sync_hdo_param_groups_to_sub_optimizers (line 334) | def _sync_hdo_param_groups_to_sub_optimizers(self):
    method _move_new_state_to_right_device (line 357) | def _move_new_state_to_right_device(self):
    method _update_fp32_params_by_new_state (line 369) | def _update_fp32_params_by_new_state(self):
    method update_fp32_param_by_new_param (line 376) | def update_fp32_param_by_new_param(self):
    method _register_load_state_dict_hooks (line 383) | def _register_load_state_dict_hooks(self):
    method zero_grad (line 440) | def zero_grad(self, set_to_none: bool = True):
    method dummy_step (line 453) | def dummy_step(self):
    method sub_optimizers (line 466) | def sub_optimizers(self):

FILE: megatron/core/optimizer/distrib_optimizer.py
  class Range (line 62) | class Range:
    method __init__ (line 72) | def __init__(self, start: int, end: int):
    method normalize (line 77) | def normalize(self, start: int = 0):
    method __str__ (line 87) | def __str__(self):
    method __repr__ (line 90) | def __repr__(self):
    method __len__ (line 93) | def __len__(self):
  class DistributedOptimizer (line 97) | class DistributedOptimizer(MixedPrecisionOptimizer):
    method _build_model_gbuf_param_range_map (line 112) | def _build_model_gbuf_param_range_map(
    method _build_model_gbuf_range (line 174) | def _build_model_gbuf_range(cls, param_and_grad_buffer: _ParamAndGradB...
    method _build_gbuf_range_map (line 221) | def _build_gbuf_range_map(cls, param_and_grad_buffer: _ParamAndGradBuf...
    method _build_model_param_gbuf_map (line 241) | def _build_model_param_gbuf_map(
    method _build_optimizer_group_ranges (line 260) | def _build_optimizer_group_ranges(cls, param_groups: List[Dict], gbuf_...
    method _build_model_and_main_param_groups (line 307) | def _build_model_and_main_param_groups(
    method __init__ (line 459) | def __init__(
    method _get_model_param_range_map (line 610) | def _get_model_param_range_map(self, param: torch.nn.Parameter):
    method get_grad_stats_parallel_group (line 620) | def get_grad_stats_parallel_group(self) -> torch.distributed.ProcessGr...
    method state_dict (line 628) | def state_dict(self):
    method load_state_dict (line 690) | def load_state_dict(self, state_dict):
    method _get_main_param_and_optimizer_states (line 883) | def _get_main_param_and_optimizer_states(self, model_param):
    method _set_main_param_and_optimizer_states (line 916) | def _set_main_param_and_optimizer_states(self, model_param, tensors):
    method get_parameter_state_dp_reshardable (line 954) | def get_parameter_state_dp_reshardable(self):
    method get_parameter_state_dp_zero (line 989) | def get_parameter_state_dp_zero(
    method save_parameter_state (line 1138) | def save_parameter_state(self, filename: str):
    method _init_optimizer_states_with_dummy_values (line 1149) | def _init_optimizer_states_with_dummy_values(self):
    method _param_name (line 1166) | def _param_name(self, param: torch.nn.Parameter) -> str:
    method sharded_state_dict (line 1187) | def sharded_state_dict(
    method _param_groups_to_param2group_meta (line 1308) | def _param_groups_to_param2group_meta(
    method _param2group_meta_to_param_groups (line 1320) | def _param2group_meta_to_param_groups(
    method sharded_param_state_fsdp_dtensor (line 1359) | def sharded_param_state_fsdp_dtensor(self, is_loading: bool = False):
    method sharded_param_state_dp_zero (line 1383) | def sharded_param_state_dp_zero(
    method sharded_param_state_fully_reshardable (line 1416) | def sharded_param_state_fully_reshardable(
    method sharded_param_state_dp_reshardable (line 1551) | def sharded_param_state_dp_reshardable(
    method sharded_param_state_fs_model_space (line 1701) | def sharded_param_state_fs_model_space(
    method load_parameter_state_from_dp_reshardable (line 1784) | def load_parameter_state_from_dp_reshardable(self, state_dict):
    method load_parameter_state_from_fs_model_space (line 1819) | def load_parameter_state_from_fs_model_space(self, state_dict):
    method _update_legacy_world_tensors (line 1844) | def _update_legacy_world_tensors(cls, old_tensors, new_numels):
    method load_parameter_state_from_dp_zero_legacy (line 1863) | def load_parameter_state_from_dp_zero_legacy(self, state_dict):
    method load_parameter_state_from_dp_zero (line 1977) | def load_parameter_state_from_dp_zero(self, state_dict, *, update_lega...
    method load_parameter_state_from_fully_reshardable (line 2082) | def load_parameter_state_from_fully_reshardable(self, state_dict: dict):
    method split_state_dict_if_needed (line 2132) | def split_state_dict_if_needed(self, state_dict):
    method load_parameter_state (line 2265) | def load_parameter_state(self, filename: str, *, update_legacy_format=...
    method zero_grad (line 2281) | def zero_grad(self, set_to_none: bool = True):
    method _collect_main_grad_data_for_unscaling (line 2312) | def _collect_main_grad_data_for_unscaling(self):
    method _get_model_and_main_params_data_float16 (line 2330) | def _get_model_and_main_params_data_float16(self):
    method _get_fp8_params_and_shard_fp32_from_fp8 (line 2347) | def _get_fp8_params_and_shard_fp32_from_fp8(self):
    method _copy_model_grads_to_main_grads (line 2398) | def _copy_model_grads_to_main_grads(self):
    method _copy_main_params_to_model_params (line 2441) | def _copy_main_params_to_model_params(self):
    method _copy_main_params_to_param_buffer (line 2493) | def _copy_main_params_to_param_buffer(self):
    method _build_model_param_to_state_dict_param_map (line 2517) | def _build_model_param_to_state_dict_param_map(self, state_dict):
    method _copy_model_params_to_main_params (line 2553) | def _copy_model_params_to_main_params(self, state_dict=None):
    method step_with_ready_grads (line 2610) | def step_with_ready_grads(self) -> bool:

FILE: megatron/core/optimizer/grad_scaler.py
  class MegatronGradScaler (line 11) | class MegatronGradScaler(ABC):
    method __init__ (line 12) | def __init__(self, initial_scale: float):
    method scale (line 18) | def scale(self):
    method inv_scale (line 22) | def inv_scale(self):
    method update (line 26) | def update(self, found_inf: bool):
    method state_dict (line 30) | def state_dict(self):
    method load_state_dict (line 34) | def load_state_dict(self, state_dict: Dict):
  class ConstantGradScaler (line 38) | class ConstantGradScaler(MegatronGradScaler):
    method update (line 43) | def update(self, found_inf: bool):
    method state_dict (line 46) | def state_dict(self):
    method load_state_dict (line 49) | def load_state_dict(self, state_dict):
  class DynamicGradScaler (line 53) | class DynamicGradScaler(MegatronGradScaler):
    method __init__ (line 61) | def __init__(
    method update (line 108) | def update(self, found_inf: bool):
    method state_dict (line 132) | def state_dict(self):
    method load_state_dict (line 139) | def load_state_dict(self, state_dict: Dict):

FILE: megatron/core/optimizer/layer_wise_optimizer.py
  class LayerWiseDistributedOptimizer (line 26) | class LayerWiseDistributedOptimizer(ChainedOptimizer):
    method __init__ (line 42) | def __init__(
    method shard_params (line 105) | def shard_params(self, optimizers):
    method set_bucket_layerwise_params_list (line 159) | def set_bucket_layerwise_params_list(self, model_chunks):
    method allgather_params (line 195) | def allgather_params(self) -> None:
    method broadcast_params (line 243) | def broadcast_params(self):
    method get_grad_norm (line 260) | def get_grad_norm(self):
    method count_zeros (line 269) | def count_zeros(self):
    method step (line 280) | def step(self):  # type: ignore[no-untyped-def]
    method load_state_dict (line 295) | def load_state_dict(self, state_dict):
    method sharded_state_dict (line 308) | def sharded_state_dict(
    method save_state_dict_to_file (line 365) | def save_state_dict_to_file(self, filename: str) -> None:
    method load_state_dict_from_file (line 372) | def load_state_dict_from_file(self, filename: str) -> None:

FILE: megatron/core/optimizer/muon.py
  class TensorParallelMuon (line 51) | class TensorParallelMuon(OrthogonalizedOptimizer):
    method __init__ (line 54) | def __init__(
    method orthogonalize (line 119) | def orthogonalize(self, p: torch.Tensor, grad: torch.Tensor, **kwargs:...
  function get_megatron_muon_optimizer (line 174) | def get_megatron_muon_optimizer(

FILE: megatron/core/optimizer/optimizer.py
  function _zero_grad_group_helper (line 58) | def _zero_grad_group_helper(
  function _multi_tensor_copy_this_to_that (line 79) | def _multi_tensor_copy_this_to_that(
  class MegatronOptimizer (line 100) | class MegatronOptimizer(ABC):
    method __init__ (line 110) | def __init__(
    method get_parameters (line 126) | def get_parameters(self) -> List[torch.nn.Parameter]:
    method get_main_grads_for_grad_norm (line 137) | def get_main_grads_for_grad_norm(self) -> List[torch.Tensor]:
    method get_grad_stats_parallel_group (line 165) | def get_grad_stats_parallel_group(self) -> torch.distributed.ProcessGr...
    method prepare_grads (line 186) | def prepare_grads(self) -> bool:
    method step_with_ready_grads (line 191) | def step_with_ready_grads(self) -> bool:
    method get_grad_norm (line 196) | def get_grad_norm(self):
    method clip_grad_norm (line 204) | def clip_grad_norm(self, clip_grad: float) -> float:
    method count_zeros (line 224) | def count_zeros(self) -> float:
    method zero_grad (line 235) | def zero_grad(self, set_to_none: bool = True):
    method get_loss_scale (line 240) | def get_loss_scale(self) -> torch.Tensor:
    method scale_loss (line 247) | def scale_loss(self, loss: torch.Tensor) -> torch.Tensor:
    method reload_model_params (line 252) | def reload_model_params(self, state_dict=None):
    method state_dict (line 269) | def state_dict(self):
    method load_state_dict (line 274) | def load_state_dict(self, state_dict):
    method _get_state (line 280) | def _get_state(self):
    method _set_state (line 283) | def _set_state(self, value):
    method _get_param_groups (line 291) | def _get_param_groups(self):
    method _set_param_groups (line 297) | def _set_param_groups(self, value):
    method step (line 303) | def step(self):
    method sharded_state_dict (line 308) | def sharded_state_dict(
    method _extract_common_per_param_step (line 326) | def _extract_common_per_param_step(state_dict) -> Union[int, torch.Ten...
    method _restore_common_per_param_step (line 341) | def _restore_common_per_param_step(state_dict: Dict, step: Union[int, ...
    method offload_to_cpu (line 345) | def offload_to_cpu(self):
    method restore_from_cpu (line 365) | def restore_from_cpu(self):
    method _filter_and_reorder_param_groups (line 384) | def _filter_and_reorder_param_groups(
  class MixedPrecisionOptimizer (line 438) | class MixedPrecisionOptimizer(MegatronOptimizer):
    method __init__ (line 452) | def __init__(
    method get_loss_scale (line 488) | def get_loss_scale(self):
    method reload_model_params (line 493) | def reload_model_params(self, state_dict=None):
    method _unscale_main_grads_and_check_for_nan (line 497) | def _unscale_main_grads_and_check_for_nan(self):
    method prepare_grads (line 525) | def prepare_grads(self) -> bool:
    method step_with_ready_grads (line 561) | def step_with_ready_grads(self) -> bool:
    method step (line 594) | def step(self):
  class Float16OptimizerWithFloat16Params (line 627) | class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
    method __init__ (line 641) | def __init__(
    method zero_grad (line 711) | def zero_grad(self, set_to_none=True):
    method _collect_main_grad_data_for_unscaling (line 726) | def _collect_main_grad_data_for_unscaling(self):
    method _get_model_and_main_params_data_float16 (line 746) | def _get_model_and_main_params_data_float16(self):
    method _copy_model_grads_to_main_grads (line 755) | def _copy_model_grads_to_main_grads(self):
    method _copy_main_params_to_model_params (line 775) | def _copy_main_params_to_model_params(self):
    method _copy_model_params_to_main_params (line 782) | def _copy_model_params_to_main_params(self, state_dict=None):
    method state_dict (line 790) | def state_dict(self, is_loading: bool = False):
    method sharded_state_dict (line 801) | def sharded_state_dict(
    method load_state_dict (line 850) | def load_state_dict(self, state_dict):
  class FP32Optimizer (line 891) | class FP32Optimizer(MegatronOptimizer):
    method __init__ (line 900) | def __init__(
    method zero_grad (line 911) | def zero_grad(self, set_to_none=True):
    method get_loss_scale (line 918) | def get_loss_scale(self):
    method prepare_grads (line 923) | def prepare_grads(self) -> bool:
    method step_with_ready_grads (line 944) | def step_with_ready_grads(self) -> bool:
    method step (line 962) | def step(self):
    method reload_model_params (line 996) | def reload_model_params(self, state_dict=None):
    method state_dict (line 999) | def state_dict(self):
    method load_state_dict (line 1002) | def load_state_dict(self, state_dict):
    method sharded_state_dict (line 1013) | def sharded_state_dict(
  class ProxyDict (line 1039) | class ProxyDict:
    method __init__ (line 1051) | def __init__(self, inner_dicts: List[dict]):
    method __getitem__ (line 1054) | def __getitem__(self, key: Tuple[int, str]):
    method __setitem__ (line 1058) | def __setitem__(self, key: Tuple[int, str], value: Any):
    method __len__ (line 1062) | def __len__(self) -> int:
    method __iter__ (line 1065) | def __iter__(self):
    method items (line 1070) | def items(self):
  class ChainedOptimizer (line 1077) | class ChainedOptimizer(MegatronOptimizer):
    method __init__ (line 1087) | def __init__(self, chained_optimizers: List[MegatronOptimizer]):
    method optimizer (line 1109) | def optimizer(self):
    method param_groups (line 1119) | def param_groups(self) -> List[dict]:
    method get_parameters (line 1127) | def get_parameters(self) -> List[torch.nn.Parameter]:
    method state (line 1135) | def state(self) -> ProxyDict:
    method zero_grad (line 1142) | def zero_grad(self, set_to_none=True):
    method get_loss_scale (line 1146) | def get_loss_scale(self):
    method _split_state_dict (line 1152) | def _split_state_dict(self, state_dict):
    method reload_model_params (line 1182) | def reload_model_params(self, state_dict=None):
    method state_dict (line 1187) | def state_dict(self):
    method sharded_state_dict (line 1193) | def sharded_state_dict(
    method load_state_dict (line 1225) | def load_state_dict(self, state_dict):
    method prepare_grads (line 1242) | def prepare_grads(self) -> bool:
    method step_with_ready_grads (line 1251) | def step_with_ready_grads(self) -> bool:
    method grads_states_parallel_group_is_shared (line 1263) | def grads_states_parallel_group_is_shared(self):
    method get_grad_stats_parallel_group (line 1271) | def get_grad_stats_parallel_group(self) -> torch.distributed.ProcessGr...
    method get_grad_norm (line 1279) | def get_grad_norm(self):
    method count_zeros (line 1298) | def count_zeros(self):
    method step (line 1317) | def step(self):
    method save_parameter_state (line 1349) | def save_parameter_state(self, filename: str):
    method load_parameter_state (line 1376) | def load_parameter_state(self, filename: str, *, update_legacy_format:...
    method _synchronize_steps (line 1401) | def _synchronize_steps(self):
    method offload_to_cpu (line 1423) | def offload_to_cpu(self):
    method restore_from_cpu (line 1428) | def restore_from_cpu(self):

FILE: megatron/core/optimizer/optimizer_config.py
  class ParamPredicate (line 13) | class ParamPredicate:
    method __call__ (line 32) | def __call__(self, param: torch.nn.Parameter) -> bool:
  class ParamWithNamePredicate (line 37) | class ParamWithNamePredicate:
    method __call__ (line 60) | def __call__(self, param: torch.nn.Parameter, name: str) -> bool:
  class ParamKey (line 65) | class ParamKey:
    method matches (line 89) | def matches(self, param: torch.nn.Parameter, param_name: str) -> bool:
  class OptimizerConfig (line 139) | class OptimizerConfig:
    method __post_init__ (line 358) | def __post_init__(self):
  class AdamOptimizerConfig (line 442) | class AdamOptimizerConfig(OptimizerConfig):
  class SGDOptimizerConfig (line 463) | class SGDOptimizerConfig(OptimizerConfig):

FILE: megatron/core/optimizer/qk_clip.py
  function clip_qk (line 8) | def clip_qk(model, log_max_only=False) -> float:

FILE: megatron/core/optimizer_param_scheduler.py
  class ParamGroupOverride (line 17) | class ParamGroupOverride(TypedDict):
  function get_canonical_lr_for_logging (line 37) | def get_canonical_lr_for_logging(param_groups: list[dict]) -> float | None:
  function param_group_override_to_tuple (line 57) | def param_group_override_to_tuple(
  function combine_param_group_overrides (line 70) | def combine_param_group_overrides(
  class OptimizerParamScheduler (line 97) | class OptimizerParamScheduler:
    method __init__ (line 122) | def __init__(
    method get_wd (line 181) | def get_wd(self, param_group: Optional[dict] = None) -> float:
    method get_lr (line 215) | def get_lr(self, param_group: dict) -> float:
    method step (line 281) | def step(self, increment: int) -> None:
    method state_dict (line 296) | def state_dict(self) -> dict:
    method _check_and_set (line 312) | def _check_and_set(self, cls_value: float, sd_value: float, name: str)...
    method load_state_dict (line 335) | def load_state_dict(self, state_dict: dict) -> None:

FILE: megatron/core/packed_seq_params.py
  class PackedSeqParams (line 10) | class PackedSeqParams:
    method __post_init__ (line 28) | def __post_init__(self):

FILE: megatron/core/parallel_state.py
  function get_nccl_options (line 150) | def get_nccl_options(pg_name, nccl_comm_cfgs):
  function update_pg_timeout (line 185) | def update_pg_timeout(
  function create_group (line 214) | def create_group(
  function generate_masked_orthogonal_rank_groups (line 251) | def generate_masked_orthogonal_rank_groups(
  function create_hierarchical_groups (line 360) | def create_hierarchical_groups(
  function create_hybrid_dp_cp_groups (line 422) | def create_hybrid_dp_cp_groups(rank, ranks, pg_options):
  class RankGenerator (line 447) | class RankGenerator(object):
    method __init__ (line 450) | def __init__(
    method get_mask (line 491) | def get_mask(self, order: str, token: str):
    method get_ranks (line 506) | def get_ranks(self, token):
  function default_embedding_ranks (line 525) | def default_embedding_ranks(pp_ranks):
  function default_position_embedding_ranks (line 534) | def default_position_embedding_ranks(pp_ranks):
  function overwrite_nccl_comm_cfgs (line 540) | def overwrite_nccl_comm_cfgs(nccl_comm_cfgs, pg_name, key_value_pair):
  function initialize_model_parallel (line 548) | def initialize_model_parallel(
  function is_initialized (line 1361) | def is_initialized():
  function model_parallel_is_initialized (line 1366) | def model_parallel_is_initialized():
  function get_model_parallel_group (line 1377) | def get_model_parallel_group(check_initialized=True):
  function get_tensor_model_parallel_group (line 1384) | def get_tensor_model_parallel_group(check_initialized=True):
  function get_pipeline_model_parallel_group (line 1393) | def get_pipeline_model_parallel_group(check_initialized=True):
  function get_data_parallel_group (line 1402) | def get_data_parallel_group(
  function has_separate_all_gather_group (line 1427) | def has_separate_all_gather_group() -> bool:
  function get_data_parallel_group_gloo (line 1436) | def get_data_parallel_group_gloo(with_context_parallel=False, partial_da...
  function get_context_parallel_group (line 1454) | def get_context_parallel_group(check_initialized=True):
  function get_context_parallel_global_ranks (line 1461) | def get_context_parallel_global_ranks(check_initialized=True):
  function get_hierarchical_context_parallel_groups (line 1470) | def get_hierarchical_context_parallel_groups(check_initialized=True):
  function get_hybrid_data_context_parallel_groups (line 1477) | def get_hybrid_data_context_parallel_groups(check_initialized=True, grou...
  function get_embedding_group (line 1489) | def get_embedding_group(check_initialized=True):
  function get_position_embedding_group (line 1496) | def get_position_embedding_group(check_initialized=True):
  function get_amax_reduction_group (line 1503) | def get_amax_reduction_group(with_context_parallel=False, tp_only_amax_r...
  function get_tensor_and_data_parallel_group (line 1529) | def get_tensor_and_data_parallel_group(check_initialized=True, with_cont...
  function get_tensor_and_context_parallel_group (line 1545) | def get_tensor_and_context_parallel_group(check_initialized=True):
  function set_tensor_model_parallel_world_size (line 1554) | def set_tensor_model_parallel_world_size(world_size):
  function set_pipeline_model_parallel_world_size (line 1560) | def set_pipeline_model_parallel_world_size(world_size):
  function set_virtual_pipeline_model_parallel_world_size (line 1566) | def set_virtual_pipeline_model_parallel_world_size(world_size):
  function get_tensor_model_parallel_world_size (line 1572) | def get_tensor_model_parallel_world_size():
  function get_pipeline_model_parallel_world_size (line 1580) | def get_pipeline_model_parallel_world_size():
  function set_tensor_model_parallel_rank (line 1588) | def set_tensor_model_parallel_rank(rank):
  function set_pipeline_model_parallel_rank (line 1594) | def set_pipeline_model_parallel_rank(rank):
  function get_tensor_model_parallel_rank (line 1600) | def get_tensor_model_parallel_rank():
  function get_pipeline_model_parallel_rank (line 1608) | def get_pipeline_model_parallel_rank():
  function is_pipeline_first_stage (line 1616) | def is_pipeline_first_stage(ignore_virtual=True, vp_stage=None):
  function is_pipeline_last_stage (line 1626) | def is_pipeline_last_stage(ignore_virtual=True, vp_stage=None):
  function is_rank_in_embedding_group (line 1636) | def is_rank_in_embedding_group(ignore_virtual=True, vp_stage=None):
  function is_rank_in_position_embedding_group (line 1654) | def is_rank_in_position_embedding_group():
  function get_virtual_pipeline_model_parallel_rank (line 1661) | def get_virtual_pipeline_model_parallel_rank():
  function set_virtual_pipeline_model_parallel_rank (line 1667) | def set_virtual_pipeline_model_parallel_rank(rank):
  function get_virtual_pipeline_model_parallel_world_size (line 1678) | def get_virtual_pipeline_model_parallel_world_size():
  function get_tensor_model_parallel_src_rank (line 1684) | def get_tensor_model_parallel_src_rank():
  function get_model_parallel_src_rank (line 1693) | def get_model_parallel_src_rank():
  function get_data_parallel_src_rank (line 1700) | def get_data_parallel_src_rank(with_context_parallel=False):
  function get_pipeline_model_parallel_first_rank (line 1713) | def get_pipeline_model_parallel_first_rank():
  function get_pipeline_model_parallel_last_rank (line 1719) | def get_pipeline_model_parallel_last_rank():
  function get_pipeline_model_parallel_next_rank (line 1726) | def get_pipeline_model_parallel_next_rank():
  function get_pipeline_model_parallel_prev_rank (line 1734) | def get_pipeline_model_parallel_prev_rank():
  function get_data_parallel_world_size (line 1742) | def get_data_parallel_world_size(with_context_parallel=False, partial_da...
  function set_data_parallel_rank (line 1755) | def set_data_parallel_rank(rank):
  function get_data_parallel_rank (line 1761) | def get_data_parallel_rank(with_context_parallel=False, partial_data_par...
  function get_context_parallel_world_size (line 1774) | def get_context_parallel_world_size():
  function get_context_parallel_rank (line 1782) | def get_context_parallel_rank():
  function get_tensor_and_context_parallel_world_size (line 1790) | def get_tensor_and_context_parallel_world_size():
  function get_tensor_and_context_parallel_rank (line 1798) | def get_tensor_and_context_parallel_rank():
  function get_expert_model_parallel_group (line 1807) | def get_expert_model_parallel_group(check_initialized=True):
  function get_expert_model_parallel_src_rank (line 1816) | def get_expert_model_parallel_src_rank():
  function get_expert_model_parallel_world_size (line 1825) | def get_expert_model_parallel_world_size():
  function set_expert_model_parallel_world_size (line 1835) | def set_expert_model_parallel_world_size(world_size):
  function get_expert_model_parallel_rank (line 1841) | def get_expert_model_parallel_rank():
  function set_expert_model_parallel_rank (line 1851) | def set_expert_model_parallel_rank(rank):
  function get_expert_tensor_parallel_group (line 1857) | def get_expert_tensor_parallel_group(check_initialized=True):
  function get_expert_tensor_parallel_world_size (line 1866) | def get_expert_tensor_parallel_world_size():
  function set_expert_tensor_parallel_world_size (line 1878) | def set_expert_tensor_parallel_world_size(world_size):
  function get_expert_tensor_parallel_rank (line 1884) | def get_expert_tensor_parallel_rank():
  function set_expert_tensor_parallel_rank (line 1896) | def set_expert_tensor_parallel_rank(rank):
  function get_expert_tensor_and_model_parallel_group (line 1902) | def get_expert_tensor_and_model_parallel_group(check_initialized=True):
  function get_expert_tensor_and_model_parallel_world_size (line 1911) | def get_expert_tensor_and_model_parallel_world_size():
  function get_expert_tensor_and_model_parallel_rank (line 1920) | def get_expert_tensor_and_model_parallel_rank():
  function get_expert_tensor_model_pipeline_parallel_group (line 1928) | def get_expert_tensor_model_pipeline_parallel_group(check_initialized=Tr...
  function get_expert_data_parallel_group (line 1937) | def get_expert_data_parallel_group(check_initialized=True, partial_exper...
  function get_expert_data_parallel_group_gloo (line 1953) | def get_expert_data_parallel_group_gloo(partial_expert_data_parallel=Fal...
  function get_expert_data_parallel_rank (line 1967) | def get_expert_data_parallel_rank(partial_expert_data_parallel=False):
  function get_expert_data_parallel_world_size (line 1977) | def get_expert_data_parallel_world_size(partial_expert_data_parallel=Fal...
  function get_intra_distributed_optimizer_instance_group (line 1987) | def get_intra_distributed_optimizer_instance_group(check_initialized=True):
  function get_inter_distributed_optimizer_instance_group (line 1996) | def get_inter_distributed_optimizer_instance_group(check_initialized=True):
  function _set_global_memory_buffer (line 2012) | def _set_global_memory_buffer():
  function get_global_memory_buffer (line 2019) | def get_global_memory_buffer():
  function destroy_global_memory_buffer (line 2025) | def destroy_global_memory_buffer():
  function get_all_ranks (line 2031) | def get_all_ranks():
  function destroy_model_parallel (line 2044) | def destroy_model_parallel():

FILE: megatron/core/pipeline_parallel/bridge_communicator.py
  class CommRole (line 14) | class CommRole(Enum):
  class RankCommInfo (line 31) | class RankCommInfo:
  class BridgeCommunicator (line 39) | class BridgeCommunicator:
    method destroy_broadcast_pgs (line 53) | def destroy_broadcast_pgs(cls):
    method __init__ (line 60) | def __init__(
    method _get_or_create_broadcast_pg (line 161) | def _get_or_create_broadcast_pg(cls, ranks_list: List[List[int]]):
    method get_leader_rank (line 169) | def get_leader_rank(self, grid: HyperCommGrid, is_src: bool) -> List[i...
    method get_boundary_pp_stage_ranks (line 205) | def get_boundary_pp_stage_ranks(self, grid: HyperCommGrid, is_src: bool):
    method is_current_rank_in_grid (line 243) | def is_current_rank_in_grid(self, grid: HyperCommGrid) -> bool:
    method build_comm_map (line 247) | def build_comm_map(self, src_tp_leaders: List[int], dest_tp_leaders: L...
    method send_forward (line 312) | def send_forward(self, tensor_to_send: torch.Tensor):
    method recv_forward (line 340) | def recv_forward(self) -> torch.Tensor:
    method send_backward (line 436) | def send_backward(self, grad_tensor: torch.Tensor):
    method recv_backward (line 471) | def recv_backward(self) -> torch.Tensor:
    method send_forward_recv_backward (line 562) | def send_forward_recv_backward(
    method send_backward_recv_forward (line 683) | def send_backward_recv_forward(
    method _communicate_shapes (line 811) | def _communicate_shapes(
    method _split_tensor_at_batch_dim (line 923) | def _split_tensor_at_batch_dim(

FILE: megatron/core/pipeline_parallel/combined_1f1b.py
  function combined_1f1b_schedule_for_no_pipelining (line 18) | def combined_1f1b_schedule_for_no_pipelining(
  function combined_1f1b_schedule_for_interleaved_pipelining (line 111) | def combined_1f1b_schedule_for_interleaved_pipelining(
  function combined_forward_backward_step (line 237) | def combined_forward_backward_step(

FILE: megatron/core/pipeline_parallel/fine_grained_activation_offload.py
  function debug_rank (line 16) | def debug_rank(message):
  function print_offload_summary_table (line 26) | def print_offload_summary_table(total_offload_bytes: Dict[str, int]):
  class GPUTensorPool (line 97) | class GPUTensorPool:
    method __init__ (line 114) | def __init__(self, device: str = 'cuda', pin_memory: bool = False):
    method _get_pool_key (line 141) | def _get_pool_key(self, shape: Tuple, dtype: torch.dtype) -> Tuple:
    method _calculate_memory_size (line 146) | def _calculate_memory_size(shape: Tuple, dtype: torch.dtype) -> int:
    method allocate (line 154) | def allocate(self, shape: Tuple, dtype: torch.dtype = torch.float32) -...
    method free (line 208) | def free(self, tensor: torch.Tensor):
    method get_pool_status (line 251) | def get_pool_status(self, shape: Tuple = None, dtype: torch.dtype = No...
    method reset (line 294) | def reset(self):
    method clear (line 308) | def clear(self):
    method __del__ (line 326) | def __del__(self):
  class OffloadTensorGroup (line 331) | class OffloadTensorGroup:
    method __init__ (line 336) | def __init__(self, name):
    method push_tensor (line 352) | def push_tensor(self, tag, tensor):
    method pop_tensor (line 356) | def pop_tensor(self, tag):
    method record_offload_event (line 360) | def record_offload_event(self, stream):
    method wait_offload_event (line 364) | def wait_offload_event(self, stream):
    method record_reload_event (line 368) | def record_reload_event(self, stream):
    method wait_reload_event (line 372) | def wait_reload_event(self, stream):
    method update_offload_info (line 376) | def update_offload_info(self, tensor):
  class PipelineOffloadManager (line 382) | class PipelineOffloadManager:
    method get_instance (line 392) | def get_instance(cls):
    method reset_instance (line 399) | def reset_instance(cls):
    method __init__ (line 404) | def __init__(self):
    method d2h_stream (line 437) | def d2h_stream(self):
    method h2d_stream (line 442) | def h2d_stream(self):
    method cpu_tensor_pool (line 447) | def cpu_tensor_pool(self):
    method push_offload_groups (line 451) | def push_offload_groups(self, group_hook, forced_released_tensors):
    method flush_delayed_groups (line 456) | def flush_delayed_groups(self):
    method reset (line 464) | def reset(self):
    method offload_summary_bytes (line 484) | def offload_summary_bytes(self) -> Dict[str, int]:
    method offload_summary_total_bytes (line 489) | def offload_summary_total_bytes(self) -> int:
    method flush (line 493) | def flush(self):
    method disable_offload (line 508) | def disable_offload(self):
    method enable_offload (line 515) | def enable_offload(self):
    method post_warmup_callback (line 522) | def post_warmup_callback(self):
    method push (line 575) | def push(self, handler):
    method pop_backward_chunk (line 582) | def pop_backward_chunk(self, name=None):
    method front_backward_chunk (line 599) | def front_backward_chunk(self, name=None):
    method init_model_chunk_offload_handler (line 609) | def init_model_chunk_offload_handler(
    method pop_forward_chunk (line 649) | def pop_forward_chunk(self, name=None):
    method cur_forward_chunk (line 665) | def cur_forward_chunk(self):
    method cur_backward_chunk (line 669) | def cur_backward_chunk(self):
    method mark_not_offloadable (line 673) | def mark_not_offloadable(self, tensor: torch.Tensor):
    method __enter__ (line 678) | def __enter__(self):
    method __exit__ (line 695) | def __exit__(self, *args: Any):
    method on_save_for_backward (line 709) | def on_save_for_backward(self, tensor: torch.Tensor) -> Any:
    method on_get_saved_tensor (line 718) | def on_get_saved_tensor(self, saved_state: Any) -> torch.Tensor:
  class ChunkOffloadHandler (line 727) | class ChunkOffloadHandler:
    method offload (line 733) | def offload(self, src_tensor, pin_memory=True, use_cpu_pool=True):
    method reload (line 751) | def reload(self, state, non_blocking=None):
    method __init__ (line 765) | def __init__(self, min_offloaded_tensor_size, cpu_tensor_pool):
    method reset (line 789) | def reset(self):
    method find_group_with_name (line 797) | def find_group_with_name(self, name: str, start_index: int = 0):
    method is_empty_chunk (line 803) | def is_empty_chunk(self, name=None):
    method finish_all_groups (line 810) | def finish_all_groups(self, name=None) -> bool:
    method find_next_group (line 826) | def find_next_group(self, name=None):
    method tensor_push (line 831) | def tensor_push(self, tensor):
    method tensor_pop (line 849) | def tensor_pop(self, tensor_tag):
    method tensor_need_offloading_checker (line 860) | def tensor_need_offloading_checker(self, tensor):
    method bulk_offload_group (line 872) | def bulk_offload_group(self):
    method get_max_deduplicated_groups (line 891) | def get_max_deduplicated_groups(self):
    method bulk_reload_group (line 899) | def bulk_reload_group(self):
    method pre_reload_last_layer (line 920) | def pre_reload_last_layer(self):
    method should_bulk_offload (line 928) | def should_bulk_offload(self):
    method bulk_offload (line 952) | def bulk_offload(self, forced_released_tensors):
    method on_group_commit_forward (line 967) | def on_group_commit_forward(self, forced_released_tensors):
    method bulk_reload (line 976) | def bulk_reload(self):
    method on_group_commit_backward (line 993) | def on_group_commit_backward(self, name):
    method on_group_start_forward (line 1015) | def on_group_start_forward(self, name):
    method on_group_start_backward (line 1037) | def on_group_start_backward(self):
  function fine_grained_offloading_disable_offload (line 1050) | def fine_grained_offloading_disable_offload():
  function fine_grained_offloading_enable_offload (line 1056) | def fine_grained_offloading_enable_offload():
  class FineGrainedOffloadingGroupCommitFunction (line 1062) | class FineGrainedOffloadingGroupCommitFunction(torch.autograd.Function):
    method forward (line 1069) | def forward(ctx, tensor, cur_forward_chunk, name, forced_released_tens...
    method backward (line 1084) | def backward(ctx, *grad_output):
  function fine_grained_offloading_group_commit (line 1093) | def fine_grained_offloading_group_commit(
  function fine_grained_offloading_group_flush_delayed_groups (line 1136) | def fine_grained_offloading_group_flush_delayed_groups():
  class FineGrainedOffloadingGroupStartFunction (line 1142) | class FineGrainedOffloadingGroupStartFunction(torch.autograd.Function):
    method forward (line 1149) | def forward(ctx, tensor, cpu_offload_handler, name):
    method backward (line 1159) | def backward(ctx, grad_output):
  function fine_grained_offloading_group_start (line 1167) | def fine_grained_offloading_group_start(tensor, name=None):
  function fine_grained_offloading_forward_record (line 1175) | def fine_grained_offloading_forward_record(event: torch.cuda.Event) -> N...
  class FineGrainedOffloadingBackwardRecordFunction (line 1182) | class FineGrainedOffloadingBackwardRecordFunction(torch.autograd.Function):
    method forward (line 1189) | def forward(ctx, tensor, event: torch.cuda.Event) -> torch.Tensor:
    method backward (line 1195) | def backward(ctx, grad_output):
  function fine_grained_offloading_backward_record (line 1203) | def fine_grained_offloading_backward_record(tensor, event: torch.cuda.Ev...
  class FineGrainedActivationOffloadingInterface (line 1208) | class FineGrainedActivationOffloadingInterface:
    method __init__ (line 1211) | def __init__(self, offload: bool, tensor: torch.Tensor, name: str):
    method __enter__ (line 1216) | def __enter__(self):
    method __exit__ (line 1223) | def __exit__(self, *args: Any):
    method init_chunk_handler (line 1229) | def init_chunk_handler(vp_size, vp_stage, min_offloaded_tensor_size):
    method get_context (line 1236) | def get_context(flag):
    method group_commit (line 1241) | def group_commit(tensor, name, forced_released_tensors=None, delay_off...
    method mark_not_offloadable (line 1248) | def mark_not_offloadable(tensor: torch.Tensor):
    method forward_record (line 1253) | def forward_record(event: torch.cuda.Event) -> None:
    method reset (line 1260) | def reset():
    method reset_instance (line 1265) | def reset_instance():

FILE: megatron/core/pipeline_parallel/hybrid_cp_schedule.py
  class BalancedCPScheduler (line 14) | class BalancedCPScheduler:
    method __init__ (line 20) | def __init__(self, max_seq_len_per_rank: int, dp_cp_group: torch.distr...
    method get_total_workload (line 28) | def get_total_workload(self, seq_length: int, cp_size: Optional[int] =...
    method gpus_needed (line 44) | def gpus_needed(self, seq_len: int) -> int:
    method make_buckets_equal (line 55) | def make_buckets_equal(
    method next_hdp_group (line 104) | def next_hdp_group(
    method get_groups_and_subsamples (line 456) | def get_groups_and_subsamples(self, sample_id_seqlens, config):
  function hybrid_context_parallel_forward_backward (line 477) | def hybrid_context_parallel_forward_backward(

FILE: megatron/core/pipeline_parallel/multimodule_communicator.py
  class RankModuleInfo (line 20) | class RankModuleInfo:
  function _prepare_tensor_for_comm (line 48) | def _prepare_tensor_for_comm(
  function _restore_tensor_from_comm (line 81) | def _restore_tensor_from_comm(
  class MultiModulePipelineCommunicator (line 104) | class MultiModulePipelineCommunicator:
    method __init__ (line 107) | def __init__(
    method _build_bridge_comms (line 153) | def _build_bridge_comms(self):
    method is_pp_first_stage (line 171) | def is_pp_first_stage(self):
    method is_pp_last_stage (line 187) | def is_pp_last_stage(self):
    method _is_source_module (line 202) | def _is_source_module(self, module_name: str) -> bool:
    method _is_sink_module (line 210) | def _is_sink_module(self, module_name: str) -> bool:
    method is_current_rank_in_grid (line 214) | def is_current_rank_in_grid(self, grid: HyperCommGrid) -> bool:
    method total_stages (line 219) | def total_stages(self) -> int:
    method current_stage (line 231) | def current_stage(self) -> int:
    method _build_rank_module_info_map (line 263) | def _build_rank_module_info_map(self):
    method recv_forward (line 307) | def recv_forward(
    method send_forward (line 341) | def send_forward(self, output_dict: Dict[str, torch.Tensor], is_last_s...
    method send_forward_recv_backward (line 359) | def send_forward_recv_backward(
    method send_backward_recv_forward (line 393) | def send_backward_recv_forward(
    method recv_backward (line 429) | def recv_backward(
    method send_backward (line 460) | def send_backward(self, grad_dict: Dict[str, torch.Tensor], is_first_s...
    method compute_total_pipeline_stages (line 479) | def compute_total_pipeline_stages(

FILE: megatron/core/pipeline_parallel/p2p_communication.py
  function _batched_p2p_ops (line 17) | def _batched_p2p_ops(
  function _p2p_ops (line 55) | def _p2p_ops(
  function is_single_shape (line 131) | def is_single_shape(x) -> bool:
  class P2PCommunicator (line 140) | class P2PCommunicator:
    method __init__ (line 147) | def __init__(self, pp_group: dist.ProcessGroup, config: ModelParallelC...
    method is_pp_first_stage (line 167) | def is_pp_first_stage(self) -> bool:
    method is_pp_last_stage (line 172) | def is_pp_last_stage(self) -> bool:
    method total_stages (line 177) | def total_stages(self) -> int:
    method current_stage (line 182) | def current_stage(self) -> int:
    method _communicate_shapes (line 186) | def _communicate_shapes(self, tensor_send_next, tensor_send_prev, recv...
    method _communicate (line 275) | def _communicate(
    method recv_forward (line 424) | def recv_forward(
    method recv_backward (line 455) | def recv_backward(
    method send_forward (line 486) | def send_forward(self, output_tensors, is_last_stage: bool) -> None:
    method send_backward (line 507) | def send_backward(self, input_tensor_grads, is_first_stage: bool) -> N...
    method send_forward_recv_backward (line 527) | def send_forward_recv_backward(
    method send_backward_recv_forward (line 560) | def send_backward_recv_forward(
    method send_forward_recv_forward (line 593) | def send_forward_recv_forward(
    method send_backward_recv_backward (line 619) | def send_backward_recv_backward(
    method send_forward_backward_recv_forward_backward (line 645) | def send_forward_backward_recv_forward_backward(

FILE: megatron/core/pipeline_parallel/schedules.py
  function get_forward_backward_func (line 48) | def get_forward_backward_func(pp_size: Optional[int] = None, vp_size: Op...
  function deallocate_output_tensor (line 157) | def deallocate_output_tensor(out, deallocate_pipeline_outputs=False):
  function custom_backward (line 190) | def custom_backward(output, grad_output):
  function forward_step_calc_loss (line 222) | def forward_step_calc_loss(
  function forward_step (line 316) | def forward_step(
  function backward_step (line 451) | def backward_step(input_tensor, output_tensor, output_tensor_grad, config):
  function backward_step_multimodule (line 514) | def backward_step_multimodule(
  function check_first_val_step (line 575) | def check_first_val_step(first_val_step, forward_only, cond):
  function forward_backward_no_pipelining (line 583) | def forward_backward_no_pipelining(
  function clear_embedding_activation_buffer (line 751) | def clear_embedding_activation_buffer(config, model, is_last_stage):
  function finish_embedding_wgrad_compute (line 770) | def finish_embedding_wgrad_compute(config, embedding_module, is_last_sta...
  function get_pp_rank_microbatches (line 786) | def get_pp_rank_microbatches(
  function get_schedule_table (line 846) | def get_schedule_table(num_microbatches, num_model_chunks, microbatch_gr...
  function forward_backward_pipelining_with_interleaving (line 876) | def forward_backward_pipelining_with_interleaving(
  function get_tensor_shapes (line 1975) | def get_tensor_shapes(
  function forward_backward_pipelining_without_interleaving (line 2007) | def forward_backward_pipelining_without_interleaving(

FILE: megatron/core/pipeline_parallel/utils.py
  function is_pp_first_stage (line 16) | def is_pp_first_stage(pp_group: torch.distributed.ProcessGroup):
  function is_pp_last_stage (line 21) | def is_pp_last_stage(pp_group: torch.distributed.ProcessGroup):
  function is_vp_first_stage (line 26) | def is_vp_first_stage(vp_stage: int, vp_size: int | None):
  function is_vp_last_stage (line 37) | def is_vp_last_stage(vp_stage: int, vp_size: int | None):
  function get_pp_first_rank (line 48) | def get_pp_first_rank(pp_group: torch.distributed.ProcessGroup):
  function get_pp_last_rank (line 54) | def get_pp_last_rank(pp_group: torch.distributed.ProcessGroup):
  function get_pp_next_rank (line 60) | def get_pp_next_rank(pp_group: torch.distributed.ProcessGroup):
  function get_pp_prev_rank (line 70) | def get_pp_prev_rank(pp_group: torch.distributed.ProcessGroup):
  function make_viewless (line 80) | def make_viewless(e):
  function set_ideal_affinity_for_current_gpu (line 86) | def set_ideal_affinity_for_current_gpu():
  class NoopScheduleNode (line 119) | class NoopScheduleNode:
    method forward (line 128) | def forward(self, inputs):
    method backward (line 132) | def backward(self, outgrads):
  class ScheduleNode (line 137) | class ScheduleNode:
    method __init__ (line 144) | def __init__(
    method default_backward_func (line 181) | def default_backward_func(self, outputs, output_grad):
    method forward (line 194) | def forward(self, inputs=()):
    method _forward (line 200) | def _forward(self, *inputs):
    method get_output (line 227) | def get_output(self):
    method backward (line 231) | def backward(self, output_grad):
    method _backward (line 237) | def _backward(self, *output_grad):
    method get_grad (line 265) | def get_grad(self):
    method stream_acquire_context (line 274) | def stream_acquire_context(self, name=None):
    method _release_state (line 297) | def _release_state(self):
  class AbstractSchedulePlan (line 305) | class AbstractSchedulePlan(ABC):
    method run (line 311) | def run(
  function set_streams (line 330) | def set_streams(comp_stream=None, comm_stream=None):
  function get_comp_stream (line 348) | def get_comp_stream():
  function get_comm_stream (line 354) | def get_comm_stream():

FILE: megatron/core/post_training/modelopt/gpt/model_specs.py
  function get_gpt_modelopt_spec (line 35) | def get_gpt_modelopt_spec(

FILE: megatron/core/post_training/modelopt/gpt/state_dict_hooks.py
  function mcore_gpt_load_te_state_dict_pre_hook (line 11) | def mcore_gpt_load_te_state_dict_pre_hook(

FILE: megatron/core/post_training/modelopt/layers.py
  class Norm (line 50) | class Norm:
    method __new__ (line 58) | def __new__(
  class Linear (line 105) | class Linear(torch.nn.Linear):
    method __init__ (line 108) | def __init__(
    method sharded_state_dict (line 158) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N...
    method forward (line 175) | def forward(self, x):
  class RealQuantTransformerLayer (line 184) | class RealQuantTransformerLayer(TransformerLayer):
    method __init__ (line 200) | def __init__(self, *args, **kwargs):
    method _collect_original_tensor_info (line 235) | def _collect_original_tensor_info(self):
    method _report_quantize_tensor_info (line 241) | def _report_quantize_tensor_info(self):
  class FP8WeightTransformerLayer (line 256) | class FP8WeightTransformerLayer(RealQuantTransformerLayer):
  class BlockwiseFP8WeightTransformerLayer (line 262) | class BlockwiseFP8WeightTransformerLayer(RealQuantTransformerLayer):

FILE: megatron/core/post_training/modelopt/mamba/model_specs.py
  function get_mamba_stack_modelopt_spec (line 20) | def get_mamba_stack_modelopt_spec(
  function _get_mamba_stack_local_spec (line 55) | def _get_mamba_stack_local_spec(

FILE: megatron/core/process_groups_config.py
  class ProcessGroupHelperMeta (line 14) | class ProcessGroupHelperMeta(type):
    method __setattr__ (line 17) | def __setattr__(cls, name, value):
  class ProcessGroupCollection (line 27) | class ProcessGroupCollection:
    method __init__ (line 136) | def __init__(self, **kwargs):
    method __repr__ (line 143) | def __repr__(self):
    method use_mpu_process_groups (line 161) | def use_mpu_process_groups(cls, required_pgs: Optional[List[str]] = No...
    method setup_process_groups_for_optimizer (line 253) | def setup_process_groups_for_optimizer(
    method setup_process_groups_for_ddp (line 444) | def setup_process_groups_for_ddp(
  class MultiModuleProcessGroupCollection (line 575) | class MultiModuleProcessGroupCollection:
    method __post_init__ (line 615) | def __post_init__(self):
    method get_language_model_collection (line 625) | def get_language_model_collection(self) -> ProcessGroupCollection:
    method get_language_model_cp_size (line 638) | def get_language_model_cp_size(self) -> int:
    method has_language_model (line 649) | def has_language_model(self) -> bool:
    method get_module_collection (line 657) | def get_module_collection(self, module_name: str) -> ProcessGroupColle...
    method __len__ (line 676) | def __len__(self):
    method __getitem__ (line 680) | def __getitem__(self, module_name: str):
    method __iter__ (line 684) | def __iter__(self):
    method keys (line 688) | def keys(self):
    method values (line 692) | def values(self):
    method items (line 696) | def items(self):
    method __repr__ (line 700) | def __repr__(self):

FILE: megatron/core/quantization/quant_config.py
  class MatchContext (line 66) | class MatchContext:
  class QuantizationConfig (line 73) | class QuantizationConfig:
    method __init__ (line 76) | def __init__(self, config: dict, match_input: MatchContext, config_key...
    method __repr__ (line 87) | def __repr__(self) -> str:
  class Matcher (line 94) | class Matcher(ABC):
    method match (line 98) | def match(self, context: MatchContext) -> Optional[str]:
  class GlobMatcher (line 108) | class GlobMatcher(Matcher):
    method __init__ (line 117) | def __init__(self, pattern: str, config_key: str):
    method match (line 121) | def match(self, context: MatchContext) -> Optional[str]:
    method __repr__ (line 127) | def __repr__(self) -> str:
  class RecipeConfig (line 131) | class RecipeConfig:
    method __init__ (line 134) | def __init__(self, matchers: List[Matcher], config_dict: Dict[str, Dic...
    method _build_matchers (line 139) | def _build_matchers(matchers_dict: Dict | None) -> List[Matcher]:
    method from_yaml_file (line 172) | def from_yaml_file(recipe_yaml_path: str) -> "RecipeConfig":
    method from_config_dict (line 190) | def from_config_dict(config: Dict) -> "RecipeConfig":
    method match_to_config_key (line 199) | def match_to_config_key(self, operator_context: MatchContext) -> str |...
    method match (line 218) | def match(self, operator_context: MatchContext) -> QuantizationConfig ...
    method __repr__ (line 230) | def __repr__(self) -> str:

FILE: megatron/core/quantization/utils.py
  function get_quant_config_or_none (line 9) | def get_quant_config_or_none(
  function load_quantization_recipe (line 23) | def load_quantization_recipe(recipe_path: str) -> RecipeConfig:
  function kitchen_quantization_recipe_config (line 29) | def kitchen_quantization_recipe_config(recipe_idx: int) -> RecipeConfig:

FILE: megatron/core/rerun_state_machine.py
  class Caller (line 46) | class Caller(NamedTuple):
  class Call (line 53) | class Call(NamedTuple):
  class RerunDiagnostic (line 60) | class RerunDiagnostic(str, Enum):
  class RerunMode (line 74) | class RerunMode(str, Enum):
  class RerunState (line 82) | class RerunState(Enum):
  class RerunValidationStatus (line 114) | class RerunValidationStatus(str, Enum):
  class RerunStateMachine (line 129) | class RerunStateMachine:
    method __init__ (line 185) | def __init__(
    method set_mode (line 241) | def set_mode(self, mode: RerunMode) -> None:
    method get_mode (line 247) | def get_mode(self) -> RerunMode:
    method _reduce_any (line 252) | def _reduce_any(self, value: Union[bool, List[bool]]) -> Union[bool, T...
    method should_run_forward_backward (line 270) | def should_run_forward_backward(self, data_iterator: DataIteratorArgTy...
    method should_checkpoint_and_exit (line 398) | def should_checkpoint_and_exit(self) -> Tuple[bool, bool, int]:
    method validate_result (line 463) | def validate_result(
    method is_unexpectedly_large (line 697) | def is_unexpectedly_large(
    method state_dict (line 767) | def state_dict(
    method validate_state_dict (line 846) | def validate_state_dict(self, state_dict: dict[str, Any]) -> bool:
    method load_state_dict (line 862) | def load_state_dict(self, state_dict: dict[str, Any]) -> None:
    method _sanitize_data_iterators (line 910) | def _sanitize_data_iterators(
    method _get_validation_call_info (line 927) | def _get_validation_call_info(self, message: str) -> Call:
    method _save_state (line 943) | def _save_state(self) -> None:
    method _restore_state (line 964) | def _restore_state(self) -> None:
    method _maybe_report_stats (line 975) | def _maybe_report_stats(self) -> None:
    method _log_validation_error_to_file (line 1002) | def _log_validation_error_to_file(
    method get_skipped_iterations_from_tracker_file (line 1022) | def get_skipped_iterations_from_tracker_file(cls, tracker_file_name: s...
  class RerunDataIterator (line 1090) | class RerunDataIterator:
    method __init__ (line 1109) | def __init__(self, iterable: Iterable[Any]) -> None:
    method __next__ (line 1115) | def __next__(self) -> Any:
    method rewind (line 1130) | def rewind(self) -> None:
    method advance (line 1136) | def advance(self) -> None:
    method state_dict (line 1142) | def state_dict(self) -> SerializableStateType:
    method load_state_dict (line 1151) | def load_state_dict(self, state_dict: SerializableStateType) -> None:
  class QuickStats (line 1159) | class QuickStats:
    method __init__ (line 1166) | def __init__(self, max_size: int = 100000) -> None:
    method record (line 1173) | def record(self, data: float) -> None:
    method combine (line 1187) | def combine(self, others: list["QuickStats"]) -> None:
    method reset (line 1200) | def reset(self) -> None:
    method print_stats (line 1208) | def print_stats(self) -> str:
    method __getstate_ (line 1230) | def __getstate_(self) -> Any:
    method __setstate (line 1235) | def __setstate(self, state: Any) -> Any:
  class RerunErrorInjector (line 1244) | class RerunErrorInjector:
    method __init__ (line 1253) | def __init__(
    method maybe_inject (line 1268) | def maybe_inject(self) -> bool:
    method maybe_miscompare (line 1286) | def maybe_miscompare(
    method state_dict (line 1323) | def state_dict(self) -> SerializableStateType:
    method load_state_dict (line 1333) | def load_state_dict(self, state_dict: SerializableStateType) -> None:
  function initialize_rerun_state_machine (line 1343) | def initialize_rerun_state_machine(*args, **kwargs) -> None:
  function destroy_rerun_state_machine (line 1353) | def destroy_rerun_state_machine() -> None:
  function get_rerun_state_machine (line 1360) | def get_rerun_state_machine() -> RerunStateMachine:
  function _set_rerun_state_machine (line 1370) | def _set_rerun_state_machine(rerun_state_machine) -> None:
  function _compare_floats (line 1378) | def _compare_floats(a: torch.Tensor, b: torch.Tensor) -> float:

FILE: megatron/core/resharding/copy_services/base.py
  class CopyService (line 9) | class CopyService(ABC):
    method submit_send (line 13) | def submit_send(self, src_tensor: torch.Tensor, dest_rank: int):
    method submit_recv (line 18) | def submit_recv(self, dest_tensor: torch.Tensor, src_rank: int):
    method run (line 23) | def run(self):

FILE: megatron/core/resharding/copy_services/gloo_copy_service.py
  class SendOp (line 17) | class SendOp:
  class RecvOp (line 26) | class RecvOp:
  class GlooCopyService (line 34) | class GlooCopyService(CopyService):
    method __init__ (line 40) | def __init__(self, group=None):
    method submit_send (line 57) | def submit_send(self, src_tensor: torch.Tensor, dest_rank: int):
    method submit_send_with_id (line 60) | def submit_send_with_id(self, task_id: int, src_tensor: torch.Tensor, ...
    method submit_recv (line 64) | def submit_recv(self, dest_tensor: torch.Tensor, src_rank: int):
    method submit_recv_with_id (line 73) | def submit_recv_with_id(self, task_id: int, dest_tensor: torch.Tensor,...
    method run (line 80) | def run(self):

FILE: megatron/core/resharding/copy_services/nccl_copy_service.py
  class SendOp (line 17) | class SendOp:
  class RecvOp (line 26) | class RecvOp:
  class NCCLCopyService (line 34) | class NCCLCopyService(CopyService):
    method __init__ (line 40) | def __init__(self, group=None):
    method submit_send (line 53) | def submit_send(self, src_tensor: torch.Tensor, dest_rank: int):
    method submit_send_with_id (line 56) | def submit_send_with_id(self, task_id: int, src_tensor: torch.Tensor, ...
    method submit_recv (line 60) | def submit_recv(self, dest_tensor: torch.Tensor, src_rank: int):
    method submit_recv_with_id (line 64) | def submit_recv_with_id(self, task_id: int, dest_tensor: torch.Tensor,...
    method run (line 68) | def run(self):

FILE: megatron/core/resharding/copy_services/nvshmem_copy_service.py
  class NVSHMEMCopyService (line 17) | class NVSHMEMCopyService(CopyService):
    method __init__ (line 20) | def __init__(self, group=None):
    method _ensure_initialized (line 38) | def _ensure_initialized(self):
    method submit_send (line 46) | def submit_send(self, src_tensor: torch.Tensor, dest_rank: int):
    method submit_recv (line 60) | def submit_recv(self, dest_tensor: torch.Tensor, src_rank: int):
    method submit_send_with_id (line 72) | def submit_send_with_id(self, task_id: int, src_tensor: torch.Tensor, ...
    method submit_recv_with_id (line 100) | def submit_recv_with_id(self, task_id: int, dest_tensor: torch.Tensor,...
    method run (line 127) | def run(self):

FILE: megatron/core/resharding/execution.py
  function _is_mxfp8_tensor (line 17) | def _is_mxfp8_tensor(param):
  function execute_reshard_plan (line 26) | def execute_reshard_plan(

FILE: megatron/core/resharding/nvshmem_copy_service/compat.py
  function _patch_cuda_core_experimental (line 19) | def _patch_cuda_core_experimental():
  function get_cuda_core_device_class (line 31) | def get_cuda_core_device_class():
  function ensure_nvshmem_compat (line 47) | def ensure_nvshmem_compat():

FILE: megatron/core/resharding/nvshmem_copy_service/core/gpu_resource_manager.py
  class GPUResourceManager (line 32) | class GPUResourceManager:
    method __init__ (line 35) | def __init__(self):
    method init (line 56) | def init(self, group=None) -> None:
    method get_stream (line 154) | def get_stream(self, name: str):
    method get_torch_stream (line 172) | def get_torch_stream(self, name: str) -> Optional[torch.cuda.ExternalS...
    method create_events (line 184) | def create_events(self, num_events: int = 2):
    method finalize (line 200) | def finalize(self) -> None:

FILE: megatron/core/resharding/nvshmem_copy_service/core/kernel_launcher.py
  class KernelLauncher (line 23) | class KernelLauncher:
    method __init__ (line 26) | def __init__(self):
    method load_kernels (line 32) | def load_kernels(self) -> None:
    method set_streams (line 47) | def set_streams(self, pack_stream, unpack_stream) -> None:
    method launch_pack (line 63) | def launch_pack(
    method launch_unpack (line 106) | def launch_unpack(

FILE: megatron/core/resharding/nvshmem_copy_service/core/pipeline_executor.py
  class PipelineExecutor (line 31) | class PipelineExecutor:
    method __init__ (line 34) | def __init__(
    method set_streams (line 65) | def set_streams(
    method set_events (line 87) | def set_events(self, pack_events: List, unpack_events: List, barrier_e...
    method execute_pipeline (line 93) | def execute_pipeline(
    method _launch_pack (line 244) | def _launch_pack(self, iteration: int, batch: ScheduledBatch) -> None:
    method _launch_unpack (line 256) | def _launch_unpack(self, iteration: int, batch: ScheduledBatch) -> None:
    method process_self_moves (line 268) | def process_self_moves(

FILE: megatron/core/resharding/nvshmem_copy_service/logger.py
  class ColoredFormatter (line 25) | class ColoredFormatter(logging.Formatter):
    method __init__ (line 28) | def __init__(self, fmt: str, pe_id: int, use_color: bool = True):
    method formatTime (line 46) | def formatTime(self, record, datefmt=None):
    method format (line 58) | def format(self, record):
  class PELogger (line 74) | class PELogger:
    method init (line 82) | def init(cls, pe_id: int, level: str = "INFO", logs_dir: str = "logs"):
    method set_level (line 136) | def set_level(cls, level: str):
    method trace (line 155) | def trace(cls, msg: str):
    method debug (line 161) | def debug(cls, msg: str):
    method info (line 167) | def info(cls, msg: str):
    method summary (line 173) | def summary(cls, msg: str):
    method warn (line 179) | def warn(cls, msg: str):
    method warning (line 185) | def warning(cls, msg: str):
    method error (line 190) | def error(cls, msg: str):
    method critical (line 196) | def critical(cls, msg: str):
    method shutdown (line 202) | def shutdown(cls):

FILE: megatron/core/resharding/nvshmem_copy_service/memory/double_buffer_manager.py
  class DoubleBufferManager (line 25) | class DoubleBufferManager:
    method __init__ (line 28) | def __init__(self, slot_size: int = MAX_SEGMENT_SIZE):
    method allocate (line 39) | def allocate(self) -> None:
    method get_send_slot (line 58) | def get_send_slot(self, iteration: int):
    method get_recv_slot (line 70) | def get_recv_slot(self, iteration: int):
    method free (line 82) | def free(self) -> None:

FILE: megatron/core/resharding/nvshmem_copy_service/memory/tensor_pointer_utils.py
  class TensorPointerExtractor (line 14) | class TensorPointerExtractor:
    method get_pointer (line 18) | def get_pointer(tensor: Any) -> int:

FILE: megatron/core/resharding/nvshmem_copy_service/nvshmem_types.py
  class SendRequest (line 12) | class SendRequest:
  class ReceiveRequest (line 23) | class ReceiveRequest:
  class WorkloadGroup (line 34) | class WorkloadGroup:
  class ScheduledBatch (line 43) | class ScheduledBatch:
  class WorkloadSummary (line 58) | class WorkloadSummary:
  class TransferMetadata (line 67) | class TransferMetadata:

FILE: megatron/core/resharding/nvshmem_copy_service/planning/communication_scheduler.py
  class CommunicationScheduler (line 9) | class CommunicationScheduler:
    method __init__ (line 16) | def __init__(self):
    method build_schedule (line 19) | def build_schedule(
    method _collect_all_batches (line 65) | def _collect_all_batches(
    method _assign_iterations (line 110) | def _assign_iterations(self, batches: List[ScheduledBatch]):
    method _exchange_workload_summaries (line 195) | def _exchange_workload_summaries(

FILE: megatron/core/resharding/nvshmem_copy_service/planning/gpu_execution_planner.py
  class GPUExecutionPlanner (line 26) | class GPUExecutionPlanner:
    method __init__ (line 29) | def __init__(self):
    method create_gpu_plans (line 33) | def create_gpu_plans(
    method _plan_kernel_args (line 158) | def _plan_kernel_args(

FILE: megatron/core/resharding/nvshmem_copy_service/planning/task_segmenter.py
  class TaskSegmenter (line 17) | class TaskSegmenter:
    method _encode_segment_id (line 23) | def _encode_segment_id(self, task_id: int, segment_index: int) -> int:
    method _calculate_num_segments (line 26) | def _calculate_num_segments(self, size: int) -> int:
    method _validate_segmentation (line 29) | def _validate_segmentation(self, task_id: int, size: int) -> bool:
    method segment_send_request (line 42) | def segment_send_request(self, req: SendRequest) -> List[SendRequest]:
    method segment_receive_request (line 72) | def segment_receive_request(self, req: ReceiveRequest) -> List[Receive...

FILE: megatron/core/resharding/nvshmem_copy_service/planning/workload_packer.py
  class WorkloadPacker (line 9) | class WorkloadPacker:
    method pack_workloads (line 15) | def pack_workloads(
    method _pack_single_destination (line 53) | def _pack_single_destination(

FILE: megatron/core/resharding/nvshmem_copy_service/service.py
  class RemoteCopyService (line 34) | class RemoteCopyService:
    method __init__ (line 42) | def __init__(self, group=None):
    method my_pe (line 70) | def my_pe(self) -> int:
    method n_pes (line 75) | def n_pes(self) -> int:
    method device (line 80) | def device(self):
    method initialized (line 85) | def initialized(self) -> bool:
    method init (line 89) | def init(self, log_level: str = "INFO") -> None:
    method register_send (line 169) | def register_send(
    method register_receive (line 189) | def register_receive(
    method schedule (line 209) | def schedule(self) -> None:
    method run (line 281) | def run(self) -> None:
    method clear_requests (line 322) | def clear_requests(self) -> None:
    method finalize (line 336) | def finalize(self) -> None:
    method _segment_tasks (line 357) | def _segment_tasks(self) -> None:
    method _prepare_iter_schedules (line 381) | def _prepare_iter_schedules(

FILE: megatron/core/resharding/nvshmem_copy_service/validation.py
  class ValidationResult (line 19) | class ValidationResult:
  class ValidationSummary (line 36) | class ValidationSummary:
    method all_passed (line 46) | def all_passed(self) -> bool:
  function generate_deterministic_data (line 51) | def generate_deterministic_data(task_id: int, size: int, device: str = "...
  function validate_received_data (line 71) | def validate_received_data(
  function log_validation_summary (line 113) | def log_validation_summary(summary: ValidationSummary) -> None:

FILE: megatron/core/resharding/planner.py
  function _build_descriptors_for_param (line 24) | def _build_descriptors_for_param(
  function _plan_multi_dim_lcm (line 70) | def _plan_multi_dim_lcm(
  function _plan_block_interleaved (line 155) | def _plan_block_interleaved(
  function _finalize_dp_transfers (line 275) | def _finalize_dp_transfers(
  function _determine_source_ranks_for_dst_param (line 322) | def _determine_source_ranks_for_dst_param(
  function build_centralized_reshard_plan (line 354) | def build_centralized_reshard_plan(

FILE: megatron/core/resharding/refit.py
  class _PlanCacheKey (line 36) | class _PlanCacheKey:
  function _get_config_tuple (line 48) | def _get_config_tuple(core) -> Optional[Tuple[int, int, int, int, int]]:
  function _build_plan_cache_key (line 75) | def _build_plan_cache_key(
  function get_or_create_service (line 103) | def get_or_create_service(backend: RefitBackendName, group=None) -> Copy...
  function clear_service_cache (line 129) | def clear_service_cache():
  function clear_plan_cache (line 149) | def clear_plan_cache():
  function clear_all_caches (line 157) | def clear_all_caches():
  function _unwrap_model_cores (line 165) | def _unwrap_model_cores(src_model, target_model):
  function _build_or_get_plan (line 199) | def _build_or_get_plan(src_core, tgt_core, num_experts, group, src_rank_...
  function _needs_mxfp8_conversion (line 219) | def _needs_mxfp8_conversion(model) -> bool:
  function _setup_mxfp8_transform_on_plan (line 231) | def _setup_mxfp8_transform_on_plan(plan, target_model) -> None:
  function prepare_swap_model_weights (line 275) | def prepare_swap_model_weights(
  function swap_model_weights (line 322) | def swap_model_weights(
  function reshard_model_weights (line 379) | def reshard_model_weights(

FILE: megatron/core/resharding/transforms.py
  class ReshardTransform (line 17) | class ReshardTransform:
    method should_transform (line 33) | def should_transform(self, param_name: str) -> bool:
    method prepare_send (line 37) | def prepare_send(
    method prepare_recv (line 48) | def prepare_recv(self, param_name: str, dst_slice: tuple[slice, ...]) ...
    method finalize_recv (line 52) | def finalize_recv(
  function _scale_slice_from_data_slice (line 69) | def _scale_slice_from_data_slice(
  function _ensure_sendable (line 100) | def _ensure_sendable(param: torch.Tensor) -> torch.Tensor:
  class MXFP8ReshardTransform (line 117) | class MXFP8ReshardTransform(ReshardTransform):
    method __init__ (line 161) | def __init__(
    method should_transform (line 178) | def should_transform(self, param_name: str) -> bool:
    method prepare_send (line 183) | def prepare_send(self, param_name, src_slice, src_param):
    method prepare_recv (line 196) | def prepare_recv(self, param_name, dst_slice):
    method finalize_recv (line 220) | def finalize_recv(self, param_name, dst_slice, recv_buffers):

FILE: megatron/core/resharding/utils.py
  class TransferOp (line 17) | class TransferOp:
  class ParameterMetadata (line 35) | class ParameterMetadata:
  class ShardingDescriptor (line 86) | class ShardingDescriptor:
  class ReshardPlan (line 98) | class ReshardPlan:
    method __str__ (line 104) | def __str__(self):
  function _get_rank_in_group (line 113) | def _get_rank_in_group(global_rank: int, group_ranks: list[int]) -> int:
  function _detect_expert_index_from_param_name (line 123) | def _detect_expert_index_from_param_name(param_name: str) -> Optional[int]:
  function assign_ep_resolved_name_inplace (line 137) | def assign_ep_resolved_name_inplace(
  function assign_resolved_name_inplace (line 183) | def assign_resolved_name_inplace(
  function _build_layer_module_prefix_map (line 201) | def _build_layer_module_prefix_map(module: torch.nn.Module) -> dict[str,...
  function _resolve_global_layer_number_in_name (line 225) | def _resolve_global_layer_number_in_name(
  function extract_param_metadata (line 252) | def extract_param_metadata(
  function select_src_metadata_balanced (line 370) | def select_src_metadata_balanced(

FILE: megatron/core/safe_globals.py
  function register_safe_globals (line 39) | def register_safe_globals():

FILE: megatron/core/ssm/gated_delta_net.py
  class GatedDeltaNetSubmodules (line 60) | class GatedDeltaNetSubmodules:
  class GatedDeltaNet (line 70) | class GatedDeltaNet(MegatronModule):
    method __init__ (line 77) | def __init__(
    method reset_parameters (line 231) | def reset_parameters(self):
    method forward (line 253) | def forward(
    method _apply_gated_norm (line 416) | def _apply_gated_norm(self, x, gate):
    method sharded_state_dict (line 427) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N...
    method backward_dw (line 515) | def backward_dw(self):
    method _backward_in_proj (line 520) | def _backward_in_proj(self):
    method _backward_out_proj (line 524) | def _backward_out_proj(self):
  function _split_tensor_factory (line 529) | def _split_tensor_factory(
  function torch_chunk_gated_delta_rule (line 589) | def torch_chunk_gated_delta_rule(

FILE: megatron/core/ssm/mamba_block.py
  class MambaStackSubmodules (line 36) | class MambaStackSubmodules:
  class MambaStack (line 48) | class MambaStack(GraphableMegatronModule, MegatronModule):
    method __init__ (line 73) | def __init__(
    method set_input_tensor (line 168) | def set_input_tensor(self, input_tensor: Tensor):
    method mamba_state_shapes_per_request (line 178) | def mamba_state_shapes_per_request(self) -> Optional[Tuple[Tuple[int],...
    method _should_call_local_cudagraph (line 188) | def _should_call_local_cudagraph(self, *args, **kwargs):
    method __call__ (line 211) | def __call__(self, *args, **kwargs):
    method forward (line 221) | def forward(
    method sharded_state_dict (line 354) | def sharded_state_dict(

FILE: megatron/core/ssm/mamba_context_parallel.py
  class MambaContextParallel (line 31) | class MambaContextParallel:
    method __init__ (line 61) | def __init__(
    method pre_conv_ssm (line 133) | def pre_conv_ssm(
    method post_conv_ssm (line 194) | def post_conv_ssm(
    method conv1d (line 206) | def conv1d(self, input_: torch.Tensor) -> torch.Tensor:
    method conv1d_channels (line 225) | def conv1d_channels(self):
    method get_conv1d_weight (line 231) | def get_conv1d_weight(self) -> torch.Tensor:
    method get_conv1d_bias (line 236) | def get_conv1d_bias(self) -> torch.Tensor:
    method get_dt_bias (line 241) | def get_dt_bias(self) -> torch.Tensor:
    method get_A_log (line 245) | def get_A_log(self) -> torch.Tensor:
    method get_D (line 249) | def get_D(self) -> torch.Tensor:
    method _slice_conv_param (line 253) | def _slice_conv_param(self, param: torch.Tensor) -> torch.Tensor:
    method _slice_vector_param (line 288) | def _slice_vector_param(self, param: torch.Tensor, has_hdim: bool = Fa...
  function _all_to_all_cp2hp (line 304) | def _all_to_all_cp2hp(
  function _all_to_all_hp2cp (line 343) | def _all_to_all_hp2cp(
  function _undo_attention_load_balancing (line 379) | def _undo_attention_load_balancing(
  function _redo_attention_load_balancing (line 417) | def _redo_attention_load_balancing(

FILE: megatron/core/ssm/mamba_hybrid_layer_allocation.py
  class Symbols (line 14) | class Symbols:
  class ParsedHybridPattern (line 27) | class ParsedHybridPattern:
  function pattern_from_ratios (line 59) | def pattern_from_ratios(
  function get_hybrid_total_layer_count (line 113) | def get_hybrid_total_layer_count(pattern: str) -> int:
  function get_hybrid_total_pipeline_segment_count (line 130) | def get_hybrid_total_pipeline_segment_count(pattern: str) -> int:
  function get_hybrid_layer_counts (line 146) | def get_hybrid_layer_counts(pattern: str) -> Dict[str, int]:
  function parse_hybrid_pattern (line 185) | def parse_hybrid_pattern(pattern: Optional[str]) -> ParsedHybridPattern:
  function _validate_pattern (line 262) | def _validate_pattern(pattern: str, pattern_name: str, allow_pipe: bool ...
  function validate_segment_layers (line 282) | def validate_segment_layers(segment: str) -> List[str]:
  function select_pipeline_segment (line 307) | def select_pipeline_segment(
  function get_layer_maps_from_layer_type_list (line 464) | def get_layer_maps_from_layer_type_list(

FILE: megatron/core/ssm/mamba_layer.py
  class LayerNormBuilder (line 29) | class LayerNormBuilder(Protocol):
    method __call__ (line 32) | def __call__(self, config: TransformerConfig, hidden_size: int, /) -> ...
  class MambaLayerSubmodules (line 36) | class MambaLayerSubmodules:
  class MambaLayer (line 59) | class MambaLayer(GraphableMegatronModule):
    method __init__ (line 67) | def __init__(
    method create_mcore_cudagraph_manager (line 95) | def create_mcore_cudagraph_manager(self, config):
    method mamba_state_shapes_per_request (line 102) | def mamba_state_shapes_per_request(self) -> Tuple[Tuple[int], Tuple[in...
    method forward (line 106) | def forward(
    method sharded_state_dict (line 154) | def sharded_state_dict(
    method _te_cuda_graph_replay (line 177) | def _te_cuda_graph_replay(self, *args, **kwargs):
    method _should_call_local_cudagraph (line 190) | def _should_call_local_cudagraph(self, *args, **kwargs):

FILE: megatron/core/ssm/mamba_mixer.py
  class ExtendedRMSNorm (line 93) | class ExtendedRMSNorm(RMSNormGated):
    method sharded_state_dict (line 98) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N...
  class MambaMixerSubmodules (line 114) | class MambaMixerSubmodules:
  class MambaMixer (line 123) | class MambaMixer(MegatronModule):
    method __init__ (line 154) | def __init__(
    method forward (line 406) | def forward(
    method _dynamic_inference (line 457) | def _dynamic_inference(self, hidden_states: torch.Tensor, context: Dyn...
    method _dynamic_inference_prefill (line 576) | def _dynamic_inference_prefill(
    method _decode (line 637) | def _decode(
    method _ssm_training (line 671) | def _ssm_training(
    method _ssm_prefill (line 727) | def _ssm_prefill(
    method _ssm_decode (line 1122) | def _ssm_decode(
    method mamba_state_shapes_per_request (line 1299) | def mamba_state_shapes_per_request(self) -> Tuple[Tuple[int], Tuple[in...
    method _get_states_from_cache (line 1305) | def _get_states_from_cache(self, inference_context, batch_size, *, inf...
    method sharded_state_dict (line 1345) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N...
  function _split_tensor_factory (line 1434) | def _split_tensor_factory(
  function _check_mamba_sequence_packing_support (line 1494) | def _check_mamba_sequence_packing_support(

FILE: megatron/core/ssm/mlp_layer.py
  class MLPLayer (line 13) | class MLPLayer(TransformerLayer):
    method __init__ (line 16) | def __init__(

FILE: megatron/core/ssm/ops/causal_conv1d_triton.py
  function causal_conv1d_update_kernel (line 26) | def causal_conv1d_update_kernel(
  function causal_conv1d_update (line 194) | def causal_conv1d_update(

FILE: megatron/core/ssm/ops/causal_conv1d_varlen.py
  function _causal_conv1d_varlen_kernel (line 28) | def _causal_conv1d_varlen_kernel(
  function causal_conv1d_varlen_fn (line 119) | def causal_conv1d_varlen_fn(
  function _causal_conv1d_varlen_simple (line 211) | def _causal_conv1d_varlen_simple(

FILE: megatron/core/ssm/ops/determinism.py
  function use_deterministic_mode (line 27) | def use_deterministic_mode():
  function set_deterministic_mode (line 37) | def set_deterministic_mode(value):
  function _estimate_config_cost (line 43) | def _estimate_config_cost(cfg):
  function _filter_configs_by_block_sizes (line 58) | def _filter_configs_by_block_sizes(configs):
  function autotune_configs (line 81) | def autotune_configs(configs):
  function alloc_tile_workspace (line 106) | def alloc_tile_workspace(base_shape, tile_dim, dtype, device, determinis...
  function finalize_tile_workspace (line 117) | def finalize_tile_workspace(tensor, deterministic):

FILE: megatron/core/ssm/ops/mamba_ssm.py
  function softplus (line 32) | def softplus(dt):
  function softplus (line 39) | def softplus(dt):
  function _selective_scan_update_kernel (line 53) | def _selective_scan_update_kernel(
  function selective_state_update (line 274) | def selective_state_update(

FILE: megatron/core/ssm/ops/ssd_bmm.py
  function _bmm_chunk_fwd_kernel (line 65) | def _bmm_chunk_fwd_kernel(
  function _bmm_chunk_fwd (line 143) | def _bmm_chunk_fwd(a, b, chunk_size, cu_chunk_seqlens, causal=False, out...

FILE: megatron/core/ssm/ops/ssd_chunk_scan.py
  function _chunk_scan_fwd_kernel (line 78) | def _chunk_scan_fwd_kernel(
  function _chunk_scan_fwd (line 317) | def _chunk_scan_fwd(

FILE: megatron/core/ssm/ops/ssd_chunk_state.py
  function softplus (line 21) | def softplus(dt):  # pylint: disable=C0116
  function softplus (line 28) | def softplus(dt):  # pylint: disable=C0116
  function _chunk_cumsum_fwd_kernel (line 47) | def _chunk_cumsum_fwd_kernel(
  function _chunk_state_fwd_kernel (line 175) | def _chunk_state_fwd_kernel(
  function _chunk_cumsum_fwd (line 272) | def _chunk_cumsum_fwd(
  function _chunk_state_fwd (line 319) | def _chunk_state_fwd(B, x, dt, dA_cumsum, cu_chunk_seqlens, states=None,...
  function _chunk_state_varlen_kernel (line 427) | def _chunk_state_varlen_kernel(
  function chunk_state_varlen (line 583) | def chunk_state_varlen(

FILE: megatron/core/ssm/ops/ssd_combined.py
  function is_int_pow_2 (line 18) | def is_int_pow_2(n):
  function _mamba_chunk_scan_combined_fwd (line 23) | def _mamba_chunk_scan_combined_fwd(
  function mamba_chunk_scan_combined_varlen (line 163) | def mamba_chunk_scan_combined_varlen(

FILE: megatron/core/ssm/ops/ssd_state_passing.py
  function _state_passing_fwd_kernel (line 27) | def _state_passing_fwd_kernel(
  function _state_passing_fwd (line 105) | def _state_passing_fwd(

FILE: megatron/core/ssm/triton_cache_manager.py
  function _version_no_greater_than (line 20) | def _version_no_greater_than(version, version_limit):
  function default_cache_dir (line 26) | def default_cache_dir():
  class ParallelFileCacheManager (line 31) | class ParallelFileCacheManager(FileCacheManager):
    method put (line 51) | def put(self, data, filename, binary=True) -> str:

FILE: megatron/core/tensor_parallel/cross_entropy.py
  class VocabParallelCrossEntropy (line 16) | class VocabParallelCrossEntropy:
    method calculate_logits_max (line 23) | def calculate_logits_max(
    method calculate_predicted_logits (line 35) | def calculate_predicted_logits(
    method calculate_cross_entropy_loss (line 71) | def calculate_cross_entropy_loss(
    method prepare_gradient_calculation_operands (line 85) | def prepare_gradient_calculation_operands(
    method calculate_gradients (line 104) | def calculate_gradients(
  class _VocabParallelCrossEntropy (line 122) | class _VocabParallelCrossEntropy(torch.autograd.Function):
    method forward (line 124) | def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0):
    method backward (line 192) | def backward(ctx, grad_output):
  function vocab_parallel_cross_entropy (line 219) | def vocab_parallel_cross_entropy(vocab_parallel_logits, target, label_sm...

FILE: megatron/core/tensor_parallel/data.py
  function _check_data_types (line 10) | def _check_data_types(keys, data, target_dtype):
  function _build_key_size_numel_dictionaries (line 20) | def _build_key_size_numel_dictionaries(keys, data, tp_group=None):
  function broadcast_data (line 64) | def broadcast_data(keys, data, datatype, tp_group=None):

FILE: megatron/core/tensor_parallel/inference_layers.py
  function _te_rms_norm_kernel (line 38) | def _te_rms_norm_kernel(x: torch.Tensor, weight: torch.Tensor, eps: float):
  function _apply_linear (line 48) | def _apply_linear(
  class InferenceLayerNormColumnParallelLinear (line 63) | class InferenceLayerNormColumnParallelLinear(TELayerNormColumnParallelLi...
    method __init__ (line 68) | def __init__(
    method _maybe_allocate_symmetric_buffer (line 121) | def _maybe_allocate_symmetric_buffer(self, x: torch.Tensor):
    method _all_gather (line 131) | def _all_gather(self, x: torch.Tensor, symm_mem_buffer: dict) -> None:
    method forward (line 155) | def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, None]:
  class InferenceColumnParallelLinear (line 190) | class InferenceColumnParallelLinear(TEColumnParallelLinear):
    method __init__ (line 195) | def __init__(
    method _maybe_allocate_symmetric_buffer (line 240) | def _maybe_allocate_symmetric_buffer(self, x: torch.Tensor):
    method _all_gather (line 250) | def _all_gather(self, x: torch.Tensor, symm_mem_buffer: dict) -> None:
    method forward (line 270) | def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, None]:
  class InferenceRowParallelLinear (line 288) | class InferenceRowParallelLinear(TERowParallelLinear):
    method __init__ (line 293) | def __init__(
    method _matmul_reduce_scatter (line 340) | def _matmul_reduce_scatter(self, x, residual=None):
    method _set_next_layer_norm_weights (line 404) | def _set_next_layer_norm_weights(self, weights: torch.Tensor):
    method _set_residual (line 410) | def _set_residual(self, residual: torch.Tensor):
    method forward (line 416) | def forward(

FILE: megatron/core/tensor_parallel/layers.py
  function param_is_not_tensor_parallel_duplicate (line 91) | def param_is_not_tensor_parallel_duplicate(param, tp_group=None):
  function set_tensor_model_parallel_attributes (line 103) | def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride):
  function set_defaults_if_not_set_tensor_model_parallel_attributes (line 114) | def set_defaults_if_not_set_tensor_model_parallel_attributes(tensor):
  function copy_tensor_model_parallel_attributes (line 125) | def copy_tensor_model_parallel_attributes(destination_tensor, source_ten...
  function _initialize_affine_weight_gpu (line 136) | def _initialize_affine_weight_gpu(weight, init_method, partition_dim, st...
  function _initialize_affine_weight_cpu (line 151) | def _initialize_affine_weight_cpu(
  class VocabParallelEmbedding (line 197) | class VocabParallelEmbedding(torch.nn.Module):
    method __init__ (line 212) | def __init__(
    method forward (line 270) | def forward(self, input_):
    method sharded_state_dict (line 305) | def sharded_state_dict(
  class LinearWithFrozenWeight (line 327) | class LinearWithFrozenWeight(torch.autograd.Function):
    method forward (line 338) | def forward(ctx, input, weight, bias, allreduce_dgrad, tp_group):
    method backward (line 350) | def backward(ctx, grad_output):
  function linear_with_frozen_weight (line 362) | def linear_with_frozen_weight(
  class LinearWithGradAccumulationAndAsyncCommunication (line 433) | class LinearWithGradAccumulationAndAsyncCommunication(torch.autograd.Fun...
    method forward (line 438) | def forward(
    method backward (line 485) | def backward(ctx, grad_output):
  function linear_with_grad_accumulation_and_async_allreduce (line 619) | def linear_with_grad_accumulation_and_async_allreduce(
  class ColumnParallelLinear (line 731) | class ColumnParallelLinear(torch.nn.Module):
    method __init__ (line 780) | def __init__(
    method _forward_impl (line 928) | def _forward_impl(self, input, weight, *args, **kwargs):
    method forward (line 934) | def forward(
    method backward_dw (line 1033) | def backward_dw(self) -> None:
    method sharded_state_dict (line 1040) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N...
    method set_extra_state (line 1052) | def set_extra_state(self, state: Any):
    method get_extra_state (line 1055) | def get_extra_state(self) -> None:
    method extra_repr (line 1060) | def extra_repr(self) -> str:
  class RowParallelLinear (line 1072) | class RowParallelLinear(torch.nn.Module):
    method __init__ (line 1108) | def __init__(
    method _forward_impl (line 1223) | def _forward_impl(self, input, weight, *args, **kwargs):
    method forward (line 1229) | def forward(self, input_: torch.Tensor) -> tuple[torch.Tensor, torch.T...
    method backward_dw (line 1287) | def backward_dw(self) -> None:
    method sharded_state_dict (line 1294) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N...
    method set_extra_state (line 1306) | def set_extra_state(self, state: Any):
    method get_extra_state (line 1309) | def get_extra_state(self) -> None:
    method extra_repr (line 1314) | def extra_repr(self) -> str:

FILE: megatron/core/tensor_parallel/mappings.py
  function _reduce (line 22) | def _reduce(input_, group):
  function _split_along_last_dim (line 36) | def _split_along_last_dim(input_, group):
  function _split_along_first_dim (line 56) | def _split_along_first_dim(input_, group):
  function _gather_along_last_dim (line 80) | def _gather_along_last_dim(input_, group):
  function _reduce_scatter_along_last_dim (line 99) | def _reduce_scatter_along_last_dim(input_, group):
  function _gather_along_first_dim (line 114) | def _gather_along_first_dim(input_, group, output_split_sizes=None, use_...
  function _reduce_scatter_along_first_dim (line 155) | def _reduce_scatter_along_first_dim(input_, group, input_split_sizes=Non...
  class _CopyToModelParallelRegion (line 197) | class _CopyToModelParallelRegion(torch.autograd.Function):
    method symbolic (line 201) | def symbolic(graph, input_, group):
    method forward (line 206) | def forward(ctx, input_, group):
    method backward (line 212) | def backward(ctx, grad_output):
  class _ReduceFromModelParallelRegion (line 217) | class _ReduceFromModelParallelRegion(torch.autograd.Function):
    method symbolic (line 221) | def symbolic(graph, input_, group):
    method forward (line 226) | def forward(ctx, input_, group):
    method backward (line 231) | def backward(ctx, grad_output):
  class _ScatterToModelParallelRegion (line 236) | class _ScatterToModelParallelRegion(torch.autograd.Function):
    method symbolic (line 240) | def symbolic(graph, input_, group):
    method forward (line 245) | def forward(ctx, input_, group):
    method backward (line 251) | def backward(ctx, grad_output):
  class _GatherFromModelParallelRegion (line 256) | class _GatherFromModelParallelRegion(torch.autograd.Function):
    method symbolic (line 260) | def symbolic(graph, input_, group):
    method forward (line 265) | def forward(ctx, input_, group):
    method backward (line 271) | def backward(ctx, grad_output):
  class _ScatterToSequenceParallelRegion (line 276) | class _ScatterToSequenceParallelRegion(torch.autograd.Function):
    method symbolic (line 280) | def symbolic(graph, input_, group):
    method forward (line 285) | def forward(ctx, input_, group):
    method backward (line 291) | def backward(ctx, grad_output):
  class _GatherFromSequenceParallelRegion (line 296) | class _GatherFromSequenceParallelRegion(torch.autograd.Function):
    method symbolic (line 300) | def symbolic(
    method forward (line 312) | def forward(
    method backward (line 328) | def backward(ctx, grad_output):
  class _ReduceScatterToSequenceParallelRegion (line 351) | class _ReduceScatterToSequenceParallelRegion(torch.autograd.Function):
    method symbolic (line 355) | def symbolic(graph, input_, group, input_split_sizes=None, use_global_...
    method forward (line 360) | def forward(ctx, input_, group, input_split_sizes=None, use_global_buf...
    method backward (line 368) | def backward(ctx, grad_output):
  class _AllGatherFromTensorParallelRegion (line 380) | class _AllGatherFromTensorParallelRegion(torch.autograd.Function):
    method symbolic (line 384) | def symbolic(graph, input_, group):
    method forward (line 389) | def forward(ctx, input_, group):
    method backward (line 395) | def backward(ctx, grad_output):
  class _ReduceScatterToTensorParallelRegion (line 400) | class _ReduceScatterToTensorParallelRegion(torch.autograd.Function):
    method symbolic (line 404) | def symbolic(graph, input_, group):
    method forward (line 409) | def forward(ctx, input_, group):
    method backward (line 415) | def backward(ctx, grad_output):
  class _AllToAll (line 420) | class _AllToAll(torch.autograd.Function):
    method forward (line 422) | def forward(ctx, group, input, output_split_sizes, input_split_sizes):
    method backward (line 454) | def backward(ctx, *grad_output):
  function copy_to_tensor_model_parallel_region (line 469) | def copy_to_tensor_model_parallel_region(input_, group=None):
  function reduce_from_tensor_model_parallel_region (line 475) | def reduce_from_tensor_model_parallel_region(input_, group=None):
  function scatter_to_tensor_model_parallel_region (line 481) | def scatter_to_tensor_model_parallel_region(input_, group=None):
  function gather_from_tensor_model_parallel_region (line 487) | def gather_from_tensor_model_parallel_region(input_, group=None):
  function scatter_to_sequence_parallel_region (line 493) | def scatter_to_sequence_parallel_region(input_, group=None):
  function gather_from_sequence_parallel_region (line 499) | def gather_from_sequence_parallel_region(
  function reduce_scatter_to_sequence_parallel_region (line 513) | def reduce_scatter_to_sequence_parallel_region(
  function all_gather_last_dim_from_tensor_parallel_region (line 523) | def all_gather_last_dim_from_tensor_parallel_region(input_, group=None):
  function reduce_scatter_last_dim_to_tensor_parallel_region (line 529) | def reduce_scatter_last_dim_to_tensor_parallel_region(input_, group=None):
  function all_to_all (line 535) | def all_to_all(group, input_, output_split_sizes_=None, input_split_size...
  function all_to_all_sp2hp (line 541) | def all_to_all_sp2hp(input_, group=None):
  function all_to_all_hp2sp (line 570) | def all_to_all_hp2sp(input_, group=None):

FILE: megatron/core/tensor_parallel/random.py
  function _get_share_storage (line 65) | def _get_share_storage():
  function _get_cuda_rng_state (line 96) | def _get_cuda_rng_state(
  function _set_cuda_rng_state (line 127) | def _set_cuda_rng_state(new_state: torch.Tensor, device: int = -1, graph...
  function convert_cuda_rng_state (line 169) | def convert_cuda_rng_state(
  function get_expert_parallel_rng_tracker_name (line 204) | def get_expert_parallel_rng_tracker_name():
  function get_data_parallel_rng_tracker_name (line 210) | def get_data_parallel_rng_tracker_name():
  class CudaRNGStatesTracker (line 216) | class CudaRNGStatesTracker:
    method __init__ (line 225) | def __init__(self, use_cudagraphable_rng=False, is_inference_rng_track...
    method is_initialized (line 238) | def is_initialized(self):
    method reset (line 242) | def reset(self):
    method get_states (line 258) | def get_states(self):
    method set_states (line 266) | def set_states(self, states):
    method add (line 272) | def add(self, name, seed):
    method fork (line 298) | def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
  function initialize_rng_tracker (line 341) | def initialize_rng_tracker(
  function get_cuda_rng_tracker (line 401) | def get_cuda_rng_tracker(
  function get_all_rng_states (line 411) | def get_all_rng_states():
  function model_parallel_cuda_manual_seed (line 433) | def model_parallel_cuda_manual_seed(
  function is_graph_safe_cuda_rng_tracker (line 487) | def is_graph_safe_cuda_rng_tracker(cuda_rng_tracker):
  function _get_all_rng_states (line 499) | def _get_all_rng_states():
  function _set_all_rng_states (line 509) | def _set_all_rng_states(cpu_rng_state, cuda_rng_state, cuda_rng_state_tr...
  function _fork_rng (line 519) | def _fork_rng():
  function _set_checkpointing (line 534) | def _set_checkpointing():
  function _unset_checkpointing (line 540) | def _unset_checkpointing():
  function is_checkpointing (line 546) | def is_checkpointing():
  class CheckpointFunction (line 555) | class CheckpointFunction(torch.autograd.Function):
    method forward (line 565) | def forward(
    method backward (line 599) | def backward(ctx, *args):
  function checkpoint (line 637) | def checkpoint(
  class CheckpointWithoutOutputFunction (line 645) | class CheckpointWithoutOutputFunction(torch.autograd.Function):
    method forward (line 652) | def forward(
    method backward (line 678) | def backward(ctx, *args):
  class CheckpointWithoutOutput (line 692) | class CheckpointWithoutOutput(object):
    method __init__ (line 706) | def __init__(self, fp8=False):
    method checkpoint (line 715) | def checkpoint(self, run_function: Callable[[Unpack[_Ts]], _R], *args:...
    method _recompute (line 735) | def _recompute(self, _):
    method discard_output_and_register_recompute (line 797) | def discard_output_and_register_recompute(self, hook_tensor):

FILE: megatron/core/tensor_parallel/utils.py
  function split_tensor_along_last_dim (line 22) | def split_tensor_along_last_dim(
  function split_tensor_into_1d_equal_chunks (line 48) | def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False, tp_group...
  function gather_split_1d_tensor (line 79) | def gather_split_1d_tensor(tensor, tp_group=None):
  class VocabUtility (line 97) | class VocabUtility:
    method vocab_range_from_per_partition_vocab_size (line 105) | def vocab_range_from_per_partition_vocab_size(
    method vocab_range_from_global_vocab_size (line 114) | def vocab_range_from_global_vocab_size(

FILE: megatron/core/timers.py
  class TimerBase (line 35) | class TimerBase(ABC):
    method __init__ (line 38) | def __init__(self, name):
    method start (line 42) | def start(self, barrier=False):
    method stop (line 51) | def stop(self, barrier=False):
    method reset (line 60) | def reset(self):
    method elapsed (line 65) | def elapsed(self, reset=True, barrier=False):
  class DummyTimer (line 78) | class DummyTimer(TimerBase):
    method __init__ (line 81) | def __init__(self):
    method start (line 84) | def start(self, barrier=False):
    method stop (line 87) | def stop(self, barrier=False):
    method reset (line 90) | def reset(self):
    method elapsed (line 93) | def elapsed(self, reset=True, barrier=False):
    method active_time (line 99) | def active_time(self):
  class Timer (line 109) | class Timer(TimerBase):
    method __init__ (line 121) | def __init__(self, name):
    method set_barrier_group (line 135) | def set_barrier_group(self, barrier_group):
    method start (line 143) | def start(self, barrier=False):
    method stop (line 156) | def stop(self, barrier=False):
    method reset (line 171) | def reset(self):
    method set_elapsed (line 177) | def set_elapsed(self, value):
    method elapsed (line 188) | def elapsed(self, reset=True, barrier=False):
    method active_time (line 212) | def active_time(self):
  class Timers (line 217) | class Timers:
    method __init__ (line 220) | def __init__(self, log_level, log_option):
    method __call__ (line 241) | def __call__(self, name, log_level=None):
    method _get_elapsed_time_all_ranks (line 270) | def _get_elapsed_time_all_ranks(self, names, reset, barrier):
    method _get_global_min_max_time (line 318) | def _get_global_min_max_time(self, names, reset, barrier, normalizer):
    method _get_global_min_max_time_string (line 338) | def _get_global_min_max_time_string(self, names, reset, barrier, norma...
    method _get_all_ranks_time_string (line 357) | def _get_all_ranks_time_string(self, names, reset, barrier, normalizer):
    method get_all_timers_string (line 378) | def get_all_timers_string(
    method log (line 422) | def log(
    method write (line 452) | def write(

FILE: megatron/core/tokenizers/base_tokenizer.py
  class MegatronTokenizerBase (line 6) | class MegatronTokenizerBase(ABC):
    method __init__ (line 9) | def __init__(self, path: str, config: dict, **kwargs) -> None:
    method tokenize (line 26) | def tokenize(self):
    method detokenize (line 31) | def detokenize(self):
    method vocab (line 36) | def vocab(self):
    method vocab_size (line 41) | def vocab_size(self):
    method apply_chat_template (line 46) | def apply_chat_template(self):

FILE: megatron/core/tokenizers/megatron_tokenizer.py
  class MegatronTokenizer (line 37) | class MegatronTokenizer:
    method __init__ (line 40) | def __init__(self) -> None:
    method from_pretrained (line 46) | def from_pretrained(
    method write_metadata (line 104) | def write_metadata(
  function _get_metadata_path (line 170) | def _get_metadata_path(tokenizer_path: str) -> str:
  function _get_tokenizer_model_class (line 188) | def _get_tokenizer_model_class(library: str, metadata: dict) -> Megatron...

FILE: megatron/core/tokenizers/text/libraries/abstract_tokenizer.py
  class MegatronTokenizerTextAbstract (line 7) | class MegatronTokenizerTextAbstract(ABC):
    method text_to_tokens (line 13) | def text_to_tokens(self, text: str) -> List[str]:
    method tokens_to_text (line 26) | def tokens_to_text(self, tokens: List[str]) -> str:
    method tokens_to_ids (line 39) | def tokens_to_ids(self, tokens: List[str]) -> List[int]:
    method ids_to_tokens (line 52) | def ids_to_tokens(self, ids: List[int]) -> List[str]:
    method text_to_ids (line 65) | def text_to_ids(self, text: str) -> List[int]:
    method ids_to_text (line 78) | def ids_to_text(self, ids: List[int]) -> str:
    method add_special_tokens (line 91) | def add_special_tokens(self):
    method cls_id (line 96) | def cls_id(self) -> int:
    method sep_id (line 103) | def sep_id(self) -> int:
    method pad_id (line 110) | def pad_id(self) -> int:
    method eod (line 117) | def eod(self) -> int:
    method bos_id (line 129) | def bos_id(self) -> int:
    method eos_id (line 136) | def eos_id(self) -> int:
    method mask_id (line 143) | def mask_id(self) -> int:

FILE: megatron/core/tokenizers/text/libraries/bytelevel_tokenizer.py
  class ByteLevelTokenizer (line 8) | class ByteLevelTokenizer(MegatronTokenizerTextAbstract):
    method __init__ (line 15) | def __init__(
    method text_to_tokens (line 61) | def text_to_tokens(self, text):
    method tokens_to_text (line 67) | def tokens_to_text(self, tokens):
    method text_to_ids (line 73) | def text_to_ids(self, text):
    method ids_to_text (line 79) | def ids_to_text(self, ids):
    method tokens_to_ids (line 87) | def tokens_to_ids(self, tokens):
    method ids_to_tokens (line 98) | def ids_to_tokens(self, ids):
    method token_to_id (line 109) | def token_to_id(self, token):
    method id_to_token (line 118) | def id_to_token(self, id):
    method add_special_tokens (line 127) | def add_special_tokens(self, special_tokens: Union[list, dict]) -> None:
    method pad_id (line 132) | def pad_id(self):
    method bos_id (line 139) | def bos_id(self):
    method eos_id (line 146) | def eos_id(self):
    method eod (line 153) | def eod(self):
    method unk_id (line 160) | def unk_id(self):

FILE: megatron/core/tokenizers/text/libraries/chat_template.py
  class MegatronTokenizerChatTemplate (line 13) | class MegatronTokenizerChatTemplate:
    method apply_chat_template (line 16) | def apply_chat_template(

FILE: megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py
  class HuggingFaceTokenizer (line 20) | class HuggingFaceTokenizer(MegatronTokenizerTextAbstract):
    method __init__ (line 26) | def __init__(
    method add_special_tokens (line 183) | def add_special_tokens(self, special_tokens_dict: dict) -> int:
    method additional_special_tokens_ids (line 213) | def additional_special_tokens_ids(self):
    method text_to_tokens (line 220) | def text_to_tokens(self, text: str) -> List[str]:
    method tokens_to_text (line 225) | def tokens_to_text(self, tokens: List[str]) -> str:
    method token_to_id (line 230) | def token_to_id(self, token: str) -> int:
    method tokens_to_ids (line 234) | def tokens_to_ids(self, tokens: List[str]) -> List[int]:
    method ids_to_tokens (line 239) | def ids_to_tokens(self, ids: List[int]) -> List[str]:
    method text_to_ids (line 244) | def text_to_ids(self, text: str) -> List[int]:
    method ids_to_text (line 252) | def ids_to_text(self, ids: List[int], remove_special_tokens: Optional[...
    method apply_chat_template (line 271) | def apply_chat_template(self, conversation, chat_template, **kwargs):
    method vocab (line 278) | def vocab(self) -> list:
    method inv_vocab (line 284) | def inv_vocab(self) -> dict:
    method vocab_size (line 291) | def vocab_size(self) -> int:
    method pad_id (line 296) | def pad_id(self) -> int:
    method bos_id (line 303) | def bos_id(self) -> int:
    method eos_id (line 310) | def eos_id(self) -> int:
    method eod (line 315) | def eod(self) -> int:
    method sep_id (line 322) | def sep_id(self) -> int:
    method cls_id (line 329) | def cls_id(self) -> int:
    method unk_id (line 336) | def unk_id(self) -> int:
    method mask_id (line 343) | def mask_id(self) -> int:
    method save_vocabulary (line 349) | def save_vocabulary(self, save_directory: str, filename_prefix: str = ...
    method save_pretrained (line 355) | def save_pretrained(self, save_directory: str):

FILE: megatron/core/tokenizers/text/libraries/megatron_hf_tokenizer.py
  class MegatronHFTokenizer (line 79) | class MegatronHFTokenizer(HuggingFaceTokenizer):
    method __init__ (line 82) | def __init__(
    method _get_vocab_file (line 102) | def _get_vocab_file(self, tokenizer_name: str, vocab_file: str = None)...
    method _get_merges_file (line 122) | def _get_merges_file(self, tokenizer_name: str, merges_file: str = Non...
    method _get_available_models_list (line 144) | def _get_available_models_list(self) -> list:
    method _download (line 149) | def _download(self, path: str, url: str):

FILE: megatron/core/tokenizers/text/libraries/null_tokenizer.py
  class NullTokenizer (line 6) | class NullTokenizer:
    method __init__ (line 14) | def __init__(self, vocab_size):
    method text_to_ids (line 19) | def text_to_ids(self, text):
    method ids_to_text (line 23) | def ids_to_text(self, ids):
    method tokens_to_ids (line 28) | def tokens_to_ids(self, tokens):
    method ids_to_tokens (line 32) | def ids_to_tokens(self, ids):
    method offsets (line 36) | def offsets(self, ids: list[int], text: str) -> list[int]:
    method unique_identifiers (line 45) | def unique_identifiers(self) -> OrderedDict:
    method vocab_size (line 50) | def vocab_size(self):
    method vocab (line 55) | def vocab(self):
    method inv_vocab (line 60) | def inv_vocab(self):
    method cls (line 65) | def cls(self):
    method sep (line 70) | def sep(self):
    method mask (line 75) | def mask(self):
    method eod (line 80) | def eod(self):
    method additional_special_tokens_ids (line 85) | def additional_special_tokens_ids(self):

FILE: megatron/core/tokenizers/text/libraries/sentencepiece_tokenizer.py
  class SentencePieceTokenizer (line 22) | class SentencePieceTokenizer(MegatronTokenizerTextAbstract, MegatronToke...
    method __init__ (line 25) | def __init__(
    method text_to_tokens (line 92) | def text_to_tokens(self, text: str) -> List[str]:
    method text_to_ids (line 141) | def text_to_ids(self, text, sample_alpha=None) -> List[int]:
    method _text_to_ids (line 148) | def _text_to_ids(self, text, sample_alpha=None) -> List[int]:
    method _text_to_ids_extra_space (line 204) | def _text_to_ids_extra_space(self, text, sample_alpha=None) -> List[int]:
    method tokens_to_text (line 219) | def tokens_to_text(self, tokens: List[str]) -> str:
    method ids_to_text (line 226) | def ids_to_text(self, ids: List[int]) -> str:
    method token_to_id (line 246) | def token_to_id(self, token: str) -> int:
    method ids_to_tokens (line 253) | def ids_to_tokens(self, ids: List[int]) -> List[str]:
    method tokens_to_ids (line 263) | def tokens_to_ids(
    method add_special_tokens (line 275) | def add_special_tokens(self, special_tokens: Union[list, dict]) -> None:
    method offsets (line 313) | def offsets(self, ids: list[int], text: str) -> list[int]:
    method pad_id (line 318) | def pad_id(self) -> int:
    method bos_id (line 327) | def bos_id(self) -> int:
    method eos_id (line 336) | def eos_id(self) -> int:
    method sep_id (line 345) | def sep_id(self) -> int:
    method cls_id (line 356) | def cls_id(self) -> int:
    method mask_id (line 367) | def mask_id(self) -> int:
    method unk_id (line 378) | def unk_id(self) -> int:
    method additional_special_tokens_ids (line 383) | def additional_special_tokens_ids(self) -> list:
    method vocab (line 401) | def vocab(self) -> list:
    method inv_vocab (line 413) | def inv_vocab(self) -> dict:

FILE: megatron/core/tokenizers/text/libraries/sft_tokenizer.py
  class PromptConfig (line 27) | class PromptConfig:
  class SFTTokenizer (line 46) | class SFTTokenizer:
    method __init__ (line 49) | def __init__(self, tokenizer_path: str, prompt_format: str):
    method tokenize_conversation (line 111) | def tokenize_conversation(
    method text_to_ids (line 188) | def text_to_ids(self, text: Union[str, List[Dict]]):
    method tokens_to_ids (line 198) | def tokens_to_ids(self, tokens: List[str]):
    method ids_to_text (line 202) | def ids_to_text(self, tokens: List[int]):
    method ids_to_tokens (line 206) | def ids_to_tokens(self):
    method text_to_tokens (line 210) | def text_to_tokens(self):
    method tokens_to_text (line 214) | def tokens_to_text(self):
    method get_special_tokens (line 218) | def get_special_tokens(self):
    method add_special_tokens (line 222) | def add_special_tokens(self):
    method pad_id (line 227) | def pad_id(self):
    method bos_id (line 232) | def bos_id(self):
    method eod (line 237) | def eod(self):
    method vocab (line 242) | def vocab(self):
    method inv_vocab (line 247) | def inv_vocab(self):
    method vocab_size (line 252) | def vocab_size(self):

FILE: megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py
  function reload_mergeable_ranks (line 31) | def reload_mergeable_ranks(
  class TikTokenTokenizer (line 76) | class TikTokenTokenizer(MegatronTokenizerTextAbstract, MegatronTokenizer...
    method __init__ (line 79) | def __init__(
    method text_to_tokens (line 172) | def text_to_tokens(self, text: str) -> List[str]:
    method tokens_to_text (line 177) | def tokens_to_text(self, tokens: List[int]) -> str:
    method token_to_id (line 182) | def token_to_id(self, token: str) -> int:
    method tokens_to_ids (line 189) | def tokens_to_ids(self, tokens: List[str]) -> List[int]:
    method id_to_token (line 193) | def id_to_token(self, token_id: int) -> str:
    method ids_to_tokens (line 201) | def ids_to_tokens(self, token_ids: List[int]) -> List[str]:
    method text_to_ids (line 209) | def text_to_ids(self, text: str) -> List[int]:
    method ids_to_text (line 214) | def ids_to_text(self, tokens: List[int], remove_special_tokens: bool =...
    method add_special_tokens (line 232) | def add_special_tokens(self, special_tokens_dict: dict):
    method offsets (line 236) | def offsets(self, ids: list[int], text: str) -> list[int]:
    method additional_special_tokens_ids (line 255) | def additional_special_tokens_ids(self) -> list:
    method bos_id (line 271) | def bos_id(self) -> int:
    method eos_id (line 276) | def eos_id(self) -> int:
    method eod (line 281) | def eod(self) -> int:
    method unk_id (line 286) | def unk_id(self) -> int:
    method mask_id (line 291) | def mask_id(self) -> int:
    method pad_id (line 296) | def pad_id(self) -> int:
    method cls_id (line 301) | def cls_id(self) -> int:
    method sep_id (line 306) | def sep_id(self) -> int:
    method vocab (line 311) | def vocab(self):
    method decoder (line 316) | def decoder(self):
    method encoder (line 321) | def encoder(self):
    method vocab_size (line 326) | def vocab_size(self) -> int:
    method inv_vocab (line 331) | def inv_vocab(self) -> dict:

FILE: megatron/core/tokenizers/text/models/bert_tokenizer.py
  class BertTokenizer (line 6) | class BertTokenizer(MegatronTokenizerText):
    method __init__ (line 9) | def __init__(self, path: str = None, config: dict = None, **kwargs) ->...

FILE: megatron/core/tokenizers/text/models/default_tokenizer.py
  class DefaultTokenizerText (line 6) | class DefaultTokenizerText(MegatronTokenizerText):
    method __init__ (line 9) | def __init__(self, path: str = None, config: dict = None, **kwargs) ->...

FILE: megatron/core/tokenizers/text/models/gpt_tokenizer.py
  class GPTTokenizer (line 6) | class GPTTokenizer(MegatronTokenizerText):
    method __init__ (line 9) | def __init__(self, path: str = None, config: dict = None, **kwargs) ->...

FILE: megatron/core/tokenizers/text/models/mamba_tokenizer.py
  class MambaTokenizer (line 6) | class MambaTokenizer(MegatronTokenizerText):
    method __init__ (line 9) | def __init__(self, path: str = None, config: dict = None, **kwargs) ->...

FILE: megatron/core/tokenizers/text/models/t5_tokenizer.py
  class T5Tokenizer (line 6) | class T5Tokenizer(MegatronTokenizerText):
    method __init__ (line 9) | def __init__(self, path: str = None, config: dict = None, **kwargs) ->...

FILE: megatron/core/tokenizers/text/parsers/base_parser.py
  class BaseParser (line 5) | class BaseParser:
    method parse (line 9) | def parse(text: str, **kwargs) -> tuple[str, dict[str, Any]]:

FILE: megatron/core/tokenizers/text/parsers/deepseek_r1_reasoning_parser.py
  class DeepSeekR1ReasoningParser (line 5) | class DeepSeekR1ReasoningParser(BaseParser):
    method parse (line 9) | def parse(text: str, **kwargs) -> tuple[str, dict[str, str]]:

FILE: megatron/core/tokenizers/text/parsers/qwen3_coder_tool_parser.py
  class _Qwen3CoderToolParser (line 25) | class _Qwen3CoderToolParser:
    method _generate_tool_call_id (line 40) | def _generate_tool_call_id(self) -> str:
    method _get_arguments_config (line 44) | def _get_arguments_config(
    method _convert_param_value (line 69) | def _convert_param_value(
    method _parse_xml_function_call (line 172) | def _parse_xml_function_call(
    method _get_function_calls (line 202) | def _get_function_calls(self, model_output: str) -> list[str]:
    method extract_tool_calls (line 218) | def extract_tool_calls(
  class Qwen3CoderToolParser (line 259) | class Qwen3CoderToolParser(BaseParser):
    method parse (line 263) | def parse(text: str, **kwargs) -> tuple[str, dict[str, list[dict]]]:

FILE: megatron/core/tokenizers/text/text_tokenizer.py
  class MegatronTokenizerText (line 22) | class MegatronTokenizerText(MegatronTokenizerBase):
    method __init__ (line 25) | def __init__(self, path: str, config: dict, **kwargs) -> None:
    method _restore_model (line 53) | def _restore_model(self, **kwargs) -> MegatronTokenizerTextAbstract:
    method tokenize (line 65) | def tokenize(self, text: str) -> List[int]:
    method detokenize (line 78) | def detokenize(self, ids: List[int]) -> str:
    method apply_chat_template (line 91) | def apply_chat_template(
    method tokenize_conversation (line 116) | def tokenize_conversation(
    method save_pretrained (line 142) | def save_pretrained(self, path: str) -> None:
    method add_special_tokens (line 157) | def add_special_tokens(self, special_tokens: Union[list, dict]) -> None:
    method offsets (line 171) | def offsets(self, ids: list[int], text: str) -> list[int]:
    method space_sensitive (line 176) | def space_sensitive(self):
    method additional_special_tokens_ids (line 186) | def additional_special_tokens_ids(self) -> list:
    method vocab_size (line 191) | def vocab_size(self) -> int:
    method vocab (line 196) | def vocab(self):
    method unique_identifiers (line 201) | def unique_identifiers(self) -> OrderedDict:
    method pad (line 212) | def pad(self) -> int:
    method pad_id (line 217) | def pad_id(self) -> int:
    method eod (line 222) | def eod(self) -> int:
    method bos (line 227) | def bos(self) -> int:
    method bos_id (line 232) | def bos_id(self) -> int:
    method eos_id (line 237) | def eos_id(self) -> int:
    method eos (line 242) | def eos(self) -> int:
    method unk (line 247) | def unk(self) -> int:
    method unk_id (line 252) | def unk_id(self) -> int:
    method mask (line 257) | def mask(self) -> int:
    method mask_id (line 262) | def mask_id(self) -> int:
    method cls (line 267) | def cls(self) -> int:
    method cls_id (line 272) | def cls_id(self) -> int:
    method sep (line 277) | def sep(self) -> int:
    method sep_id (line 282) | def sep_id(self) -> int:
    method vocab_file (line 287) | def vocab_file(self) -> str:
    method merges_file (line 292) | def merges_file(self) -> str:
    method inv_vocab (line 297) | def inv_vocab(self) -> dict:

FILE: megatron/core/tokenizers/utils/build_tokenizer.py
  function build_tokenizer (line 15) | def build_tokenizer(args, **kwargs):
  function vocab_size_with_padding (line 98) | def vocab_size_with_padding(orig_vocab_size, args, logging_enabled=True):
  function _set_padded_vocab_size (line 114) | def _set_padded_vocab_size(args, tokenizer):

FILE: megatron/core/tokenizers/vision/libraries/multimodal_tokenizer.py
  class MegatronMultimodalTokenizer (line 55) | class MegatronMultimodalTokenizer:
    method __init__ (line 58) | def __init__(
    method _apply_image_tag (line 190) | def _apply_image_tag(self, text: Union[str, List[Dict]]):
    method tokenize (line 205) | def tokenize(self, text: Union[str, List[Dict]]):
    method _encode (line 213) | def _encode(self, text: str):
    method tokenize_conversation (line 218) | def tokenize_conversation(
    method convert_tokens_to_ids (line 305) | def convert_tokens_to_ids(self, tokens: List[str]):
    method detokenize (line 309) | def detokenize(self, tokens: List[int]):
    method add_special_tokens (line 313) | def add_special_tokens(self, special_tokens: List[str]):
    method get_special_tokens (line 317) | def get_special_tokens(self):
    method pad (line 322) | def pad(self):
    method eod (line 327) | def eod(self):
    method vocab_size (line 332) | def vocab_size(self):
    method vocab (line 337) | def vocab(self):

FILE: megatron/core/tokenizers/vision/libraries/null_multimodal_tokenizer.py
  class MegatronNullMultimodalTokenizer (line 4) | class MegatronNullMultimodalTokenizer:
    method __init__ (line 7) | def __init__(self, vocab_size, image_token=None, image_token_id=None):
    method tokenize (line 22) | def tokenize(self, text):
    method detokenize (line 34) | def detokenize(self, ids):
    method offsets (line 47) | def offsets(self, ids: list[int], text: str) -> list[int]:
    method convert_tokens_to_ids (line 55) | def convert_tokens_to_ids(self, tokens):
    method vocab_size (line 63) | def vocab_size(self):
    method cls (line 68) | def cls(self):
    method sep (line 73) | def sep(self):
    method mask (line 78) | def mask(self):
    method eod (line 83) | def eod(self):
    method additional_special_tokens_ids (line 88) | def additional_special_tokens_ids(self):

FILE: megatron/core/tokenizers/vision/models/default_tokenizer.py
  class DefaultTokenizerVision (line 6) | class DefaultTokenizerVision(MegatronTokenizerVision):
    method __init__ (line 9) | def __init__(self, path: str = None, config: dict = None, **kwargs) ->...

FILE: megatron/core/tokenizers/vision/vision_tokenizer.py
  class MegatronTokenizerVision (line 16) | class MegatronTokenizerVision(MegatronTokenizerBase):
    method __init__ (line 19) | def __init__(self, path: str, config: dict, **kwargs) -> None:
    method _restore_model (line 34) | def _restore_model(self, **kwargs):
    method tokenize (line 46) | def tokenize(self, text: Union[str, List[Dict]]) -> List[int]:
    method detokenize (line 59) | def detokenize(self, ids: List[int]) -> str:
    method tokenize_conversation (line 72) | def tokenize_conversation(
    method add_special_tokens (line 94) | def add_special_tokens(self, special_tokens: Union[list, dict]) -> None:
    method convert_tokens_to_ids (line 108) | def convert_tokens_to_ids(self, tokens: List[str]):
    method apply_chat_template (line 112) | def apply_chat_template(self):
    method get_special_tokens (line 116) | def get_special_tokens(self) -> list:
    method offsets (line 120) | def offsets(self, ids: list[int], text: str) -> list[int]:
    method vocab (line 125) | def vocab(self):
    method vocab_size (line 130) | def vocab_size(self) -> int:
    method pad (line 135) | def pad(self):
    method eod (line 140) | def eod(self):

FILE: megatron/core/transformer/attention.py
  class LinearQkv (line 118) | class LinearQkv(Protocol):
    method forward (line 121) | def forward(self, input: Tensor, /) -> tuple[Tensor, object]:
    method backward_dw (line 125) | def backward_dw(self) -> None:
  class LinearQkvBuilder (line 130) | class LinearQkvBuilder(Protocol):
    method __call__ (line 133) | def __call__(
  class LinearLayer (line 150) | class LinearLayer(Protocol):
    method forward (line 153) | def forward(self, input: Tensor, /) -> Tuple[Tensor, object]:
  class LinearLayerBuilder (line 158) | class LinearLayerBuilder(Protocol):
    method __call__ (line 161) | def __call__(
  class CoreAttention (line 176) | class CoreAttention(Protocol):
    method forward (line 179) | def forward(
  class CoreAttentionBuilder (line 195) | class CoreAttentionBuilder(Protocol):
    method __call__ (line 198) | def __call__(
  class SelfAttentionSubmodules (line 212) | class SelfAttentionSubmodules:
  class CrossAttentionSubmodules (line 225) | class CrossAttentionSubmodules:
  class Attention (line 236) | class Attention(MegatronModule, ABC):
    method __init__ (line 243) | def __init__(
    method _checkpointed_attention_forward (line 381) | def _checkpointed_attention_forward(
    method _allocate_memory (line 421) | def _allocate_memory(self, inference_max_sequence_length, batch_size, ...
    method _get_pp_layer_offset_for_inference (line 433) | def _get_pp_layer_offset_for_inference(self):
    method _adjust_key_value_for_inference (line 455) | def _adjust_key_value_for_inference(
    method get_query_key_value_tensors (line 635) | def get_query_key_value_tensors(
    method flash_decode (line 651) | def flash_decode(
    method _flash_attention_3_forward_wrapper (line 697) | def _flash_attention_3_forward_wrapper(
    method flash_decode_and_prefill (line 765) | def flash_decode_and_prefill(
    method forward (line 884) | def forward(
    method _apply_output_gate (line 1227) | def _apply_output_gate(self, x, gate):
    method set_for_recompute_input_layernorm (line 1235) | def set_for_recompute_input_layernorm(self):
    method clip_qk (line 1239) | def clip_qk(self):
  class SelfAttention (line 1247) | class SelfAttention(Attention):
    method __init__ (line 1254) | def __init__(
    method run_realtime_tests (line 1309) | def run_realtime_tests(self):
    method get_query_key_value_tensors (line 1380) | def get_query_key_value_tensors(
    method backward_dw (line 1512) | def backward_dw(self) -> None:
    method _backward_qkv_proj (line 1517) | def _backward_qkv_proj(self):
    method _backward_output_proj (line 1521) | def _backward_output_proj(self):
    method set_for_recompute_input_layernorm (line 1525) | def set_for_recompute_input_layernorm(self):
    method clip_qk (line 1531) | def clip_qk(self):
    method _clip_linear_qkv (line 1580) | def _clip_linear_qkv(self, weight):
  class CrossAttention (line 1629) | class CrossAttention(Attention):
    method __init__ (line 1636) | def __init__(
    method get_query_key_value_tensors (line 1681) | def get_query_key_value_tensors(

FILE: megatron/core/transformer/cuda_graphs.py
  function is_graph_capturing (line 84) | def is_graph_capturing():
  function _set_capture_start (line 89) | def _set_capture_start():
  function _set_capture_end (line 95) | def _set_capture_end():
  function is_graph_warmup (line 101) | def is_graph_warmup():
  function _set_warmup_start (line 106) | def _set_warmup_start():
  function _set_warmup_end (line 112) | def _set_warmup_end():
  class CudagraphBufferMetadata (line 119) | class CudagraphBufferMetadata:
  class ArgMetadata (line 135) | class ArgMetadata:
    method __init__ (line 138) | def __init__(self, arg):
    method zeros_like (line 152) | def zeros_like(self):
  class TensorReusePool (line 161) | class TensorReusePool:
    method insert (line 179) | def insert(self, tensor: torch.Tensor):
    method owns (line 184) | def owns(self, tensor: torch.Tensor):
    method get (line 188) | def get(self, meta: ArgMetadata):
  function tree_map (line 204) | def tree_map(func, tree):
  function _check_supported_type (line 223) | def _check_supported_type(meta):
  function _determine_if_first_last_layer_of_this_vp_chunk (line 249) | def _determine_if_first_last_layer_of_this_vp_chunk(base_module):
  function _clone_nested_tensors (line 278) | def _clone_nested_tensors(value: Any) -> Any:
  function _ensure_generator_state_is_cudagraph_safe (line 293) | def _ensure_generator_state_is_cudagraph_safe(gen: torch.Generator) -> t...
  class _CudagraphGlobalRecord (line 320) | class _CudagraphGlobalRecord:
    method record_fwd_graph (line 338) | def record_fwd_graph(cls, runner, args, kwargs, out):
    method record_bwd_graph (line 343) | def record_bwd_graph(cls, runner):
    method create_cudagraphs (line 348) | def create_cudagraphs(cls):
  function create_cudagraphs (line 478) | def create_cudagraphs():
  function delete_cuda_graphs (line 491) | def delete_cuda_graphs():
  class _GraphStatus (line 521) | class _GraphStatus(Enum):
  class _CudagraphRecordNode (line 528) | class _CudagraphRecordNode(torch.autograd.Function):
    method forward (line 533) | def forward(ctx, runner, inputs):
    method backward (line 544) | def backward(ctx, grads):
  class _CudagraphReplayNode (line 560) | class _CudagraphReplayNode(torch.autograd.Function):
    method forward (line 565) | def forward(ctx, runner, is_first_microbatch, *inputs):
    method backward (line 618) | def backward(ctx, *grads):
  class _CudaGraphRunner (line 670) | class _CudaGraphRunner(torch.nn.Module):
    method __init__ (line 675) | def __init__(
    method __str__ (line 746) | def __str__(self):
    method get_quantization_context (line 752) | def get_quantization_context(self):
    method get_connected_params (line 765) | def get_connected_params(self, outputs):
    method create_fwd_graph (line 789) | def create_fwd_graph(self, args, kwargs, outputs=None, clone_inputs=Tr...
    method create_bwd_graph (line 1023) | def create_bwd_graph(self):
    method apply_cudagraph_record_metadata (line 1170) | def apply_cudagraph_record_metadata(self, args, kwargs, outputs):
    method record_graph_capture (line 1188) | def record_graph_capture(self, args, kwargs):
    method replay_graph_capture (line 1245) | def replay_graph_capture(self, is_first_microbatch, args, kwargs):
    method get_mismatch_errors (line 1268) | def get_mismatch_errors(self, args, kwargs):
    method get_arg_metas (line 1333) | def get_arg_metas(self, args, kwargs=None):
    method get_tensors (line 1348) | def get_tensors(self, args, kwargs=None, check_types=True):
    method to_list (line 1379) | def to_list(self, x):
  class CudaGraphManager (line 1384) | class CudaGraphManager(torch.nn.Module):
    method __init__ (line 1390) | def __init__(
    method call_ddp_preforward_hook (line 1452) | def call_ddp_preforward_hook(self, module):
    method get_cudagraph_runner (line 1467) | def get_cudagraph_runner(self, megatron_module, args, kwargs, reuse_cu...
    method __call__ (line 1546) | def __call__(self, megatron_module, args, kwargs):
  function _layer_is_graphable (line 1641) | def _layer_is_graphable(layer, config):
  class TECudaGraphHelper (line 1684) | class TECudaGraphHelper:
    method __init__ (line 1693) | def __init__(self, model, config, seq_length, micro_batch_size, optimi...
    method _discover_layers (line 1729) | def _discover_layers(self):
    method capture_finished (line 1804) | def capture_finished(self):
    method graphs_created (line 1814) | def graphs_created(self):
    method _get_sample_arguments (line 1823) | def _get_sample_arguments(self, order, chunk_id_list=None):
    method _get_cuda_graph_input_data (line 2068) | def _get_cuda_graph_input_data(self):
    method _start_capturing (line 2217) | def _start_capturing(self):
    method _reset_after_capture (line 2233) | def _reset_after_capture(self):
    method _finish_capturing (line 2247) | def _finish_capturing(self, start_time):
    method create_cudagraphs (line 2268) | def create_cudagraphs(self):
    method cuda_graph_set_manual_hooks (line 2315) | def cuda_graph_set_manual_hooks(self):
    method delete_cuda_graphs (line 2325) | def delete_cuda_graphs(self):
  function convert_schedule_table_to_order (line 2356) | def convert_schedule_table_to_order(num_warmup_microbatches, num_model_c...
  function get_overlap_moe_expert_parallel_comm_order (line 2382) | def get_overlap_moe_expert_parallel_comm_order(order, num_layers_per_chu...
  function set_current_microbatch (line 2491) | def set_current_microbatch(model, microbatch_id):
  function _wrap_graph_for_vision (line 2540) | def _wrap_graph_for_vision(graph_fn):
  function get_vision_cuda_graph_seq_length (line 2562) | def get_vision_cuda_graph_seq_length(vision_config, default_seq_length: ...
  class VisionTECudaGraphHelper (line 2593) | class VisionTECudaGraphHelper(TECudaGraphHelper):
    method __init__ (line 2624) | def __init__(
    method _discover_layers (line 2639) | def _discover_layers(self):
    method _reset_after_capture (line 2693) | def _reset_after_capture(self):
    method _finish_capturing (line 2702) | def _finish_capturing(self, start_time):
    method _get_sample_arguments (line 2715) | def _get_sample_arguments(self, order, chunk_id_list=None):
    method cuda_graph_set_manual_hooks (line 2757) | def cuda_graph_set_manual_hooks(self):

FILE: megatron/core/transformer/custom_layers/batch_invariant_kernels.py
  function _matmul_launch_metadata (line 42) | def _matmul_launch_metadata(
  function _compute_pid (line 63) | def _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM...
  function matmul_kernel_persistent (line 73) | def matmul_kernel_persistent(
  function get_compute_units (line 154) | def get_compute_units():
  function matmul_persistent (line 178) | def matmul_persistent(a: torch.Tensor, b: torch.Tensor, bias: torch.Tens...
  function _log_softmax_kernel (line 251) | def _log_softmax_kernel(
  function log_softmax (line 308) | def log_softmax(input: torch.Tensor, dim: int = -1) -> torch.Tensor:
  function mean_kernel (line 344) | def mean_kernel(
  function mean_dim (line 391) | def mean_dim(
  function mm_batch_invariant (line 481) | def mm_batch_invariant(a, b):
  function addmm_batch_invariant (line 486) | def addmm_batch_invariant(bias, a, b):
  function _log_softmax_batch_invariant (line 491) | def _log_softmax_batch_invariant(input, dim, _half_to_float):
  function mean_batch_invariant (line 496) | def mean_batch_invariant(input, dim, keepdim=False, dtype: torch.dtype |...
  function get_batch_invariant_attention_block_size (line 516) | def get_batch_invariant_attention_block_size() -> AttentionBlockSize:
  function _import_module_if_available (line 530) | def _import_module_if_available(name: str):
  function _te_patch_for_batch_invariant (line 537) | def _te_patch_for_batch_invariant():
  function _te_unpatch_for_batch_invariant (line 621) | def _te_unpatch_for_batch_invariant():
  function _extract_te_gemm_args (line 694) | def _extract_te_gemm_args(args: tuple, kwargs: Dict[str, Any]):
  function _is_supported_dtype_for_bik (line 709) | def _is_supported_dtype_for_bik(t: torch.dtype) -> bool:
  class BatchInvariantTEGemmFn (line 713) | class BatchInvariantTEGemmFn(torch.autograd.Function):
    method forward (line 717) | def forward(
    method backward (line 775) | def backward(ctx, grad_output: torch.Tensor):
  function _te_general_gemm_patched (line 830) | def _te_general_gemm_patched(*args, **kwargs) -> List[torch.Tensor]:
  class BatchInvariantRMSNormFn (line 877) | class BatchInvariantRMSNormFn(torch.autograd.Function):
    method forward (line 881) | def forward(ctx, x: torch.Tensor, weight: torch.Tensor, eps: float, ze...
    method backward (line 912) | def backward(ctx, grad_output: torch.Tensor):
  function rmsnorm_batch_invariant (line 938) | def rmsnorm_batch_invariant(x: torch.Tensor, weight: torch.Tensor, eps: ...
  function _te_rmsnorm_forward_patched (line 948) | def _te_rmsnorm_forward_patched(self, x: torch.Tensor) -> torch.Tensor:
  function is_batch_invariant_mode_enabled (line 960) | def is_batch_invariant_mode_enabled():
  function enable_batch_invariant_mode (line 965) | def enable_batch_invariant_mode():
  function disable_batch_invariant_mode (line 981) | def disable_batch_invariant_mode():
  function set_batch_invariant_mode (line 993) | def set_batch_invariant_mode(enabled: bool = True):

FILE: megatron/core/transformer/dot_product_attention.py
  class DotProductAttention (line 26) | class DotProductAttention(MegatronModule):
    method __init__ (line 42) | def __init__(
    method forward (line 142) | def forward(
    method sharded_state_dict (line 253) | def sharded_state_dict(

FILE: megatron/core/transformer/enums.py
  class ModelType (line 8) | class ModelType(enum.Enum):
  class LayerType (line 17) | class LayerType(enum.Enum):
  class AttnType (line 33) | class AttnType(enum.Enum):
  class AttnMaskType (line 40) | class AttnMaskType(enum.Enum):
  class AttnBackend (line 51) | class AttnBackend(enum.Enum):
  class CudaGraphScope (line 61) | class CudaGraphScope(enum.Enum):

FILE: megatron/core/transformer/experimental_attention_variant/absorbed_mla.py
  class AbsorbedMLASelfAttentionSubmodules (line 62) | class AbsorbedMLASelfAttentionSubmodules:
  class AbsorbedMLASelfAttention (line 79) | class AbsorbedMLASelfAttention(Attention):
    method __init__ (line 93) | def __init__(
    method get_query_key_value_tensors (line 339) | def get_query_key_value_tensors(
    method _checkpointed_attention_forward (line 633) | def _checkpointed_attention_forward(
    method forward (line 692) | def forward(
    method backward_dw (line 805) | def backward_dw(self) -> NoReturn:
    method _backward_kv_proj (line 811) | def _backward_kv_proj(self):
    method _backward_q_proj (line 817) | def _backward_q_proj(self):
    method _backward_output_proj (line 825) | def _backward_output_proj(self):
    method set_for_recompute_input_layernorm (line 829) | def set_for_recompute_input_layernorm(self):
    method clip_qk (line 837) | def clip_qk(self):
    method _combine_kv_weights (line 845) | def _combine_kv_weights(self, k_weight, v_weight):
    method _split_kv_weights (line 882) | def _split_kv_weights(self, combined_weight):
    method _load_from_state_dict (line 917) | def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):

FILE: megatron/core/transformer/experimental_attention_variant/dsa.py
  function rotate_activation (line 30) | def rotate_activation(x: torch.Tensor) -> torch.Tensor:
  class DSAIndexerLossLoggingHelper (line 49) | class DSAIndexerLossLoggingHelper:
    method save_loss_to_tracker (line 55) | def save_loss_to_tracker(
    method clean_loss_in_tracker (line 83) | def clean_loss_in_tracker():
    method reduce_loss_in_tracker (line 92) | def reduce_loss_in_tracker():
    method track_indexer_metrics (line 116) | def track_indexer_metrics(
  function compute_dsa_indexer_loss (line 161) | def compute_dsa_indexer_loss(
  function _compute_index_scores (line 255) | def _compute_index_scores(q: torch.Tensor, weights: torch.Tensor, k: tor...
  function fused_qk_topk_naive (line 298) | def fused_qk_topk_naive(
  function fwd_fused_indexer_loss_naive (line 326) | def fwd_fused_indexer_loss_naive(
  function bwd_fused_indexer_loss_naive (line 346) | def bwd_fused_indexer_loss_naive(
  class FusedDSAIndexerLoss (line 510) | class FusedDSAIndexerLoss(torch.autograd.Function):
    method forward (line 514) | def forward(
    method backward (line 555) | def backward(ctx, grad_topk_indices, grad_loss):
  class DSAIndexerLossAutoScaler (line 579) | class DSAIndexerLossAutoScaler(torch.autograd.Function):
    method forward (line 589) | def forward(ctx, output: torch.Tensor, indexer_loss: torch.Tensor):
    method backward (line 603) | def backward(ctx, grad_output: torch.Tensor):
    method set_loss_scale (line 623) | def set_loss_scale(scale: torch.Tensor):
  class DSAIndexerSubmodules (line 636) | class DSAIndexerSubmodules:
  class DSAttentionSubmodules (line 654) | class DSAttentionSubmodules:
  class DSAIndexer (line 665) | class DSAIndexer(MegatronModule):
    method __init__ (line 676) | def __init__(
    method _apply_rope (line 779) | def _apply_rope(self, x: torch.Tensor, rotary_pos_emb: torch.Tensor, m...
    method forward_before_topk (line 798) | def forward_before_topk(
    method forward_with_scores (line 863) | def forward_with_scores(
    method forward (line 897) | def forward(
  function unfused_dsa_fn (line 920) | def unfused_dsa_fn(query, key, value, topk_indices, softmax_scale):
  class DSAttention (line 973) | class DSAttention(MegatronModule):
    method __init__ (line 982) | def __init__(
    method forward (line 1010) | def forward(

FILE: megatron/core/transformer/fsdp_dtensor_checkpoint.py
  function get_ep_layer_offset (line 51) | def get_ep_layer_offset(num_experts: int | None = None) -> int:
  function get_expert_index_from_key (line 69) | def get_expert_index_from_key(key):
  function handle_experts_in_state_dict (line 93) | def handle_experts_in_state_dict(state_dict, num_experts: int | None = N...
  function expert_param_local_key (line 151) | def expert_param_local_key(key: str, num_experts: int | None = None) -> ...
  function handle_swiglu_in_state_dict (line 180) | def handle_swiglu_in_state_dict(model, model_state_dict, optimizer_state...
  function handle_fp8_extra_state_case (line 332) | def handle_fp8_extra_state_case(model_state_dict):
  function flatten_state_dict (line 343) | def flatten_state_dict(obj, parent_key="", sep="."):
  function print_diff_in_state_dicts (line 361) | def print_diff_in_state_dicts(state_dict_metadata, load_state_dict, limi...
  function validate_loaded_state_dict (line 394) | def validate_loaded_state_dict(state_dict, checkpoint_path):
  function get_global_unique_param_name (line 443) | def get_global_unique_param_name(model_chunks, param):

FILE: megatron/core/transformer/heterogeneous/heterogeneous_config.py
  class AttentionConfig (line 11) | class AttentionConfig:
    method build_config_from_dict (line 25) | def build_config_from_dict(
  class MLPConfig (line 60) | class MLPConfig:
    method build_config_from_dict (line 74) | def build_config_from_dict(cls, block_config_dict: dict, hidden_size: ...
    method ffn_mult_to_intermediate_size (line 101) | def ffn_mult_to_intermediate_size(ffn_mult: float, hidden_size: int) -...
    method find_multiple (line 117) | def find_multiple(n: int, k: int) -> int:
  class TransformerBlockConfig (line 134) | class TransformerBlockConfig:
  class HeterogeneousTransformerConfig (line 147) | class HeterogeneousTransformerConfig(TransformerConfig):
    method __post_init__ (line 201) | def __post_init__(self):
    method get_config_for_layer (line 229) | def get_config_for_layer(self, layer_number: int) -> TransformerConfig:

FILE: megatron/core/transformer/heterogeneous/linear_replacements.py
  function _gather_from_tensor_parallel_region (line 25) | def _gather_from_tensor_parallel_region(x: Tensor, config: TransformerCo...
  class TELayerNormColumnParallelLinearGathered (line 49) | class TELayerNormColumnParallelLinearGathered(TELayerNormColumnParallelL...
    method __init__ (line 55) | def __init__(self, config: TransformerConfig, tp_comm_buffer_name: str...
    method forward (line 68) | def forward(self, x, **kwargs):
  class ColumnParallelLinearGathered (line 78) | class ColumnParallelLinearGathered(ColumnParallelLinear):
    method __init__ (line 84) | def __init__(self, config: TransformerConfig, *args, **kwargs):
    method forward (line 96) | def forward(

FILE: megatron/core/transformer/identity_op.py
  class IdentityOp (line 9) | class IdentityOp(torch.nn.Module):
    method __init__ (line 14) | def __init__(self, *args: object, **kwargs: object):
    method forward (line 17) | def forward(self, x: T, *args: object, **kwargs: object) -> T:
  class IdentityFuncOp (line 25) | class IdentityFuncOp(IdentityOp):
    method __init__ (line 32) | def __init__(self, *args: object, **kwargs: object):
    method forward (line 35) | def forward(self, *args: object, **kwargs: object):

FILE: megatron/core/transformer/mlp.py
  class LinearFc1Interface (line 48) | class LinearFc1Interface(Protocol):
    method forward (line 51) | def forward(self, hidden_states: torch.Tensor, /) -> tuple[torch.Tenso...
    method backward_dw (line 55) | def backward_dw(self) -> None:
  class LinearFc1Builder (line 60) | class LinearFc1Builder(Protocol):
    method __call__ (line 63) | def __call__(
  class TEActivationFunctionInterface (line 83) | class TEActivationFunctionInterface(Protocol):
    method forward (line 86) | def forward(self, input_: torch.Tensor, /) -> torch.Tensor:
  class TEActivationFunctionBuilder (line 91) | class TEActivationFunctionBuilder(Protocol):
    method __call__ (line 94) | def __call__(self, *, config: TransformerConfig) -> TEActivationFuncti...
  class LinearFc2Interface (line 99) | class LinearFc2Interface(Protocol):
    method forward (line 102) | def forward(self, hidden_states: torch.Tensor, /) -> tuple[torch.Tenso...
    method backward_dw (line 106) | def backward_dw(self) -> None:
  class LinearFc2Builder (line 111) | class LinearFc2Builder(Protocol):
    method __call__ (line 114) | def __call__(
  class MLPSubmodules (line 134) | class MLPSubmodules:
  class MLP (line 150) | class MLP(MegatronModule):
    method __init__ (line 167) | def __init__(
    method forward (line 246) | def forward(
    method sharded_state_dict (line 348) | def sharded_state_dict(
    method backward_dw (line 365) | def backward_dw(self):
  function apply_swiglu_sharded_factory (line 371) | def apply_swiglu_sharded_factory(

FILE: megatron/core/transformer/module.py
  function param_is_not_shared (line 26) | def param_is_not_shared(param):  # pylint: disable=missing-function-docs...
  class MegatronModule (line 30) | class MegatronModule(torch.nn.Module):
    method __init__ (line 41) | def __init__(self, config: TransformerConfig):
    method state_dict_for_save_checkpoint (line 45) | def state_dict_for_save_checkpoint(self, prefix: str = '', keep_vars: ...
    method sharded_state_dict (line 59) | def sharded_state_dict(
    method set_is_first_microbatch (line 106) | def set_is_first_microbatch(self):
    method set_symmetric_ar (line 124) | def set_symmetric_ar(self, set_to: Optional[str] = None) -> None:
  class GraphableMegatronModule (line 158) | class GraphableMegatronModule(MegatronModule):
    method __init__ (line 166) | def __init__(self, config: TransformerConfig, vp_stage: Optional[int] ...
    method init_backward_dw_wrapper (line 201) | def init_backward_dw_wrapper(self):
    method set_te_cuda_graph_backward_dw_wrapper (line 211) | def set_te_cuda_graph_backward_dw_wrapper(self):
    method _te_cuda_graph_backward_dw_graph (line 220) | def _te_cuda_graph_backward_dw_graph(self, microbatch_idx):
    method get_layer_static_inputs (line 229) | def get_layer_static_inputs(self, seq_length, micro_batch_size):
    method setup_manual_hooks (line 257) | def setup_manual_hooks(self, make_hook_func):
    method _get_submodules_under_cudagraphs (line 276) | def _get_submodules_under_cudagraphs(self):
    method _te_cuda_graph_capture (line 283) | def _te_cuda_graph_capture(self, *args, **kwargs):
    method _te_cuda_graph_replay (line 290) | def _te_cuda_graph_replay(self, *args, **kwargs):
    method _get_te_cuda_graph_replay_args (line 311) | def _get_te_cuda_graph_replay_args(self, *args, **kwargs):
    method _should_call_local_cudagraph (line 327) | def _should_call_local_cudagraph(self, *args, **kwargs):
    method _should_call_te_cudagraph (line 333) | def _should_call_te_cudagraph(self, *args, **kwargs):
    method __call__ (line 345) | def __call__(self, *args, **kwargs):
  function conversion_helper (line 359) | def conversion_helper(val, conversion):
  function fp32_to_float16 (line 379) | def fp32_to_float16(val, float16_convertor):
  function float16_to_fp32 (line 398) | def float16_to_fp32(val):
  class Float16Module (line 416) | class Float16Module(MegatronModule):
    method __init__ (line 428) | def __init__(self, config: TransformerConfig, module: torch.nn.Module):
    method set_input_tensor (line 454) | def set_input_tensor(self, input_tensor):  # pylint: disable=missing-f...
    method forward (line 457) | def forward(self, *inputs, fp32_output=True, **kwargs):
    method state_dict (line 502) | def state_dict(
    method state_dict_for_save_checkpoint (line 507) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
    method sharded_state_dict (line 511) | def sharded_state_dict(self, prefix='', *args, **kwargs):
    method load_state_dict (line 515) | def load_state_dict(

FILE: megatron/core/transformer/moe/experts.py
  class GroupedLinearFc1Interface (line 67) | class GroupedLinearFc1Interface(Protocol):
    method forward (line 70) | def forward(
    method backward_dw (line 76) | def backward_dw(self) -> None:
  class GroupedLinearFc1Builder (line 81) | class GroupedLinearFc1Builder(Protocol):
    method __call__ (line 84) | def __call__(
  class GroupedLinearFc2Interface (line 103) | class GroupedLinearFc2Interface(Protocol):
    method forward (line 106) | def forward(
    method backward_dw (line 112) | def backward_dw(self) -> None:
  class GroupedLinearFc2Builder (line 117) | class GroupedLinearFc2Builder(Protocol):
    method __call__ (line 120) | def __call__(
  class TEGroupedMLPSubmodules (line 140) | class TEGroupedMLPSubmodules:
  class TEGroupedMLP (line 156) | class TEGroupedMLP(MegatronModule):
    method __init__ (line 163) | def __init__(
    method _apply_bias (line 251) | def _apply_bias(intermediate_parallel, bias_parallel, tokens_per_exper...
    method bias_act_func (line 270) | def bias_act_func(self, intermediate_parallel, bias_parallel, permuted...
    method forward (line 327) | def forward(
    method sharded_state_dict (line 408) | def sharded_state_dict(
    method backward_dw (line 448) | def backward_dw(self):
  class InferenceGroupedMLP (line 460) | class InferenceGroupedMLP(TEGroupedMLP):
    method __init__ (line 470) | def __init__(
    method _resolve_flashinfer_activation_type (line 497) | def _resolve_flashinfer_activation_type(self):
    method _resolve_mcore_activation_type (line 513) | def _resolve_mcore_activation_type(self):
    method set_inference_cuda_graphed_iteration (line 520) | def set_inference_cuda_graphed_iteration(self):
    method unset_inference_cuda_graphed_iteration (line 524) | def unset_inference_cuda_graphed_iteration(self):
    method _build_concatenated_mxfp8_weights (line 528) | def _build_concatenated_mxfp8_weights(self):
    method _build_concatenated_weights (line 571) | def _build_concatenated_weights(self):
    method _flashinfer_forward (line 614) | def _flashinfer_forward(self, hidden_states, routing_map, probs):
    method _mcore_fused_moe_forward (line 632) | def _mcore_fused_moe_forward(
    method forward (line 652) | def forward(
  class SequentialMLP (line 716) | class SequentialMLP(MegatronModule):
    method __init__ (line 723) | def __init__(
    method _pad_tensor_for_quantization (line 758) | def _pad_tensor_for_quantization(self, hidden, probs):
    method forward (line 772) | def forward(
    method backward_dw (line 826) | def backward_dw(self):
    method sharded_state_dict (line 831) | def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=N...

FILE: megatron/core/transformer/moe/fused_a2a.py
  function get_hidden_bytes (line 21) | def get_hidden_bytes(x: torch.Tensor) -> int:
  function get_buffer (line 33) | def get_buffer(group: torch.distributed.ProcessGroup, hidden_bytes: int):
  class FusedDispatch (line 69) | class FusedDispatch(torch.autograd.Function):
    method forward (line 73) | def forward(
    method backward (line 140) | def backward(
  class FusedCombine (line 163) | class FusedCombine(torch.autograd.Function):
    method forward (line 167) | def forward(ctx, x, group, handle, async_finish=False, allocate_on_com...
    method backward (line 191) | def backward(ctx, grad_output, previous_event=None):
  function fused_dispatch (line 212) | def fused_dispatch(
  function fused_combine (line 244) | def fused_combine(x, group, handle, async_finish=False, allocate_on_comm...
  function set_deepep_num_sms (line 258) | def set_deepep_num_sms(num_sms):
  function init_hybrid_ep_buffer (line 278) | def init_hybrid_ep_buffer(
  function reset_hybrid_ep_buffer (line 324) | def reset_hybrid_ep_buffer():
  class HybridEPDispatch (line 332) | class HybridEPDispatch(torch.autograd.Function):
    method forward (line 338) | def forward(
    method backward (line 397) | def backward(ctx, grad_x, grad_probs, grad_scaling_factor, grad_tokens...
  class HybridEPCombine (line 409) | class HybridEPCombine(torch.autograd.Function):
    method forward (line 415) | def forward(ctx, x, handle, num_permuted_tokens=None, pad_multiple=None):
    method backward (line 428) | def backward(ctx, grad_x):
  function hybrid_ep_dispatch (line 446) | def hybrid_ep_dispatch(
  function hybrid_ep_combine (line 497) | def hybrid_ep_combine(x, handle, num_permuted_tokens, pad_multiple):

FILE: megatron/core/transformer/moe/moe_layer.py
  class RouterInterface (line 58) | class RouterInterface(Protocol):
    method forward (line 61) | def forward(self, input: torch.Tensor, /) -> tuple[torch.Tensor, torch...
    method set_layer_number (line 69) | def set_layer_number(self, layer_number: int) -> None:
  class RouterBuilder (line 77) | class RouterBuilder(Protocol):
    method __call__ (line 80) | def __call__(
  class MoESubmodules (line 86) | class MoESubmodules:
  class BaseMoELayer (line 94) | class BaseMoELayer(MegatronModule, ABC):
    method __init__ (line 101) | def __init__(
    method forward (line 137) | def forward(self, hidden_states):
    method set_layer_number (line 141) | def set_layer_number(self, layer_number: int):
  class MoELayer (line 147) | class MoELayer(BaseMoELayer):
    method __init__ (line 155) | def __init__(
    method _setup_inference_mode (line 293) | def _setup_inference_mode(self, pg_collection):
    method set_inference_cuda_graphed_iteration (line 313) | def set_inference_cuda_graphed_iteration(self):
    method unset_inference_cuda_graphed_iteration (line 331) | def unset_inference_cuda_graphed_iteration(self):
    method route (line 347) | def route(self, hidden_states: torch.Tensor, padding_mask: Optional[to...
    method preprocess (line 357) | def preprocess(
    method dispatch (line 376) | def dispatch(self, hidden_states: torch.Tensor, probs: torch.Tensor):
    method shared_experts_compute (line 386) | def shared_experts_compute(self, hidden_states: torch.Tensor):
    method routed_experts_compute (line 414) | def routed_experts_compute(self, hidden_states: torch.Tensor, probs: t...
    method combine (line 441) | def combine(self, output: torch.Tensor):
    method postprocess (line 450) | def postprocess(self, output: torch.Tensor, shared_expert_output: Opti...
    method router_and_preprocess (line 462) | def router_and_preprocess(self, hidden_states: torch.Tensor):
    method forward (line 469) | def forward(
    method backward_dw (line 564) | def backward_dw(self, routed_experts: bool = True, shared_experts: boo...
    method set_for_recompute_pre_mlp_layernorm (line 586) | def set_for_recompute_pre_mlp_layernorm(self):

FILE: megatron/core/transformer/moe/moe_utils.py
  function switch_load_balancing_loss_func (line 55) | def switch_load_balancing_loss_func(
  function z_loss_func (line 145) | def z_loss_func(
  function sinkhorn (line 177) | def sinkhorn(cost: torch.Tensor, tol: float = 0.0001) -> torch.Tensor:
  function get_capacity (line 202) | def get_capacity(
  function get_tokens_per_expert_and_token_count (line 223) | def get_tokens_per_expert_and_token_count(
  class MoEAuxLossAutoScaler (line 245) | class MoEAuxLossAutoScaler(torch.autograd.Function):
    method forward (line 251) | def forward(ctx, output: torch.Tensor, aux_loss: torch.Tensor) -> torc...
    method backward (line 265) | def backward(ctx, grad_output: torch.Tensor) -> Tuple[torch.Tensor, to...
    method set_loss_scale (line 285) | def set_loss_scale(scale: torch.Tensor) -> None:
  function permute (line 298) | def permute(
  function unpermute (line 427) | def unpermute(
  function sort_chunks_by_idxs (line 529) | def sort_chunks_by_idxs(
  function group_limited_topk (line 574) | def group_limited_topk(
  function pad_routing_map (line 632) | def pad_routing_map(routing_map: torch.Tensor, pad_multiple: int) -> tor...
  function topk_routing_with_score_function (line 667) | def topk_routing_with_score_function(
  function compute_routing_scores_for_aux_loss (line 822) | def compute_routing_scores_for_aux_loss(
  function apply_router_token_dropping (line 873) | def apply_router_token_dropping(
  function save_to_aux_losses_tracker (line 939) | def save_to_aux_losses_tracker(
  function clear_aux_losses_tracker (line 976) | def clear_aux_losses_tracker() -> None:
  function reduce_aux_losses_tracker_across_ranks (line 983) | def reduce_aux_losses_tracker_across_ranks(
  function track_moe_metrics (line 1028) | def track_moe_metrics(
  function get_updated_expert_bias (line 1132) | def get_updated_expert_bias(
  function maybe_move_tensor_to_cpu (line 1158) | def maybe_move_tensor_to_cpu(
  function get_moe_layer_wise_logging_tracker (line 1183) | def get_moe_layer_wise_logging_tracker() -> dict:
  class RandomSTE (line 1190) | class RandomSTE(torch.autograd.Function):
    method forward (line 1199) | def forward(ctx, logits: torch.Tensor) -> torch.Tensor:
    method backward (line 1214) | def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
  function apply_random_logits (line 1227) | def apply_random_logits(logits: torch.Tensor) -> torch.Tensor:
  class RouterGatingLinearFunction (line 1240) | class RouterGatingLinearFunction(torch.autograd.Function):
    method forward (line 1246) | def forward(
    method backward (line 1286) | def backward(
  function router_gating_linear (line 1323) | def router_gating_linear(
  function get_align_size_for_quantization (line 1343) | def get_align_size_for_quantization(config: TransformerConfig) -> int:
  function get_default_pg_collection (line 1361) | def get_default_pg_collection() -> ProcessGroupCollection:
  class MoECudaGraphPartialCaptureSignal (line 1381) | class MoECudaGraphPartialCaptureSignal(Exception):
    method __init__ (line 1389) | def __init__(self, moe_layer, return_step: str, **kwargs):
    method get_early_return_outputs (line 1394) | def get_early_return_outputs(
  class MoECudaGraphTensorStore (line 1447) | class MoECudaGraphTensorStore:
    method is_empty (line 1467) | def is_empty(self) -> bool:
    method set (line 1478) | def set(self, **kwargs):
    method clear (line 1493) | def clear(self):
  function maybe_skip_or_early_return_by_cudagraph (line 1499) | def maybe_skip_or_early_return_by_cudagraph(step_condition):

FILE: megatron/core/transformer/moe/router.py
  class Router (line 28) | class Router(ABC, MegatronModule):
    method __init__ (line 31) | def __init__(
    method reset_parameters (line 72) | def reset_parameters(self):
    method gating (line 84) | def gating(self, input: torch.Tensor):
    method routing (line 109) | def routing(self, logits: torch.Tensor):
    method forward (line 122) | def forward(self, input: torch.Tensor):
    method set_layer_number (line 131) | def set_layer_number(self, layer_number: int):
  class TopKRouter (line 136) | class TopKRouter(Router):
    method __init__ (line 152) | def __init__(
    method _maintain_float32_expert_bias (line 218) | def _maintain_float32_expert_bias(self):
    method sinkhorn_load_balancing (line 229) | def sinkhorn_load_balancing(self, logits: torch.Tensor):
    method get_aux_loss_coeff (line 262) | def get_aux_loss_coeff(self, aux_loss_type: str) -> float:
    method is_aux_loss_enabled (line 277) | def is_aux_loss_enabled(self) -> bool:
    method _apply_aux_loss (line 284) | def _apply_aux_loss(
    method _apply_seq_aux_loss (line 324) | def _apply_seq_aux_loss(
    method _apply_global_aux_loss (line 379) | def _apply_global_aux_loss(
    method attach_and_log_load_balancing_loss (line 425) | def attach_and_log_load_balancing_loss(
    method apply_z_loss (line 496) | def apply_z_loss(self, logits, padding_mask: Optional[torch.Tensor] = ...
    method apply_input_jitter (line 551) | def apply_input_jitter(self, input: torch.Tensor):
    method _apply_expert_bias (line 573) | def _apply_expert_bias(
    method routing (line 586) | def routing(self, logits: torch.Tensor, padding_mask: Optional[torch.T...
    method reset_global_aux_loss_tracker (line 674) | def reset_global_aux_loss_tracker(self):
    method forward (line 680) | def forward(self, input: torch.Tensor, padding_mask: Optional[torch.Te...
    method _load_from_state_dict (line 704) | def _load_from_state_dict(self, *args, **kwargs):
    method _save_to_state_dict (line 709) | def _save_to_state_dict(self, *args, **kwargs):
  class InferenceTopKRouter (line 715) | class InferenceTopKRouter(TopKRouter):
    method __init__ (line 727) | def __init__(
    method set_inference_cuda_graphed_iteration (line 753) | def set_inference_cuda_graphed_iteration(self):
    method unset_inference_cuda_graphed_iteration (line 757) | def unset_inference_cuda_graphed_iteration(self):
    method _compiled_topk_routing (line 763) | def _compiled_topk_routing(
    method _forward (line 790) | def _forward(self, input: torch.Tensor, padding_mask: Optional[torch.T...
    method forward (line 808) | def forward(self, input: torch.Tensor, padding_mask: Optional[torch.Te...

FILE: megatron/core/transformer/moe/router_replay.py
  class RouterReplayAction (line 8) | class RouterReplayAction(Enum):
  class RouterReplay (line 18) | class RouterReplay:
    method set_replay_data (line 29) | def set_replay_data(all_layers_topk_indices: List[torch.Tensor]):
    method get_recorded_data (line 45) | def get_recorded_data() -> List[torch.Tensor]:
    method clear_global_indices (line 55) | def clear_global_indices():
    method set_global_router_replay_action (line 61) | def set_global_router_replay_action(router_replay_action: RouterReplay...
    method clear_global_router_replay_action (line 67) | def clear_global_router_replay_action():
    method clear_global_router_replay_instances (line 73) | def clear_global_router_replay_instances():
    method set_global_static_buffers (line 78) | def set_global_static_buffers(static_buffer: torch.Tensor):
    method clear_global_static_buffers (line 95) | def clear_global_static_buffers():
    method __init__ (line 100) | def __init__(self):
    method set_target_indices (line 113) | def set_target_indices(self, topk_indices: torch.Tensor):
    method get_recorded_indices (line 118) | def get_recorded_indices(self) -> Optional[torch.Tensor]:
    method clear_indices (line 122) | def clear_indices(self):
    method set_router_replay_action (line 128) | def set_router_replay_action(self, router_replay_action: RouterReplayA...
    method clear_router_replay_action (line 132) | def clear_router_replay_action(self):
    method get_replay_topk (line 136) | def get_replay_topk(
    method set_static_buffer (line 183) | def set_static_buffer(self, buffer: torch.Tensor):
    method clear_static_buffer (line 191) | def clear_static_buffer(self):
    method record_indices (line 195) | def record_indices(self, topk_indices: torch.Tensor):

FILE: megatron/core/transformer/moe/shared_experts.py
  class SharedExpertMLP (line 37) | class SharedExpertMLP(MLP):
    method __init__ (line 46) | def __init__(
    method forward (line 121) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
    method sharded_state_dict (line 130) | def sharded_state_dict(
    method pre_forward_comm (line 150) | def pre_forward_comm(self, input):
    method linear_fc1_forward_and_act (line 171) | def linear_fc1_forward_and_act(self, overlapped_comm_output=None):
    method linear_fc2_forward (line 224) | def linear_fc2_forward(self, overlapped_comm_output=None):
    method post_forward_comm (line 239) | def post_forward_comm(self):
    method get_output (line 259) | def get_output(self):
  function set_tensor_grad_fn_sequence_sr (line 279) | def set_tensor_grad_fn_sequence_sr(tensor, value):

FILE: megatron/core/transformer/moe/token_dispatcher.py
  class MoETokenDispatcher (line 53) | class MoETokenDispatcher:
    method __init__ (line 58) | def __init__(
    method dispatch_preprocess (line 86) | def dispatch_preprocess(
    method token_dispatch (line 110) | def token_dispatch(self, hidden_states: torch.Tensor, probs: torch.Ten...
    method dispatch_postprocess (line 126) | def dispatch_postprocess(self, hidden_states: torch.Tensor, probs: tor...
    method combine_preprocess (line 148) | def combine_preprocess(self, hidden_states):
    method token_combine (line 168) | def token_combine(self, hidden_states):
    method combine_postprocess (line 183) | def combine_postprocess(self, hidden_states):
    method set_shared_experts (line 202) | def set_shared_experts(self, shared_experts):
  class MoEAllGatherTokenDispatcher (line 208) | class MoEAllGatherTokenDispatcher(MoETokenDispatcher):
    method __init__ (line 214) | def __init__(
    method dispatch_preprocess (line 246) | def dispatch_preprocess(
    method token_dispatch (line 256) | def token_dispatch(self, hidden_states, probs):
    method dispatch_postprocess (line 280) | def dispatch_postprocess(self, hidden_states, probs):
    method combine_preprocess (line 312) | def combine_preprocess(self, hidden_states):
    method token_combine (line 330) | def token_combine(self, hidden_states):
    method combine_postprocess (line 345) | def combine_postprocess(self, hidden_states):
  class MoEAlltoAllTokenDispatcher (line 350) | class MoEAlltoAllTokenDispatcher(MoETokenDispatcher):
    method __init__ (line 367) | def __init__(
    method set_shared_experts (line 464) | def set_shared_experts(self, shared_experts):
    method preprocess (line 471) | def preprocess(self, routing_map: torch.Tensor) -> torch.Tensor:
    method dispatch_preprocess (line 594) | def dispatch_preprocess(
    method token_dispatch (line 651) | def token_dispatch(self, permutated_local_input_tokens, permuted_probs):
    method dispatch_postprocess (line 680) | def dispatch_postprocess(self, global_input_tokens, global_probs):
    method combine_preprocess (line 751) | def combine_preprocess(self, hidden_states):
    method token_combine (line 792) | def token_combine(
    method combine_postprocess (line 820) | def combine_postprocess(self, permutated_local_input_tokens):
    method _maybe_update_cuda_sync_point (line 856) | def _maybe_update_cuda_sync_point(self, point: str):
    method _maybe_dtoh_and_synchronize (line 867) | def _maybe_dtoh_and_synchronize(
  class _DispatchManager (line 909) | class _DispatchManager(ABC):
    method setup_metadata (line 922) | def setup_metadata(self, routing_map: torch.Tensor, probs: torch.Tensor):
    method dispatch (line 927) | def dispatch(self, hidden_states: torch.Tensor) -> torch.Tensor:
    method combine (line 932) | def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
    method get_permuted_hidden_states_by_experts (line 937) | def get_permuted_hidden_states_by_experts(self, hidden_states: torch.T...
    method get_restored_hidden_states_by_experts (line 942) | def get_restored_hidden_states_by_experts(self, hidden_states: torch.T...
  class _HybridEPManager (line 947) | class _HybridEPManager(_DispatchManager):
    method __init__ (line 962) | def __init__(
    method setup_metadata (line 1007) | def setup_metadata(self, routing_map: torch.Tensor, probs: torch.Tensor):
    method dispatch (line 1027) | def dispatch(
    method combine (line 1063) | def combine(
    method get_permuted_hidden_states_by_experts (line 1083) | def get_permuted_hidden_states_by_experts(self, hidden_states: torch.T...
    method get_restored_hidden_states_by_experts (line 1086) | def get_restored_hidden_states_by_experts(self, hidden_states: torch.T...
    method get_number_of_tokens_per_expert (line 1089) | def get_number_of_tokens_per_expert(self) -> torch.Tensor:
  class _DeepepManager (line 1096) | class _DeepepManager(_DispatchManager):
    method __init__ (line 1118) | def __init__(
    method setup_metadata (line 1160) | def setup_metadata(self, routing_map: torch.Tensor, probs: torch.Tensor):
    method dispatch (line 1172) | def dispatch(
    method _indices_to_multihot (line 1203) | def _indices_to_multihot(self, indices, probs):
    method get_number_of_tokens_per_expert (line 1233) | def get_number_of_tokens_per_expert(self) -> torch.Tensor:
    method combine (line 1239) | def combine(
    method _pad_routing_map (line 1256) | def _pad_routing_map(
    method get_permuted_hidden_states_by_experts (line 1287) | def get_permuted_hidden_states_by_experts(self, hidden_states: torch.T...
    method get_restored_hidden_states_by_experts (line 1322) | def get_restored_hidden_states_by_experts(self, hidden_states: torch.T...
  class MoEFlexTokenDispatcher (line 1334) | class MoEFlexTokenDispatcher(MoETokenDispatcher):
    method __init__ (line 1340) | def __init__(
    method set_shared_experts (line 1385) | def set_shared_experts(self, shared_experts):
    method _initialize_metadata (line 1390) | def _initialize_metadata(self, routing_map: torch.Tensor, probs: torch...
    method dispatch_preprocess (line 1418) | def dispatch_preprocess(
    method token_dispatch (line 1444) | def token_dispatch(
    method dispatch_postprocess (line 1473) | def dispatch_postprocess(self, hidden_states: torch.Tensor, probs: tor...
    method combine_preprocess (line 1492) | def combine_preprocess(self, hidden_states: torch.Tensor):
    method token_combine (line 1501) | def token_combine(
    method combine_postprocess (line 1521) | def combine_postprocess(self, hidden_states: torch.Tensor):

FILE: megatron/core/transformer/moe/token_dispatcher_inference.py
  class InferenceCUDAGraphTokenDispatcher (line 33) | class InferenceCUDAGraphTokenDispatcher(MoEAllGatherTokenDispatcher):
    method __init__ (line 47) | def __init__(
    method _maybe_allocate_ag_buffers (line 73) | def _maybe_allocate_ag_buffers(
    method _maybe_allocate_rs_buffer (line 141) | def _maybe_allocate_rs_buffer(self, x: torch.Tensor) -> dict:
    method token_dispatch (line 160) | def token_dispatch(self, hidden_states, probs):
    method dispatch_postprocess (line 246) | def dispatch_postprocess(self, hidden_states, probs):
    method combine_preprocess (line 266) | def combine_preprocess(self, expert_output):
    method token_combine (line 281) | def token_combine(self, hidden_states):

FILE: megatron/core/transformer/moe/upcycling_utils.py
  function _get_keys_endswith (line 15) | def _get_keys_endswith(model, suffix):
  function _find_submodule (line 22) | def _find_submodule(model, submodule_name):
  function _get_config (line 32) | def _get_config(moe_model, dense_model):
  function _convert_to_moe_state_dict (line 108) | def _convert_to_moe_state_dict(moe_model, dense_model):
  function upcycle_state_dict (line 301) | def upcycle_state_dict(moe_model, dense_model):
  function load_and_upcycle_model (line 329) | def load_and_upcycle_model(

FILE: megatron/core/transformer/multi_latent_attention.py
  class MLASelfAttentionSubmodules (line 81) | class MLASelfAttentionSubmodules:
  class MultiLatentAttention (line 98) | class MultiLatentAttention(Attention):
    method __init__ (line 105) | def __init__(
    method forward (line 216) | def forward(
  class MLASelfAttention (line 377) | class MLASelfAttention(MultiLatentAttention):
    method __init__ (line 384) | def __init__(
    method _qkv_down_projection (line 525) | def _qkv_down_projection(self, hidden_states):
    method get_query_key_value_tensors (line 553) | def get_query_key_value_tensors(
    method uncompress_kv_from_cache (line 877) | def uncompress_kv_from_cache(self, kv_cached):
    method prepare_for_absorption (line 903) | def prepare_for_absorption(self):
    method backward_dw (line 964) | def backward_dw(self) -> NoReturn:
    method _backward_kv_proj (line 970) | def _backward_kv_proj(self):
    method _backward_q_proj (line 975) | def _backward_q_proj(self):
    method _backward_output_proj (line 983) | def _backward_output_proj(self):
    method set_for_recompute_input_layernorm (line 987) | def set_for_recompute_input_layernorm(self):
    method clip_qk (line 993) | def clip_qk(self):
    method _clip_q_proj_weight (line 1060) | def _clip_q_proj_weight(self, weight):
    method _clip_kv_proj_weight (line 1087) | def _clip_kv_proj_weight(self, weight):
  class FusedMLASelfAttention (line 1114) | class FusedMLASelfAttention(MLASelfAttention):
    method __init__ (line 1117) | def __init__(
    method _qkv_down_projection (line 1216) | def _qkv_down_projection(self, hidden_states):
    method sharded_state_dict (line 1226) | def sharded_state_dict(self, prefix: str = "", sharded_offsets: tuple ...
    method _load_from_state_dict (line 1307) | def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):

FILE: megatron/core/transformer/multi_token_prediction.py
  function tie_word_embeddings_state_dict (line 60) | def tie_word_embeddings_state_dict(
  function tie_output_layer_state_dict (line 95) | def tie_output_layer_state_dict(
  function roll_tensor (line 130) | def roll_tensor(tensor, shifts=-1, dims=-1, cp_group=None, packed_seq_pa...
  function _roll_tensor_packed_seq (line 228) | def _roll_tensor_packed_seq(tensor, shifts, dims, packed_seq_params, cp_...
  class MTPLossLoggingHelper (line 332) | class MTPLossLoggingHelper:
    method save_loss_to_tracker (line 338) | def save_loss_to_tracker(
    method clean_loss_in_tracker (line 364) | def clean_loss_in_tracker():
    method reduce_loss_in_tracker (line 371) | def reduce_loss_in_tracker():
    method track_mtp_metrics (line 385) | def track_mtp_metrics(loss_scale, iteration, writer, wandb_writer=None...
  class MultiTokenPredictionLayerSubmodules (line 410) | class MultiTokenPredictionLayerSubmodules:
  function get_mtp_layer_spec (line 433) | def get_mtp_layer_spec(
  function get_mtp_layer_spec_for_backend (line 447) | def get_mtp_layer_spec_for_backend(
  function mtp_on_this_rank (line 470) | def mtp_on_this_rank(
  function get_mtp_ranks (line 514) | def get_mtp_ranks(pp_ranks: List[int], config: TransformerConfig) -> Lis...
  function get_mtp_layer_offset (line 530) | def get_mtp_layer_offset(config: TransformerConfig, vp_stage: Optional[i...
  function get_mtp_num_layers_to_build (line 544) | def get_mtp_num_layers_to_build(
  class MTPLossAutoScaler (line 566) | class MTPLossAutoScaler(torch.autograd.Function):
    method forward (line 572) | def forward(ctx, output: torch.Tensor, mtp_loss: torch.Tensor):
    method backward (line 586) | def backward(ctx, grad_output: torch.Tensor):
    method set_loss_scale (line 602) | def set_loss_scale(scale: torch.Tensor):
  function process_mtp_loss (line 612) | def process_mtp_loss(
  class MultiTokenPredictionLayer (line 712) | class MultiTokenPredictionLayer(MegatronModule):
    method __init__ (line 732) | def __init__(
    method _get_embeddings (line 850) | def _get_embeddings(
    method _concat_embeddings (line 895) | def _concat_embeddings(self, hidden_states: torch.Tensor, decoder_inpu...
    method _proj_and_transformer_layer (line 918) | def _proj_and_transformer_layer(
    method _postprocess (line 987) | def _postprocess(self, hidden_states: torch.Tensor):
    method forward_single_position (line 1001) | def forward_single_position(
    method _checkpointed_forward (line 1048) | def _checkpointed_forward(self, forward_func, *args, **kwargs):
    method forward (line 1086) | def forward(
    method sharded_state_dict (line 1168) | def sharded_state_dict(
  class MultiTokenPredictionBlockSubmodules (line 1198) | class MultiTokenPredictionBlockSubmodules:
  function _get_mtp_block_submodules (line 1215) | def _get_mtp_block_submodules(
  class MultiTokenPredictionBlock (line 1243) | class MultiTokenPredictionBlock(MegatronModule):
    method __init__ (line 1266) | def __init__(
    method _build_layers (line 1309) | def _build_layers(self, pg_collection):
    method forward (line 1392) | def forward(
    method sharded_state_dict (line 1451) | def sharded_state_dict(

FILE: megatron/core/transformer/pipeline_parallel_layer_layout.py
  class PipelineParallelLayerLayout (line 15) | class PipelineParallelLayerLayout:
    method __repr__ (line 18) | def __repr__(self) -> str:
    method __init__ (line 24) | def __init__(self, layout: str | list, pipeline_model_parallel_size: i...
    method validate_layer_layout (line 85) | def validate_layer_layout(self, num_layers: int, mtp_num_layers: int):
    method get_num_layers_to_build (line 152) | def get_num_layers_to_build(
    method get_layer_offset (line 170) | def get_layer_offset(
    method get_layer_id_list (line 194) | def get_layer_id_list(
    method pretty_repr (line 207) | def pretty_repr(self):
    method from_str (line 263) | def from_str(layout, pipeline_model_parallel_size):
    method get_num_stages_from_str (line 277) | def get_num_stages_from_str(layout: str):
    method parse_str_to_list (line 283) | def parse_str_to_list(layout_str: str):

FILE: megatron/core/transformer/spec_utils.py
  class ModuleSpec (line 12) | class ModuleSpec:
    method __call__ (line 33) | def __call__(self, *args: Any, **kwargs: Any) -> Any:
  function import_module (line 43) | def import_module(module_path: Tuple[str]):
  function get_module (line 59) | def get_module(spec_or_module: Union[ModuleSpec, type], **additional_kwa...
  function build_module (line 73) | def build_module(spec_or_module: Union[ModuleSpec, type], *args, **kwargs):

FILE: megatron/core/transformer/torch_norm.py
  class LayerNormInterface (line 11) | class LayerNormInterface(Protocol):
    method forward (line 14) | def forward(self, x: torch.Tensor, /) -> torch.Tensor:
  class LayerNormBuilder (line 19) | class LayerNormBuilder(Protocol):
    method __call__ (line 22) | def __call__(
  class WrappedTorchNorm (line 27) | class WrappedTorchNorm:
    method __new__ (line 33) | def __new__(
  class L2Norm (line 72) | class L2Norm(torch.nn.Module, LayerNormInterface):
    method __init__ (line 85) | def __init__(self, hidden_size: int, eps: float = 1e-6, **kwargs):
    method _norm (line 91) | def _norm(self, x: torch.Tensor) -> torch.Tensor:
    method forward (line 104) | def forward(self, x: torch.Tensor) -> torch.Tensor:

FILE: megatron/core/transformer/transformer_block.py
  function get_num_layers_to_build (line 71) | def get_num_layers_to_build(
  class TransformerBlockSubmodules (line 202) | class TransformerBlockSubmodules:
  function _get_block_submodules (line 221) | def _get_block_submodules(
  class TransformerBlock (line 262) | class TransformerBlock(GraphableMegatronModule, MegatronModule):
    method __init__ (line 265) | def __init__(
    method _build_layers (line 325) | def _build_layers(self):
    method has_final_layernorm_in_this_stage (line 386) | def has_final_layernorm_in_this_stage(self):
    method _setup_fused_tp_communication (line 415) | def _setup_fused_tp_communication(self):
    method _get_layer (line 439) | def _get_layer(self, layer_number: int):
    method _checkpointed_forward (line 442) | def _checkpointed_forward(
    method set_input_tensor (line 604) | def set_input_tensor(self, input_tensor: Tensor):
    method _should_call_local_cudagraph (line 614) | def _should_call_local_cudagraph(self, *args, **kwargs):
    method __call__ (line 637) | def __call__(self, *args, **kwargs):
    method forward (line 647) | def forward(
    method sharded_state_dict (line 871) | def sharded_state_dict(

FILE: megatron/core/transformer/transformer_config.py
  class TransformerConfig (line 40) | class TransformerConfig(ModelParallelConfig):
    method __post_init__ (line 1007) | def __post_init__(self):
  class MLATransformerConfig (line 2229) | class MLATransformerConfig(TransformerConfig):
    method __post_init__ (line 2294) | def __post_init__(self):

FILE: megatron/core/transformer/transformer_layer.py
  function get_transformer_layer_offset (line 45) | def get_transformer_layer_offset(
  class TransformerLayerSubmodules (line 203) | class TransformerLayerSubmodules:
  class BaseTransformerLayer (line 246) | class BaseTransformerLayer(ABC):
    method __init__ (line 258) | def __init__(self):
  class TransformerLayer (line 262) | class TransformerLayer(GraphableMegatronModule, BaseTransformerLayer):
    method __init__ (line 269) | def __init__(
    method create_mcore_cudagraph_manager (line 493) | def create_mcore_cudagraph_manager(self, config):
    method _get_layer_offset (line 515) | def _get_layer_offset(config: TransformerConfig):
    method _forward_attention (line 528) | def _forward_attention(
    method forward (line 696) | def forward(self, *args, **kwargs):
    method _forward_pre_mlp_layernorm (line 711) | def _forward_pre_mlp_layernorm(self, hidden_states: Tensor):
    method _forward_mlp (line 728) | def _forward_mlp(
    method _forward_post_mlp (line 840) | def _forward_post_mlp(
    method sharded_state_dict (line 901) | def sharded_state_dict(
    method configure_fused_tp_inference (line 924) | def configure_fused_tp_inference(
    method _set_proj_next_layer_norm_weights (line 947) | def _set_proj_next_layer_norm_weights(self, weights: Tensor):
    method _set_fc2_next_layer_norm_weights (line 951) | def _set_fc2_next_layer_norm_weights(self, weights: Optional[Tensor]):
    method _set_proj_residual (line 958) | def _set_proj_residual(self, residual: Tensor):
    method _set_fc2_residual (line 962) | def _set_fc2_residual(self, residual: Tensor):
    method get_mlp_layer_norm_weights (line 966) | def get_mlp_layer_norm_weights(self) -> Tensor:
    method get_qkv_layer_norm_weights (line 975) | def get_qkv_layer_norm_weights(self) -> Tensor:
    method get_layer_static_inputs (line 984) | def get_layer_static_inputs(self, seq_length, micro_batch_size):
    method _get_submodules_under_cudagraphs (line 1006) | def _get_submodules_under_cudagraphs(self):
    method _te_cuda_graph_capture (line 1034) | def _te_cuda_graph_capture(self, *args, **kwargs):
    method _te_cuda_graph_replay (line 1071) | def _te_cuda_graph_replay(self, *args, **kwargs):
    method _get_te_cuda_graph_replay_args (line 1196) | def _get_te_cuda_graph_replay_args(self, *args, **kwargs):
    method _should_call_local_cudagraph (line 1251) | def _should_call_local_cudagraph(self, *args, **kwargs):
    method get_layer_norm_weights (line 1283) | def get_layer_norm_weights(self):
  class MoETransformerLayer (line 1292) | class MoETransformerLayer(TransformerLayer):
    method __init__ (line 1302) | def __init__(self, *args, **kwargs):
    method _should_call_local_cudagraph (line 1310) | def _should_call_local_cudagraph(self, *args, **kwargs):
    method transition_cudagraph_scope (line 1330) | def transition_cudagraph_scope(self, mode):
    method create_mcore_cudagraph_manager (line 1366) | def create_mcore_cudagraph_manager(self, config):
    method _forward_mlp_router (line 1386) | def _forward_mlp_router(self, hidden_states, padding_mask=None):
    method _forward_mlp_expert_compute (line 1427) | def _forward_mlp_expert_compute(self, hidden_states, probs):
    method _forward_mlp_postprocess (line 1446) | def _forward_mlp_postprocess(self, residual, output, shared_expert_out...
    method _forward_mlp (line 1466) | def _forward_mlp(self, hidden_states, inference_context=None, padding_...

FILE: megatron/core/transformer/utils.py
  function get_linear_layer (line 23) | def get_linear_layer(rows, columns, init_method, perform_initialization=...
  function get_default_causal_mask (line 33) | def get_default_causal_mask(sq: int) -> torch.Tensor:
  function get_sliding_window_causal_mask (line 38) | def get_sliding_window_causal_mask(sq, skv, window_size):
  function attention_mask_func (line 49) | def attention_mask_func(attention_scores, attention_mask):
  function gelu_impl (line 55) | def gelu_impl(x):
  function openai_gelu (line 61) | def openai_gelu(x):
  function erf_gelu (line 69) | def erf_gelu(x):
  function make_sharded_tensors_for_checkpoint (line 75) | def make_sharded_tensors_for_checkpoint(
  function make_sharded_object_for_checkpoint (line 151) | def make_sharded_object_for_checkpoint(
  function _get_extra_state_offsets (line 178) | def _get_extra_state_offsets(
  function ensure_metadata_has_dp_cp_group (line 194) | def ensure_metadata_has_dp_cp_group(metadata: Optional[dict]) -> dict:
  function sharded_state_dict_default (line 209) | def sharded_state_dict_default(
  function _init_sequence_parallel_cache (line 261) | def _init_sequence_parallel_cache(model, exclude_modules):
  function set_model_to_sequence_parallel (line 310) | def set_model_to_sequence_parallel(model, set_to=False, exclude_modules=...
  function init_cuda_graph_cache (line 337) | def init_cuda_graph_cache(model):
  function toggle_cuda_graphs (line 394) | def toggle_cuda_graphs(model, set_to="none"):
  function transition_moe_cudagraphs (line 437) | def transition_moe_cudagraphs(model, scope: str):
  function is_layer_window_attention (line 453) | def is_layer_window_attention(

FILE: megatron/core/typed_torch.py
  class _Module (line 16) | class _Module(Generic[P, R_co], Protocol):
    method forward (line 19) | def forward(self, *args: P.args, **kwargs: P.kwargs) -> R_co:
  function apply_module (line 24) | def apply_module(m: _Module[P, R_co], *, check_subclass: bool = True) ->...
  function not_none (line 40) | def not_none(value: T | None) -> T:
  function copy_signature (line 62) | def copy_signature(
  function copy_signature (line 73) | def copy_signature(
  function copy_signature (line 84) | def copy_signature(
  function copy_signature (line 95) | def copy_signature(
  function copy_signature (line 106) | def copy_signature(
  function copy_signature (line 119) | def copy_signature(
  function copy_signature (line 132) | def copy_signature(
  function copy_signature (line 145) | def copy_signature(
  function copy_signature (line 157) | def copy_signature(

FILE: megatron/core/utils.py
  function null_decorator (line 87) | def null_decorator(*args, **kwargs):
  class ExperimentalNotEnabledError (line 101) | class ExperimentalNotEnabledError(Exception):
  function experimental_fn (line 105) | def experimental_fn(introduced_with_version: str):
  function experimental_cls (line 171) | def experimental_cls(introduced_with_version: str):
  function get_te_version (line 299) | def get_te_version():
  function is_te_min_version (line 330) | def is_te_min_version(version, check_equality=True):
  function get_torch_version (line 342) | def get_torch_version():
  function is_torch_min_version (line 349) | def is_torch_min_version(version, check_equality=True):
  function get_fa_version (line 360) | def get_fa_version():
  function is_fa_min_version (line 381) | def is_fa_min_version(version, check_equality=True):
  function get_mamba_version (line 392) | def get_mamba_version():
  function is_mamba_min_version (line 413) | def is_mamba_min_version(version, check_equality=True):
  function get_causal_conv1d_version (line 424) | def get_causal_conv1d_version():
  function is_causal_conv1d_min_version (line 445) | def is_causal_conv1d_min_version(version, check_equality=True):
  function get_flashinfer_version (line 456) | def get_flashinfer_version():
  function is_flashinfer_min_version (line 481) | def is_flashinfer_min_version(version, check_equality=True):
  function ensure_divisibility (line 494) | def ensure_divisibility(numerator, denominator):
  function divide (line 499) | def divide(numerator, denominator):
  function get_tensor_model_parallel_group_if_none (line 506) | def get_tensor_model_parallel_group_if_none(tp_group, is_expert=False, c...
  function get_pg_size (line 535) | def get_pg_size(group=None):
  function get_pg_rank (line 549) | def get_pg_rank(group=None):
  function get_pg_src_rank (line 563) | def get_pg_src_rank(group=None):
  function get_attr_wrapped_model (line 580) | def get_attr_wrapped_model(model, attr, allow_none=True, return_model_ob...
  function get_model_type (line 608) | def get_model_type(model):
  function get_model_xattn (line 613) | def get_model_xattn(model):
  function get_model_config (line 621) | def get_model_config(model):
  class GlobalMemoryBuffer (line 626) | class GlobalMemoryBuffer:
    method __init__ (line 631) | def __init__(self):
    method get_tensor (line 634) | def get_tensor(self, tensor_shape, dtype, name, mem_alloc_context: Opt...
  function _kernel_make_viewless_tensor (line 655) | def _kernel_make_viewless_tensor(inp, requires_grad):
  class WrappedTensor (line 669) | class WrappedTensor:
    method __init__ (line 676) | def __init__(self, tensor: torch.Tensor):
    method unwrap (line 679) | def unwrap(self):
  class MakeViewlessTensor (line 689) | class MakeViewlessTensor(torch.autograd.Function):
    method forward (line 700) | def forward(ctx, inp, requires_grad):
    method backward (line 705) | def backward(ctx, grad_output):
  function make_viewless_tensor (line 710) | def make_viewless_tensor(inp, requires_grad, keep_graph):
  function assert_viewless_tensor (line 731) | def assert_viewless_tensor(tensor, extra_msg=None):
  function safely_set_viewless_tensor_data (line 747) | def safely_set_viewless_tensor_data(tensor, new_data_tensor):
  function init_method_normal (line 761) | def init_method_normal(sigma):
  function scaled_init_method_normal (line 766) | def scaled_init_method_normal(sigma, num_layers, multiplier=2.0):
  function mup_scaled_init_method_normal (line 773) | def mup_scaled_init_method_normal(sigma, num_layers, width_mult, multipl...
  function log_on_each_pipeline_stage (line 793) | def log_on_each_pipeline_stage(
  function check_param_hashes_across_dp_replicas (line 824) | def check_param_hashes_across_dp_replicas(
  function make_tp_sharded_tensor_for_checkpoint (line 903) | def make_tp_sharded_tensor_for_checkpoint(
  function make_sharded_tensor_for_checkpoint (line 972) | def make_sharded_tensor_for_checkpoint(tensor, key, prepend_offsets=(), ...
  function get_full_tensor_if_necessary (line 1025) | def get_full_tensor_if_necessary(tensor):
  function to_local_if_dtensor (line 1041) | def to_local_if_dtensor(tensor: Union[torch.Tensor, "DTensor"]) -> torch...
  function get_data_parallel_group_if_dtensor (line 1047) | def get_data_parallel_group_if_dtensor(
  function prepare_input_tensors_for_wgrad_compute (line 1058) | def prepare_input_tensors_for_wgrad_compute(grad_output, all_gathered_in...
  function drain_embedding_wgrad_compute (line 1087) | def drain_embedding_wgrad_compute(
  function local_multi_tensor_applier (line 1171) | def local_multi_tensor_applier(op, noop_flag_buffer, tensor_lists, *args):
  function local_multi_tensor_l2_norm (line 1178) | def local_multi_tensor_l2_norm(chunk_size, noop_flag, tensor_lists, per_...
  function local_multi_tensor_scale (line 1190) | def local_multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale):
  class _ValueWithRank (line 1196) | class _ValueWithRank:
    method __init__ (line 1205) | def __init__(self, value: float, rank: int, unit: str = "") -> None:
    method __lt__ (line 1217) | def __lt__(self, other) -> bool:
    method __gt__ (line 1228) | def __gt__(self, other) -> bool:
    method __call__ (line 1239) | def __call__(self) -> Tuple[float, int, str]:
    method __str__ (line 1247) | def __str__(self) -> str:
  class _StragglerData (line 1258) | class _StragglerData:
  class StragglerDetector (line 1298) | class StragglerDetector:
    method __new__ (line 1341) | def __new__(cls: Type["StragglerDetector"]) -> "StragglerDetector":
    method __init__ (line 1356) | def __init__(self) -> None:
    method configure (line 1386) | def configure(
    method reset (line 1461) | def reset(self) -> None:
    method start_method (line 1485) | def start_method(self) -> None:
    method stop_method (line 1519) | def stop_method(self) -> None:
    method elapsed (line 1538) | def elapsed(self) -> Tuple[float, float, int, int, int, int]:
    method report (line 1591) | def report(self, total_flops: float = 0.0, log_interval: int = 0) -> b...
    method _check_toggle (line 1658) | def _check_toggle(self) -> None:
    method _handler (line 1688) | def _handler(self) -> None:
    method _controller (line 1719) | def _controller(self):
    method _min_max (line 1739) | def _min_max(
    method enabled (line 1860) | def enabled(self) -> bool:
    method configured (line 1871) | def configured(self) -> bool:
    method my_rank (line 1880) | def my_rank(self):
    method world_size (line 1889) | def world_size(self) -> int:
    method null_method (line 1897) | def null_method(self) -> None:
    method __enter__ (line 1901) | def __enter__(self) -> "StragglerDetector":
    method __call__ (line 1910) | def __call__(self, bdata: bool = False) -> "StragglerDetector":
    method __exit__ (line 1924) | def __exit__(
  function is_submodule (line 1954) | def is_submodule(module, parent_module, strict=True):
  function get_batch_on_this_cp_rank (line 1972) | def get_batch_on_this_cp_rank(
  function get_thd_batch_on_this_cp_rank (line 2019) | def get_thd_batch_on_this_cp_rank(
  function get_batch_on_this_hybrid_cp_rank (line 2064) | def get_batch_on_this_hybrid_cp_rank(
  function configure_nvtx_profiling (line 2121) | def configure_nvtx_profiling(enabled: bool) -> None:
  function _nvtx_range_get_func_path (line 2131) | def _nvtx_range_get_func_path():
  function nvtx_range_push (line 2145) | def nvtx_range_push(msg=None, suffix=None) -> None:
  function nvtx_range_pop (line 2167) | def nvtx_range_pop(msg=None, suffix=None) -> None:
  function _nvtx_decorator_get_func_path (line 2197) | def _nvtx_decorator_get_func_path(func):
  function nvtx_decorator (line 2212) | def nvtx_decorator(message: Optional[str] = None, color: Optional[str] =...
  function unwrap_model (line 2242) | def unwrap_model(model, module_instances=None):
  function get_asyncio_loop (line 2271) | def get_asyncio_loop(loop: asyncio.AbstractEventLoop | None = None) -> a...
  function is_using_quantization_scales (line 2286) | def is_using_quantization_scales(config):
  function trace_async_exceptions (line 2294) | def trace_async_exceptions(func: Optional[Callable] = None, *, verbose: ...
  function deprecated (line 2370) | def deprecated(
  function internal_api (line 2435) | def internal_api(func: Callable) -> Callable:
  function experimental_api (line 2468) | def experimental_api(func: Callable) -> Callable:
  function deprecate_args (line 2501) | def deprecate_args(
  function deprecate_inference_params (line 2530) | def deprecate_inference_params(inference_context, inference_params):

FILE: megatron/inference/utils.py
  function get_model_for_inference (line 39) | def get_model_for_inference() -> MegatronModule:
  function add_inference_args (line 86) | def add_inference_args(parser: ArgumentParser) -> ArgumentParser:
  function get_inference_config_from_model_and_args (line 279) | def get_inference_config_from_model_and_args(model: MegatronModule, args):
  function get_dynamic_inference_engine (line 369) | def get_dynamic_inference_engine(model: Optional[MegatronModule] = None)...

FILE: megatron/legacy/fp16_deprecated/loss_scaler.py
  class LossScaler (line 5) | class LossScaler:
    method __init__ (line 6) | def __init__(self, scale=1):
  class DynamicLossScaler (line 9) | class DynamicLossScaler:
    method __init__ (line 10) | def __init__(self,

FILE: megatron/legacy/fused_kernels/__init__.py
  function load (line 17) | def load(args):
  function _get_cuda_bare_metal_version (line 57) | def _get_cuda_bare_metal_version(cuda_dir):
  function _create_build_dir (line 70) | def _create_build_dir(buildpath):

FILE: megatron/legacy/fused_kernels/tests/test_fused_kernels.py
  function test_load_fused_kernels (line 13) | def test_load_fused_kernels():
  function test_fused_softmax (line 25) | def test_fused_softmax():
  function test_fused_upper_triangle_mask_softmax (line 123) | def test_fused_upper_triangle_mask_softmax():
  function test_layer_norm (line 223) | def test_layer_norm():
  function attention_mask_func (line 282) | def attention_mask_func(attention_scores, attention_mask):
  function forward_torch_softmax (line 287) | def forward_torch_softmax(input, mask, scale):
  function test_masked_softmax_forward (line 294) | def test_masked_softmax_forward():
  function test_masked_softmax_backward (line 309) | def test_masked_softmax_backward():
  function test_allmasked_softmax_forward (line 330) | def test_allmasked_softmax_forward():
  function test_allmasked_softmax_backward (line 346) | def test_allmasked_softmax_backward():

FILE: megatron/legacy/model/bert_model.py
  function bert_extended_attention_mask (line 20) | def bert_extended_attention_mask(attention_mask):
  function bert_position_ids (line 36) | def bert_position_ids(token_ids):
  class BertLMHead (line 46) | class BertLMHead(MegatronModule):
    method __init__ (line 55) | def __init__(self, mpu_vocab_size, config, parallel_output):
    method forward (line 74) | def forward(self, hidden_states, word_embeddings_weight):
    method load_state_dict (line 84) | def load_state_dict(self, state_dict, strict=True):
  function post_language_model_processing (line 96) | def post_language_model_processing(lm_output, pooled_output,
  class BertModel (line 127) | class BertModel(MegatronModule):
    method __init__ (line 130) | def __init__(self,
    method set_input_tensor (line 171) | def set_input_tensor(self, input_tensor):
    method forward (line 175) | def forward(self, bert_model_input, attention_mask,
    method state_dict_for_save_checkpoint (line 222) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
    method load_state_dict (line 243) | def load_state_dict(self, state_dict, strict=True):

FILE: megatron/legacy/model/biencoder_model.py
  function get_model_provider (line 19) | def get_model_provider(only_query_model=False, only_context_model=False,
  function biencoder_model_provider (line 37) | def biencoder_model_provider(only_query_model=False,
  class BiEncoderModel (line 65) | class BiEncoderModel(MegatronModule):
    method __init__ (line 68) | def __init__(self,
    method set_input_tensor (line 107) | def set_input_tensor(self, input_tensor):
    method forward (line 114) | def forward(self, query_tokens, query_attention_mask, query_types,
    method embed_text (line 136) | def embed_text(model, tokens, attention_mask, token_types):
    method state_dict_for_save_checkpoint (line 143) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
    method load_state_dict (line 163) | def load_state_dict(self, state_dict, strict=True):
    method init_state_dict_from_bert (line 180) | def init_state_dict_from_bert(self):
  class PretrainedBertModel (line 246) | class PretrainedBertModel(MegatronModule):
    method __init__ (line 250) | def __init__(self, num_tokentypes=2,
    method forward (line 280) | def forward(self, input_ids, attention_mask, tokentype_ids=None):
    method state_dict_for_save_checkpoint (line 304) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
    method load_state_dict (line 320) | def load_state_dict(self, state_dict, strict=True):

FILE: megatron/legacy/model/classification.py
  class Classification (line 17) | class Classification(MegatronModule):
    method __init__ (line 19) | def __init__(self,
    method set_input_tensor (line 48) | def set_input_tensor(self, input_tensor):
    method forward (line 52) | def forward(self, model_input, attention_mask, tokentype_ids=None):
    method state_dict_for_save_checkpoint (line 76) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
    method load_state_dict (line 89) | def load_state_dict(self, state_dict, strict=True):

FILE: megatron/legacy/model/enums.py
  class LayerType (line 5) | class LayerType(enum.Enum):
  class AttnType (line 9) | class AttnType(enum.Enum):
  class AttnMaskType (line 13) | class AttnMaskType(enum.Enum):

FILE: megatron/legacy/model/fused_bias_gelu.py
  function bias_gelu (line 16) | def bias_gelu(bias, y):
  function bias_gelu_back (line 24) | def bias_gelu_back(g, bias, y):
  class GeLUFunction (line 31) | class GeLUFunction(torch.autograd.Function):
    method forward (line 34) | def forward(ctx, input, bias):
    method backward (line 39) | def backward(ctx, grad_output):

FILE: megatron/legacy/model/fused_layer_norm.py
  class MixedFusedLayerNorm (line 31) | class MixedFusedLayerNorm(torch.nn.Module):
    method __init__ (line 33) | def __init__(self, normalized_shape, eps=1e-5,
    method reset_parameters (line 69) | def reset_parameters(self):
    method forward (line 78) | def forward(self, input):

FILE: megatron/legacy/model/fused_softmax.py
  class ScaledUpperTriangMaskedSoftmax (line 9) | class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
    method forward (line 18) | def forward(ctx, inputs, scale):
    method backward (line 33) | def backward(ctx, output_grads):
  class ScaledMaskedSoftmax (line 47) | class ScaledMaskedSoftmax(torch.autograd.Function):
    method forward (line 56) | def forward(ctx, inputs, mask, scale):
    method backward (line 69) | def backward(ctx, output_grads):
  class ScaledSoftmax (line 83) | class ScaledSoftmax(torch.autograd.Function):
    method forward (line 91) | def forward(ctx, inputs, scale):
    method backward (line 106) | def backward(ctx, output_grads):
  class FusedScaleMaskSoftmax (line 120) | class FusedScaleMaskSoftmax(nn.Module):
    method __init__ (line 134) | def __init__(
    method forward (line 161) | def forward(self, input, mask):
    method is_kernel_available (line 170) | def is_kernel_available(self, mask, b, np, sq, sk):
    method forward_fused_softmax (line 192) | def forward_fused_softmax(self, input, mask):
    method forward_torch_softmax (line 210) | def forward_torch_softmax(self, input, mask):
    method get_batch_per_block (line 228) | def get_batch_per_block(sq, sk, b, np):

FILE: megatron/legacy/model/gpt_model.py
  function post_language_model_processing (line 18) | def post_language_model_processing(lm_output, labels, logit_weights,
  class GPTModel (line 45) | class GPTModel(MegatronModule):
    method __init__ (line 48) | def __init__(self,
    method set_input_tensor (line 74) | def set_input_tensor(self, input_tensor):
    method forward (line 78) | def forward(self, input_ids, position_ids, attention_mask,
    method state_dict_for_save_checkpoint (line 98) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
    method load_state_dict (line 111) | def load_state_dict(self, state_dict, strict=True):

FILE: megatron/legacy/model/language_model.py
  function parallel_lm_logits (line 22) | def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, ...
  function get_language_model (line 52) | def get_language_model(
  class Pooler (line 91) | class Pooler(MegatronModule):
    method __init__ (line 103) | def __init__(self, hidden_size, init_method):
    method forward (line 109) | def forward(self, hidden_states, sequence_index=0):
  class Embedding (line 126) | class Embedding(MegatronModule):
    method __init__ (line 140) | def __init__(
    method zero_parameters (line 192) | def zero_parameters(self):
    method add_tokentype_embeddings (line 203) | def add_tokentype_embeddings(self, num_tokentypes):
    method forward (line 218) | def forward(self, input_ids, position_ids, tokentype_ids=None):
    method state_dict_for_save_checkpoint (line 255) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
    method load_state_dict (line 273) | def load_state_dict(self, state_dict, strict=True):
  class TransformerLanguageModel (line 319) | class TransformerLanguageModel(MegatronModule):
    method __init__ (line 332) | def __init__(
    method set_input_tensor (line 441) | def set_input_tensor(self, input_tensor):
    method forward (line 471) | def forward(
    method state_dict_for_save_checkpoint (line 555) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
    method load_state_dict (line 584) | def load_state_dict(self, state_dict, strict=True):

FILE: megatron/legacy/model/module.py
  function param_is_not_shared (line 18) | def param_is_not_shared(param):
  class MegatronModule (line 22) | class MegatronModule(torch.nn.Module):
    method __init__ (line 26) | def __init__(self, config=None, share_embeddings_and_output_weights=Tr...
    method state_dict_for_save_checkpoint (line 31) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
    method shared_embedding_or_output_weight (line 36) | def shared_embedding_or_output_weight(self):
    method initialize_word_embeddings (line 47) | def initialize_word_embeddings(self):
  function conversion_helper (line 127) | def conversion_helper(val, conversion):
  function fp32_to_float16 (line 138) | def fp32_to_float16(val, float16_convertor):
  function float16_to_fp32 (line 152) | def float16_to_fp32(val):

FILE: megatron/legacy/model/multiple_choice.py
  class MultipleChoice (line 17) | class MultipleChoice(MegatronModule):
    method __init__ (line 19) | def __init__(self,
    method set_input_tensor (line 45) | def set_input_tensor(self, input_tensor):
    method forward (line 49) | def forward(self, model_input, attention_mask, tokentype_ids=None):
    method state_dict_for_save_checkpoint (line 87) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
    method load_state_dict (line 100) | def load_state_dict(self, state_dict, strict=True):

FILE: megatron/legacy/model/realm_model.py
  function general_ict_model_provider (line 18) | def general_ict_model_provider(only_query_model=False, only_block_model=...
  class ICTBertModel (line 39) | class ICTBertModel(MegatronModule):
    method __init__ (line 41) | def __init__(self,
    method forward (line 67) | def forward(self, query_tokens, query_attention_mask, block_tokens, bl...
    method embed_query (line 73) | def embed_query(self, query_tokens, query_attention_mask):
    method embed_block (line 82) | def embed_block(self, block_tokens, block_attention_mask):
    method state_dict_for_save_checkpoint (line 91) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
    method load_state_dict (line 106) | def load_state_dict(self, state_dict, strict=True):
    method init_state_dict_from_bert (line 118) | def init_state_dict_from_bert(self):
  class IREncoderBertModel (line 148) | class IREncoderBertModel(MegatronModule):
    method __init__ (line 150) | def __init__(self, ict_head_size, num_tokentypes=2, parallel_output=Tr...
    method forward (line 170) | def forward(self, input_ids, attention_mask, tokentype_ids=None):
    method state_dict_for_save_checkpoint (line 185) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
    method load_state_dict (line 198) | def load_state_dict(self, state_dict, strict=True):

FILE: megatron/legacy/model/rms_norm.py
  class RMSNorm (line 6) | class RMSNorm(torch.nn.Module):
    method __init__ (line 8) | def __init__(self,
    method _norm (line 27) | def _norm(self, x):
    method forward (line 30) | def forward(self, x):

FILE: megatron/legacy/model/t5_model.py
  function t5_extended_attention_mask (line 19) | def t5_extended_attention_mask(attention_mask_list):
  function t5_position_ids (line 29) | def t5_position_ids(token_ids):
  class T5LMHead (line 39) | class T5LMHead(MegatronModule):
    method __init__ (line 47) | def __init__(self, mpu_vocab_size, parallel_output):
    method forward (line 56) | def forward(self, hidden_states, word_embeddings_weight):
  class T5Model (line 64) | class T5Model(MegatronModule):
    method __init__ (line 67) | def __init__(self,
    method set_input_tensor (line 112) | def set_input_tensor(self, input_tensor):
    method forward (line 116) | def forward(self, encoder_input_ids, decoder_input_ids, encoder_attn_m...
    method state_dict_for_save_checkpoint (line 165) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
    method load_state_dict (line 184) | def load_state_dict(self, state_dict, strict=True):

FILE: megatron/legacy/model/transformer.py
  class DropPath (line 76) | class DropPath(MegatronModule):
    method __init__ (line 81) | def __init__(self, drop_prob=0.):
    method forward (line 85) | def forward(self, hidden_state):
  class ParallelMLP (line 98) | class ParallelMLP(MegatronModule):
    method __init__ (line 106) | def __init__(self, config, is_expert=False):
    method forward (line 161) | def forward(self, hidden_states):
  function sinkhorn (line 179) | def sinkhorn(cost, tol=0.0001):
  function get_router_linear_layer (line 195) | def get_router_linear_layer(config):
  class SwitchMLP (line 204) | class SwitchMLP(MegatronModule):
    method __init__ (line 208) | def __init__(self, config):
    method gather_indices (line 227) | def gather_indices(self, local_indices):
    method forward (line 245) | def forward(self, hidden_states):
  class CoreAttention (line 317) | class CoreAttention(MegatronModule):
    method __init__ (line 319) | def __init__(self, layer_number, config,
    method forward (line 363) | def forward(self, query_layer, key_layer,
  class FlashSelfAttention (line 452) | class FlashSelfAttention(torch.nn.Module):
    method __init__ (line 462) | def __init__(self, causal=False, softmax_scale=None, attention_dropout...
    method forward (line 472) | def forward(self, q, k, v):
  class ParallelAttention (line 514) | class ParallelAttention(MegatronModule):
    method __init__ (line 521) | def __init__(self, config, layer_number,
    method _checkpointed_attention_forward (line 625) | def _checkpointed_attention_forward(self, query_layer, key_layer,
    method _allocate_memory (line 648) | def _allocate_memory(self, inference_max_sequence_len, batch_size, num...
    method forward (line 657) | def forward(self, hidden_states, attention_mask,
  function bias_dropout_add (line 844) | def bias_dropout_add(x, bias, residual, prob, training):
  function get_bias_dropout_add (line 853) | def get_bias_dropout_add(training):
  function bias_dropout_add_fused_train (line 860) | def bias_dropout_add_fused_train(x: torch.Tensor,
  function bias_dropout_add_fused_inference (line 868) | def bias_dropout_add_fused_inference(x: torch.Tensor,
  class ParallelTransformerLayer (line 875) | class ParallelTransformerLayer(MegatronModule):
    method __init__ (line 882) | def __init__(self, config,
    method default_decoder_cross_attention (line 927) | def default_decoder_cross_attention(self,
    method forward (line 963) | def forward(self, hidden_states, attention_mask,
  class NoopTransformerLayer (line 1076) | class NoopTransformerLayer(MegatronModule):
    method __init__ (line 1092) | def __init__(self, layer_number):
    method forward (line 1096) | def forward(self, hidden_states, attention_mask,
  function _get_num_layers (line 1102) | def _get_num_layers(args, model_type, is_decoder=False):
  class ParallelTransformer (line 1127) | class ParallelTransformer(MegatronModule):
    method __init__ (line 1130) | def __init__(self, config,
    method _get_layer (line 1308) | def _get_layer(self, layer_number):
    method _checkpointed_forward (line 1311) | def _checkpointed_forward(self, hidden_states, attention_mask,
    method set_input_tensor (line 1390) | def set_input_tensor(self, input_tensor):
    method forward (line 1400) | def forward(self, hidden_states, attention_mask,
    method load_state_dict (line 1505) | def load_state_dict(self, state_dict, strict=True):

FILE: megatron/legacy/model/utils.py
  function init_method_normal (line 13) | def init_method_normal(sigma):
  function scaled_init_method_normal (line 21) | def scaled_init_method_normal(sigma, num_layers):
  function attention_mask_func (line 31) | def attention_mask_func(attention_scores, attention_mask):
  function get_linear_layer (line 36) | def get_linear_layer(rows, columns, init_method):
  function gelu_impl (line 47) | def gelu_impl(x):
  function openai_gelu (line 52) | def openai_gelu(x):
  function erf_gelu (line 58) | def erf_gelu(x):
  function get_norm (line 62) | def get_norm(config):

FILE: megatron/legacy/model/vision/classification.py
  class VitClassificationModel (line 13) | class VitClassificationModel(MegatronModule):
    method __init__ (line 16) | def __init__(self, config, num_classes, finetune=False,
    method set_input_tensor (line 44) | def set_input_tensor(self, input_tensor):
    method forward (line 48) | def forward(self, input):
  class MitClassificationModel (line 57) | class MitClassificationModel(MegatronModule):
    method __init__ (line 60) | def __init__(self, num_classes,
    method _init_weights (line 72) | def _init_weights(self, m):
    method set_input_tensor (line 78) | def set_input_tensor(self, input_tensor):
    method forward (line 82) | def forward(self, input):

FILE: megatron/legacy/model/vision/dino.py
  class DINOLoss (line 23) | class DINOLoss(torch.nn.Module):
    method __init__ (line 24) | def __init__(self, out_dim, ncrops, warmup_teacher_temp, teacher_temp,
    method forward (line 41) | def forward(self, student_output, teacher_output, iteration):
    method update_center (line 73) | def update_center(self, teacher_output):
  class DINOHead (line 82) | class DINOHead(torch.nn.Module):
    method __init__ (line 83) | def __init__(self, in_dim, out_dim, norm_last_layer=True, nlayers=3):
    method _init_weights (line 105) | def _init_weights(self, m):
    method forward (line 111) | def forward(self, x):
  class MultiCropWrapper (line 118) | class MultiCropWrapper(MegatronModule):
    method __init__ (line 128) | def __init__(self, backbone, head):
    method forward (line 135) | def forward(self, x):
  function cosine_scheduler (line 159) | def cosine_scheduler(base_value, final_value, epochs, niter_per_ep,
  function get_student_backbone_and_num_features (line 176) | def get_student_backbone_and_num_features(config, pre_process=True, post...
  function get_teacher_backbone_and_num_features (line 198) | def get_teacher_backbone_and_num_features(config, pre_process=True, post...
  class DINOPretrainModel (line 219) | class DINOPretrainModel(MegatronModule):
    method __init__ (line 220) | def __init__(self, config, pre_process=True, post_process=True):
    method set_input_tensor (line 266) | def set_input_tensor(self, tensor):
    method forward (line 269) | def forward(self, input):
    method cancel_gradients_last_layer (line 278) | def cancel_gradients_last_layer(self, iteration):
    method update_momentum (line 286) | def update_momentum(self, iteration):

FILE: megatron/legacy/model/vision/esvit_swin_backbone.py
  class Mlp (line 25) | class Mlp(nn.Module):
    method __init__ (line 26) | def __init__(self, in_features, hidden_features=None,
    method forward (line 36) | def forward(self, x):
  function window_partition (line 45) | def window_partition(x, window_size):
  function window_reverse (line 59) | def window_reverse(windows, window_size, H, W):
  class WindowAttention (line 75) | class WindowAttention(nn.Module):
    method __init__ (line 88) | def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scal...
    method forward (line 122) | def forward(self, x, mask=None):
    method extra_repr (line 156) | def extra_repr(self) -> str:
    method flops (line 159) | def flops(self, N):
    method compute_macs (line 173) | def compute_macs(module, input, output):
  class SwinTransformerBlock (line 179) | class SwinTransformerBlock(nn.Module):
    method __init__ (line 197) | def __init__(self, dim, input_resolution, num_heads, window_size=7, sh...
    method create_attn_mask (line 229) | def create_attn_mask(self, H, W):
    method forward (line 255) | def forward(self, x):
    method extra_repr (line 313) | def extra_repr(self) -> str:
    method flops (line 317) | def flops(self):
  class PatchMerging (line 332) | class PatchMerging(nn.Module):
    method __init__ (line 340) | def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
    method forward (line 347) | def forward(self, x):
    method extra_repr (line 377) | def extra_repr(self) -> str:
    method flops (line 380) | def flops(self):
  class BasicLayer (line 387) | class BasicLayer(nn.Module):
    method __init__ (line 405) | def __init__(self, dim, input_resolution, depth, num_heads, window_size,
    method forward (line 429) | def forward(self, x):
    method forward_with_features (line 436) | def forward_with_features(self, x):
    method forward_with_attention (line 445) | def forward_with_attention(self, x):
    method extra_repr (line 455) | def extra_repr(self) -> str:
    method flops (line 458) | def flops(self):
  class PatchEmbed (line 467) | class PatchEmbed(nn.Module):
    method __init__ (line 471) | def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=...
    method forward (line 490) | def forward(self, x):
    method flops (line 499) | def flops(self):
  class SwinTransformer (line 506) | class SwinTransformer(nn.Module):
    method __init__ (line 530) | def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes...
    method _init_weights (line 580) | def _init_weights(self, m):
    method no_weight_decay (line 590) | def no_weight_decay(self):
    method no_weight_decay_keywords (line 594) | def no_weight_decay_keywords(self):
    method forward (line 598) | def forward(self, x):
    method forward_feature_maps (line 614) | def forward_feature_maps(self, x):
    method forward_selfattention (line 630) | def forward_selfattention(self, x, n=1):
    method forward_last_selfattention (line 644) | def forward_last_selfattention(self, x):
    method forward_all_selfattention (line 653) | def forward_all_selfattention(self, x):
    method forward_return_n_last_blocks (line 663) | def forward_return_n_last_blocks(self, x, n=1, return_patch_avgpool=Fa...
    method flops (line 705) | def flops(self):
    method init_weights (line 716) | def init_weights(self, pretrained='', pretrained_layers=[], verbose=Tr...
    method freeze_pretrained_layers (line 783) | def freeze_pretrained_layers(self, frozen_layers=[]):
  function get_swin (line 810) | def get_swin(is_teacher=False):

FILE: megatron/legacy/model/vision/inpainting.py
  class VitInpaintingModel (line 19) | class VitInpaintingModel(MegatronModule):
    method __init__ (line 21) | def __init__(self, config, pre_process=True, post_process=True):
    method set_input_tensor (line 48) | def set_input_tensor(self, input_tensor):
    method forward (line 51) | def forward(self, input):
  class MLP (line 70) | class MLP(torch.nn.Module):
    method __init__ (line 74) | def __init__(self, input_dim=2048, embed_dim=768):
    method forward (line 78) | def forward(self, x):
  class MitInpaintingModel (line 84) | class MitInpaintingModel(MegatronModule):
    method __init__ (line 87) | def __init__(self, pre_process=True, post_process=True):
    method set_input_tensor (line 115) | def set_input_tensor(self, input_tensor):
    method forward (line 119) | def forward(self, input):

FILE: megatron/legacy/model/vision/knn_monitor.py
  function build_data_loader (line 12) | def build_data_loader(dataset, drop_last=True, shuffle=False):
  function compute_feature_bank (line 38) | def compute_feature_bank(model):
  function get_feature_bank (line 96) | def get_feature_bank():
  function knn_predict (line 105) | def knn_predict(feature, feature_bank, feature_labels, classes, knn_k, k...

FILE: megatron/legacy/model/vision/mit_backbone.py
  class Mlp (line 13) | class Mlp(nn.Module):
    method __init__ (line 14) | def __init__(self,
    method _init_weights (line 31) | def _init_weights(self, m):
    method forward (line 46) | def forward(self, x, H, W):
  class Attention (line 56) | class Attention(nn.Module):
    method __init__ (line 57) | def __init__(self,
    method _init_weights (line 86) | def _init_weights(self, m):
    method forward (line 101) | def forward(self, x, H, W):
  class Block (line 125) | class Block(nn.Module):
    method __init__ (line 127) | def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_sc...
    method _init_weights (line 143) | def _init_weights(self, m):
    method forward (line 158) | def forward(self, x, H, W):
  class OverlapPatchEmbed (line 165) | class OverlapPatchEmbed(nn.Module):
    method __init__ (line 169) | def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, e...
    method _init_weights (line 180) | def _init_weights(self, m):
    method forward (line 195) | def forward(self, x):
  class MixVisionTransformer (line 204) | class MixVisionTransformer(nn.Module):
    method __init__ (line 205) | def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classe...
    method _init_weights (line 260) | def _init_weights(self, m):
    method reset_drop_path (line 275) | def reset_drop_path(self, drop_path_rate):
    method freeze_patch_emb (line 293) | def freeze_patch_emb(self):
    method forward_features (line 296) | def forward_features(self, x):
    method forward (line 335) | def forward(self, x):
  class DWConv (line 344) | class DWConv(nn.Module):
    method __init__ (line 345) | def __init__(self, dim=768):
    method forward (line 349) | def forward(self, x, H, W):
  class mit_b0 (line 357) | class mit_b0(MixVisionTransformer):
    method __init__ (line 358) | def __init__(self, **kwargs):
  class mit_b1 (line 365) | class mit_b1(MixVisionTransformer):
    method __init__ (line 366) | def __init__(self, **kwargs):
  class mit_b2 (line 373) | class mit_b2(MixVisionTransformer):
    method __init__ (line 374) | def __init__(self, **kwargs):
  class mit_b3 (line 381) | class mit_b3(MixVisionTransformer):
    method __init__ (line 382) | def __init__(self, **kwargs):
  class mit_b3_avg (line 388) | class mit_b3_avg(MixVisionTransformer):
    method __init__ (line 389) | def __init__(self, drop_path_rate=0.1, **kwargs):
  class mit_b4 (line 395) | class mit_b4(MixVisionTransformer):
    method __init__ (line 396) | def __init__(self, **kwargs):
  class mit_b5 (line 402) | class mit_b5(MixVisionTransformer):
    method __init__ (line 403) | def __init__(self, **kwargs):
  class mit_b5_avg (line 409) | class mit_b5_avg(MixVisionTransformer):
    method __init__ (line 410) | def __init__(self, drop_path_rate=0.1, **kwargs):

FILE: megatron/legacy/model/vision/swin_backbone.py
  class Mlp (line 19) | class Mlp(nn.Module):
    method __init__ (line 20) | def __init__(self, in_features, hidden_features=None,
    method forward (line 30) | def forward(self, x):
  function window_partition (line 39) | def window_partition(x, window_size):
  function window_reverse (line 54) | def window_reverse(windows, window_size, H, W):
  class WindowAttention (line 71) | class WindowAttention(nn.Module):
    method __init__ (line 85) | def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scal...
    method forward (line 119) | def forward(self, x, mask=None):
    method extra_repr (line 152) | def extra_repr(self) -> str:
    method flops (line 155) | def flops(self, N):
  class SwinTransformerBlock (line 169) | class SwinTransformerBlock(nn.Module):
    method __init__ (line 188) | def __init__(self, dim, input_resolution, num_heads, window_size=7, sh...
    method create_attn_mask (line 219) | def create_attn_mask(self, H, W):
    method forward (line 245) | def forward(self, x):
    method extra_repr (line 284) | def extra_repr(self) -> str:
    method flops (line 288) | def flops(self):
  class PatchMerging (line 303) | class PatchMerging(nn.Module):
    method __init__ (line 312) | def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
    method forward (line 319) | def forward(self, x):
    method extra_repr (line 342) | def extra_repr(self) -> str:
    method flops (line 345) | def flops(self):
  class BasicLayer (line 352) | class BasicLayer(nn.Module):
    method __init__ (line 372) | def __init__(self, dim, input_resolution, depth, num_heads, window_size,
    method forward (line 400) | def forward(self, x):
    method extra_repr (line 411) | def extra_repr(self) -> str:
    method flops (line 414) | def flops(self):
  class PatchEmbed (line 423) | class PatchEmbed(nn.Module):
    method __init__ (line 434) | def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=9...
    method forward (line 453) | def forward(self, x):
    method flops (line 463) | def flops(self):
  class SwinTransformer (line 471) | class SwinTransformer(nn.Module):
    method __init__ (line 496) | def __init__(self, img_size=224, patch_size=4, in_chans=3,
    method _init_weights (line 552) | def _init_weights(self, m):
    method no_weight_decay (line 562) | def no_weight_decay(self):
    method no_weight_decay_keywords (line 566) | def no_weight_decay_keywords(self):
    method forward (line 569) | def forward(self, x):
    method flops (line 595) | def flops(self):
  function get_swin (line 605) | def get_swin(drop_path_rate=0.3, output_avg=False):

FILE: megatron/legacy/model/vision/utils.py
  function resize (line 7) | def resize(input,

FILE: megatron/legacy/model/vision/vit_backbone.py
  class VitMlpHead (line 21) | class VitMlpHead(MegatronModule):
    method __init__ (line 33) | def __init__(self, config, hidden_size, num_classes):
    method forward (line 41) | def forward(self, hidden_states):
  function isPerfectSquare (line 50) | def isPerfectSquare(x):
  function twod_interpolate_position_embeddings_hook (line 57) | def twod_interpolate_position_embeddings_hook(
  class VitBackbone (line 130) | class VitBackbone(MegatronModule):
    method __init__ (line 133) | def __init__(self,
    method set_input_tensor (line 208) | def set_input_tensor(self, input_tensor):
    method forward (line 212) | def forward(self, input):

FILE: megatron/post_training/arguments.py
  function add_modelopt_args (line 4) | def add_modelopt_args(parser):

FILE: megatron/post_training/checkpointing.py
  function has_modelopt_state (line 23) | def has_modelopt_state(checkpoint_path: str) -> bool:
  function get_sharded_load_dir (line 55) | def get_sharded_load_dir(load_dir: str) -> Tuple[Union[Path, None], str]:
  function load_modelopt_state (line 92) | def load_modelopt_state(model: nn.Module, load_dir: Optional[str] = None...
  function load_modelopt_checkpoint (line 129) | def load_modelopt_checkpoint(

FILE: megatron/post_training/generate.py
  function simple_generate (line 15) | def simple_generate(
  function simple_speculative_generate (line 106) | def simple_speculative_generate(

FILE: megatron/post_training/loss_func.py
  function _mask_loss (line 13) | def _mask_loss(output_tensor, loss_mask):
  function loss_func (line 39) | def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor, mode...

FILE: megatron/post_training/model_builder.py
  function count_parameters_in_layer (line 31) | def count_parameters_in_layer(model, layer_name):
  function _add_load_convert_hooks (line 40) | def _add_load_convert_hooks(model: MCoreGPTModel):
  function _load_teacher_model_config (line 48) | def _load_teacher_model_config(checkpoint_path: str) -> Namespace:
  function _load_teacher_model (line 117) | def _load_teacher_model(config, config_raw: Namespace, model_kwargs: Dic...
  function modelopt_gpt_mamba_builder (line 161) | def modelopt_gpt_mamba_builder(

FILE: megatron/post_training/non_loss_data_func.py
  function report_draft_acceptance_length (line 11) | def report_draft_acceptance_length(model, osl: int = 64, draft_steps: in...

FILE: megatron/post_training/utils.py
  function modelopt_version_higher_than (line 16) | def modelopt_version_higher_than(target_version: str):
  function modelopt_version_at_least (line 27) | def modelopt_version_at_least(target_version: str):
  function function_has_parameter (line 39) | def function_has_parameter(function, argument_name: str) -> bool:
  function get_current_memory_info (line 44) | def get_current_memory_info():
  function report_current_memory_info (line 57) | def report_current_memory_info():
  function get_mtbench_chat_data (line 63) | def get_mtbench_chat_data():
  function to_empty_if_meta (line 81) | def to_empty_if_meta(module: torch.nn.Module, *, device: torch.device, r...
  function print_distributed_quant_summary (line 101) | def print_distributed_quant_summary(model, msg=""):

FILE: megatron/rl/__init__.py
  function import_class (line 16) | def import_class(class_path: str) -> Type:
  class TypeLookupable (line 38) | class TypeLookupable(BaseModel, extra='allow'):
    method unwrap (line 43) | def unwrap(self) -> Self:
    method register_subclass (line 48) | def register_subclass(cls, register_type: Type[Self]) -> Type[Self]:
  class GenericGenerationArgs (line 60) | class GenericGenerationArgs(BaseModel):
    method add (line 69) | def add(self, generation_args: 'GenericGenerationArgs') -> 'GenericGen...
  class Request (line 75) | class Request(BaseModel):

FILE: megatron/rl/agent/api.py
  class AgentBaseModel (line 23) | class AgentBaseModel(BaseModel, extra='allow'):
  class RolloutRequest (line 27) | class RolloutRequest(Request):
  class GroupedRolloutRequest (line 35) | class GroupedRolloutRequest(Request):
  class Rollout (line 47) | class Rollout(AgentBaseModel):
  class TokenRollout (line 60) | class TokenRollout(AgentBaseModel):
  class RolloutGroup (line 77) | class RolloutGroup(AgentBaseModel):
    method __iter__ (line 84) | def __iter__(self):
    method __len__ (line 87) | def __len__(self):
    method __getitem__ (line 90) | def __getitem__(self, idx):
  class ContrastiveRollout (line 97) | class ContrastiveRollout(AgentBaseModel):
  class Head2HeadRolloutRequest (line 104) | class Head2HeadRolloutRequest(Request):
  class EvaluationRequest (line 110) | class EvaluationRequest(Request):
  class EvaluationResult (line 121) | class EvaluationResult(AgentBaseModel):
  class RewardEvaluationResult (line 126) | class RewardEvaluationResult(EvaluationResult):
  class EvaluationResponse (line 134) | class EvaluationResponse(AgentBaseModel, TypeLookupable, Generic[T]):
    method metrics (line 138) | def metrics(self):
  class Agent (line 142) | class Agent(ABC, AgentBaseModel):
  class RolloutGenerator (line 146) | class RolloutGenerator(Agent, ABC):
    method rollout (line 150) | async def rollout(self, request: RolloutRequest) -> Rollout: ...
    method get_reward_rollouts (line 152) | async def get_reward_rollouts(self, request: RolloutRequest) -> list[R...
  class ContrastiveRolloutGenerator (line 162) | class ContrastiveRolloutGenerator(Agent, ABC):
    method get_contrastive_rollouts (line 166) | async def get_contrastive_rollouts(
  class TokenizedRolloutGenerator (line 171) | class TokenizedRolloutGenerator(Agent, ABC):
    method rollout (line 179) | async def rollout(self, request: RolloutRequest) -> TokenRollout: ...
    method get_reward_rollouts (line 181) | async def get_reward_rollouts(self, request: RolloutRequest) -> list[T...
  class GroupedRolloutGenerator (line 191) | class GroupedRolloutGenerator(Agent, ABC):
    method __init__ (line 197) | def __init__(self, *, parallel_generation_tasks: int | None = None, **...
    method group_rollout (line 203) | async def group_rollout(self, request: GroupedRolloutRequest) -> list[...
    method get_grouped_rollouts (line 205) | async def get_grouped_rollouts(self, request: GroupedRolloutRequest):
  class EvaluationAgent (line 302) | class EvaluationAgent(Agent, ABC):
    method run_evaluation (line 306) | async def run_evaluation(self, request: EvaluationRequest) -> Evaluati...

FILE: megatron/rl/agent/huggingface_dataset_agent.py
  class HFDatasetAgent (line 7) | class HFDatasetAgent(BaseModel):
    method __init__ (line 22) | def __init__(self, **data):
    method load_hf_dataset (line 26) | def load_hf_dataset(self):

FILE: megatron/rl/agent/pass_at_evaluation_agent.py
  function pass_at_k (line 14) | def pass_at_k(n_samples: int, n_correct: int, k: int) -> float:
  class PassAtEvaluationResult (line 26) | class PassAtEvaluationResult(RewardEvaluationResult):
  class PassAtEvaluationResponse (line 34) | class PassAtEvaluationResponse(EvaluationResponse[PassAtEvaluationResult]):
    method metrics (line 37) | def metrics(self):
  class PassAtEvaluationAgent (line 47) | class PassAtEvaluationAgent(EvaluationAgent, ABC):
    method __init__ (line 49) | def __init__(self, max_k=32, **kwargs):
    method _evaluation (line 54) | async def _evaluation(
    method evaluation (line 58) | async def evaluation(

FILE: megatron/rl/agent/remote_agent.py
  class RemoteAgent (line 7) | class RemoteAgent(FastAPIEnvServer, RolloutGenerator, GroupedRolloutGene...

FILE: megatron/rl/agent/reward_only_agent.py
  class RewardOnlyEvaluationResponse (line 30) | class RewardOnlyEvaluationResponse(EvaluationResponse[RewardEvaluationRe...
    method metrics (line 33) | def metrics(self):
  class RewardOnlyAgent (line 37) | class RewardOnlyAgent(RolloutGenerator, GroupedRolloutGenerator, PassAtE...
    method get_dataset (line 42) | def get_dataset(self, validation: bool = False):
    method get_reward (line 46) | async def get_reward(self, response: str, golden: Any) -> float:
    method get_prompt (line 50) | async def get_prompt(self, validation: bool) -> tuple[str, Any]:
    method evaluation_prompts (line 54) | async def evaluation_prompts(
    method _get_rank_subset (line 60) | def _get_rank_subset(
    method rollout_from_response (line 84) | async def rollout_from_response(
    method rollout (line 124) | async def rollout(self, request: RolloutRequest) -> Rollout:
    method group_rollout (line 136) | async def group_rollout(self, request: GroupedRolloutRequest) -> list[...
    method _evaluation (line 147) | async def _evaluation(
    method run_evaluation (line 168) | async def run_evaluation(self, request: EvaluationRequest):

FILE: megatron/rl/agent/weighted_multi_task.py
  class AgentConfig (line 24) | class AgentConfig(AgentBaseModel):
    method __init__ (line 32) | def __init__(self, **data):
  class WeightedMultiTask (line 38) | class WeightedMultiTask(
    method __init__ (line 43) | def __init__(self, agent_configs: list[AgentConfig]):
    method from_config (line 69) | def from_config(
    method _distribute_counts (line 106) | def _distribute_counts(self, total_count: int, distribute_remainder: b...
    method group_rollout (line 156) | async def group_rollout(self, request: GroupedRolloutRequest) -> list[...
    method rollout (line 161) | async def rollout(self, request: RolloutRequest) -> Rollout:
    method get_reward_rollouts (line 166) | async def get_reward_rollouts(self, request: RolloutRequest) -> list[R...
    method get_grouped_rollouts (line 186) | async def get_grouped_rollouts(self, request: GroupedRolloutRequest):
    method get_contrastive_rollouts (line 246) | async def get_contrastive_rollouts(self, request: RolloutRequest) -> l...
    method run_evaluation (line 271) | async def run_evaluation(self, request: EvaluationRequest) -> list[Eva...

FILE: megatron/rl/inference/api.py
  class LLMChatMessage (line 8) | class LLMChatMessage(BaseModel):
  class InferenceRequest (line 13) | class InferenceRequest(Request):
  class InferenceResponse (line 18) | class InferenceResponse(BaseModel):

FILE: megatron/rl/inference/inference_interface.py
  class InferenceInterface (line 15) | class InferenceInterface(BaseModel):
    class Config (line 18) | class Config:
    method prepare_request (line 21) | def prepare_request(
    method base_generate (line 27) | async def base_generate(self, request: InferenceRequest) -> InferenceR...
    method agenerate (line 30) | async def agenerate(
    method generate (line 35) | def generate(
  class ReturnsRaw (line 45) | class ReturnsRaw(InferenceInterface):
  class ReturnsTokens (line 51) | class ReturnsTokens(InferenceInterface):
  class ReturnsLogProbs (line 57) | class ReturnsLogProbs(ReturnsTokens):

FILE: megatron/rl/inference/megatron.py
  class MegatronLocal (line 36) | class MegatronLocal(InferenceServer, ReturnsTokens, ReturnsRaw):
    method base_generate (line 47) | async def base_generate(self, request: InferenceRequest) -> InferenceR...
    method launch (line 85) | async def launch(cls, model: GPTModel, **kwargs):
    method kill (line 147) | async def kill(self):
    method set_generation_epoch (line 168) | def set_generation_epoch(self, generation_epoch: int):
    method suspend (line 172) | async def suspend(self):
    method resume (line 181) | async def resume(self):

FILE: megatron/rl/logging.py
  function log (line 18) | def log(message):

FILE: megatron/rl/parallel_utils.py
  function build_inference_pg_collection (line 16) | def build_inference_pg_collection(

FILE: megatron/rl/rl_utils.py
  function _torch_saver_swap_inference_model (line 112) | def _torch_saver_swap_inference_model(*, to_cpu: bool) -> None:
  function _maybe_prefetch_separate_inference_model_weights (line 145) | def _maybe_prefetch_separate_inference_model_weights(model_core, *, to_c...
  function verify_model_weights_swap (line 179) | def verify_model_weights_swap(
  class RolloutStats (line 273) | class RolloutStats:
  class RLRuntimeState (line 296) | class RLRuntimeState:
    method __init__ (line 299) | def __init__(self):
    method reset_iteration_counters (line 305) | def reset_iteration_counters(self, iteration):
    method increment_sequences (line 310) | def increment_sequences(self, count):
  function get_rl_runtime_state (line 320) | def get_rl_runtime_state():
  function update_inference_logprobs_group_stats (line 325) | def update_inference_logprobs_group_stats(
  function align_unpacked_inference_logprobs (line 359) | def align_unpacked_inference_logprobs(
  function get_agent (line 424) | def get_agent(args, parallel_generation_tasks: int | None = None):
  function get_inference_interface (line 442) | def get_inference_interface(args, loop, model):
  function get_rollout_generator (line 458) | def get_rollout_generator(args, inference_interface, n_prompts, samples_...
  function get_environment_rollouts (line 480) | def get_environment_rollouts(
  function selective_log_softmax (line 596) | def selective_log_softmax(logits, index):
  function get_logprobs (line 637) | def get_logprobs(model, tokens, position_ids, no_grad=False, sequence_pa...
  function calculate_grpo_advantages (line 717) | def calculate_grpo_advantages(rewards: list[list[float]], num_turns: lis...
  function compute_group_stats (line 744) | def compute_group_stats(
  function prep_wandb_metrics (line 845) | def prep_wandb_metrics(
  function maybe_log_training_metrics (line 956) | def maybe_log_training_metrics(
  function prepare_trajectories (line 1037) | def prepare_trajectories(
  function logprobs_forward_step (line 1164) | def logprobs_forward_step(data_iterator, model, is_correction, packing_c...
  function compute_logprobs_batch (line 1195) | def compute_logprobs_batch(
  function prepare_data_for_update (line 1245) | def prepare_data_for_update(
  function get_grpo_data_iterator (line 1518) | def get_grpo_data_iterator(
  function evaluate_and_print_results_rl (line 1590) | def evaluate_and_print_results_rl(
  function calculate_grpo_loss (line 1703) | def calculate_grpo_loss(
  function megatron_rl_inference_mode (line 1798) | def megatron_rl_inference_mode(
  function rl_inference_interface_shutdown (line 1920) | def rl_inference_interface_shutdown():
  function get_iteration_sequence_count (line 1943) | def get_iteration_sequence_count(args):
  function _pad_nonnull_with_zeros (line 1953) | def _pad_nonnull_with_zeros(data: list[Optional[torch.Tensor]], max_len:...

FILE: megatron/rl/sequence_packing_utils.py
  class PackingInfo (line 25) | class PackingInfo:
  class PackingContext (line 43) | class PackingContext:
  function load_packed_data_by_index (line 74) | def load_packed_data_by_index(bin_idx: int, packing_context: PackingCont...
  function log_packing_efficiency (line 140) | def log_packing_efficiency(packing_context: PackingContext):
  function get_actual_sequence_lengths (line 282) | def get_actual_sequence_lengths(sequences: torch.Tensor, pad_token: int)...
  function create_empty_bins (line 311) | def create_empty_bins(
  function get_default_packed_seq_params (line 398) | def get_default_packed_seq_params(seq_length: int, max_sequences_per_bin...
  function create_packed_seq_params (line 434) | def create_packed_seq_params(packing_context: PackingContext):
  function create_packed_seq_params_for_bin (line 451) | def create_packed_seq_params_for_bin(
  function pack_inference_logprobs (line 511) | def pack_inference_logprobs(
  function compute_packed_inference_logprobs_stats (line 582) | def compute_packed_inference_logprobs_stats(
  class SequencePacker (line 623) | class SequencePacker:
    method __init__ (line 626) | def __init__(self, bin_size: int, pad_token: int, max_sequences_per_bi...
    method pack_sequences (line 631) | def pack_sequences(
  function distribute_packed_bins (line 800) | def distribute_packed_bins(
  function pack_all_trajectories (line 973) | def pack_all_trajectories(trajs, generation_masks, inference_logprobs, g...
  function update_microbatch_calculator (line 1067) | def update_microbatch_calculator(
  function get_microbatch_dataloader (line 1154) | def get_microbatch_dataloader(num_bins_this_rank, micro_batch_size):
  function get_sequence_packing_log_info (line 1159) | def get_sequence_packing_log_info(args):
  function get_sequence_packing_tensorboard_metrics (line 1166) | def get_sequence_packing_tensorboard_metrics(args):

FILE: megatron/rl/server/agent/fastapi_env_server.py
  class FastAPIEnvServer (line 43) | class FastAPIEnvServer(EnvironmentServer):
    method launch (line 49) | async def launch(cls, env_cls: type[Agent], cls_args: dict, port: int,...
    method kill (line 101) | def kill(self):
    method get_contrastive_rollouts (line 104) | async def get_contrastive_rollouts(self, request: RolloutRequest) -> l...
    method group_rollout (line 119) | async def group_rollout(self, request: GroupedRolloutRequest):
    method get_grouped_rollouts (line 124) | async def get_grouped_rollouts(
    method rollout (line 141) | async def rollout(self, request: RolloutRequest) -> TokenRollout:
    method get_reward_rollouts (line 146) | async def get_reward_rollouts(self, request: RolloutRequest) -> list[T...
    method run_evaluation (line 159) | async def run_evaluation(self, request: EvaluationRequest) -> Evaluati...
  function run (line 173) | def run(agent_cls: type[Agent], cls_args: dict, port: int):

FILE: megatron/rl/server/api.py
  class Server (line 11) | class Server(TypeLookupable):
    method launch (line 15) | async def launch(cls) -> Self:
    method suspend (line 18) | async def suspend(self):
    method resume (line 21) | async def resume(self):
    method kill (line 24) | async def kill(self):
  class InferenceServer (line 28) | class InferenceServer(Server, InferenceInterface):
  class EnvironmentServer (line 34) | class EnvironmentServer(Server):
  class RemoteRolloutRequest (line 40) | class RemoteRolloutRequest(RolloutRequest):
  class RemoteGroupedRolloutRequest (line 44) | class RemoteGroupedRolloutRequest(GroupedRolloutRequest):
  class RemoteEvaluationRequest (line 48) | class RemoteEvaluationRequest(EvaluationRequest):

FILE: megatron/rl/server/inference/inference_interface_server.py
  class InferenceInterfaceClient (line 25) | class InferenceInterfaceClient(InferenceServer):
    method base_generate (line 30) | async def base_generate(self, request: InferenceRequest) -> InferenceR...
  class InferenceInterfaceServer (line 39) | class InferenceInterfaceServer(InferenceInterfaceClient, ReturnsRaw, Ret...
    method launch (line 47) | async def launch(cls, interface_cls: type[InferenceInterface], **kwarg...
    method kill (line 82) | async def kill(self):
    method suspend (line 89) | async def suspend(self):
    method resume (line 93) | async def resume(self):

FILE: megatron/training/argument_utils.py
  class TypeInferenceError (line 17) | class TypeInferenceError(Exception):
  class ArgumentGroupFactory (line 21) | class ArgumentGroupFactory:
    method __init__ (line 66) | def __init__(self, src_cfg_class: type, exclude: Optional[list[str]] =...
    method _format_arg_name (line 71) | def _format_arg_name(self, config_attr_name: str, prefix: Optional[str...
    method _get_enum_kwargs (line 85) | def _get_enum_kwargs(self, config_type: enum.EnumMeta) -> dict[str, Any]:
    method _extract_type (line 96) | def _extract_type(self, config_type: type) -> dict[str, Any]:
    method _build_argparse_kwargs_from_field (line 136) | def _build_argparse_kwargs_from_field(self, attribute: Field) -> dict[...
    method build_group (line 190) | def build_group(self, parser: ArgumentParser, title: Optional[str] = N...
    method _get_field_docstrings (line 209) | def _get_field_docstrings(self, src_cfg_class: type) -> dict[str, str]:

FILE: megatron/training/arguments.py
  function add_megatron_arguments (line 49) | def add_megatron_arguments(parser: argparse.ArgumentParser):
  function parse_args (line 88) | def parse_args(extra_args_provider=None, ignore_unknown_args=False):
  function validate_model_config_args_from_heterogeneous_config (line 126) | def validate_model_config_args_from_heterogeneous_config(args):
  function _eval_pattern (line 199) | def _eval_pattern(pattern):
  function no_rope_freq_type (line 209) | def no_rope_freq_type(x):
  function moe_freq_type (line 232) | def moe_freq_type(x):
  function la_freq_type (line 257) | def la_freq_type(x):
  function tuple_type (line 282) | def tuple_type(x):
  function validate_args (line 294) | def validate_args(args, defaults={}):
  function _print_args (line 1623) | def _print_args(title, args):
  function _check_arg_is_not_none (line 1637) | def _check_arg_is_not_none(args, arg):
  function core_transformer_config_from_args (line 1641) | def core_transformer_config_from_args(args, config_class=None):
  function _add_transformer_engine_args (line 1729) | def _add_transformer_engine_args(parser):
  function _add_inference_args (line 1743) | def _add_inference_args(parser):
  function _add_network_size_args (line 1922) | def _add_network_size_args(parser):
  function _add_straggler_detector_args (line 2087) | def _add_straggler_detector_args(parser):
  function _add_workload_inspector_server_args (line 2095) | def _add_workload_inspector_server_args(parser):
  function _add_inprocess_restart_args (line 2101) | def _add_inprocess_restart_args(parser):
  function _add_one_logger_args (line 2150) | def _add_one_logger_args(parser):
  function _add_ft_package_args (line 2179) | def _add_ft_package_args(parser):
  function _add_logging_args (line 2195) | def _add_logging_args(parser):
  function _add_regularization_args (line 2204) | def _add_regularization_args(parser):
  function _add_rl_args (line 2264) | def _add_rl_args(parser):
  function _add_training_args (line 2412) | def _add_training_args(parser):
  function _add_rerun_machine_args (line 2512) | def _add_rerun_machine_args(parser):
  function _add_initialization_args (line 2521) | def _add_initialization_args(parser):
  function _add_learning_rate_args (line 2533) | def _add_learning_rate_args(parser):
  function _add_checkpointing_args (line 2558) | def _add_checkpointing_args(parser):
  function _add_mixed_precision_args (line 2588) | def _add_mixed_precision_args(parser):
  function _add_distributed_args (line 2619) | def _add_distributed_args(parser):
  function _add_validation_args (line 2738) | def _add_validation_args(parser):
  function _add_tokenizer_args (line 2747) | def _add_tokenizer_args(parser):
  function _add_data_args (line 2806) | def _add_data_args(parser):
  function _add_autoresume_args (line 2930) | def _add_autoresume_args(parser):
  function _add_biencoder_args (line 2942) | def _add_biencoder_args(parser):
  function _add_vision_args (line 2999) | def _add_vision_args(parser):
  function _add_moe_args (line 3066) | def _add_moe_args(parser):
  function _add_mla_args (line 3097) | def _add_mla_args(parser):
  function _add_experimental_attention_variant_args (line 3127) | def _add_experimental_attention_variant_args(parser):
  function _add_heterogeneous_args (line 3141) | def _add_heterogeneous_args(parser):
  function _add_experimental_args (line 3191) | def _add_experimental_args(parser):
  function _add_msc_args (line 3254) | def _add_msc_args(parser):
  function _add_kitchen_quantization_arguments (line 3260) | def _add_kitchen_quantization_arguments(parser: argparse.ArgumentParser):
  function _add_sft_args (line 3290) | def _add_sft_args(parser):

FILE: megatron/training/async_utils.py
  function init_persistent_async_worker (line 25) | def init_persistent_async_worker(rank: int, mp_mode: str = 'spawn'):
  function schedule_async_save (line 47) | def schedule_async_save(async_request: AsyncRequest):
  function maybe_finalize_async_save (line 56) | def maybe_finalize_async_save(blocking: bool = False, terminate=False):
  function is_empty_async_queue (line 84) | def is_empty_async_queue() -> bool:
  function reset_persistent_async_worker (line 93) | def reset_persistent_async_worker():

FILE: megatron/training/checkpointing.py
  function finalize_deletion_processes (line 78) | def finalize_deletion_processes(blocking=False):
  function set_checkpoint_version (line 101) | def set_checkpoint_version(value):
  function get_checkpoint_version (line 109) | def get_checkpoint_version():
  function set_loaded_iteration (line 114) | def set_loaded_iteration(value):
  function get_loaded_iteration (line 124) | def get_loaded_iteration():
  function check_checkpoint_args (line 130) | def check_checkpoint_args(checkpoint_args):
  function isfile (line 172) | def isfile(filename) -> bool:
  function ensure_directory_exists (line 180) | def ensure_directory_exists(filename, check_parent=True):
  function get_checkpoint_name (line 190) | def get_checkpoint_name(checkpoints_path, iteration, release=False,
  function get_load_checkpoint_path_by_args (line 232) | def get_load_checkpoint_path_by_args(args, load_arg="load"):
  function get_distributed_optimizer_checkpoint_name (line 249) | def get_distributed_optimizer_checkpoint_name(model_checkpoint_name):
  function find_checkpoint_rank_0 (line 254) | def find_checkpoint_rank_0(checkpoints_path, iteration, release=False):
  function get_checkpoint_tracker_filename (line 305) | def get_checkpoint_tracker_filename(checkpoints_path):
  function checkpoint_exists (line 312) | def checkpoint_exists(checkpoints_path):
  function read_metadata (line 319) | def read_metadata(tracker_filename):
  function get_rng_state (line 364) | def get_rng_state(ckpt_format: str, tp_group: torch.distributed.ProcessG...
  class CheckpointType (line 402) | class CheckpointType(Enum):
  function _build_sharded_state_dict_metadata (line 410) | def _build_sharded_state_dict_metadata(args: Namespace, dp_cp_group: Opt...
  function save_grads (line 446) | def save_grads(save_dir, state_dict, iteration, grad_label):
  function save_checkpoint (line 480) | def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, nu...
  function _async_delete_checkpoint_impl (line 826) | def _async_delete_checkpoint_impl(save_path, iteration_to_delete, log_pr...
  function cleanup_old_non_persistent_checkpoint (line 861) | def cleanup_old_non_persistent_checkpoint(save_dir, leave_ckpt_num=1, do...
  function maybe_save_dataloader_state (line 884) | def maybe_save_dataloader_state(train_iterator, iteration, dataloader_sa...
  function generate_state_dict (line 933) | def generate_state_dict(
  function preprocess_fsdp_dtensor_state_dict (line 1011) | def preprocess_fsdp_dtensor_state_dict(args, raw_state_dict, model):
  function _transpose_first_dim (line 1033) | def _transpose_first_dim(t, num_splits, num_splits_first, model):
  function fix_query_key_value_ordering (line 1072) | def fix_query_key_value_ordering(model, checkpoint_version):
  function _get_non_persistent_iteration (line 1103) | def _get_non_persistent_iteration(non_persistent_global_dir, args, check...
  function _load_non_persistent_base_checkpoint (line 1124) | def _load_non_persistent_base_checkpoint(
  function _load_global_dist_base_checkpoint (line 1159) | def _load_global_dist_base_checkpoint(
  function _get_checkpoint_format (line 1199) | def _get_checkpoint_format(checkpoint_name, args):
  function _load_base_checkpoint (line 1225) | def _load_base_checkpoint(
  function load_args_from_checkpoint (line 1396) | def load_args_from_checkpoint(
  function load_checkpoint (line 1561) | def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg=...
  function _to_dtensor (line 2013) | def _to_dtensor(wrapped_model, model_state_dict):
  function load_biencoder_checkpoint (line 2027) | def load_biencoder_checkpoint(model, only_query_model=False,

FILE: megatron/training/config/common_config.py
  class RNGConfig (line 7) | class RNGConfig:
  class ProfilingConfig (line 25) | class ProfilingConfig:
  class DistributedInitConfig (line 71) | class DistributedInitConfig:

FILE: megatron/training/config/resilience_config.py
  class RerunStateMachineConfig (line 6) | class RerunStateMachineConfig:
  class StragglerDetectionConfig (line 28) | class StragglerDetectionConfig:

FILE: megatron/training/config/training_config.py
  class TrainingConfig (line 7) | class TrainingConfig:
  class ValidationConfig (line 91) | class ValidationConfig:
  class SchedulerConfig (line 120) | class SchedulerConfig:
  class LoggerConfig (line 199) | class LoggerConfig:
  class CheckpointConfig (line 326) | class CheckpointConfig:

FILE: megatron/training/datasets/data_samplers.py
  function build_pretraining_data_loader (line 19) | def build_pretraining_data_loader(dataset, consumed_samples):
  class MegatronPretrainingSampler (line 112) | class MegatronPretrainingSampler:
    method __init__ (line 121) | def __init__(
    method __len__ (line 151) | def __len__(self):
    method get_start_end_idx (line 154) | def get_start_end_idx(self):
    method __iter__ (line 166) | def __iter__(self):
  class HybridCPMegatronPretrainingSampler (line 181) | class HybridCPMegatronPretrainingSampler(MegatronPretrainingSampler):
    method __init__ (line 189) | def __init__(self, total_samples, consumed_samples, micro_batch_size, ...
    method __len__ (line 196) | def __len__(self):
    method get_start_end_idx_global_batch (line 199) | def get_start_end_idx_global_batch(self):
    method __iter__ (line 204) | def __iter__(self):
  class RandomSeedDataset (line 225) | class RandomSeedDataset(Dataset):
    method __init__ (line 242) | def __init__(self, dataset, seed):
    method __len__ (line 247) | def __len__(self):
    method set_epoch (line 250) | def set_epoch(self, epoch):
    method __getitem__ (line 259) | def __getitem__(self, idx):
  class MegatronPretrainingRandomSampler (line 267) | class MegatronPretrainingRandomSampler:
    method __init__ (line 275) | def __init__(
    method __len__ (line 306) | def __len__(self):
    method __iter__ (line 309) | def __iter__(self):

FILE: megatron/training/datasets/fim_dataset.py
  class GPTFIMDatasetConfig (line 16) | class GPTFIMDatasetConfig(GPTDatasetConfig):
  class GPTFIMDataset (line 38) | class GPTFIMDataset(GPTDataset):
    method __init__ (line 54) | def __init__(
    method _query_document_sample_shuffle_indices (line 104) | def _query_document_sample_shuffle_indices(self, idx: int) -> Tuple[np...
    method _fim_permute_sequence (line 184) | def _fim_permute_sequence(self, sequence, rate):
    method _fim_split_and_permute_sequence (line 198) | def _fim_split_and_permute_sequence(self, sequence):
    method _permute (line 233) | def _permute(

FILE: megatron/training/datasets/sft_dataset.py
  class SFTLowLevelDataset (line 17) | class SFTLowLevelDataset:
    method __init__ (line 35) | def __init__(self, dataset_path: str) -> None:
    method __len__ (line 44) | def __len__(self) -> int:
    method __getitem__ (line 47) | def __getitem__(self, idx: int) -> list:
  class SFTDataset (line 51) | class SFTDataset(MegatronDataset):
    method __init__ (line 54) | def __init__(
    method numel_low_level_dataset (line 66) | def numel_low_level_dataset(low_level_dataset: LowLevelDataset) -> int:
    method build_low_level_dataset (line 70) | def build_low_level_dataset(dataset_path: str, config: GPTDatasetConfi...
    method __len__ (line 73) | def __len__(self) -> int:
    method _split_conversations (line 76) | def _split_conversations(self, merged_conversations):
    method __getitem__ (line 91) | def __getitem__(self, idx: int) -> Dict[str, Any]:

FILE: megatron/training/dgrad_logging.py
  function _get_linear_types (line 15) | def _get_linear_types():
  class DataGradLogger (line 50) | class DataGradLogger:
    method __init__ (line 56) | def __init__(self, save_dir: str):
    method _make_hook (line 61) | def _make_hook(self, model_chunk_name: str, module_name: str):
    method save (line 74) | def save(self, iteration: int):
    method register_hooks (line 81) | def register_hooks(self, model: torch.nn.Module):
    method remove_hooks (line 94) | def remove_hooks(self):
  function enable_dgrad_logging (line 104) | def enable_dgrad_logging(model: torch.nn.Module, save_dir: str):
  function disable_dgrad_logging (line 112) | def disable_dgrad_logging():
  function save_dgrads (line 119) | def save_dgrads(iteration: int):

FILE: megatron/training/dist_signal_handler.py
  function get_world_size (line 6) | def get_world_size():
  function get_device (line 14) | def get_device(local_rank=None):
  function all_gather_item (line 28) | def all_gather_item(item, dtype, group=None, async_op=False, local_rank=...
  class DistributedSignalHandler (line 50) | class DistributedSignalHandler:
    method __init__ (line 51) | def __init__(self, sig: signal.Signals = signal.SIGTERM):
    method signals_received (line 54) | def signals_received(self):
    method __enter__ (line 60) | def __enter__(self):
    method __exit__ (line 72) | def __exit__(self, type, value, tb):
    method release (line 75) | def release(self):

FILE: megatron/training/ft_integration.py
  function get_rank_monitor_client (line 67) | def get_rank_monitor_client() -> Optional[Any]:
  function setup (line 76) | def setup() -> None:
  function on_training_step_start (line 121) | def on_training_step_start() -> None:
  function on_training_step_end (line 136) | def on_training_step_end() -> None:
  function on_eval_step_start (line 146) | def on_eval_step_start() -> None:
  function on_eval_step_end (line 159) | def on_eval_step_end() -> None:
  function on_checkpointing_start (line 169) | def on_checkpointing_start() -> None:
  function on_checkpointing_end (line 176) | def on_checkpointing_end(is_async_finalization: bool) -> None:
  function on_checkpoint_loaded (line 193) | def on_checkpoint_loaded(is_local_chkpt: bool) -> None:
  function shutdown (line 207) | def shutdown() -> None:
  function _load_state_if_exists (line 219) | def _load_state_if_exists():
  function _update_timeouts (line 228) | def _update_timeouts(selected_sections, calc_out_of_section):
  function _maybe_update_timeouts (line 244) | def _maybe_update_timeouts(is_closing_ft=False):
  function maybe_setup_simulated_fault (line 300) | def maybe_setup_simulated_fault() -> None:

FILE: megatron/training/global_vars.py
  function get_args (line 27) | def get_args():
  function get_tokenizer (line 33) | def get_tokenizer():
  function get_tensorboard_writer (line 39) | def get_tensorboard_writer():
  function get_wandb_writer (line 45) | def get_wandb_writer():
  function get_one_logger (line 51) | def get_one_logger():
  function get_adlr_autoresume (line 56) | def get_adlr_autoresume():
  function get_timers (line 62) | def get_timers():
  function get_energy_monitor (line 67) | def get_energy_monitor():
  function get_signal_handler (line 72) | def get_signal_handler():
  function _set_signal_handler (line 77) | def _set_signal_handler(exit_signal):
  function set_global_variables (line 85) | def set_global_variables(args, build_tokenizer=True):
  function unset_global_variables (line 120) | def unset_global_variables():
  function set_args (line 151) | def set_args(args):
  function _build_tokenizer (line 156) | def _build_tokenizer(args):
  function rebuild_tokenizer (line 164) | def rebuild_tokenizer(args):
  function _set_tensorboard_writer (line 170) | def _set_tensorboard_writer(args):
  function _set_wandb_writer (line 190) | def _set_wandb_writer(args):
  function _set_one_logger (line 222) | def _set_one_logger(args):
  function _set_adlr_autoresume (line 246) | def _set_adlr_autoresume(args):
  function _set_timers (line 264) | def _set_timers(args):
  function _set_energy_monitor (line 270) | def _set_energy_monitor(args):
  function _ensure_var_is_initialized (line 277) | def _ensure_var_is_initialized(var, name):
  function _ensure_var_is_not_initialized (line 282) | def _ensure_var_is_not_initialized(var, name):
  function destroy_global_vars (line 286) | def destroy_global_vars():

FILE: megatron/training/initialize.py
  function initialize_megatron (line 41) | def initialize_megatron(
  function _compile_dependencies (line 179) | def _compile_dependencies():
  function _initialize_tp_communicators (line 242) | def _initialize_tp_communicators():
  function _initialize_distributed (line 315) | def _initialize_distributed(get_embedding_ranks, get_position_embedding_...
  function _init_autoresume (line 431) | def _init_autoresume():
  function _set_random_seed (line 440) | def _set_random_seed(
  function write_args_to_tensorboard (line 465) | def write_args_to_tensorboard():
  function set_jit_fusion_options (line 474) | def set_jit_fusion_options():
  function _warmup_jit_function (line 498) | def _warmup_jit_function():
  function setup_logging (line 561) | def setup_logging() -> None:

FILE: megatron/training/inprocess_restart.py
  function destroy_state (line 25) | def destroy_state():
  function inprocess_restart (line 30) | def inprocess_restart(train, args):
  function maybe_wrap_for_inprocess_restart (line 126) | def maybe_wrap_for_inprocess_restart(pretrain):
  function maybe_force_nccl_backend_init (line 148) | def maybe_force_nccl_backend_init(device_id):

FILE: megatron/training/log_handler.py
  class CustomHandler (line 9) | class CustomHandler(StreamHandler):
    method __init__ (line 15) | def __init__(self):
    method filter (line 18) | def filter(self, record: LogRecord) -> bool:

FILE: megatron/training/one_logger_utils.py
  function get_timestamp_in_ms (line 9) | def get_timestamp_in_ms():
  function on_train_start (line 18) | def on_train_start(iteration, consumed_train_samples, train_samples, seq...
  function _produce_e2e_metrics (line 76) | def _produce_e2e_metrics(log_throughput=False, throughput=None):
  function track_e2e_metrics (line 209) | def track_e2e_metrics(log_throughput=False, throughput=None):
  function on_save_checkpoint_start (line 226) | def on_save_checkpoint_start(async_save):
  function on_pretrain_start (line 265) | def on_pretrain_start():
  function track_config_flags (line 300) | def track_config_flags(train_iters, skip_train, do_train, do_valid, do_t...
  function on_save_checkpoint_success (line 322) | def on_save_checkpoint_success(productive_metrics, async_save):
  function on_save_checkpoint_end (line 366) | def on_save_checkpoint_end(save_checkpoint_duration, current_iteration, ...
  function track_app_tag (line 437) | def track_app_tag(batch_size, world_size, seq_length):
  function finish (line 456) | def finish():

FILE: megatron/training/theoretical_memory_usage.py
  function compute_weight_and_optimizer_memory (line 12) | def compute_weight_and_optimizer_memory(args, verbose=False):
  function compute_activation_memory (line 192) | def compute_activation_memory(args, num_microbatches, verbose=False):
  function compute_activation_memory_without_sp (line 266) | def compute_activation_memory_without_sp(args, num_microbatches, verbose...
  function report_theoretical_memory (line 340) | def report_theoretical_memory(args, num_microbatches=None, verbose=False):

FILE: megatron/training/training.py
  function set_startup_timestamps (line 16) | def set_startup_timestamps(program_start=None, main_entry=None):
  function destroy_global_state (line 209) | def destroy_global_state():
  function print_datetime (line 218) | def print_datetime(string, override_timestamp=None):
  function num_floating_point_operations (line 228) | def num_floating_point_operations(args, batch_size):
  function get_start_time_from_progress_log (line 630) | def get_start_time_from_progress_log():
  function preprocess_common_state_dict (line 673) | def preprocess_common_state_dict(common_state_dict):
  function pretrain (line 718) | def pretrain(
  function update_train_iters (line 1178) | def update_train_iters(args):
  function get_model (line 1211) | def get_model(model_provider_func, model_type=ModelType.encoder_or_decod...
  function get_optimizer_param_scheduler (line 1404) | def get_optimizer_param_scheduler(optimizer):
  function get_megatron_optimizer_config (line 1460) | def get_megatron_optimizer_config(args: Any) -> OptimizerConfig:
  function setup_model_and_optimizer (line 1488) | def setup_model_and_optimizer(
  function dummy_train_step (line 1661) | def dummy_train_step(data_iterator):
  function train_step (line 1672) | def train_step(forward_step_func, data_iterator, model, optimizer, opt_p...
  function training_log (line 1842) | def training_log(
  function compute_throughputs_and_append_to_progress_log (line 2189) | def compute_throughputs_and_append_to_progress_log(iteration, num_floati...
  function enable_forward_pre_hook (line 2222) | def enable_forward_pre_hook(model_chunks):
  function disable_forward_pre_hook (line 2228) | def disable_forward_pre_hook(model_chunks, param_sync=True):
  function force_param_sync (line 2234) | def force_param_sync(model_chunks: list[DDP]) -> None:
  function save_checkpoint_and_time (line 2243) | def save_checkpoint_and_time(
  function post_training_step_callbacks (line 2325) | def post_training_step_callbacks(
  function checkpoint_and_decide_exit (line 2392) | def checkpoint_and_decide_exit(
  function train (line 2504) | def train(
  function evaluate (line 3163) | def evaluate(
  function evaluate_and_print_results (line 3328) | def evaluate_and_print_results(
  function cyclic_iter (line 3420) | def cyclic_iter(iter):
  function get_train_valid_test_num_samples (line 3426) | def get_train_valid_test_num_samples():
  function build_train_valid_test_datasets (line 3460) | def build_train_valid_test_datasets(build_train_valid_test_datasets_prov...
  function build_train_valid_test_data_loaders (line 3471) | def build_train_valid_test_data_loaders(build_train_valid_test_datasets_...
  function build_train_valid_test_data_iterators (line 3555) | def build_train_valid_test_data_iterators(build_train_valid_test_dataset...
  function should_disable_forward_pre_hook (line 3629) | def should_disable_forward_pre_hook(args):

FILE: megatron/training/utils.py
  function calc_params_l2_norm (line 48) | def calc_params_l2_norm(model, force_create_fp32_copy=False):
  function calc_dtensor_params_l2_norm (line 197) | def calc_dtensor_params_l2_norm(params):
  function average_losses_across_data_parallel_group (line 234) | def average_losses_across_data_parallel_group(losses):
  function reduce_max_stat_across_model_parallel_group (line 243) | def reduce_max_stat_across_model_parallel_group(stat: float) -> float | ...
  function logical_and_across_model_parallel_group (line 264) | def logical_and_across_model_parallel_group(input: bool) -> bool:
  function report_memory (line 279) | def report_memory(name):
  function print_params_min_max_norm (line 294) | def print_params_min_max_norm(optimizer, iteration):
  function check_adlr_autoresume_termination (line 313) | def check_adlr_autoresume_termination(iteration, model, optimizer, opt_p...
  function get_ltor_masks_and_position_ids (line 331) | def get_ltor_masks_and_position_ids(data,
  function print_rank_0 (line 394) | def print_rank_0(message, rank=None):
  function warn_rank_0 (line 404) | def warn_rank_0(message, rank=None):
  function is_rank0 (line 414) | def is_rank0():
  function is_last_rank (line 419) | def is_last_rank():
  function print_rank_last (line 425) | def print_rank_last(message):
  function is_hybrid_model (line 434) | def is_hybrid_model(args):
  function is_first_or_last_pipeline_stage (line 439) | def is_first_or_last_pipeline_stage(vp_stage):
  function get_device_arch_version (line 451) | def get_device_arch_version():
  function append_to_progress_log (line 456) | def append_to_progress_log(string, barrier=True):
  function get_blend_and_blend_per_split (line 474) | def get_blend_and_blend_per_split(args):
  function get_batch_on_this_tp_rank (line 522) | def get_batch_on_this_tp_rank(data_iterator, mtp_on_this_rank: bool = Fa...
  function update_use_dist_ckpt (line 724) | def update_use_dist_ckpt(args):
  function to_empty_if_meta_device (line 728) | def to_empty_if_meta_device(module: torch.nn.Module, *, device: torch.de...
  function get_nvtx_range (line 756) | def get_nvtx_range():

FILE: megatron/training/wandb_utils.py
  function _get_wandb_artifact_tracker_filename (line 10) | def _get_wandb_artifact_tracker_filename(save_dir: str) -> Path:
  function _get_artifact_name_and_version (line 15) | def _get_artifact_name_and_version(save_dir: Path, checkpoint_path: Path...
  function on_save_checkpoint_success (line 19) | def on_save_checkpoint_success(checkpoint_path: str, tracker_filename: s...
  function on_load_checkpoint_success (line 44) | def on_load_checkpoint_success(checkpoint_path: str, load_dir: str) -> N...

FILE: megatron/training/yaml_arguments.py
  function env_constructor (line 25) | def env_constructor(loader, node):
  function validate_yaml (line 41) | def validate_yaml(args, defaults={}):
  function _print_args (line 339) | def _print_args(title, args):
  function core_config_from_args (line 353) | def core_config_from_args(args, dataclass=TransformerConfig):
  function _check_arg_is_not_none (line 374) | def _check_arg_is_not_none(args, arg):
  function core_transformer_config_from_yaml (line 377) | def core_transformer_config_from_yaml(args, transfomer_key = "language_m...
  function load_yaml (line 416) | def load_yaml(yaml_path):

FILE: model_provider.py
  function model_provider (line 24) | def model_provider(
  function count_parameters_in_layer (line 66) | def count_parameters_in_layer(model, layer_name):

FILE: pretrain_bert.py
  function model_provider (line 29) | def model_provider(pre_process=True, post_process=True, vp_stage=None, c...
  function get_batch (line 72) | def get_batch(data_iterator):
  function loss_func (line 98) | def loss_func(loss_mask, sentence_order, output_tensor):
  function forward_step (line 123) | def forward_step(data_iterator, model):
  function train_valid_test_datasets_provider (line 144) | def train_valid_test_datasets_provider(train_val_test_num_samples, vp_st...

FILE: pretrain_gpt.py
  function get_batch (line 65) | def get_batch(data_iterator, vp_stage: Optional[int] = None):
  function loss_func (line 170) | def loss_func(
  function forward_step (line 232) | def forward_step(data_iterator, model: GPTModel, return_schedule_plan: b...
  function is_dataset_built_on_rank (line 271) | def is_dataset_built_on_rank(vp_stage=None, is_packed_sequence=False):
  function core_gpt_dataset_config_from_args (line 284) | def core_gpt_dataset_config_from_args(args):
  function train_valid_test_datasets_provider (line 349) | def train_valid_test_datasets_provider(train_val_test_num_samples, vp_st...
  function get_embedding_ranks (line 384) | def get_embedding_ranks(pp_ranks: List[int]):

FILE: pretrain_mamba.py
  function get_batch (line 74) | def get_batch(data_iterator, vp_stage=None):
  function loss_func (line 152) | def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor, mode...
  function forward_step (line 210) | def forward_step(data_iterator, model: MambaModel):
  function is_dataset_built_on_rank (line 267) | def is_dataset_built_on_rank(vp_stage=None, is_packed_sequence=False):
  function core_gpt_dataset_config_from_args (line 276) | def core_gpt_dataset_config_from_args(args):
  function train_valid_test_datasets_provider (line 313) | def train_valid_test_datasets_provider(train_val_test_num_samples, vp_st...

FILE: pretrain_t5.py
  function model_provider (line 67) | def model_provider(
  function get_batch (line 149) | def get_batch(data_iterator, use_local):
  function forward_step (line 179) | def forward_step(data_iterator, model: T5Model):
  function train_valid_test_datasets_provider (line 206) | def train_valid_test_datasets_provider(train_val_test_num_samples: int):
  function t5_embedding_ranks (line 255) | def t5_embedding_ranks(pp_ranks):
  function t5_position_embedding_ranks (line 269) | def t5_position_embedding_ranks(pp_ranks):

FILE: pretrain_vlm.py
  function model_provider (line 45) | def model_provider(
  function train_valid_test_datasets_provider (line 213) | def train_valid_test_datasets_provider(train_val_test_num_samples):
  function _preprocess_data_for_llava (line 253) | def _preprocess_data_for_llava(data):
  function get_batch (line 289) | def get_batch(data_iterator):
  function forward_step (line 364) | def forward_step(data_iterator, model: LLaVAModel):
  function add_vlm_extra_args (line 389) | def add_vlm_extra_args(parser):
  function llava_embedding_ranks (line 417) | def llava_embedding_ranks(pp_ranks):
  function llava_position_embedding_ranks (line 431) | def llava_position_embedding_ranks(pp_ranks):

FILE: scripts/check_api_backwards_compatibility.py
  function has_exempt_decorator (line 66) | def has_exempt_decorator(obj: Object) -> bool:
  function get_filtered_paths (line 87) | def get_filtered_paths(package: Object, package_name: str) -> set:
  function strip_ansi_codes (line 139) | def strip_ansi_codes(text):
  function get_object_path (line 158) | def get_object_path(change) -> str:
  function should_skip_change (line 224) | def should_skip_change(change, filtered_paths: set) -> bool:
  function main (line 275) | def main():

FILE: tasks/data_utils.py
  function clean_text (line 9) | def clean_text(text):
  function build_sample (line 20) | def build_sample(ids, types, paddings, label, unique_id):
  function build_tokens_types_paddings_from_text (line 35) | def build_tokens_types_paddings_from_text(text_a, text_b,
  function build_tokens_types_paddings_from_ids (line 49) | def build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, max_seq...

FILE: tasks/eval_utils.py
  function accuracy_func_provider (line 19) | def accuracy_func_provider(single_dataset_provider):
  function calculate_correct_answers (line 65) | def calculate_correct_answers(name, model, dataloader,

FILE: tasks/finetune_utils.py
  function process_batch (line 26) | def process_batch(batch):
  function cross_entropy_loss_func (line 40) | def cross_entropy_loss_func(labels, output_tensor):
  function _cross_entropy_forward_step (line 53) | def _cross_entropy_forward_step(batch, model):
  function build_data_loader (line 72) | def build_data_loader(dataset, micro_batch_size, num_workers, drop_last,
  function _build_infinite_size_dataloader (line 95) | def _build_infinite_size_dataloader(dataloader):
  function _build_train_valid_dataloaders (line 106) | def _build_train_valid_dataloaders(train_dataset, valid_dataset,
  function _train (line 145) | def _train(model, optimizer, opt_param_scheduler, forward_step,
  function finetune (line 239) | def finetune(train_valid_datasets_provider, model_provider,

FILE: tests/functional_tests/python_test_utils/common.py
  class TypeOfTestResult (line 22) | class TypeOfTestResult(enum.Enum):
  class Test (line 27) | class Test(pydantic.BaseModel):
  class NotApproximateError (line 31) | class NotApproximateError(Exception):
  class NotDeterminsticError (line 35) | class NotDeterminsticError(Exception):
  class ApproximateTest (line 39) | class ApproximateTest(Test):
    method type_of_test_result (line 44) | def type_of_test_result(self) -> TypeOfTestResult:
    method error_message (line 47) | def error_message(self, metric_name: str) -> NotApproximateError:
  class DeterministicTest (line 51) | class DeterministicTest(Test):
    method rtol (line 53) | def rtol(self) -> float:
    method atol (line 57) | def atol(self) -> Union[int, float]:
    method type_of_test_result (line 61) | def type_of_test_result(self) -> TypeOfTestResult:
    method error_message (line 64) | def error_message(self, metric_name: str) -> NotDeterminsticError:
  class GoldenValueMetric (line 68) | class GoldenValueMetric(pydantic.BaseModel):
    method __repr__ (line 74) | def __repr__(self):
  class GoldenValues (line 78) | class GoldenValues(pydantic.RootModel):
  class MissingTensorboardLogsError (line 82) | class MissingTensorboardLogsError(Exception):
  class UndefinedMetricError (line 86) | class UndefinedMetricError(Exception):
  class SkipMetricError (line 90) | class SkipMetricError(Exception):
  function read_tb_logs_as_list (line 94) | def read_tb_logs_as_list(
  function read_golden_values_from_json (line 161) | def read_golden_values_from_json(
  function _filter_checks (line 172) | def _filter_checks(
  function pipeline (line 178) | def pipeline(

FILE: tests/functional_tests/python_test_utils/compute_golden_statistics.py
  function find_result_json_files (line 46) | def find_result_json_files(results_dir: str, workspace_root: Optional[st...
  function _extract_result_path_from_log (line 92) | def _extract_result_path_from_log(out_file: Path, workspace_root: str) -...
  function _find_json_files_directly (line 151) | def _find_json_files_directly(results_dir: str) -> List[str]:
  function load_result_file (line 178) | def load_result_file(filepath: str) -> Optional[Dict[str, Any]]:
  function _detect_result_format (line 196) | def _detect_result_format(data: Dict[str, Any]) -> str:
  function _is_valid_numeric (line 221) | def _is_valid_numeric(value) -> bool:
  function _to_float (line 235) | def _to_float(value) -> Optional[float]:
  function _aggregate_training_results (line 251) | def _aggregate_training_results(
  function _aggregate_inference_results (line 296) | def _aggregate_inference_results(
  function aggregate_results (line 375) | def aggregate_results(result_files: List[str]) -> Dict[str, Dict[str, Li...
  function compute_statistics (line 413) | def compute_statistics(aggregated: Dict[str, Dict[str, List[float]]]) ->...
  function compute_recommended_tolerances (line 467) | def compute_recommended_tolerances(
  function format_summary (line 617) | def format_summary(stats: Dict[str, Any], tolerances: Dict[str, Dict[str...
  function main (line 659) | def main():

FILE: tests/functional_tests/python_test_utils/conftest.py
  function pytest_addoption (line 6) | def pytest_addoption(parser):
  function compare_approximate_results (line 32) | def compare_approximate_results(request) -> bool:
  function golden_values_path (line 38) | def golden_values_path(request):
  function golden_values (line 44) | def golden_values(request):
  function actual_values (line 50) | def actual_values(request):
  function actual_values_first_run (line 56) | def actual_values_first_run(request):
  function actual_values_second_run (line 64) | def actual_values_second_run(request):
  function scope (line 72) | def scope(request):
  function train_iters (line 78) | def train_iters(request):
  function tensorboard_logs (line 84) | def tensorboard_logs(request, train_iters):
  function test_values_path (line 92) | def test_values_path(request):
  function tensorboard_path (line 97) | def tensorboard_path(request):
  function model_config_path (line 103) | def model_config_path(request):

FILE: tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py
  function collect_train_test_metrics (line 32) | def collect_train_test_metrics(

FILE: tests/functional_tests/python_test_utils/test_grpo_training_loop.py
  function validate_with_tolerance (line 31) | def validate_with_tolerance(
  function test_grpo_training_loop (line 81) | def test_grpo_training_loop(

FILE: tests/functional_tests/python_test_utils/test_inference_regular_pipeline.py
  function _median_as_float (line 24) | def _median_as_float(value):
  function _bytes_to_gib (line 39) | def _bytes_to_gib(num_bytes: float) -> float:
  function test_inference_pipeline (line 43) | def test_inference_pipeline(

FILE: tests/functional_tests/python_test_utils/test_optimizer_grads_match.py
  function _as_iter (line 13) | def _as_iter(x: TensorLike):
  function _fro_norm (line 17) | def _fro_norm(x: TensorLike) -> torch.Tensor:
  function machine_epsilon_for_dtype (line 26) | def machine_epsilon_for_dtype(dtype: torch.dtype) -> float:
  function relative_grad_diff (line 41) | def relative_grad_diff(g_hat: TensorLike, g_ref: TensorLike, eps_den: fl...
  function expected_rel_bound (line 59) | def expected_rel_bound(
  function check_gradient (line 80) | def check_gradient(
  function _filter_optimizer_tensors (line 104) | def _filter_optimizer_tensors(plain_tensors: Dict[str, torch.Tensor]) ->...
  function assert_grads_close (line 111) | def assert_grads_close(left: torch.Tensor, right: torch.Tensor):
  function unshard_row_parallel_state (line 144) | def unshard_row_parallel_state(saved_state, out_features, in_features, tp):
  function _assert_optimizer_tensors_equal (line 154) | def _assert_optimizer_tensors_equal(
  function load_dist_checkpoint_pt (line 252) | def load_dist_checkpoint_pt(
  function test_optimizer_states_match (line 291) | def test_optimizer_states_match(checkpoint_dirs):
  function main (line 327) | def main():

FILE: tests/functional_tests/python_test_utils/test_pretraining_regular_pipeline.py
  function test_regular_pipeline (line 25) | def test_regular_pipeline(

FILE: tests/functional_tests/python_test_utils/test_pretraining_resume_checkpoint_pipeline.py
  function test_resume_checkpoint_pipeline (line 14) | def test_resume_checkpoint_pipeline(

FILE: tests/functional_tests/test_cases/common/ckpt_converter/__main__.py
  function is_model_parallel_rank_0 (line 39) | def is_model_parallel_rank_0():
  function broadcast (line 46) | def broadcast(item):
  class TempSharedDir (line 56) | class TempSharedDir:
    method __enter__ (line 59) | def __enter__(self):
    method __exit__ (line 67) | def __exit__(self, exc_type, exc_value, exc_tb):
  class ModelParallelState (line 78) | class ModelParallelState(_ModelParallelState):
    method __new__ (line 81) | def __new__(cls, tp=1, pp=1, ep=1):
    method __str__ (line 84) | def __str__(self):
  class ModelMeta (line 88) | class ModelMeta:
    method __init__ (line 97) | def __init__(self, format: str, mp: ModelParallelState, transformer_im...
    method __str__ (line 112) | def __str__(self):
  class Pipeline (line 116) | class Pipeline:
    method __init__ (line 131) | def __init__(self, src: ModelMeta, dst: ModelMeta):
    method __str__ (line 138) | def __str__(self):
    method get_model_argv (line 141) | def get_model_argv(self):
    method get_converter_model_type (line 145) | def get_converter_model_type(self):
    method get_meta (line 149) | def get_meta(self, key):
    method init_args_and_model (line 154) | def init_args_and_model(self, key):
    method build_model (line 224) | def build_model():
    method get_input_ids (line 234) | def get_input_ids():
    method get_batch (line 256) | def get_batch(input_ids):
    method forward_step (line 316) | def forward_step(cls, orig_input_ids: T.Iterator, model: torch.nn.Modu...
    method forward_model (line 346) | def forward_model(cls, models, orig_input_ids):
    method rand_init_model_params (line 376) | def rand_init_model_params(self, key, models):
    method save_checkpoint (line 399) | def save_checkpoint(self):
    method load_checkpoint (line 422) | def load_checkpoint(self, orig_input_ids):
    method convert_checkpoint (line 444) | def convert_checkpoint(self):
    method run (line 488) | def run(self):
  class GPTPipeline (line 544) | class GPTPipeline(Pipeline):
    method __init__ (line 553) | def __init__(self, src: ModelMeta, dst: ModelMeta, num_moe_experts: T....
    method __str__ (line 558) | def __str__(self):
    method get_model_argv (line 564) | def get_model_argv(self):
    method get_converter_model_type (line 590) | def get_converter_model_type(self):
  class LLaVAPipeline (line 594) | class LLaVAPipeline(Pipeline):
    method __init__ (line 595) | def __init__(
    method __str__ (line 603) | def __str__(self):
    method get_model_argv (line 610) | def get_model_argv(self):
    method get_test_image (line 650) | def get_test_image():
    method get_input_ids (line 656) | def get_input_ids():
    method forward_step (line 681) | def forward_step(cls, orig_input_ids: T.Iterator, model: torch.nn.Modu...
    method forward_model (line 711) | def forward_model(cls, models, orig_input_ids):
    method build_model (line 743) | def build_model():
    method get_converter_model_type (line 753) | def get_converter_model_type(self):
    method init_args_and_model (line 756) | def init_args_and_model(self, key):
  function get_gpt_pipelines (line 828) | def get_gpt_pipelines():
  function get_moe_pipelines (line 843) | def get_moe_pipelines():
  function get_llava_pipelines (line 851) | def get_llava_pipelines():
  function test_all_pipelines (line 874) | def test_all_pipelines():

FILE: tests/functional_tests/test_cases/common/moe_perf/__main__.py
  function _build_transformer_config (line 43) | def _build_transformer_config(case: MoEPerformanceCase) -> TransformerCo...
  function _resolve_moe_submodules (line 89) | def _resolve_moe_submodules(case: MoEPerformanceCase):
  function _load_baselines (line 95) | def _load_baselines() -> Dict[str, Dict[str, float]]:
  function _persist_baselines (line 102) | def _persist_baselines(data: Dict[str, Dict[str, float]]) -> None:
  function _serialize_metrics (line 109) | def _serialize_metrics(metrics: Dict[str, float]) -> Dict[str, float]:
  function _assert_within_baseline (line 120) | def _assert_within_baseline(
  function _benchmark_moe_layer (line 179) | def _benchmark_moe_layer(layer: MoELayer, case: MoEPerformanceCase):
  function _maybe_update_baseline (line 271) | def _maybe_update_baseline(
  function _prepare_moe_layer (line 292) | def _prepare_moe_layer(case: MoEPerformanceCase) -> MoELayer:
  function _check_env (line 301) | def _check_env():
  function _check_dependencies (line 309) | def _check_dependencies(case: MoEPerformanceCase):
  function test_moe_layer_performance (line 325) | def test_moe_layer_performance(perf_case: MoEPerformanceCase, debug_mode...

FILE: tests/functional_tests/test_cases/common/moe_perf/test_cases.py
  class MoEModelConfig (line 9) | class MoEModelConfig:
  class MoEPerformanceCase (line 29) | class MoEPerformanceCase:
    method input_dtype (line 62) | def input_dtype(self) -> torch.dtype:
    method is_current_platform (line 65) | def is_current_platform(self) -> bool: