Full Code of NVIDIA/Megatron-LM for AI

main f456199700bc cached

2310 files

32.9 MB

4.1M tokens

5871 symbols

1 requests

Copy disabled (too large) Download .txt

Showing preview only (16,329K chars total). Download the full file to get everything.

Repository: NVIDIA/Megatron-LM
Branch: main
Commit: f456199700bc
Files: 2310
Total size: 32.9 MB

Directory structure:
gitextract_32wjwf3g/

├── .coderabbit.yaml
├── .flake8
├── .github/
│   ├── CODEOWNERS
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.md
│   │   ├── config.yml
│   │   ├── feature_request.md
│   │   ├── question.md
│   │   └── regression.md
│   ├── actions/
│   │   ├── action.yml
│   │   └── check-nvidia-sso-membership/
│   │       └── action.yml
│   ├── copy-pr-bot.yaml
│   ├── oncall_schedule.json
│   ├── pull_request_template.md
│   ├── scripts/
│   │   ├── oncall_manager.py
│   │   ├── readme.sh
│   │   └── sync_team_usergroups.py
│   └── workflows/
│       ├── _build_test_publish_wheel.yml
│       ├── _release_library.yml
│       ├── _update_dependencies.yml
│       ├── auto-assign-milestone.yml
│       ├── auto-reminder-bot.yml
│       ├── auto-swap-labels.yml
│       ├── auto-update-copy-pr-bot.yml
│       ├── build-docs.yml
│       ├── build-test-publish-wheel.yml
│       ├── cherry-pick-release-commit.yml
│       ├── cicd-approve-test-queue.yml
│       ├── cicd-main.yml
│       ├── claude-complexity-label.yml
│       ├── claude_review.yml
│       ├── close-inactive-issue-pr.yml
│       ├── community-bot.yml
│       ├── config/
│       │   └── changelog-config.json
│       ├── copyright-check.yml
│       ├── dependabot.yml
│       ├── force-draft-pr.yml
│       ├── install-test.yml
│       ├── multi-approval-bot.yml
│       ├── oncall-assign.yml
│       ├── oncall-rotation.yml
│       ├── release-docs.yml
│       ├── release-freeze.yml
│       ├── release-nightly-docs.yml
│       ├── release.yaml
│       ├── review-trigger.yml
│       ├── sync-team-usergroups.yml
│       └── trigger-mbridge-tests.yml
├── .gitignore
├── .gitlab/
│   ├── labeler-config.yml
│   ├── scripts/
│   │   ├── build.sh
│   │   ├── check_imports.py
│   │   └── fetch-legacy-suite.sh
│   └── stages/
│       ├── 00.pre.yml
│       ├── 01.build.yml
│       ├── 02.test.yml
│       ├── 03.integration-tests.yml
│       ├── 04.functional-tests.yml
│       └── 05.publish.yml
├── .gitlab-ci.yml
├── .pre-commit-config.yaml
├── .pylintrc
├── .python-version
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── codecov.yml
├── docker/
│   ├── .ngc_version.dev
│   ├── .ngc_version.lts
│   ├── Dockerfile.ci.dev
│   ├── Dockerfile.ci.nemo
│   ├── Dockerfile.linting
│   ├── common/
│   │   ├── install.sh
│   │   └── install_source_wheels.sh
│   └── patches/
│       └── deepep.patch
├── docs/
│   ├── add_copyright_header.py
│   ├── advanced/
│   │   └── index.md
│   ├── api-backwards-compatibility-check.md
│   ├── api-guide/
│   │   ├── core/
│   │   │   ├── datasets.md
│   │   │   ├── dist_checkpointing.md
│   │   │   ├── dist_checkpointing.strategies.md
│   │   │   ├── distributed.md
│   │   │   ├── fusions.md
│   │   │   ├── index.md
│   │   │   ├── pipeline_parallel.md
│   │   │   ├── tensor_parallel.md
│   │   │   └── transformer.md
│   │   ├── index.md
│   │   ├── internal/
│   │   │   ├── index.md
│   │   │   ├── num_microbatches_calculator.md
│   │   │   └── optimizer_param_scheduler.md
│   │   ├── models/
│   │   │   ├── index.md
│   │   │   ├── models.bert.md
│   │   │   ├── models.gpt.md
│   │   │   ├── models.md
│   │   │   └── models.t5.md
│   │   └── router_replay.md
│   ├── autodoc2_docstrings_parser.py
│   ├── broken_links_false_positives.json
│   ├── conf.py
│   ├── developer/
│   │   ├── contribute.md
│   │   ├── generate_docs.md
│   │   ├── oncall.md
│   │   └── submit.md
│   ├── discussions/
│   │   ├── README.md
│   │   └── megatron-fsdp-user-guide/
│   │       ├── example-scripts/
│   │       │   ├── sbatch_checkpoint_convert.sh
│   │       │   └── sbatch_mfsdp_deepseek_v3.sh
│   │       └── megatron-fsdp-user-guide.md
│   ├── documentation.md
│   ├── get-started/
│   │   ├── install.md
│   │   ├── overview.md
│   │   ├── quickstart.md
│   │   └── releasenotes.md
│   ├── index.md
│   ├── llama_mistral.md
│   ├── models/
│   │   ├── index.md
│   │   ├── llms.md
│   │   └── multimodal.md
│   ├── project.json
│   ├── user-guide/
│   │   ├── data-preparation.md
│   │   ├── features/
│   │   │   ├── context_parallel.md
│   │   │   ├── custom_fsdp.md
│   │   │   ├── dist_optimizer.md
│   │   │   ├── fine_grained_activation_offloading.md
│   │   │   ├── index.md
│   │   │   ├── megatron_energon.md
│   │   │   ├── megatron_rl.md
│   │   │   ├── moe.md
│   │   │   ├── multi_latent_attention.md
│   │   │   ├── multi_token_prediction.md
│   │   │   ├── optimizer_cpu_offload.md
│   │   │   ├── pipeline_parallel_layout.md
│   │   │   └── tokenizers.md
│   │   ├── index.md
│   │   ├── msc_integration.md
│   │   ├── parallelism-guide.md
│   │   └── training-examples.md
│   └── versions1.json
├── examples/
│   ├── __init__.py
│   ├── academic_paper_scripts/
│   │   ├── detxoify_lm/
│   │   │   ├── README.md
│   │   │   ├── annotations/
│   │   │   │   ├── filter-selfgeneration.py
│   │   │   │   ├── perspective_api_annotate.py
│   │   │   │   └── preprocess.sh
│   │   │   ├── finetune_gpt.py
│   │   │   ├── finetune_gpt_distributed-1.3b.sh
│   │   │   ├── generate-1.3b.sh
│   │   │   ├── generate_samples_gpt.py
│   │   │   ├── perspective_api.py
│   │   │   └── self_generation/
│   │   │       └── selfgenerate-1.3b-unconditional.sh
│   │   ├── msdp/
│   │   │   ├── README.md
│   │   │   ├── data_processing.sh
│   │   │   ├── eval_knwl_generation.sh
│   │   │   ├── eval_resp_generation.sh
│   │   │   ├── prep_resp_gen.sh
│   │   │   ├── prompt_knwl_gen.sh
│   │   │   └── prompt_resp_gen.sh
│   │   └── sc21/
│   │       ├── CONFIG.sh
│   │       ├── README.md
│   │       ├── SBATCH.sh
│   │       ├── SRUN.sh
│   │       ├── run_figure_11.sh
│   │       ├── run_figure_12.sh
│   │       ├── run_figure_13.sh
│   │       ├── run_figure_14.sh
│   │       ├── run_figure_15.sh
│   │       ├── run_figure_16.sh
│   │       ├── run_figure_17.sh
│   │       ├── run_figure_18.sh
│   │       └── run_table_1.sh
│   ├── bert/
│   │   ├── README.md
│   │   └── train_bert_340m_distributed.sh
│   ├── export/
│   │   ├── README.md
│   │   └── trtllm_export/
│   │       ├── README.md
│   │       ├── distributed_export/
│   │       │   └── gpt_distributed_gpu_export.py
│   │       └── single_device_export/
│   │           └── gpt_single_device_cpu_export.py
│   ├── gpt3/
│   │   ├── README.md
│   │   ├── gpt_config.yaml
│   │   └── train_gpt3_175b_distributed.sh
│   ├── gptoss/
│   │   ├── 01_convert_from_hf.py
│   │   ├── 02_train.sh
│   │   ├── 03_convert_to_hf.py
│   │   └── README.md
│   ├── inference/
│   │   ├── README.md
│   │   ├── gpt/
│   │   │   ├── gpt_dynamic_inference.py
│   │   │   ├── gpt_dynamic_inference_12b.sh
│   │   │   ├── gpt_dynamic_inference_357m.sh
│   │   │   ├── gpt_dynamic_inference_with_coordinator.py
│   │   │   ├── gpt_static_inference.py
│   │   │   └── utils.py
│   │   ├── llama_mistral/
│   │   │   ├── huggingface_reference.py
│   │   │   ├── run_static_inference_llama4_scout.sh
│   │   │   ├── run_text_generation_llama3.1.sh
│   │   │   ├── run_text_generation_llama3.sh
│   │   │   └── run_text_generation_mistral.sh
│   │   ├── run_text_generation_server_345M.sh
│   │   ├── run_text_generation_server_345M_8_tensor_parallel.sh
│   │   └── t5/
│   │       └── simple_t5_batch_inference.py
│   ├── llama/
│   │   ├── README.md
│   │   └── train_llama3_8b_h100_fp8.sh
│   ├── mamba/
│   │   ├── .gitignore
│   │   ├── Dockerfile
│   │   ├── README.md
│   │   ├── run_text_gen_server_8b.sh
│   │   ├── run_text_gen_server_8b_gpt3.sh
│   │   └── train.sh
│   ├── mimo/
│   │   ├── __init__.py
│   │   ├── avlm_inference.py
│   │   ├── configs/
│   │   │   ├── llava_avlm.py
│   │   │   ├── llava_vlm.py
│   │   │   └── mock.py
│   │   ├── data/
│   │   │   ├── __init__.py
│   │   │   ├── avlm_sample_loader.py
│   │   │   ├── energon_avlm_task_encoder.py
│   │   │   ├── energon_vlm_task_encoder.py
│   │   │   ├── mock.py
│   │   │   ├── prepare_video_llava_data.py
│   │   │   └── utils/
│   │   │       └── calculate_audio_tokens.py
│   │   ├── model_providers/
│   │   │   ├── __init__.py
│   │   │   ├── hf_clip_encoder.py
│   │   │   ├── hf_whisper_encoder.py
│   │   │   ├── llava_avlm.py
│   │   │   ├── llava_vlm.py
│   │   │   └── mock.py
│   │   ├── scripts/
│   │   │   ├── run_avlm_train.sh
│   │   │   ├── run_mock_train.sh
│   │   │   ├── run_video_vlm_train.sh
│   │   │   └── run_vlm_train.sh
│   │   ├── train.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── data_helpers.py
│   │       ├── logging.py
│   │       └── model_helpers.py
│   ├── mixtral/
│   │   ├── README.md
│   │   └── train_mixtral_8x7b_distributed.sh
│   ├── multimodal/
│   │   ├── Dockerfile
│   │   ├── README.md
│   │   ├── combine_lm_vision_checkpoints.sh
│   │   ├── combine_state_dicts.py
│   │   ├── config.py
│   │   ├── convert_llava_pretrain_to_wds.py
│   │   ├── dataloader_provider.py
│   │   ├── dataset_helpers.py
│   │   ├── energon_util.py
│   │   ├── evaluation/
│   │   │   ├── evaluate_ai2d.py
│   │   │   ├── evaluate_chartqa.py
│   │   │   ├── evaluate_coco.py
│   │   │   ├── evaluate_infovqa.py
│   │   │   ├── evaluate_mathvista.py
│   │   │   ├── evaluate_mmmu.py
│   │   │   ├── evaluate_ocrbench.py
│   │   │   ├── evaluate_ocrbench_v2.py
│   │   │   ├── evaluate_rd_tablebench.py
│   │   │   ├── evaluate_realworldqa.py
│   │   │   ├── evaluate_spdocvqa.py
│   │   │   ├── evaluate_textvqa.py
│   │   │   ├── evaluate_video_motionbench.py
│   │   │   ├── evaluate_video_mvbench.py
│   │   │   ├── evaluate_video_phys_game_bench.py
│   │   │   ├── evaluate_vqav2.py
│   │   │   ├── evaluation_datasets.py
│   │   │   └── mmmu_utils.py
│   │   ├── image_processing.py
│   │   ├── layer_scaling.py
│   │   ├── layer_specs.py
│   │   ├── llama_3p1_nemotron_nano_vl_8b_v1/
│   │   │   ├── Dockerfile
│   │   │   ├── README.md
│   │   │   ├── pretraining_llama_3p1_nemotron_nano_vl_8b_v1.sh
│   │   │   ├── sft_llama_3p1_nemotron_nano_vl_8b_v1.sh
│   │   │   └── text_generation.sh
│   │   ├── manual_prompts.json
│   │   ├── model.py
│   │   ├── model_converter/
│   │   │   ├── clip_converter.py
│   │   │   ├── internvit_converter.py
│   │   │   ├── radio_converter.py
│   │   │   ├── siglip_converter.py
│   │   │   └── vision_model_tester.py
│   │   ├── multimodal_args.py
│   │   ├── nvlm/
│   │   │   ├── README.md
│   │   │   ├── internvit.py
│   │   │   ├── nvlm_prompts.json
│   │   │   ├── pp_checkpoint_converter.py
│   │   │   ├── pretrain_blend.yaml
│   │   │   ├── pretrain_qwen20_72b_internvit_6b.sh
│   │   │   ├── pretrain_yi_34b_internvit_6b.sh
│   │   │   ├── run_text_generation_qwen20_72b_internvit_6b.sh
│   │   │   ├── run_text_generation_qwen25_7b_internvit_video.sh
│   │   │   ├── run_text_generation_qwen25_7b_siglip.sh
│   │   │   ├── run_text_generation_yi_34b_internvit_6b.sh
│   │   │   ├── sft_34b_internvit.sh
│   │   │   ├── sft_blend.yaml
│   │   │   ├── sft_qwen20_72b_internvit_6b.sh
│   │   │   └── sft_qwen2p5_7b_internvit_6b_video.sh
│   │   ├── pretrain_dataset.yaml
│   │   ├── pretrain_mistral_clip.sh
│   │   ├── radio/
│   │   │   └── radio_g.py
│   │   ├── run_text_generation.py
│   │   ├── sft_dataset.yaml
│   │   ├── sft_mistral_clip.sh
│   │   ├── text_generation_mistral_clip.sh
│   │   └── train.py
│   ├── post_training/
│   │   └── modelopt/
│   │       ├── .gitignore
│   │       ├── ADVANCED.md
│   │       ├── Dockerfile
│   │       ├── README.md
│   │       ├── conf/
│   │       │   ├── Qwen/
│   │       │   │   ├── Qwen2.5-0.5B-Instruct.sh
│   │       │   │   ├── Qwen2.5-7B-Instruct.sh
│   │       │   │   ├── Qwen3-0.6B.sh
│   │       │   │   ├── Qwen3-235B-A22B.sh
│   │       │   │   ├── Qwen3-30B-A3B.sh
│   │       │   │   └── Qwen3-8B.sh
│   │       │   ├── arguments.sh
│   │       │   ├── deepseek-ai/
│   │       │   │   ├── DeepSeek-R1.sh
│   │       │   │   └── DeepSeek-V2-Lite.sh
│   │       │   ├── meta-llama/
│   │       │   │   ├── Llama-3.1-8B-Instruct.sh
│   │       │   │   ├── Llama-3.2-1B-Instruct.sh
│   │       │   │   ├── Llama-4-Maverick-17B-128E-Instruct.sh
│   │       │   │   └── Llama-4-Scout-17B-16E-Instruct.sh
│   │       │   ├── moonshotai/
│   │       │   │   ├── Kimi-K2-Instruct.sh
│   │       │   │   ├── kimi_k2_instruct.sh
│   │       │   │   └── kimi_k2_instruct_export.sh
│   │       │   ├── nvidia/
│   │       │   │   ├── NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.sh
│   │       │   │   ├── NVIDIA-Nemotron-3-Super-120B-A12B-BF16.sh
│   │       │   │   ├── NVIDIA-Nemotron-Nano-9B-v2.sh
│   │       │   │   ├── Nemotron-H-47B-Reasoning-128K.sh
│   │       │   │   ├── Nemotron-H-4B-Instruct.sh
│   │       │   │   ├── Nemotron-H-56B-Base-8K.sh
│   │       │   │   ├── Nemotron-H-8B-Base-8K.sh
│   │       │   │   └── Nemotron-Mini-4B-Instruct.sh
│   │       │   └── openai/
│   │       │       ├── gpt-oss-120b.sh
│   │       │       └── gpt-oss-20b.sh
│   │       ├── convert.sh
│   │       ├── convert_model.py
│   │       ├── distillation.md
│   │       ├── eagle3.sh
│   │       ├── export.py
│   │       ├── export.sh
│   │       ├── finetune.py
│   │       ├── finetune.sh
│   │       ├── generate.py
│   │       ├── generate.sh
│   │       ├── generation_server.sh
│   │       ├── mmlu.py
│   │       ├── mmlu.sh
│   │       ├── offline_feature_extract.py
│   │       ├── offline_feature_extract.sh
│   │       ├── prune.py
│   │       ├── prune.sh
│   │       ├── quantize.py
│   │       ├── quantize.sh
│   │       ├── requirements.txt
│   │       ├── requirements_ssm.txt
│   │       ├── slurm/
│   │       │   ├── env_setup_template.sh
│   │       │   └── sbatch.sh
│   │       ├── speculative.md
│   │       ├── train.sh
│   │       ├── validate.py
│   │       └── validate.sh
│   ├── rl/
│   │   ├── README.md
│   │   ├── benchmark_refit.py
│   │   ├── environment_configs/
│   │   │   ├── countdown.yaml
│   │   │   ├── dapo.yaml
│   │   │   ├── default.yaml
│   │   │   ├── gsm8k.yaml
│   │   │   ├── gsm8k_nanov3.yaml
│   │   │   ├── math.yaml
│   │   │   └── openmathinstructv2.yaml
│   │   ├── environments/
│   │   │   ├── __init__.py
│   │   │   ├── countdown/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── countdown.py
│   │   │   │   └── countdown_agent.py
│   │   │   └── math/
│   │   │       ├── __init__.py
│   │   │       ├── aime_agent.py
│   │   │       ├── bigmath_agent.py
│   │   │       ├── dapo_agent.py
│   │   │       ├── gsm8k_agent.py
│   │   │       ├── math_agent.py
│   │   │       └── openmath_agent.py
│   │   └── model_configs/
│   │       ├── common.sh
│   │       ├── llama3p1_8b_instruct.sh
│   │       ├── nemotron5_56b.sh
│   │       ├── nemotron5_8b.sh
│   │       ├── nemotron5p5_12b_H.sh
│   │       ├── nemotron6_3b_moe.sh
│   │       ├── qwen3_30b_a3b_moe.sh
│   │       ├── qwen3_32b.sh
│   │       ├── qwen3_4b.sh
│   │       ├── qwen3_8b.sh
│   │       ├── qwen_2p5_32b.sh
│   │       ├── qwen_2p5_3b.sh
│   │       ├── qwen_2p5_distill_7b.sh
│   │       └── qwen_2p5_math_7b.sh
│   ├── run_simple_mcore_train_loop.py
│   └── t5/
│       ├── README.md
│       └── train_t5_220m_distributed.sh
├── gpt_builders.py
├── greptile.json
├── mamba_builders.py
├── megatron/
│   ├── core/
│   │   ├── MSC_Integration.md
│   │   ├── QuickStart.md
│   │   ├── README.md
│   │   ├── README_STRAGGLER.md
│   │   ├── __init__.py
│   │   ├── _rank_utils.py
│   │   ├── activations.py
│   │   ├── config.py
│   │   ├── config_logger.py
│   │   ├── datasets/
│   │   │   ├── Makefile
│   │   │   ├── __init__.py
│   │   │   ├── bert_dataset.py
│   │   │   ├── blended_dataset.py
│   │   │   ├── blended_megatron_dataset_builder.py
│   │   │   ├── blended_megatron_dataset_config.py
│   │   │   ├── data_schedule.py
│   │   │   ├── gpt_dataset.py
│   │   │   ├── helpers.cpp
│   │   │   ├── helpers.py
│   │   │   ├── indexed_dataset.py
│   │   │   ├── masked_dataset.py
│   │   │   ├── megatron_dataset.py
│   │   │   ├── multimodal_dataset.py
│   │   │   ├── object_storage_utils.py
│   │   │   ├── readme.md
│   │   │   ├── t5_dataset.py
│   │   │   ├── utils.py
│   │   │   └── utils_s3.py
│   │   ├── dist_checkpointing/
│   │   │   ├── __init__.py
│   │   │   ├── core.py
│   │   │   ├── dict_utils.py
│   │   │   ├── exchange_utils.py
│   │   │   ├── mapping.py
│   │   │   ├── optimizer.py
│   │   │   ├── serialization.py
│   │   │   ├── state_dict_utils.py
│   │   │   ├── strategies/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── async_utils.py
│   │   │   │   ├── base.py
│   │   │   │   ├── cached_metadata_filesystem_reader.py
│   │   │   │   ├── checkpointable.py
│   │   │   │   ├── common.py
│   │   │   │   ├── filesystem_async.py
│   │   │   │   ├── fully_parallel.py
│   │   │   │   ├── state_dict_saver.py
│   │   │   │   └── torch.py
│   │   │   ├── tensor_aware_state_dict.py
│   │   │   ├── utils.py
│   │   │   └── validation.py
│   │   ├── distributed/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── data_parallel_base.py
│   │   │   ├── distributed_data_parallel.py
│   │   │   ├── distributed_data_parallel_config.py
│   │   │   ├── finalize_model_grads.py
│   │   │   ├── fsdp/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── mcore_fsdp_adapter.py
│   │   │   │   └── src/
│   │   │   │       ├── README.md
│   │   │   │       ├── __init__.py
│   │   │   │       ├── megatron_fsdp/
│   │   │   │       │   ├── __init__.py
│   │   │   │       │   ├── distributed_data_parallel_config.py
│   │   │   │       │   ├── fully_shard.py
│   │   │   │       │   ├── megatron_fsdp.py
│   │   │   │       │   ├── mixed_precision.py
│   │   │   │       │   ├── package_info.py
│   │   │   │       │   ├── param_and_grad_buffer.py
│   │   │   │       │   ├── uneven_dtensor.py
│   │   │   │       │   └── utils.py
│   │   │   │       └── pyproject.toml
│   │   │   ├── param_and_grad_buffer.py
│   │   │   ├── reduce_scatter_with_fp32_accumulation.py
│   │   │   ├── torch_fully_sharded_data_parallel.py
│   │   │   └── torch_fully_sharded_data_parallel_config.py
│   │   ├── energy_monitor.py
│   │   ├── enums.py
│   │   ├── export/
│   │   │   ├── __init__.py
│   │   │   ├── data_type.py
│   │   │   ├── export_config.py
│   │   │   ├── model_type.py
│   │   │   └── trtllm/
│   │   │       ├── __init__.py
│   │   │       ├── engine_builder/
│   │   │       │   ├── __init__.py
│   │   │       │   └── trtllm_engine_builder.py
│   │   │       ├── model_to_trllm_mapping/
│   │   │       │   ├── __init__.py
│   │   │       │   └── default_conversion_dict.py
│   │   │       ├── trt_model_config.py
│   │   │       ├── trt_model_type.py
│   │   │       ├── trtllm_helper.py
│   │   │       ├── trtllm_layers.py
│   │   │       └── trtllm_weights_converter/
│   │   │           ├── __init__.py
│   │   │           ├── distributed_trtllm_model_weights_converter.py
│   │   │           ├── single_device_trtllm_model_weights_converter.py
│   │   │           └── utils.py
│   │   ├── extensions/
│   │   │   ├── TransformerEngineMixedPrecision.md
│   │   │   ├── __init__.py
│   │   │   ├── kitchen.py
│   │   │   ├── transformer_engine.py
│   │   │   └── transformer_engine_spec_provider.py
│   │   ├── fp4_utils.py
│   │   ├── fp8_utils.py
│   │   ├── full_cuda_graph.py
│   │   ├── fusions/
│   │   │   ├── __init__.py
│   │   │   ├── fused_bias_dropout.py
│   │   │   ├── fused_bias_geglu.py
│   │   │   ├── fused_bias_gelu.py
│   │   │   ├── fused_bias_swiglu.py
│   │   │   ├── fused_cross_entropy.py
│   │   │   ├── fused_indices_converter.py
│   │   │   ├── fused_layer_norm.py
│   │   │   ├── fused_mla_yarn_rope_apply.py
│   │   │   ├── fused_pad_routing_map.py
│   │   │   ├── fused_softmax.py
│   │   │   └── fused_weighted_squared_relu.py
│   │   ├── hyper_comm_grid.py
│   │   ├── inference/
│   │   │   ├── __init__.py
│   │   │   ├── async_stream.py
│   │   │   ├── batch_dimensions_utils.py
│   │   │   ├── common_inference_params.py
│   │   │   ├── communication/
│   │   │   │   └── torch_symm_triton/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── barrier.py
│   │   │   │       ├── collectives.py
│   │   │   │       ├── fused_collectives.py
│   │   │   │       ├── multimem_asm.py
│   │   │   │       └── utils.py
│   │   │   ├── communication_utils.py
│   │   │   ├── config.py
│   │   │   ├── contexts/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── attention_context/
│   │   │   │   │   ├── mamba_metadata.py
│   │   │   │   │   ├── metadata_base.py
│   │   │   │   │   ├── mha_metadata.py
│   │   │   │   │   └── triton/
│   │   │   │   │       └── tensor_ops.py
│   │   │   │   ├── base_context.py
│   │   │   │   ├── dynamic_context.py
│   │   │   │   ├── fused_kv_append_kernel.py
│   │   │   │   ├── kv_block_allocator.py
│   │   │   │   ├── mamba_slot_allocator.py
│   │   │   │   ├── routing_metadata.py
│   │   │   │   └── static_context.py
│   │   │   ├── data_parallel_inference_coordinator.py
│   │   │   ├── engines/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── abstract_engine.py
│   │   │   │   ├── async_zmq_communicator.py
│   │   │   │   ├── dynamic_engine.py
│   │   │   │   ├── mcore_engine.py
│   │   │   │   └── static_engine.py
│   │   │   ├── headers.py
│   │   │   ├── inference_client.py
│   │   │   ├── inference_request.py
│   │   │   ├── model_inference_wrappers/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── abstract_model_inference_wrapper.py
│   │   │   │   ├── gpt/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── gpt_inference_wrapper.py
│   │   │   │   ├── multimodal/
│   │   │   │   │   └── vlm_inference_wrapper.py
│   │   │   │   └── t5/
│   │   │   │       ├── __init__.py
│   │   │   │       └── t5_inference_wrapper.py
│   │   │   ├── moe/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── activations.py
│   │   │   │   ├── fused_moe.py
│   │   │   │   ├── pad.py
│   │   │   │   └── permute.py
│   │   │   ├── quantization/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── mxfp8_quantize.py
│   │   │   │   ├── mxfp8_tensor.py
│   │   │   │   └── utils.py
│   │   │   ├── sampling_params.py
│   │   │   ├── scheduler.py
│   │   │   ├── symmetric_memory.py
│   │   │   ├── text_generation_controllers/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── encoder_decoder_text_generation_controller.py
│   │   │   │   ├── text_generation_controller.py
│   │   │   │   └── vlm_text_generation_controller.py
│   │   │   ├── text_generation_server/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── dynamic_text_gen_server/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── endpoints/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── chat_completions.py
│   │   │   │   │   │   ├── common.py
│   │   │   │   │   │   ├── completions.py
│   │   │   │   │   │   └── health.py
│   │   │   │   │   ├── text_generation_server.py
│   │   │   │   │   └── tokenization.py
│   │   │   │   ├── endpoints/
│   │   │   │   │   ├── common.py
│   │   │   │   │   └── completions.py
│   │   │   │   ├── run_mcore_engine.py
│   │   │   │   ├── text_generation_server.py
│   │   │   │   └── tokenization.py
│   │   │   ├── unified_memory.py
│   │   │   └── utils.py
│   │   ├── inference_params.py
│   │   ├── jit.py
│   │   ├── model_parallel_config.py
│   │   ├── models/
│   │   │   ├── T5/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── t5_model.py
│   │   │   │   └── t5_spec.py
│   │   │   ├── __init__.py
│   │   │   ├── backends.py
│   │   │   ├── bert/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── bert_layer_specs.py
│   │   │   │   ├── bert_lm_head.py
│   │   │   │   ├── bert_model.py
│   │   │   │   └── pooler.py
│   │   │   ├── common/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── embeddings/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── language_model_embedding.py
│   │   │   │   │   ├── relative_pos_embedding.py
│   │   │   │   │   ├── rope_utils.py
│   │   │   │   │   ├── rotary_pos_embedding.py
│   │   │   │   │   └── yarn_rotary_pos_embedding.py
│   │   │   │   ├── language_module/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── language_module.py
│   │   │   │   ├── model_chunk_schedule_plan.py
│   │   │   │   └── vision_module/
│   │   │   │       ├── __init__.py
│   │   │   │       └── vision_module.py
│   │   │   ├── gpt/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── experimental_attention_variant_module_specs.py
│   │   │   │   ├── fine_grained_callables.py
│   │   │   │   ├── gpt_layer_specs.py
│   │   │   │   ├── gpt_model.py
│   │   │   │   ├── heterogeneous/
│   │   │   │   │   └── heterogeneous_layer_specs.py
│   │   │   │   └── moe_module_specs.py
│   │   │   ├── huggingface/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── clip_model.py
│   │   │   │   ├── module.py
│   │   │   │   └── qwen_model.py
│   │   │   ├── mamba/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── mamba_layer_specs.py
│   │   │   │   └── mamba_model.py
│   │   │   ├── mimo/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── config/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── base_configs.py
│   │   │   │   ├── model/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── base.py
│   │   │   │   ├── partition/
│   │   │   │   │   └── utils.py
│   │   │   │   └── submodules/
│   │   │   │       ├── audio.py
│   │   │   │       ├── base.py
│   │   │   │       └── vision.py
│   │   │   ├── multimodal/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── context_parallel.py
│   │   │   │   ├── llava_model.py
│   │   │   │   └── llava_spec.py
│   │   │   └── vision/
│   │   │       ├── __init__.py
│   │   │       ├── clip_vit_model.py
│   │   │       ├── multimodal_projector.py
│   │   │       ├── radio.py
│   │   │       └── vit_layer_specs.py
│   │   ├── msc_utils.py
│   │   ├── nccl_allocator.py
│   │   ├── num_microbatches_calculator.py
│   │   ├── optimizer/
│   │   │   ├── __init__.py
│   │   │   ├── clip_grads.py
│   │   │   ├── cpu_offloading/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   └── hybrid_optimizer.py
│   │   │   ├── distrib_optimizer.py
│   │   │   ├── grad_scaler.py
│   │   │   ├── layer_wise_optimizer.py
│   │   │   ├── muon.py
│   │   │   ├── optimizer.py
│   │   │   ├── optimizer_config.py
│   │   │   └── qk_clip.py
│   │   ├── optimizer_param_scheduler.py
│   │   ├── package_info.py
│   │   ├── packed_seq_params.py
│   │   ├── parallel_state.py
│   │   ├── pipeline_parallel/
│   │   │   ├── __init__.py
│   │   │   ├── bridge_communicator.py
│   │   │   ├── combined_1f1b.py
│   │   │   ├── fine_grained_activation_offload.py
│   │   │   ├── hybrid_cp_schedule.py
│   │   │   ├── multimodule_communicator.py
│   │   │   ├── p2p_communication.py
│   │   │   ├── schedules.py
│   │   │   └── utils.py
│   │   ├── post_training/
│   │   │   ├── __init__.py
│   │   │   └── modelopt/
│   │   │       ├── __init__.py
│   │   │       ├── gpt/
│   │   │       │   ├── __init__.py
│   │   │       │   ├── model_specs.py
│   │   │       │   └── state_dict_hooks.py
│   │   │       ├── layers.py
│   │   │       └── mamba/
│   │   │           ├── __init__.py
│   │   │           └── model_specs.py
│   │   ├── process_groups_config.py
│   │   ├── quantization/
│   │   │   ├── __init__.py
│   │   │   ├── quant_config.py
│   │   │   └── utils.py
│   │   ├── requirements.txt
│   │   ├── rerun_state_machine.py
│   │   ├── resharding/
│   │   │   ├── __init__.py
│   │   │   ├── copy_services/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base.py
│   │   │   │   ├── gloo_copy_service.py
│   │   │   │   ├── nccl_copy_service.py
│   │   │   │   └── nvshmem_copy_service.py
│   │   │   ├── execution.py
│   │   │   ├── nvshmem_copy_service/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── compat.py
│   │   │   │   ├── core/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── gpu_resource_manager.py
│   │   │   │   │   ├── kernel_launcher.py
│   │   │   │   │   └── pipeline_executor.py
│   │   │   │   ├── kernels/
│   │   │   │   │   └── chunked_kernel.cu
│   │   │   │   ├── logger.py
│   │   │   │   ├── memory/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── double_buffer_manager.py
│   │   │   │   │   └── tensor_pointer_utils.py
│   │   │   │   ├── nvshmem_types.py
│   │   │   │   ├── planning/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── communication_scheduler.py
│   │   │   │   │   ├── gpu_execution_planner.py
│   │   │   │   │   ├── task_segmenter.py
│   │   │   │   │   └── workload_packer.py
│   │   │   │   ├── service.py
│   │   │   │   └── validation.py
│   │   │   ├── planner.py
│   │   │   ├── refit.py
│   │   │   ├── transforms.py
│   │   │   └── utils.py
│   │   ├── safe_globals.py
│   │   ├── ssm/
│   │   │   ├── __init__.py
│   │   │   ├── gated_delta_net.py
│   │   │   ├── mamba_block.py
│   │   │   ├── mamba_context_parallel.py
│   │   │   ├── mamba_hybrid_layer_allocation.py
│   │   │   ├── mamba_layer.py
│   │   │   ├── mamba_mixer.py
│   │   │   ├── mlp_layer.py
│   │   │   ├── ops/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── causal_conv1d_triton.py
│   │   │   │   ├── causal_conv1d_varlen.py
│   │   │   │   ├── determinism.py
│   │   │   │   ├── mamba_ssm.py
│   │   │   │   ├── ssd_bmm.py
│   │   │   │   ├── ssd_chunk_scan.py
│   │   │   │   ├── ssd_chunk_state.py
│   │   │   │   ├── ssd_combined.py
│   │   │   │   └── ssd_state_passing.py
│   │   │   └── triton_cache_manager.py
│   │   ├── tensor_parallel/
│   │   │   ├── __init__.py
│   │   │   ├── cross_entropy.py
│   │   │   ├── data.py
│   │   │   ├── inference_layers.py
│   │   │   ├── layers.py
│   │   │   ├── mappings.py
│   │   │   ├── random.py
│   │   │   └── utils.py
│   │   ├── timers.py
│   │   ├── tokenizers/
│   │   │   ├── __init__.py
│   │   │   ├── base_tokenizer.py
│   │   │   ├── megatron_tokenizer.py
│   │   │   ├── text/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── libraries/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── abstract_tokenizer.py
│   │   │   │   │   ├── bytelevel_tokenizer.py
│   │   │   │   │   ├── chat_template.py
│   │   │   │   │   ├── huggingface_tokenizer.py
│   │   │   │   │   ├── megatron_hf_tokenizer.py
│   │   │   │   │   ├── null_tokenizer.py
│   │   │   │   │   ├── sentencepiece_tokenizer.py
│   │   │   │   │   ├── sft_tokenizer.py
│   │   │   │   │   └── tiktoken_tokenizer.py
│   │   │   │   ├── models/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── bert_tokenizer.py
│   │   │   │   │   ├── default_tokenizer.py
│   │   │   │   │   ├── gpt_tokenizer.py
│   │   │   │   │   ├── mamba_tokenizer.py
│   │   │   │   │   └── t5_tokenizer.py
│   │   │   │   ├── parsers/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base_parser.py
│   │   │   │   │   ├── deepseek_r1_reasoning_parser.py
│   │   │   │   │   └── qwen3_coder_tool_parser.py
│   │   │   │   └── text_tokenizer.py
│   │   │   ├── utils/
│   │   │   │   └── build_tokenizer.py
│   │   │   └── vision/
│   │   │       ├── __init__.py
│   │   │       ├── libraries/
│   │   │       │   ├── __init__.py
│   │   │       │   ├── multimodal_tokenizer.py
│   │   │       │   └── null_multimodal_tokenizer.py
│   │   │       ├── models/
│   │   │       │   ├── __init__.py
│   │   │       │   └── default_tokenizer.py
│   │   │       └── vision_tokenizer.py
│   │   ├── transformer/
│   │   │   ├── __init__.py
│   │   │   ├── attention.py
│   │   │   ├── cuda_graphs.py
│   │   │   ├── custom_layers/
│   │   │   │   ├── __init__.py
│   │   │   │   └── batch_invariant_kernels.py
│   │   │   ├── dot_product_attention.py
│   │   │   ├── enums.py
│   │   │   ├── experimental_attention_variant/
│   │   │   │   ├── absorbed_mla.py
│   │   │   │   └── dsa.py
│   │   │   ├── fsdp_dtensor_checkpoint.py
│   │   │   ├── heterogeneous/
│   │   │   │   ├── heterogeneous_config.py
│   │   │   │   └── linear_replacements.py
│   │   │   ├── identity_op.py
│   │   │   ├── mlp.py
│   │   │   ├── module.py
│   │   │   ├── moe/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── experts.py
│   │   │   │   ├── fused_a2a.py
│   │   │   │   ├── moe_layer.py
│   │   │   │   ├── moe_utils.py
│   │   │   │   ├── router.py
│   │   │   │   ├── router_replay.py
│   │   │   │   ├── shared_experts.py
│   │   │   │   ├── token_dispatcher.py
│   │   │   │   ├── token_dispatcher_inference.py
│   │   │   │   └── upcycling_utils.py
│   │   │   ├── multi_latent_attention.py
│   │   │   ├── multi_token_prediction.py
│   │   │   ├── pipeline_parallel_layer_layout.py
│   │   │   ├── spec_utils.py
│   │   │   ├── torch_layer_norm.py
│   │   │   ├── torch_norm.py
│   │   │   ├── transformer_block.py
│   │   │   ├── transformer_config.py
│   │   │   ├── transformer_layer.py
│   │   │   └── utils.py
│   │   ├── typed_torch.py
│   │   └── utils.py
│   ├── inference/
│   │   ├── __init__.py
│   │   └── utils.py
│   ├── legacy/
│   │   ├── fp16_deprecated/
│   │   │   └── loss_scaler.py
│   │   ├── fused_kernels/
│   │   │   ├── __init__.py
│   │   │   ├── compat.h
│   │   │   ├── tests/
│   │   │   │   ├── __init__.py
│   │   │   │   └── test_fused_kernels.py
│   │   │   └── type_shim.h
│   │   └── model/
│   │       ├── __init__.py
│   │       ├── bert_model.py
│   │       ├── biencoder_model.py
│   │       ├── classification.py
│   │       ├── enums.py
│   │       ├── fused_bias_gelu.py
│   │       ├── fused_layer_norm.py
│   │       ├── fused_softmax.py
│   │       ├── gpt_model.py
│   │       ├── language_model.py
│   │       ├── module.py
│   │       ├── multiple_choice.py
│   │       ├── realm_model.py
│   │       ├── rms_norm.py
│   │       ├── t5_model.py
│   │       ├── transformer.py
│   │       ├── utils.py
│   │       └── vision/
│   │           ├── classification.py
│   │           ├── dino.py
│   │           ├── esvit_swin_backbone.py
│   │           ├── inpainting.py
│   │           ├── knn_monitor.py
│   │           ├── mit_backbone.py
│   │           ├── swin_backbone.py
│   │           ├── utils.py
│   │           └── vit_backbone.py
│   ├── post_training/
│   │   ├── __init__.py
│   │   ├── arguments.py
│   │   ├── checkpointing.py
│   │   ├── generate.py
│   │   ├── loss_func.py
│   │   ├── model_builder.py
│   │   ├── non_loss_data_func.py
│   │   └── utils.py
│   ├── rl/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── agent/
│   │   │   ├── __init__.py
│   │   │   ├── api.py
│   │   │   ├── huggingface_dataset_agent.py
│   │   │   ├── pass_at_evaluation_agent.py
│   │   │   ├── remote_agent.py
│   │   │   ├── reward_only_agent.py
│   │   │   └── weighted_multi_task.py
│   │   ├── inference/
│   │   │   ├── __init__.py
│   │   │   ├── api.py
│   │   │   ├── inference_interface.py
│   │   │   └── megatron.py
│   │   ├── logging.py
│   │   ├── parallel_utils.py
│   │   ├── rl_utils.py
│   │   ├── sequence_packing_utils.py
│   │   └── server/
│   │       ├── __init__.py
│   │       ├── agent/
│   │       │   ├── __init__.py
│   │       │   └── fastapi_env_server.py
│   │       ├── api.py
│   │       └── inference/
│   │           ├── __init__.py
│   │           └── inference_interface_server.py
│   └── training/
│       ├── __init__.py
│       ├── argument_utils.py
│       ├── arguments.py
│       ├── async_utils.py
│       ├── checkpointing.py
│       ├── config/
│       │   ├── __init__.py
│       │   ├── common_config.py
│       │   ├── resilience_config.py
│       │   └── training_config.py
│       ├── datasets/
│       │   ├── README.md
│       │   ├── __init__.py
│       │   ├── data_samplers.py
│       │   ├── fim_dataset.py
│       │   └── sft_dataset.py
│       ├── dgrad_logging.py
│       ├── dist_signal_handler.py
│       ├── ft_integration.py
│       ├── global_vars.py
│       ├── initialize.py
│       ├── inprocess_restart.py
│       ├── log_handler.py
│       ├── one_logger_utils.py
│       ├── theoretical_memory_usage.py
│       ├── training.py
│       ├── utils.py
│       ├── wandb_utils.py
│       └── yaml_arguments.py
├── model_provider.py
├── pretrain_bert.py
├── pretrain_gpt.py
├── pretrain_mamba.py
├── pretrain_t5.py
├── pretrain_vlm.py
├── pyproject.toml
├── scripts/
│   └── check_api_backwards_compatibility.py
├── setup.py
├── tasks/
│   ├── data_utils.py
│   ├── eval_utils.py
│   └── finetune_utils.py
├── tests/
│   ├── README.md
│   ├── __init__.py
│   ├── functional_tests/
│   │   ├── __init__.py
│   │   ├── python_test_utils/
│   │   │   ├── __init__.py
│   │   │   ├── common.py
│   │   │   ├── compute_golden_statistics.py
│   │   │   ├── conftest.py
│   │   │   ├── get_test_results_from_tensorboard_logs.py
│   │   │   ├── test_grpo_training_loop.py
│   │   │   ├── test_inference_regular_pipeline.py
│   │   │   ├── test_optimizer_grads_match.py
│   │   │   ├── test_pretraining_regular_pipeline.py
│   │   │   └── test_pretraining_resume_checkpoint_pipeline.py
│   │   ├── shell_test_utils/
│   │   │   ├── _run_training.sh
│   │   │   ├── run_batch_ci_tests.sh
│   │   │   ├── run_ci_test.sh
│   │   │   └── start_interactive_job.sh
│   │   └── test_cases/
│   │       ├── bert/
│   │       │   ├── bert_mcore_tp1_pp2/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── bert_mcore_tp1_pp4_vp2/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── bert_mcore_tp2_pp2/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── bert_mcore_tp2_pp2_frozen_resume_torch_dist/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── bert_mcore_tp2_pp2_local_spec/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── bert_mcore_tp2_pp2_resume_torch_dist/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── bert_mcore_tp2_pp2_resume_torch_dist_local_spec/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── bert_mcore_tp4_pp1/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── bert_release/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   └── bert_release_sm/
│   │       │       ├── golden_values_dev_dgx_gb200.json
│   │       │       ├── golden_values_dev_dgx_h100.json
│   │       │       └── model_config.yaml
│   │       ├── common/
│   │       │   ├── ckpt_converter/
│   │       │   │   ├── __main__.py
│   │       │   │   └── model_config.yaml
│   │       │   └── moe_perf/
│   │       │       ├── __main__.py
│   │       │       ├── baseline.json
│   │       │       └── test_cases.py
│   │       ├── gpt/
│   │       │   ├── gpt3_15b_8t_release/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_15b_8t_release_gb200/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_15b_8t_release_sm/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_15b_8t_release_sm_gb200/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_7b_tp1_pp4_memory_speed/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_7b_tp4_pp1_memory_speed/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_reruns_disable/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_reruns_enable/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_reruns_persistent_1/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_reruns_persistent_2/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_reruns_reshard/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_reruns_resume/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_reruns_resume_check_grads/
│   │       │   │   ├── README.md
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_reruns_transient/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp1_mup/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp1_uniform_full_recompute/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp2_rope_embeddings/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_disable_bias_linear/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_sequence_parallel/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_swiglu/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_untie_embeddings_and_outputs/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_calculate_per_token_loss/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_decoupled_lr/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxh100_dgxc.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_cp2_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_fsdp2_resume_torch_dist/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_gdn/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_modelopt_distill_resume/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_multi_dist_optimizer_instances/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cp2/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_ddp_average_in_collective/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_defer_embedding_wgrad_compute/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/
│   │       │   │   └── golden_values_dev_dgxh100_dgxc.json
│   │       │   ├── gpt3_mcore_te_tp2_pp2_mla/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_no_mmap_bin_files/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_resume_torch_dist/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp4_pp1_qk_layernorm_test_mode/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_dev_dgxh100_dgxc.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp1_pp1_fsdp2_resume_torch_dist_te/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp1_pp2/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp1_pp2_fp16/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp1_pp2_resume_torch_dist/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp1_pp4/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp1_pp4_resume_torch_dist/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_pp2_resume_torch_dist_uninstall_te/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_pp2_uninstall_te/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp4_pp1/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp4_pp1_resume_torch/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp4_pp1_resume_torch_dist/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_weekly_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── model_config.yaml
│   │       │   │   └── tp_comm_overlap_cfg.yaml
│   │       │   ├── gpt3_weekly_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_validation/
│   │       │   │   ├── cuda_graphs.py
│   │       │   │   ├── cuda_graphs.sh
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp1_pp8_dp1_583m_logitsmatch_zmq/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp2_pp2_dp2_583m_logitsmatch_zmq/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_grpo_basic_function/
│   │       │   │   ├── env_config.yaml
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest/
│   │       │   │   ├── env_config.yaml
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest_github/
│   │       │   │   ├── env_config.yaml
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_grpo_tp4_pp1_dp2_8b_cudagraphs_throughput/
│   │       │   │   ├── env_config.yaml
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_grpo_tp4_pp1_dp2_8b_throughput/
│   │       │   │   ├── env_config.yaml
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_grpo_tp4_pp1_dp2_8b_throughput_github/
│   │       │   │   ├── env_config.yaml
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/
│   │       │   │   ├── README.md
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── model_config.yaml
│   │       │   │   └── test_prompts.jsonl
│   │       │   ├── gpt_static_inference_tp1_pp1_583m_cudagraphs/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   └── gpt_static_inference_tp1_pp1_583m_logitsmatch/
│   │       │       ├── golden_values_dev_dgx_a100.json
│   │       │       ├── golden_values_dev_dgx_h100.json
│   │       │       └── model_config.yaml
│   │       ├── gpt-nemo/
│   │       │   ├── bert-nemo_340m_mr_mbs2_gbs32_mcore_te_tp2_pp2_1N8G/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gemma2-nemo_2b_mr_mbs1_gbs8_mcore_te_tp4_pp1_cp1_1N8G/
│   │       │   │   └── model_config.yaml
│   │       │   ├── llama3-nemo_8b_mr_mbs1_gbs8_mcore_te_8experts_tp2_ep2_pp2_dgx_a100_1N8G/
│   │       │   │   └── model_config.yaml
│   │       │   ├── llama3-nemo_8b_mr_mbs4_gbs64_mcore_te_tp1_pp1_cp2_dgx_a100_1N8G/
│   │       │   │   └── model_config.yaml
│   │       │   ├── mixtral-nemo_8x7b_mr_mbs1_gbs8_mcore_te_tp2_pp1_ep2_1N8G/
│   │       │   │   └── model_config.yaml
│   │       │   └── t5-nemo_220m_mr_mbs4_gbs64_te_tp1_pp1_1N8G/
│   │       │       └── model_config.yaml
│   │       ├── hybrid/
│   │       │   ├── hybrid_dynamic_inference_tp1_pp1_dp8_583m/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── hybrid_dynamic_inference_tp1_pp1_dp8_583m_chunked_prefill/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── hybrid_mr_mcore_te_tp1_pp2_vpp2_cp1_dgx_a100_1N8G/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── hybrid_mr_mcore_te_tp1_pp4_cp1_dgx_a100_1N8G/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── hybrid_static_inference_tp1_pp1_2B_cudagraphs/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   └── hybrid_static_inference_tp1_pp1_2B_logitsmatch/
│   │       │       ├── golden_values_dev_dgx_h100.json
│   │       │       └── model_config.yaml
│   │       ├── mimo/
│   │       │   ├── mimo_vlm_pretrain_convergence_tp1_pp1_cp1_dp8/
│   │       │   │   ├── golden_values_dev.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── mimo_vlm_pretrain_convergence_tp1_pp1_cp1_dp8_seq_packing/
│   │       │   │   ├── golden_values_dev.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   └── mimo_vlm_pretrain_convergence_tp1_pp1_cp2_dp8/
│   │       │       ├── golden_values_dev.json
│   │       │       ├── golden_values_dev_dgx_h100.json
│   │       │       └── model_config.yaml
│   │       ├── mixtral/
│   │       │   ├── deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release/
│   │       │   │   └── model_config.yaml
│   │       │   ├── deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release_sm/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── deepseekv3_proxy_flex_tp2pp2emp16etp1cp1_gb_200_release/
│   │       │   │   └── model_config.yaml
│   │       │   ├── deepseekv3_proxy_flex_tp2pp2emp16etp1cp1_gb_200_release_sm/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── mixtral_8x22b_tp2pp8ep8vpp1_release/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── mixtral_8x7b_alltoall_tp2pp4ep4_release/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   └── mixtral_8x7b_tp1pp4ep8vpp8_release/
│   │       │       ├── golden_values_dev_dgx_h100.json
│   │       │       ├── golden_values_lts_dgx_a100.json
│   │       │       └── model_config.yaml
│   │       ├── moe/
│   │       │   ├── deepseek_proxy_fsdp_ep2_fsdp2/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── deepseek_proxy_fsdp_ep2_fsdp2_1node/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_te_8experts2parallel_top2router/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4/
│   │       │   │   ├── golden_values_dev.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxh100_dgxc.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/
│   │       │   │   ├── golden_values_dev.json
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_pp2_resume_torch_dist_te_2experts/
│   │       │   │   ├── golden_values_dev.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_moe_mcore_te_ep8_resume_torch_dist_muon/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_dev_dgxh100_dgxc.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp4_etp1_pp1_ep8_16B_logitsmatch_cudagraph_zmq/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp4_etp1_pp1_ep8_16B_logitsmatch_zmq/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp4_etp1_pp1_ep8_16B_logitsmatch_zmq_suspend_resume/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── model_config.yaml
│   │       │   │   └── prompts.json
│   │       │   ├── gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_grpo_tp8tp4_pp1_ep8ep2_dp8_throughputtest/
│   │       │   │   ├── env_config.yaml
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_static_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   └── gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/
│   │       │       ├── golden_values_dev_dgx_h100.json
│   │       │       └── model_config.yaml
│   │       ├── multimodal-llava/
│   │       │   ├── multimodal_llava_mcore_te_tp1_pp1/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   └── multimodal_llava_mcore_te_tp4_sp_cp2/
│   │       │       ├── golden_values_dev_dgx_a100.json
│   │       │       ├── golden_values_dev_dgx_h100.json
│   │       │       ├── golden_values_lts_dgx_a100.json
│   │       │       └── model_config.yaml
│   │       └── t5/
│   │           ├── t5_11b_mcore_tp4_pp1/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_te_tp1_pp1_vp1_resume_torch/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_a100_2nd.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_dev_dgx_h100_2nd.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_te_tp2_pp1_vp1/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_te_tp2_pp1_vp1_sequence_parallel/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_te_tp4_pp1/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_te_tp4_pp1_resume_torch_dist/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_dev_dgx_h100_2nd.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_tp1_pp1_vp1/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_tp1_pp1_vp1_resume_torch/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_a100_2nd.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_dev_dgx_h100_2nd.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_tp2_pp1_vp1/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_tp4_pp1/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_tp4_pp1_resume_torch_dist/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_dev_dgx_h100_2nd.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_release/
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_release_sm/
│   │           │   ├── golden_values_dev_dgx_gb200.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_weekly_mcore_te_tp2_pp1_vp1/
│   │           │   └── golden_values_lts_dgx_a100.json
│   │           └── t5_weekly_mcore_te_tp2_pp1_vp1_sequence_parallel/
│   │               └── golden_values_lts_dgx_a100.json
│   ├── test_utils/
│   │   ├── python_scripts/
│   │   │   ├── approve_merge_gate.py
│   │   │   ├── auto_reminder.py
│   │   │   ├── auto_reminder_github.py
│   │   │   ├── check_status_of_main.py
│   │   │   ├── dashboard.py
│   │   │   ├── download_coverage_results.py
│   │   │   ├── download_golden_values.py
│   │   │   ├── download_unit_tests_dataset.py
│   │   │   ├── generate_jet_trigger_job.py
│   │   │   ├── generate_local_jobs.py
│   │   │   ├── launch_jet_workload.py
│   │   │   ├── launch_nemo_run_workload.py
│   │   │   ├── notify.py
│   │   │   ├── recipe_parser.py
│   │   │   ├── swap_pr_labels.py
│   │   │   └── wait_for_resources.py
│   │   └── recipes/
│   │       ├── _build-mcore-dev.yaml
│   │       ├── _build-mcore-lts.yaml
│   │       ├── _build-nemo.yaml
│   │       ├── gb200/
│   │       │   ├── gpt.yaml
│   │       │   ├── moe-1node.yaml
│   │       │   ├── moe.yaml
│   │       │   └── unit-tests.yaml
│   │       └── h100/
│   │           ├── bert.yaml
│   │           ├── ckpt_converter.yaml
│   │           ├── gpt-dynamic-inference-cuda-graphs.yaml
│   │           ├── gpt-dynamic-inference-with-coordinator.yaml
│   │           ├── gpt-dynamic-inference.yaml
│   │           ├── gpt-grads.yaml
│   │           ├── gpt-grpo.yaml
│   │           ├── gpt-nemo.yaml
│   │           ├── gpt-static-inference.yaml
│   │           ├── gpt.yaml
│   │           ├── mamba-dynamic-inference.yaml
│   │           ├── mamba-static-inference.yaml
│   │           ├── mamba.yaml
│   │           ├── mimo.yaml
│   │           ├── module_performance.yaml
│   │           ├── moe-dynamic-inference-with-coordinator.yaml
│   │           ├── moe-dynamic-inference.yaml
│   │           ├── moe-grpo.yaml
│   │           ├── moe-static-inference.yaml
│   │           ├── moe.yaml
│   │           ├── multimodal-llava.yaml
│   │           ├── t5.yaml
│   │           └── unit-tests.yaml
│   └── unit_tests/
│       ├── __init__.py
│       ├── a2a_overlap/
│       │   ├── test_cuda_graphed_schedule_chunk_1f1b.py
│       │   ├── test_schedule_chunk_1f1b.py
│       │   ├── test_schedule_layer_1f1b.py
│       │   └── utils.py
│       ├── conftest.py
│       ├── data/
│       │   ├── __init__.py
│       │   ├── test_bin_reader.py
│       │   ├── test_builder.py
│       │   ├── test_fim_dataset.py
│       │   ├── test_gpt_dataset.py
│       │   ├── test_multimodal_dataset.py
│       │   ├── test_preprocess_data.py
│       │   └── test_preprocess_mmdata.py
│       ├── dist_checkpointing/
│       │   ├── __init__.py
│       │   ├── conftest.py
│       │   ├── models/
│       │   │   ├── __init__.py
│       │   │   ├── common.py
│       │   │   ├── test_bert_model.py
│       │   │   ├── test_gpt_model.py
│       │   │   ├── test_mamba.py
│       │   │   ├── test_mlp_glu.py
│       │   │   ├── test_moe_experts.py
│       │   │   └── test_t5_model.py
│       │   ├── test_async_save.py
│       │   ├── test_checkpointable.py
│       │   ├── test_fp8.py
│       │   ├── test_fully_parallel.py
│       │   ├── test_global_metadata_reuse.py
│       │   ├── test_layer_wise_optimizer.py
│       │   ├── test_local.py
│       │   ├── test_mapping.py
│       │   ├── test_msc.py
│       │   ├── test_nonpersistent.py
│       │   ├── test_optimizer.py
│       │   ├── test_pipeline_parallel_layout.py
│       │   ├── test_replication.py
│       │   ├── test_safe_globals.py
│       │   ├── test_serialization.py
│       │   ├── test_strict.py
│       │   ├── test_torch_dist.py
│       │   └── utils.py
│       ├── distributed/
│       │   ├── megatron_fsdp/
│       │   │   ├── test_mcore_fully_sharded_data_parallel.py
│       │   │   ├── test_mfsdp_fully_shard.py
│       │   │   └── utils.py
│       │   ├── test_distributed_data_parallel.py
│       │   ├── test_finalize_model_grads.py
│       │   ├── test_grad_reduce_for_replicated_embedder.py
│       │   ├── test_grad_sync_with_expert_parallel.py
│       │   ├── test_param_and_grad_buffer.py
│       │   ├── test_reduce_scatter_with_fp32_accumulation.py
│       │   └── test_torch_fully_sharded_parallel.py
│       ├── export/
│       │   └── trtllm/
│       │       ├── __init__.py
│       │       ├── test_distributed_fp8.py
│       │       ├── test_single_device_fp8.py
│       │       ├── test_trtllm_distributed_gpu_converter.py
│       │       ├── test_trtllm_helper.py
│       │       ├── test_trtllm_layers.py
│       │       └── test_trtllm_single_device_converter.py
│       ├── extension/
│       │   └── test_kitchen_sdpa.py
│       ├── find_test_cases.py
│       ├── fusions/
│       │   ├── test_bias_dropout_fusion.py
│       │   ├── test_mla_yarn_rope_apply.py
│       │   ├── test_rmsnorm_residual_fusion.py
│       │   ├── test_swiglu_fusion.py
│       │   ├── test_torch_softmax.py
│       │   └── test_weighted_squared_relu_fusion.py
│       ├── inference/
│       │   ├── __init__.py
│       │   ├── contexts/
│       │   │   ├── attention_metadata/
│       │   │   │   ├── test_mamba_metadata.py
│       │   │   │   └── test_tensor_ops.py
│       │   │   ├── test_dynamic_context.py
│       │   │   └── test_dynamic_prefix_caching.py
│       │   ├── engines/
│       │   │   ├── __init__.py
│       │   │   ├── test_dynamic_engine.py
│       │   │   ├── test_dynamic_events.py
│       │   │   ├── test_mamba_prefix_caching_e2e.py
│       │   │   └── test_static_engine.py
│       │   ├── model_inference_wrappers/
│       │   │   ├── __init__.py
│       │   │   ├── gpt/
│       │   │   │   └── test_gpt_inference_wrapper.py
│       │   │   └── t5/
│       │   │       └── test_t5_inference_wrapper.py
│       │   ├── test_batch_dimension_utils.py
│       │   ├── test_common_inference_params.py
│       │   ├── test_communication_utils.py
│       │   ├── test_data_parallel_inference_coordinator.py
│       │   ├── test_dynamic_prefix_caching_coordinator.py
│       │   ├── test_flash_decode.py
│       │   ├── test_inference_config.py
│       │   ├── test_inference_utils.py
│       │   ├── test_moe_inference.py
│       │   ├── test_moe_permute.py
│       │   ├── test_mxfp8_utils.py
│       │   ├── test_scheduler.py
│       │   ├── test_stop_words.py
│       │   ├── test_wandb_logging.py
│       │   └── text_generation_controllers/
│       │       ├── __init__.py
│       │       ├── test_encoder_decoder_text_generation_controller.py
│       │       ├── test_text_generation_controller.py
│       │       └── test_vlm_text_generation_controller.py
│       ├── models/
│       │   ├── __init__.py
│       │   ├── test_base_embedding.py
│       │   ├── test_bert_model.py
│       │   ├── test_clip_vit_model.py
│       │   ├── test_gpt_model.py
│       │   ├── test_gpt_model_batch_invariant.py
│       │   ├── test_gpt_model_quantization.py
│       │   ├── test_heterogeneous_gpt_model.py
│       │   ├── test_llava_model.py
│       │   ├── test_mamba_model.py
│       │   ├── test_mamba_moe_model.py
│       │   ├── test_mimo_audio_submodules.py
│       │   ├── test_mimo_embedding_alignment.py
│       │   ├── test_mimo_model.py
│       │   ├── test_mimo_partition.py
│       │   ├── test_mimo_submodules.py
│       │   ├── test_multimodal_projector.py
│       │   ├── test_radio_model.py
│       │   └── test_t5_model.py
│       ├── optimizer/
│       │   ├── __init__.py
│       │   └── test_optimizer_config.py
│       ├── pipeline_parallel/
│       │   ├── __init__.py
│       │   ├── test_bridge_communicator.py
│       │   ├── test_fine_grained_activation_offloading.py
│       │   ├── test_helpers.py
│       │   ├── test_multimodule_communicator.py
│       │   ├── test_multimodule_schedules.py
│       │   ├── test_pipeline_layout.py
│       │   └── test_schedules.py
│       ├── post_training/
│       │   ├── __init__.py
│       │   ├── test_modelopt_model_builder.py
│       │   └── test_modelopt_module_spec.py
│       ├── resharding/
│       │   ├── test_communication_scheduler.py
│       │   ├── test_dp_balancing.py
│       │   ├── test_model_swap.py
│       │   ├── test_mxfp8_refit.py
│       │   ├── test_task_segmenter.py
│       │   └── test_workload_packer.py
│       ├── rl/
│       │   ├── test_grouped_rollouts.py
│       │   ├── test_rl_batch_invariant.py
│       │   ├── test_rl_utils.py
│       │   └── test_sequence_packing_utils.py
│       ├── run_ci_test.sh
│       ├── ssm/
│       │   ├── ops/
│       │   │   ├── test_causal_conv1d_varlen.py
│       │   │   ├── test_ops_init.py
│       │   │   ├── test_ssd_bmm.py
│       │   │   ├── test_ssd_chunk_scan.py
│       │   │   ├── test_ssd_chunk_state.py
│       │   │   ├── test_ssd_combined.py
│       │   │   ├── test_ssd_state_passing.py
│       │   │   └── test_ssm_kernel.py
│       │   ├── test_causal_conv1d_triton.py
│       │   ├── test_gated_delta_net.py
│       │   ├── test_mamba_block.py
│       │   ├── test_mamba_context_parallel.py
│       │   ├── test_mamba_hybrid_layer_allocation.py
│       │   ├── test_mamba_layer.py
│       │   └── test_mamba_mixer.py
│       ├── tensor_parallel/
│       │   ├── __init__.py
│       │   ├── test_cross_entropy.py
│       │   ├── test_data.py
│       │   ├── test_initialization.py
│       │   ├── test_layers.py
│       │   ├── test_mappings.py
│       │   ├── test_random.py
│       │   └── test_tensor_parallel_utils.py
│       ├── test_api_backwards_compat_setup.py
│       ├── test_argument_utils.py
│       ├── test_basic.py
│       ├── test_checkpointing.py
│       ├── test_fp8_param.py
│       ├── test_fp8_utils.py
│       ├── test_hyper_comm_grid.py
│       ├── test_imports.py
│       ├── test_inference.py
│       ├── test_layer_wise_optimizer.py
│       ├── test_lion_optimizer.py
│       ├── test_local_multi_tensor_fns.py
│       ├── test_model_configs.py
│       ├── test_muon_optimizer.py
│       ├── test_nccl_allocator.py
│       ├── test_num_microbatches_calculator.py
│       ├── test_optimizer.py
│       ├── test_optimizer_cpu_offloading.py
│       ├── test_optimizer_param_scheduler.py
│       ├── test_parallel_state.py
│       ├── test_process_groups_config.py
│       ├── test_training.py
│       ├── test_typed_torch.py
│       ├── test_utilities.py
│       ├── test_utils.py
│       ├── tokenizers/
│       │   └── test_tokenizer.py
│       ├── transformer/
│       │   ├── __init__.py
│       │   ├── experimental_attention_variant/
│       │   │   ├── test_absorbed_mla.py
│       │   │   └── test_attention_variant_dsa.py
│       │   ├── moe/
│       │   │   ├── __init__.py
│       │   │   ├── conftest.py
│       │   │   ├── test_a2a_token_dispatcher.py
│       │   │   ├── test_aux_loss.py
│       │   │   ├── test_grouped_mlp.py
│       │   │   ├── test_latent_moe_layer.py
│       │   │   ├── test_moe_layer.py
│       │   │   ├── test_moe_layer_discrepancy.py
│       │   │   ├── test_multihot_indices_converter.py
│       │   │   ├── test_router_replay.py
│       │   │   ├── test_routers.py
│       │   │   ├── test_sequential_mlp.py
│       │   │   ├── test_shared_experts.py
│       │   │   ├── test_token_dispatcher.py
│       │   │   └── test_upcycling.py
│       │   ├── test_attention.py
│       │   ├── test_attention_no_rope.py
│       │   ├── test_attention_packed_seq.py
│       │   ├── test_core_attention.py
│       │   ├── test_cuda_graphs.py
│       │   ├── test_full_cuda_graph.py
│       │   ├── test_mlp.py
│       │   ├── test_module.py
│       │   ├── test_multi_latent_attention.py
│       │   ├── test_multi_token_prediction.py
│       │   ├── test_mup.py
│       │   ├── test_quantization_config.py
│       │   ├── test_relative_attention.py
│       │   ├── test_rope.py
│       │   ├── test_spec_customization.py
│       │   ├── test_submodule_callables.py
│       │   ├── test_te_layers_batch_invariant.py
│       │   ├── test_thd_correctness.py
│       │   ├── test_transformer_block.py
│       │   ├── test_transformer_block_custom_pgs.py
│       │   ├── test_transformer_layer.py
│       │   ├── test_utils.py
│       │   └── test_vision_cuda_graphs.py
│       └── utils/
│           └── test_experimental_log_once.py
├── tools/
│   ├── __init__.py
│   ├── autoformat.sh
│   ├── bert_embedding/
│   │   ├── __init__.py
│   │   ├── dataset.py
│   │   ├── embed.py
│   │   ├── external_libs.py
│   │   └── huggingface.py
│   ├── build_sequences_per_dataset.py
│   ├── check_copyright.py
│   ├── checkpoint/
│   │   ├── checkpoint_inspector.py
│   │   ├── convert.py
│   │   ├── hybrid_conversion.py
│   │   ├── loader_base.py
│   │   ├── loader_core.py
│   │   ├── loader_legacy.py
│   │   ├── loader_llama_mistral.py
│   │   ├── loader_llava.py
│   │   ├── loader_mixtral_hf.py
│   │   ├── saver_base.py
│   │   ├── saver_core.py
│   │   ├── saver_hf_llava.py
│   │   ├── saver_legacy.py
│   │   ├── saver_llava.py
│   │   ├── schema_base.py
│   │   ├── schema_core.py
│   │   ├── schema_hf.py
│   │   └── utils.py
│   ├── copyright.sh
│   ├── linter.py
│   ├── merge_datasets.py
│   ├── preprocess_data.py
│   ├── preprocess_data_nmt.py
│   ├── preprocess_mmdata.py
│   ├── report_theoretical_memory.py
│   ├── run_dynamic_text_generation_server.py
│   ├── run_inference_performance_test.py
│   ├── run_mamba_text_generation_server.py
│   ├── run_mamba_text_generation_server_completions.py
│   ├── run_text_generation_server.py
│   ├── run_vlm_text_generation.py
│   ├── text_generation_cli.py
│   ├── trigger_internal_ci.md
│   ├── trigger_internal_ci.py
│   ├── upgrade_dependencies.sh
│   └── wait_daemon.sh
└── train_rl.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .coderabbit.yaml
================================================
# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json
language: "en-US"

# Only comment on Critical/Major bugs. No Minor, Trivial, or style comments.
tone_instructions: "Only comment on Critical or Major bugs. Never comment on Minor issues, style, refactoring, or suggestions. When in doubt, stay silent."

reviews:
  # Use chill profile - filters out nitpicks automatically
  profile: "chill"

  # Disable all summary features
  high_level_summary: false
  high_level_summary_in_walkthrough: false

  # Disable walkthrough comment entirely
  collapse_walkthrough: true
  changed_files_summary: false
  sequence_diagrams: false

  # Disable status/effort estimates
  review_status: false
  commit_status: false
  estimate_code_review_effort: false

  # Disable auto-suggestions for labels/reviewers
  suggested_labels: false
  suggested_reviewers: false

  # Disable related issues/PRs lookup
  assess_linked_issues: false
  related_issues: false
  related_prs: false

  # Auto-review disabled - only review when explicitly requested via @coderabbitai review
  auto_review:
    enabled: false

chat:
  auto_reply: true


================================================
FILE: .flake8
================================================
[flake8]
max-line-length = 100
extend-ignore = E203,E501,F401,E402,E714
per-file-ignores = __init__.py:F401

================================================
FILE: .github/CODEOWNERS
================================================
megatron/core/ @NVIDIA/core-adlr @NVIDIA/core-nemo

megatron/core/models/gpt/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/gpt

megatron/core/models/multimodal/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/multi-modal

megatron/core/models/mamba/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-mamba
megatron/core/ssm/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-mamba

megatron/core/datasets/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/datasets

megatron/core/tokenizers/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/tokenizers

megatron/core/distributed/fsdp/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/megatron-fsdp

megatron/core/transformer/fsdp_dtensor_checkpoint.py @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/megatron-fsdp

megatron/core/dist_checkpointing/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/dist-checkpointing

megatron/core/optimizer/distrib_optimizer/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/dist-optimizer

megatron/core/inference/modelopt_support @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/quantization-and-inference

megatron/core/datasets/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/datasets

megatron/core/pipeline_parallel/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/pipeline-parallelism

megatron/core/transformer/ @NVIDIA/core-adlr @NVIDIA/core-nemo

megatron/core/transformer/moe/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/mixture-of-experts-adlr @NVIDIA/mixture-of-experts-devtech

megatron/core/inference/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/inference

megatron/core/parallel_state.py @NVIDIA/core-adlr @NVIDIA/core-nemo

megatron/core/post_training/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/post-training

megatron/post_training/ @NVIDIA/post-training

megatron/core/transformer/cuda_graphs.py @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/cuda-graphs

megatron/training/ @NVIDIA/training-adlr @NVIDIA/training-nemo
megatron/training/arguments.py

.gitlab/ @NVIDIA/ci
.github/ @NVIDIA/ci
.gitlab-ci.yml @NVIDIA/ci
docker/  @NVIDIA/ci
tests/functional_tests/python_test_utils/ @NVIDIA/ci
tests/functional_tests/shell_test_utils/ @NVIDIA/ci
tests/test_utils/recipes/ @NVIDIA/ci
tests/unit_tests/run_ci_test.sh @NVIDIA/ci

# API Backwards Compatibility Check
scripts/check_api_backwards_compatibility.py @NVIDIA/ci
scripts/README_API_COMPAT.md @NVIDIA/ci
.github/workflows/check_api_backwards_compatibility_workflow.yml @NVIDIA/ci
docs/api-backwards-compatibility-check.md @NVIDIA/ci
tests/unit_tests/test_api_backwards_compat_setup.py @NVIDIA/ci

megatron/rl/ @NVIDIA/reinforcement-learning
examples/rl/ @NVIDIA/reinforcement-learning
test/unit_tests/test_rl_utils.py @NVIDIA/reinforcement-learning
train_rl.py @NVIDIA/reinforcement-learning


================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: Bug report
about: Create a report to help us improve the repository or project
title: ""
labels: bug
assignees: ''

---

**Describe the bug**

A clear and concise description of what the bug is. Tag the [@mcore-oncall](https://github.com/orgs/NVIDIA/teams/mcore-oncall) 
to get oncall's attention to this issue.

**Steps/Code to reproduce bug**

Please list *minimal* steps or code snippet for us to be able to reproduce the bug.

A helpful guide on on how to craft a minimal bug report http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports.


**Expected behavior**

A clear and concise description of what you expected to happen.


**Additional context**

Add any other context about the problem here. 


================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: false



================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.md
================================================
---
name: Feature request
about: Suggest an idea for this project
title: ""
labels: enhancement
assignees: ''

---

**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]

Tag the [@mcore-oncall](https://github.com/orgs/NVIDIA/teams/mcore-oncall) 
to get oncall's attention to this issue.

**Describe the solution you'd like**
A clear and concise description of what you want to happen.

**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.

**Additional context**
Add any other context or screenshots about the feature request here.


================================================
FILE: .github/ISSUE_TEMPLATE/question.md
================================================
---
name: QUESTION
about: Ask a question about Megatron-LM that is not a bug, regression or enhancement
  request
title: "[QUESTION]"
labels: ''
assignees: ''

---

**Your question**
Ask a clear and concise question about Megatron-LM. Tag the [@mcore-oncall](https://github.com/orgs/NVIDIA/teams/mcore-oncall) 
to get oncall's attention to this issue.

================================================
FILE: .github/ISSUE_TEMPLATE/regression.md
================================================
---
name: REGRESSION
about: Report a regression in speed or accuracy due to a Megatron-LM update
title: "[REGRESSION]"
labels: ''
assignees: ''

---

**Describe the regression**
A clear and concise description of what the regression is. Tag the [@mcore-oncall](https://github.com/orgs/NVIDIA/teams/mcore-oncall) 
to get oncall's attention to this issue.

**To Reproduce**
Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention.

**Previous performance**
What speed or accuracy did you previously see.

**New performance**
What speed or accuracy do you see after the update.

**Stack trace/logs**
If applicable, add the stack trace or logs related to the regression.

**Environment (please complete the following information):**
 - Previous Megatron-LM commit ID
 - New Megatron-LM commit ID
 - Previous PyTorch version
 - New PyTorch version
 - Previous CUDA version
 - New CUDA version
 - Previous NCCL version
 - New NCCL version

**Proposed fix**
If you have a proposal for how to fix the issue state it here or link to a PR.

**Additional context**
Add any other context about the problem here.


================================================
FILE: .github/actions/action.yml
================================================
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "Test Template"
description: "Template for running NeMo tests in a containerized environment"

inputs:
  container-image:
    description: "Container image to use for test"
    required: true
  timeout:
    description: "Max runtime of test in minutes"
    required: false
    default: "30"
  script:
    description: "Test script to execute"
    required: true
  is-optional:
    description: "Pass this job on failure."
    required: false
    default: "false"
  is_unit_test:
    description: "Upload coverage as unit test"
    required: false
    default: "false"
  tag:
    description: Latest or legacy test suite
    required: true
  test_case:
    description: Test case to launch
    required: true
  model:
    description: Model to launch
    required: false
  PAT:
    description: "GitHub Personal Access Token"
    required: true
  is_ci_workload:
    description: "Is CI workload"
    required: true
  is_merge_group:
    description: "Is merge group"
    required: true
  platform:
    description: "Platform to run tests on (e.g. dgx_h100, dgx_gb200)"
    required: false
    default: "dgx_h100"
runs:
  using: "composite"
  steps:
    - name: Print node name
      shell: bash -x -e -u -o pipefail {0}
      run: echo "node_name=$NODE_NAME" | tee -a "$GITHUB_OUTPUT"

    - name: Checkout repository
      uses: actions/checkout@v6

    - name: Change ownership of /home/runner/
      shell: bash
      run: sudo chown -R $(whoami) /home/runner/

    - name: Setup python
      uses: actions/setup-python@v5
      with:
        python-version: 3.12

    - name: Install uuidgen
      shell: bash -x -e -u -o pipefail {0}
      run: |
        apt-get update
        apt-get install -y uuid-runtime

    - name: Create run-script (unit test)
      shell: bash -x -e -u -o pipefail {0}
      if: inputs.is_unit_test == 'true'
      run: |
        echo "::group::Create run-script"
        cmd=$(cat <<'RUN_TEST_EOF'
        #!/bin/bash

        export PYTHONPATH=$(pwd)
        export NEMORUN_HOME=$(pwd)
        export NCCL_DEBUG=INFO
        pip install --no-cache-dir "uv<0.9.29"
        uv venv .venv
        uv cache clean
        uv sync --no-cache --only-group test
        uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \
          --scope unit-tests \
          --model unit-tests \
          --test-case "${{ inputs.test_case }}" \
          --environment dev \
          --platform ${{ inputs.platform }} \
          --tag ${{ inputs.tag }} \
          --container-image ${{ inputs.container-image }} \
          --hf-home /mnt/datadrive/TestData/nemo-fw/TestData/HF_HOME

        RUN_TEST_EOF
        )
        echo "$cmd" | tee "job.sh"
        echo "::endgroup::"

    - name: Get PR info
      id: get-pr-info
      if: startsWith(github.ref, 'refs/heads/pull-request/')
      uses: nv-gha-runners/get-pr-info@main

    - name: Install GH CLI
      shell: bash -x -e -u -o pipefail {0}
      run: |
        apt-get update
        apt-get install -y gh

    - name: Has Run tests label
      shell: bash -x -e -u -o pipefail {0}
      id: has-run-tests-label
      env:
        GH_TOKEN: ${{ github.token }}
      run: |
        PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
        HAS_RUN_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run tests")') || echo "false"
        echo "main=$HAS_RUN_TESTS_LABEL" | tee -a $GITHUB_OUTPUT

    - name: Has Run functional tests label
      shell: bash -x -e -u -o pipefail {0}
      id: has-run-functional-tests-label
      env:
        GH_TOKEN: ${{ github.token }}
        IS_CI_WORKLOAD: ${{ inputs.is_ci_workload }}
      run: |
        PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
        HAS_RUN_FUNCTIONAL_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run functional tests")') || echo "$IS_CI_WORKLOAD"
        HAS_RUN_FUNCTIONAL_TESTS_LABEL=${HAS_RUN_FUNCTIONAL_TESTS_LABEL:-$IS_CI_WORKLOAD}
        echo "main=$HAS_RUN_FUNCTIONAL_TESTS_LABEL" | tee -a $GITHUB_OUTPUT

    - name: Create run-script (e2e test)
      shell: bash -x -e -u -o pipefail {0}
      if: inputs.is_unit_test == 'false'
      env:
        MODEL: ${{ inputs.model }}
      run: |
        echo "::group::Create run-script"
        cmd=$(cat <<'RUN_TEST_EOF'
        #!/bin/bash
        set -euxo pipefail

        if [ "${{ inputs.is_merge_group }}" == "true" ]; then
          ARGS=(
            --scope mr-github
            --n-repeat 1
          )
        elif [ "${{ steps.has-run-tests-label.outputs.main }}" == "true" ]; then
          ARGS=(
            --scope mr-github
            --enable-lightweight-mode
            --n-repeat 1
          )
        elif [ "${{ steps.has-run-functional-tests-label.outputs.main }}" == "true" ]; then
          ARGS=(
            --scope mr-github
            --n-repeat 5
          )
        else
          ARGS=(
            --scope mr-github-slim
            --n-repeat 5
          )
        fi

        export PYTHONPATH=$(pwd)
        export NEMORUN_HOME=$(pwd)
        pip install --no-cache-dir "uv<0.9.29"
        uv venv .venv
        uv cache clean
        uv sync --no-cache --only-group test
        uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \
          ${ARGS[@]} \
          --model ${{ inputs.model }} \
          --test-case ${{ inputs.test_case }} \
          --environment dev \
          --platform ${{ inputs.platform }} \
          --container-image ${{ inputs.container-image }} \
          --data-dir /mnt/datadrive/TestData/megatron-lm/artifacts \
          --hf-home /mnt/datadrive/TestData/nemo-fw/TestData/HF_HOME

        RUN_TEST_EOF
        )
        echo "$cmd" | tee "job.sh"
        echo "::endgroup::"

    - name: Set timeout
      shell: bash -x -e -u -o pipefail {0}
      id: timeout_in_seconds
      run: |
        echo "::group::Set timeout"
        echo "main=$(( ${{ inputs.timeout }} * 60 ))" | tee -a "$GITHUB_OUTPUT"
        echo "::endgroup::"

    - name: Pull container
      shell: bash -x -e -u -o pipefail {0}
      run: |
        echo "::group::Pull container"
        docker pull ${{ inputs.container-image }}
        echo "::endgroup::"

    - name: Run main script
      shell: bash -x -e -u -o pipefail {0}
      id: run-main-script
      run: |
        echo "::group::Run main script"
        EXIT_CODE=0
        /bin/bash job.sh || EXIT_CODE=$?
        echo "exit_code=$EXIT_CODE" | tee -a "$GITHUB_OUTPUT"
        exit $EXIT_CODE
        echo "::endgroup::"

    - name: Check result
      id: check
      shell: bash -x -e -u -o pipefail {0}
      if: always()
      env:
        IS_UNIT_TEST: ${{ inputs.is_unit_test == 'true' }}
      run: |
        echo "::group::Check result"

        logs_report=logs-${{ inputs.test_case }}-${{ github.run_id }}-$(uuidgen)
        echo "logs_report=$logs_report" | sed 's/\//-/g' | sed 's/\*/-/g' | tee -a "$GITHUB_OUTPUT"

        if [[ "$IS_UNIT_TEST" == "true" ]]; then
          coverage_report=coverage-${{ inputs.is_unit_test == 'true' && 'unit-test' || 'e2e' }}-${{ github.run_id }}-$(uuidgen)
        else
          coverage_report=none
        fi
        echo "coverage_report=$coverage_report" | tee -a "$GITHUB_OUTPUT"

        EXIT_CODE=${{ steps.run-main-script.outputs.exit_code }}
        IS_SUCCESS=$([[ "$EXIT_CODE" -eq 0 ]] && echo "true" || echo "false")

        if [[ "$IS_SUCCESS" == "false" && "${{ inputs.is-optional }}" == "true" ]]; then
          echo "::warning:: Test failed, but displayed as successful because it is marked as optional."
          IS_SUCCESS=true
        fi

        if [[ "$IS_SUCCESS" == "false" ]]; then
          echo Test did not finish successfully.
          exit 1
        fi

        if [[ "$coverage_report" != "none" ]]; then
          uv run coverage report -i
        fi

        exit $EXIT_CODE
        echo "::endgroup::"

    - name: Upload coverage
      uses: actions/upload-artifact@v4
      if: ${{ always() && steps.check.outputs.coverage_report != 'none' }}
      with:
        name: ${{ steps.check.outputs.coverage_report }}
        path: |
          coverage.xml
          .coverage
        include-hidden-files: true

    - name: Upload logs
      uses: actions/upload-artifact@v4
      if: always()
      with:
        name: ${{ steps.check.outputs.logs_report }}
        path: ${{ inputs.is_unit_test == 'true' && 'assets_dir/logs' || 'assets_dir' }}
        include-hidden-files: true


================================================
FILE: .github/actions/check-nvidia-sso-membership/action.yml
================================================
name: 'Check NVIDIA SSO Membership'
description: 'Check if a GitHub username exists in the NVIDIA SSO users list from github-audits'
author: 'NVIDIA'

inputs:
  username:
    description: 'GitHub username to check'
    required: true
  github_audits_repo:
    description: 'Repository containing SSO users file'
    required: false
    default: 'NVIDIA-GitHub-Management/github-audits'
  github_audits_version:
    description: 'Release version tag'
    required: false
    default: 'v0.1.0'
  sso_users_filename:
    description: 'Filename of SSO users JSON'
    required: false
    default: 'users_sso.json'
  github_token:
    description: 'GitHub token with access to github-audits repo'
    required: true

outputs:
  is_member:
    description: 'Boolean - true if user is in NVIDIA SSO list, false otherwise'
    value: ${{ steps.check-membership.outputs.is_member }}
  is_org_member:
    description: 'Boolean - true if user has NVIDIA or NVIDIA-NeMo in org_roles'
    value: ${{ steps.check-membership.outputs.is_org_member }}
  user_orgs:
    description: 'Comma-separated list of orgs user is member of'
    value: ${{ steps.check-membership.outputs.user_orgs }}
  sso_file_available:
    description: 'Boolean - true if SSO file was successfully downloaded'
    value: ${{ steps.download-sso.outputs.sso_file_available }}
  user_count:
    description: 'Number of users in the SSO file (0 if download failed)'
    value: ${{ steps.download-sso.outputs.user_count }}

runs:
  using: 'composite'
  steps:
    - name: Download NVIDIA SSO users from github-audits
      id: download-sso
      shell: bash
      env:
        GH_TOKEN: ${{ inputs.github_token }}
      run: |
        echo "Downloading ${{ inputs.sso_users_filename }} from ${{ inputs.github_audits_repo }} ${{ inputs.github_audits_version }} release..."

        # Download the release asset using gh CLI
        gh release download ${{ inputs.github_audits_version }} \
          --repo ${{ inputs.github_audits_repo }} \
          --pattern ${{ inputs.sso_users_filename }} \
          --clobber 2>&1 || {
            echo "ERROR: Failed to download ${{ inputs.sso_users_filename }} from github-audits release"
            echo "sso_file_available=false" >> $GITHUB_OUTPUT
            echo "user_count=0" >> $GITHUB_OUTPUT
            exit 0
          }

        # Verify file was downloaded and is valid JSON
        if [ ! -f ${{ inputs.sso_users_filename }} ]; then
          echo "ERROR: ${{ inputs.sso_users_filename }} file not found after download"
          echo "sso_file_available=false" >> $GITHUB_OUTPUT
          echo "user_count=0" >> $GITHUB_OUTPUT
          exit 0
        fi

        # Validate JSON structure
        if ! jq -e 'type == "object"' ${{ inputs.sso_users_filename }} > /dev/null 2>&1; then
          echo "ERROR: ${{ inputs.sso_users_filename }} is not a valid JSON object"
          echo "sso_file_available=false" >> $GITHUB_OUTPUT
          echo "user_count=0" >> $GITHUB_OUTPUT
          exit 0
        fi

        USER_COUNT=$(jq 'length' ${{ inputs.sso_users_filename }})
        echo "Successfully downloaded ${{ inputs.sso_users_filename }} with $USER_COUNT NVIDIA SSO users"
        echo "sso_file_available=true" >> $GITHUB_OUTPUT
        echo "user_count=$USER_COUNT" >> $GITHUB_OUTPUT

    - name: Check if user is in SSO list
      id: check-membership
      shell: bash
      run: |
        USERNAME="${{ inputs.username }}"
        SSO_FILE="${{ inputs.sso_users_filename }}"

        echo "Checking if $USERNAME is in NVIDIA SSO users list..."

        # Check if SSO file is available
        if [ "${{ steps.download-sso.outputs.sso_file_available }}" != "true" ] || [ ! -f "$SSO_FILE" ]; then
          echo "ERROR: $SSO_FILE not available - cannot check membership"
          echo "is_member=false" >> $GITHUB_OUTPUT
          echo "is_org_member=false" >> $GITHUB_OUTPUT
          echo "user_orgs=" >> $GITHUB_OUTPUT
          exit 0
        fi

        # Check if username exists as a key in the JSON object
        if jq -e --arg user "$USERNAME" 'has($user)' "$SSO_FILE" > /dev/null 2>&1; then
          echo "$USERNAME found in NVIDIA SSO users"
          echo "is_member=true" >> $GITHUB_OUTPUT

          # Extract and check org membership
          IS_ORG_MEMBER=$(jq -r --arg user "$USERNAME" '
            .[$user].org_roles // [] |
            map(select(test("^(NVIDIA|NVIDIA-NeMo):Member$"))) |
            length > 0
          ' "$SSO_FILE")

          USER_ORGS=$(jq -r --arg user "$USERNAME" '
            .[$user].org_roles // [] |
            map(split(":")[0]) |
            unique |
            join(",")
          ' "$SSO_FILE")

          echo "is_org_member=$IS_ORG_MEMBER" >> $GITHUB_OUTPUT
          echo "user_orgs=$USER_ORGS" >> $GITHUB_OUTPUT

          if [ "$IS_ORG_MEMBER" == "true" ]; then
            echo "$USERNAME is a member of NVIDIA or NVIDIA-NeMo org"
          else
            echo "$USERNAME has @nvidia.com email but is not in NVIDIA or NVIDIA-NeMo org (orgs: $USER_ORGS)"
          fi
        else
          echo "$USERNAME NOT found in NVIDIA SSO users"
          echo "is_member=false" >> $GITHUB_OUTPUT
          echo "is_org_member=false" >> $GITHUB_OUTPUT
          echo "user_orgs=" >> $GITHUB_OUTPUT
        fi

branding:
  icon: 'shield'
  color: 'green'


================================================
FILE: .github/copy-pr-bot.yaml
================================================
enabled: true
auto_sync_draft: false
auto_sync_ready: true
trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "CarlosGomes98", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "RPrenger", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cjld", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "faradawn", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "huvunvidia", "hxbai", "ilml", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kajalj22", "kanz-nv", "kevalmorabia97", "ko3n1g", "ksivaman", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mchrzanowski", "mehraakash", "mkhona-nvidia", "nanz-nv", "parthmannan", "prajwal1210", "pthombre", "rhewett-nv", "rogerwaleffe", "sajadn", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "sharathts", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "tomlifu", "trintamaki", "tylerpoon", "wdykas", "wplf", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yueshen2016", "yuzhongw-nvidia", "zhongbozhu"]


================================================
FILE: .github/oncall_schedule.json
================================================
[
    {
        "user": "dimapihtar",
        "date": "2026-03-18"
    },
    {
        "user": "janEbert",
        "date": "2026-03-25"
    },
    {
        "user": "gautham-kollu",
        "date": "2026-04-01"
    },
    {
        "user": "ilml",
        "date": "2026-04-08"
    },
    {
        "user": "Phlip79",
        "date": "2026-04-15"
    },
    {
        "user": "asolergi-nv",
        "date": "2026-04-22"
    },
    {
        "user": "BoxiangW",
        "date": "2026-04-29"
    },
    {
        "user": "maanug-nv",
        "date": "2026-05-06"
    },
    {
        "user": "dimapihtar",
        "date": "2026-05-13"
    },
    {
        "user": "gautham-kollu",
        "date": "2026-05-20"
    },
    {
        "user": "ilml",
        "date": "2026-05-27"
    },
    {
        "user": "janEbert",
        "date": "2026-06-03"
    }
]


================================================
FILE: .github/pull_request_template.md
================================================
# What does this PR do ?
<!-- Add a one line overview of what this PR aims to accomplish. -->

:warning: For major changes (either in lines of code or in its impact), please make sure to first share a design doc with the team. If you're unsure what's the best way to do so, contact the @mcore-oncall.

## Contribution process

### Pre-checks

- [ ] I have added relevant unit tests
- [ ] I have added relevant functional tests
- [ ] I have added proper typing to my code [Typing guidelines](https://docs.python.org/3/library/typing.html)
- [ ] I have added relevant documentation
- [ ] I have run the [autoformatter.sh](https://github.com/NVIDIA/Megatron-LM/blob/main/tools/autoformat.sh) on my PR

### Code review

Feel free to message or comment the [@mcore-oncall](https://github.com/orgs/NVIDIA/teams/mcore-oncall) to help accelerate your merge into main. The less complex your PR is, the faster it will be approved and merged!

All PRs start as **draft**. If you open a non-draft PR, it will be automatically converted to draft.

#### Step 1: Mark PR as "Ready for Review"

1. When your PR is ready, click **Ready for Review**.
2. An oncall reviewer is auto-assigned and expert reviewers are notified based on your changes.
   - Some PRs may jump straight to step 2. This is determined by `.github/CODEOWNERS`.

:warning: Only mark as ready once merge-conflicts are resolved and the CI is passing.
Final Review might get declined if these requirements are not fulfilled.

#### Step 2: Final Review

For PRs that change `megatron/core`, once all expert reviewers have approved, the `Final Review` label is applied **automatically** and final reviewers are assigned.

For PRs outside `megatron/core`, this step is skipped.

#### Step 3: Approved

Once all required reviewers have approved, the `Approved` label is applied **automatically**.

### Merge

Any member of [mcore-engineers](https://github.com/orgs/NVIDIA/teams/mcore-engineers) will be able to merge your PR.

<details>
<summary>For MRs into `dev` branch</summary>
The proposed review process for `dev` branch is under active discussion.

MRs are mergable after one approval by either `eharper@nvidia.com` or `zijiey@nvidia.com`.
</details>


================================================
FILE: .github/scripts/oncall_manager.py
================================================
# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import json
import requests
import argparse
from datetime import datetime, timedelta, timezone

from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError

# Constants
GITHUB_API_URL = "https://api.github.com"
SCHEDULE_FILE = ".github/oncall_schedule.json"
ROTATION_TEAM_SLUG = "mcore-oncall-rotation"
ACTIVE_ONCALL_TEAM_SLUG = "mcore-oncall"
SLACK_USERGROUP_HANDLE = "mcore-oncall"
TARGET_WEEKS = 12

# Caches for email and Slack lookups
_email_cache = {}
_slack_id_cache = {}

def get_headers():
    token = os.environ.get("GH_TOKEN")
    if not token:
        # Fallback to GITHUB_TOKEN if GH_TOKEN not set
        token = os.environ.get("GITHUB_TOKEN")
        
    if not token:
        print("Error: GH_TOKEN or GITHUB_TOKEN not set")
        sys.exit(1)
        
    return {
        "Authorization": f"token {token}",
        "Accept": "application/vnd.github.v3+json"
    }

def get_repo_info():
    """Returns (owner, repo) from GITHUB_REPOSITORY env var."""
    repo_env = os.environ.get("GITHUB_REPOSITORY")
    if not repo_env:
        print("Error: GITHUB_REPOSITORY environment variable not set")
        sys.exit(1)
    parts = repo_env.split("/")
    return parts[0], parts[1]

def get_team_members(org, team_slug):
    """Fetches members of the GitHub team."""
    url = f"{GITHUB_API_URL}/orgs/{org}/teams/{team_slug}/members"
    headers = get_headers()
    
    members = set()
    page = 1
    while True:
        resp = requests.get(f"{url}?per_page=100&page={page}", headers=headers)
        if resp.status_code != 200:
            print(f"Error fetching team members: {resp.status_code} {resp.text}")
            sys.exit(1)
        
        data = resp.json()
        if not data:
            break
            
        members.update([m['login'] for m in data])
        if len(data) < 100:
            break
        page += 1
        
    return members

def get_user_email(username):
    """Get user's email from GitHub, prioritizing @nvidia.com emails.
    
    Checks in order:
    1. Public profile email
    2. Recent commits in the repository
    """
    if username in _email_cache:
        return _email_cache[username]
    
    headers = get_headers()
    public_email = None
    
    try:
        # 1. Try to get user's public profile email first
        resp = requests.get(f"{GITHUB_API_URL}/users/{username}", headers=headers)
        if resp.status_code == 200:
            user_data = resp.json()
            email = user_data.get('email')
            if email and not email.endswith("@users.noreply.github.com"):
                if email.endswith("@nvidia.com"):
                    _email_cache[username] = email
                    return email
                # Store non-nvidia email as fallback
                public_email = email
        
        # 2. Check recent commits in the repository for @nvidia.com email
        repo_env = os.environ.get("GITHUB_REPOSITORY", "NVIDIA/Megatron-LM")
        commits_url = f"{GITHUB_API_URL}/repos/{repo_env}/commits?author={username}&per_page=10"
        resp = requests.get(commits_url, headers=headers)
        
        if resp.status_code == 200:
            commits = resp.json()
            for commit in commits:
                # Get email from commit author
                commit_data = commit.get('commit', {})
                author_data = commit_data.get('author', {})
                email = author_data.get('email')
                
                if email and not email.endswith("@users.noreply.github.com"):
                    if email.endswith("@nvidia.com"):
                        _email_cache[username] = email
                        print(f"Found @nvidia.com email for {username} from commits: {email}")
                        return email
                    elif public_email is None:
                        public_email = email
        
        # 3. Use public email if found, otherwise fallback
        if public_email:
            _email_cache[username] = public_email
            print(f"Using public email for {username}: {public_email}")
            return public_email
        
        # Fallback to noreply email
        fallback = f"{username}@users.noreply.github.com"
        _email_cache[username] = fallback
        print(f"Warning: No email found for {username}, using fallback: {fallback}")
        return fallback
        
    except Exception as e:
        print(f"Warning: Could not get email for {username}: {e}")
        fallback = f"{username}@users.noreply.github.com"
        _email_cache[username] = fallback
        return fallback

def get_slack_client():
    """Get Slack WebClient if token is available."""
    slack_token = os.environ.get("SLACK_TOKEN")
    if not slack_token:
        return None
    
    return WebClient(token=slack_token)

def get_slack_user_id(slack_client, email):
    """Get Slack user ID from email."""
    if not slack_client:
        return None
    
    if email in _slack_id_cache:
        return _slack_id_cache[email]
    
    try:
        response = slack_client.users_lookupByEmail(email=email)
        user_id = response["user"]["id"]
        _slack_id_cache[email] = user_id
        return user_id
    except SlackApiError as e:
        print(f"Warning: Could not find Slack user for {email}: {e.response['error']}")
        _slack_id_cache[email] = None
        return None

def get_slack_usergroup_id(slack_client, handle):
    """Get Slack usergroup ID from handle."""
    if not slack_client:
        return None
    
    try:
        response = slack_client.usergroups_list(include_users=True)
        for usergroup in response.get("usergroups", []):
            if usergroup.get("handle") == handle:
                return usergroup.get("id"), usergroup.get("users", [])
        print(f"Warning: Slack usergroup '{handle}' not found")
        return None, []
    except SlackApiError as e:
        print(f"Warning: Could not list Slack usergroups: {e.response['error']}")
        return None, []

def update_slack_usergroup(new_oncall_username, old_members_usernames):
    """
    Updates the Slack usergroup to contain only the new oncall user.
    Adds new oncall first, then removes old members (usergroups need at least one member).
    """
    slack_client = get_slack_client()
    if not slack_client:
        print("Slack token not configured, skipping Slack usergroup update")
        return
    
    # Get the new oncall's email and Slack user ID
    new_email = get_user_email(new_oncall_username)
    new_slack_id = get_slack_user_id(slack_client, new_email)
    
    if not new_slack_id:
        print(f"Could not find Slack user ID for {new_oncall_username} ({new_email}), skipping Slack update")
        return
    
    # Get the usergroup ID and current members
    usergroup_id, current_slack_members = get_slack_usergroup_id(slack_client, SLACK_USERGROUP_HANDLE)
    
    if not usergroup_id:
        print(f"Could not find Slack usergroup '{SLACK_USERGROUP_HANDLE}', skipping Slack update")
        return
    
    try:
        # Step 1: Add new oncall first (include current members to avoid removing anyone yet)
        # This ensures usergroup always has at least one member
        if new_slack_id not in current_slack_members:
            updated_members = list(set(current_slack_members + [new_slack_id]))
            slack_client.usergroups_users_update(
                usergroup=usergroup_id,
                users=updated_members
            )
            print(f"Added {new_oncall_username} to Slack usergroup '{SLACK_USERGROUP_HANDLE}'")
        
        # Step 2: Now set the usergroup to contain only the new oncall
        slack_client.usergroups_users_update(
            usergroup=usergroup_id,
            users=[new_slack_id]
        )
        print(f"Updated Slack usergroup '{SLACK_USERGROUP_HANDLE}' to contain only {new_oncall_username}")
        
    except SlackApiError as e:
        print(f"Failed to update Slack usergroup: {e.response['error']}")

def load_schedule():
    if not os.path.exists(SCHEDULE_FILE):
        return []
    try:
        with open(SCHEDULE_FILE, 'r') as f:
            data = json.load(f)
            # Normalize to list of dicts if it's a list of strings
            schedule = []
            for item in data:
                if isinstance(item, str):
                    schedule.append({"user": item, "date": "YYYY-MM-DD"})
                else:
                    schedule.append(item)
            return schedule
    except (json.JSONDecodeError, FileNotFoundError):
        return []

def save_schedule(schedule):
    with open(SCHEDULE_FILE, 'w') as f:
        json.dump(schedule, f, indent=4)
        f.write('\n') # trailing newline

def update_active_oncall_team(org, new_oncall):
    """Updates the active oncall team to contain only the new oncall user."""
    # 1. Get current members of the active team
    current_members = get_team_members(org, ACTIVE_ONCALL_TEAM_SLUG)
    
    # 2. Add the new oncall if not present
    if new_oncall not in current_members:
        url = f"{GITHUB_API_URL}/orgs/{org}/teams/{ACTIVE_ONCALL_TEAM_SLUG}/memberships/{new_oncall}"
        resp = requests.put(url, headers=get_headers())
        if resp.status_code == 200:
            print(f"Added {new_oncall} to {ACTIVE_ONCALL_TEAM_SLUG}")
        else:
            print(f"Failed to add {new_oncall} to {ACTIVE_ONCALL_TEAM_SLUG}: {resp.status_code} {resp.text}")

    # 3. Remove everyone else
    old_members = []
    for member in current_members:
        if member not in [new_oncall, 'svcnvidia-nemo-ci']:
            old_members.append(member)
            url = f"{GITHUB_API_URL}/orgs/{org}/teams/{ACTIVE_ONCALL_TEAM_SLUG}/memberships/{member}"
            resp = requests.delete(url, headers=get_headers())
            if resp.status_code == 204:
                print(f"Removed {member} from {ACTIVE_ONCALL_TEAM_SLUG}")
            else:
                print(f"Failed to remove {member} from {ACTIVE_ONCALL_TEAM_SLUG}: {resp.status_code} {resp.text}")
    
    # 4. Update Slack usergroup (add new oncall first, then remove old members)
    update_slack_usergroup(new_oncall, old_members)

def rotate_schedule(repo_owner, dry_run=False):
    schedule = load_schedule()
    print(f"Current schedule length: {len(schedule)}")
    
    # 1. Rotate (Remove past week)
    # Only if schedule is not empty.
    if schedule:
        # Check date of first entry
        first_entry = schedule[0]
        try:
            # We assume the date is the *start* of the oncall shift (Wednesday).
            # The shift ends 7 days later.
            start_date = datetime.strptime(first_entry['date'], "%Y-%m-%d").date()
            end_date = start_date + timedelta(days=7)
            
            today = datetime.now(timezone.utc).date()
            
            # If today is >= end_date, the shift is over.
            # (e.g. Started last Wed, ends today Wed. If today is Wed, we rotate)
            if today >= end_date:
                removed = schedule.pop(0)
                print(f"Rotated out: {removed} (Ended {end_date})")
            else:
                print(f"First entry {first_entry} has not ended yet (Ends {end_date}). Not removing.")
        except ValueError:
             # Fallback if date is invalid, rotate anyway
             removed = schedule.pop(0)
             print(f"Rotated out (invalid date): {removed}")
    else:
        print("Schedule empty, nothing to rotate.")

    # 2. Replenish
    ensure_schedule_filled(schedule, repo_owner)
    
    # 3. Update active oncall team
    if schedule:
        current_oncall = schedule[0]['user']
        print(f"New active oncall: {current_oncall}")
        if not dry_run:
            update_active_oncall_team(repo_owner, current_oncall)
        else:
            print(f"Dry run: Would update {ACTIVE_ONCALL_TEAM_SLUG} to contain only {current_oncall}")
    
    if not dry_run:
        save_schedule(schedule)
        print("Schedule updated and saved.")
    else:
        print("Dry run: Schedule not saved.")
        print(json.dumps(schedule, indent=4))

def get_last_wednesday():
    today = datetime.now(timezone.utc).date()
    # Monday=0, Wednesday=2
    offset = (today.weekday() - 2) % 7
    return today - timedelta(days=offset)

def ensure_schedule_filled(schedule, repo_owner):
    """Appends users to schedule until it reaches TARGET_WEEKS."""
    members = get_team_members(repo_owner, ROTATION_TEAM_SLUG)
    if not members:
        print(f"Warning: No team members found in {ROTATION_TEAM_SLUG}.")
        return
    if 'svcnvidia-nemo-ci' in members:
        members.remove('svcnvidia-nemo-ci')
    members = list(members)

    members.sort() # Deterministic order
    
    while len(schedule) < TARGET_WEEKS:
        # Determine start date for the new entry
        if not schedule:
            # Start with the most recent Wednesday if list is empty
            next_date = get_last_wednesday()
            
            # Start with the first member alphabetically if list is empty
            next_user = members[0]
        else:
            last_entry = schedule[-1]
            last_user = last_entry['user']
            
            # Parse last date and add 7 days
            try:
                last_date = datetime.strptime(last_entry['date'], "%Y-%m-%d").date()
                next_date = last_date + timedelta(days=7)
            except ValueError:
                # Fallback if date is invalid/placeholder
                next_date = get_last_wednesday() + timedelta(days=7 * len(schedule))

            try:
                # Find index of last scheduled user in the team list
                if last_user in members:
                    last_idx = members.index(last_user)
                    next_idx = (last_idx + 1) % len(members)
                    next_user = members[next_idx]
                else:
                    # Last user not in team, just pick first member
                    next_user = members[0]
            except ValueError:
                next_user = members[0]
        
        new_entry = {"user": next_user, "date": next_date.strftime("%Y-%m-%d")}
        schedule.append(new_entry)
        print(f"Appended: {new_entry}")

def assign_reviewer(pr_number):
    """Assigns the mcore-oncall team as the reviewer for the PR."""
    owner, repo = get_repo_info()
    url = f"{GITHUB_API_URL}/repos/{owner}/{repo}/pulls/{pr_number}/requested_reviewers"
    
    # Assign the oncall team as reviewer
    data = {"team_reviewers": [ACTIVE_ONCALL_TEAM_SLUG]}
    resp = requests.post(url, headers=get_headers(), json=data)
    
    if resp.status_code in [201, 200]:
        print(f"Successfully requested review from team NVIDIA/{ACTIVE_ONCALL_TEAM_SLUG}")
    else:
        print(f"Failed to request review: {resp.status_code} {resp.text}")
        sys.exit(1)

def main():
    parser = argparse.ArgumentParser(description="Manage Oncall Schedule")
    subparsers = parser.add_subparsers(dest="command", required=True)
    
    # Rotate command
    parser_rotate = subparsers.add_parser("rotate", help="Rotate the schedule (remove first, append new)")
    parser_rotate.add_argument("--dry-run", action="store_true", help="Do not save changes")

    # Fill command (just fill up to 12 without rotating - useful for init)
    parser_fill = subparsers.add_parser("fill", help="Fill the schedule to 12 weeks without rotating")
    
    # Assign command
    parser_assign = subparsers.add_parser("assign", help="Assign current oncall to PR")
    parser_assign.add_argument("--pr", type=int, required=True, help="PR number")

    args = parser.parse_args()
    
    owner, _ = get_repo_info()
    
    if args.command == "rotate":
        rotate_schedule(owner, dry_run=args.dry_run)
    elif args.command == "fill":
        schedule = load_schedule()
        ensure_schedule_filled(schedule, owner)
        save_schedule(schedule)
        print("Schedule filled and saved.")
    elif args.command == "assign":
        assign_reviewer(args.pr)

if __name__ == "__main__":
    main()



================================================
FILE: .github/scripts/readme.sh
================================================
#!/bin/bash

cat << 'EOF'
╔══════════════════════════════════════════════════════════════════════╗
║                                                                      ║
║    ███╗   ███╗██████╗ ██████╗ ██╗██████╗  ██████╗ ███████╗         ║
║    ████╗ ████║██╔══██╗██╔══██╗██║██╔══██╗██╔════╝ ██╔════╝         ║
║    ██╔████╔██║██████╔╝██████╔╝██║██║  ██║██║  ███╗█████╗           ║
║    ██║╚██╔╝██║██╔══██╗██╔══██╗██║██║  ██║██║   ██║██╔══╝           ║
║    ██║ ╚═╝ ██║██████╔╝██║  ██║██║██████╔╝╚██████╔╝███████╗         ║
║    ╚═╝     ╚═╝╚═════╝ ╚═╝  ╚═╝╚═╝╚═════╝  ╚═════╝ ╚══════╝         ║
║                                                                      ║
║              H O W   T O :   M B R I D G E   T E S T I N G         ║
╚══════════════════════════════════════════════════════════════════════╝

  MBridge unit tests run automatically on every PR. To also trigger
  functional tests, attach the label and re-run the workflow step.

  ┌─────────────────────────────────────────────────────────────────┐
  │  DEFAULT  │  Unit tests run on every PR (no action needed)      │
  ├─────────────────────────────────────────────────────────────────┤
  │                                                                  │
  │    Every PR  ──►  cicd-mbridge-testing  ──►  unit tests only   │
  │                                                                  │
  └─────────────────────────────────────────────────────────────────┘

  ┌─────────────────────────────────────────────────────────────────┐
  │  STEP 1  │  Attach the label to your PR (for functional tests)  │
  ├─────────────────────────────────────────────────────────────────┤
  │                                                                  │
  │    PR Labels  ──►  [ + Add label ]  ──►  "Run MBridge tests"   │
  │                                                                  │
  └─────────────────────────────────────────────────────────────────┘

  ┌─────────────────────────────────────────────────────────────────┐
  │  STEP 2  │  Re-run this workflow step                           │
  ├─────────────────────────────────────────────────────────────────┤
  │                                                                  │
  │    Actions  ──►  [ Re-run jobs ]  ──►  Re-run failed jobs      │
  │                                                                  │
  └─────────────────────────────────────────────────────────────────┘

  ┌─────────────────────────────────────────────────────────────────┐
  │  RESULT  │  Unit + functional tests run!                        │
  ├─────────────────────────────────────────────────────────────────┤
  │                                                                  │
  │         cicd-mbridge-testing  ◄── unit + functional tests      │
  │                                                                  │
  │         Tests run against MBridge using the merge commit       │
  │         SHA of your pull request.                              │
  │                                                                  │
  └─────────────────────────────────────────────────────────────────┘

                ┌────────────────────────────────────┐
                │  Label present?     NO   → unit    │
                │  Label present?     YES  → unit +  │
                │                           functional│
                └────────────────────────────────────┘

  NOTE: The label must be present BEFORE the re-run is triggered.
        The CI checks for "Run MBridge tests" at runtime.

  NOTE: All MBridge test results are optional — failures do not
        block merging your PR.
EOF


================================================
FILE: .github/scripts/sync_team_usergroups.py
================================================
# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Syncs GitHub team membership to Slack user groups.

This script reads members from GitHub teams and updates the corresponding
Slack user groups to match.
"""

import os
import sys
import argparse
import requests

from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError

# Constants
GITHUB_API_URL = "https://api.github.com"

# Teams whose *children* are each synced to their own Slack usergroup
PARENT_TEAM_SLUGS = ["mcore-reviewers"]

# Teams synced directly (the team itself, not its children)
DIRECT_TEAM_SLUGS = ["mcore-engineers"]

# Caches for email and Slack lookups
_email_cache = {}
_slack_id_cache = {}
_usergroups_cache = None


def get_headers():
    """Get GitHub API headers with authentication."""
    token = os.environ.get("GH_TOKEN")
    if not token:
        token = os.environ.get("GITHUB_TOKEN")

    if not token:
        print("Error: GH_TOKEN or GITHUB_TOKEN not set")
        sys.exit(1)

    return {
        "Authorization": f"token {token}",
        "Accept": "application/vnd.github.v3+json",
    }


def get_org():
    """Returns the organization from GITHUB_REPOSITORY env var or default."""
    repo_env = os.environ.get("GITHUB_REPOSITORY", "NVIDIA/Megatron-LM")
    return repo_env.split("/")[0]


def github_team_to_slack_usergroup(team_slug):
    """Convert a GitHub team slug to a Slack usergroup handle.

    Rules:
    - Base pattern: "test" -> "mcore-test"
    - Remove "core-" prefix: "core-test" -> "mcore-test"
    - Remove "megatron-" prefix: "megatron-test" -> "mcore-test"
    - Remove "-and-": "test1-and-test2" -> "mcore-test1-test2"
    - Shorten "mixture-of-experts" to "moe"
    - Shorten "pipeline-parallelism" to "pp"
    - Shorten "reinforcement-learning" to "rl"
    """
    name = team_slug

    # Apply shortenings first (before removing prefixes)
    name = name.replace("mixture-of-experts", "moe")
    name = name.replace("pipeline-parallelism", "pp")
    name = name.replace("reinforcement-learning", "rl")

    # Remove prefixes
    if name.startswith("core-"):
        name = name[5:]  # Remove "core-"
    elif name.startswith("megatron-"):
        name = name[9:]  # Remove "megatron-"
    elif name.startswith("mcore-"):
        name = name[6:]  # Remove "mcore-"

    # Remove "-and-"
    name = name.replace("-and-", "-")

    return f"mcore-{name}"


def get_child_teams(org, parent_team_slug):
    """Fetches child teams of a parent GitHub team."""
    # First get the team ID
    url = f"{GITHUB_API_URL}/orgs/{org}/teams/{parent_team_slug}"
    headers = get_headers()

    resp = requests.get(url, headers=headers)
    if resp.status_code != 200:
        print(f"Error fetching parent team '{parent_team_slug}': {resp.status_code} {resp.text}")
        return []

    parent_team_id = resp.json().get("id")
    if not parent_team_id:
        print(f"Error: Could not get ID for team '{parent_team_slug}'")
        return []

    # Now fetch child teams
    url = f"{GITHUB_API_URL}/orgs/{org}/teams/{parent_team_slug}/teams"
    child_teams = []
    page = 1

    while True:
        resp = requests.get(f"{url}?per_page=100&page={page}", headers=headers)
        if resp.status_code != 200:
            print(f"Error fetching child teams: {resp.status_code} {resp.text}")
            return child_teams

        data = resp.json()
        if not data:
            break

        child_teams.extend([team["slug"] for team in data])
        if len(data) < 100:
            break
        page += 1

    return child_teams


def get_team_members(org, team_slug):
    """Fetches members of the GitHub team."""
    url = f"{GITHUB_API_URL}/orgs/{org}/teams/{team_slug}/members"
    headers = get_headers()

    members = set()
    page = 1
    while True:
        resp = requests.get(f"{url}?per_page=100&page={page}", headers=headers)
        if resp.status_code == 404:
            print(f"Warning: Team '{team_slug}' not found in org '{org}'")
            return set()
        if resp.status_code != 200:
            print(f"Error fetching team members: {resp.status_code} {resp.text}")
            return set()

        data = resp.json()
        if not data:
            break

        members.update([m["login"] for m in data])
        if len(data) < 100:
            break
        page += 1

    return members


def get_user_email(username):
    """Get user's email from GitHub, prioritizing @nvidia.com emails.

    Checks in order:
    1. Public profile email
    2. Recent commits in the repository
    """
    if username in _email_cache:
        return _email_cache[username]

    headers = get_headers()
    public_email = None

    try:
        # 1. Try to get user's public profile email first
        resp = requests.get(f"{GITHUB_API_URL}/users/{username}", headers=headers)
        if resp.status_code == 200:
            user_data = resp.json()
            email = user_data.get('email')
            if email and not email.endswith("@users.noreply.github.com"):
                if email.endswith("@nvidia.com"):
                    _email_cache[username] = email
                    return email
                # Store non-nvidia email as fallback
                public_email = email

        # 2. Check recent commits in the repository for @nvidia.com email
        repo_env = os.environ.get("GITHUB_REPOSITORY", "NVIDIA/Megatron-LM")
        commits_url = f"{GITHUB_API_URL}/repos/{repo_env}/commits?author={username}&per_page=10"
        resp = requests.get(commits_url, headers=headers)

        if resp.status_code == 200:
            commits = resp.json()
            for commit in commits:
                # Get email from commit author
                commit_data = commit.get('commit', {})
                author_data = commit_data.get('author', {})
                email = author_data.get('email')

                if email and not email.endswith("@users.noreply.github.com"):
                    if email.endswith("@nvidia.com"):
                        _email_cache[username] = email
                        print(f"Found @nvidia.com email for {username} from commits")
                        return email
                    elif public_email is None:
                        public_email = email

        # 3. Use public email if found, otherwise fallback
        if public_email:
            _email_cache[username] = public_email
            print(f"Using public email for {username}: {public_email}")
            return public_email

        # Fallback to noreply email
        fallback = f"{username}@users.noreply.github.com"
        _email_cache[username] = fallback
        print(f"Warning: No email found for {username}, using fallback: {fallback}")
        return fallback

    except Exception as e:
        print(f"Warning: Could not get email for {username}: {e}")
        fallback = f"{username}@users.noreply.github.com"
        _email_cache[username] = fallback
        return fallback


def get_slack_client():
    """Get Slack WebClient if token is available."""
    slack_token = os.environ.get("SLACK_TOKEN")
    if not slack_token:
        return None

    return WebClient(token=slack_token)


def get_slack_user_id(slack_client, email):
    """Get Slack user ID from email."""
    if not slack_client:
        return None

    if email in _slack_id_cache:
        return _slack_id_cache[email]

    try:
        response = slack_client.users_lookupByEmail(email=email)
        user_id = response["user"]["id"]
        _slack_id_cache[email] = user_id
        return user_id
    except SlackApiError as e:
        print(f"Warning: Could not find Slack user for {email}: {e.response['error']}")
        _slack_id_cache[email] = None
        return None


def fetch_all_usergroups(slack_client):
    """Fetch all Slack usergroups once and cache them."""
    global _usergroups_cache

    if _usergroups_cache is not None:
        return _usergroups_cache

    if not slack_client:
        _usergroups_cache = {}
        return _usergroups_cache

    try:
        print("Fetching Slack usergroups...")
        response = slack_client.usergroups_list(include_users=True)
        _usergroups_cache = {}
        for usergroup in response.get("usergroups", []):
            handle = usergroup.get("handle")
            if handle:
                _usergroups_cache[handle] = {
                    "id": usergroup.get("id"),
                    "users": usergroup.get("users", []),
                }
        print(f"Fetched {len(_usergroups_cache)} usergroups")
        return _usergroups_cache
    except SlackApiError as e:
        print(f"Warning: Could not list Slack usergroups: {e.response['error']}")
        _usergroups_cache = {}
        return _usergroups_cache


def get_slack_usergroup_id(slack_client, handle):
    """Get Slack usergroup ID from handle."""
    usergroups = fetch_all_usergroups(slack_client)

    if handle in usergroups:
        return usergroups[handle]["id"], usergroups[handle]["users"]

    return None, []


def github_team_to_usergroup_name(team_slug):
    """Convert a GitHub team slug to a Slack usergroup display name.

    Example: "test3" -> "Megatron Core Experts: Test3"
    """
    # Title case each word separated by hyphens, then join with spaces
    words = team_slug.split("-")
    title_cased = " ".join(word.capitalize() for word in words)
    return f"Megatron Core Experts: {title_cased}"


def create_slack_usergroup(slack_client, handle, team_slug):
    """Create a new Slack usergroup.

    Args:
        slack_client: Slack WebClient instance
        handle: The usergroup handle (e.g., "mcore-test")
        team_slug: The GitHub team slug (used for name and description)

    Returns:
        The usergroup ID if created successfully, None otherwise
    """
    global _usergroups_cache

    name = github_team_to_usergroup_name(team_slug)
    description = f'Expert review group "{team_slug}"'

    try:
        print(f"Creating Slack usergroup '@{handle}' with name '{name}'...")
        response = slack_client.usergroups_create(
            name=name,
            handle=handle,
            description=description,
        )
        usergroup = response.get("usergroup", {})
        usergroup_id = usergroup.get("id")

        if usergroup_id:
            # Update cache with new usergroup
            if _usergroups_cache is not None:
                _usergroups_cache[handle] = {
                    "id": usergroup_id,
                    "users": [],
                }
            print(f"Successfully created Slack usergroup '@{handle}'")
            return usergroup_id
        else:
            print(f"Error: Usergroup created but no ID returned")
            return None

    except SlackApiError as e:
        print(f"Error creating Slack usergroup '@{handle}': {e.response['error']}")
        return None


def sync_team_to_usergroup(team_slug, usergroup_handle, dry_run=False):
    """Sync a GitHub team to a Slack usergroup."""
    print(f"\n{'='*60}")
    print(f"Syncing GitHub team '{team_slug}' -> Slack usergroup '@{usergroup_handle}'")
    print(f"{'='*60}")

    org = get_org()
    slack_client = get_slack_client()

    if not slack_client:
        print("Error: Slack token not configured")
        return False

    # 1. Get GitHub team members
    members = get_team_members(org, team_slug)
    if not members:
        print(f"No members found in GitHub team '{team_slug}'")
        return False

    # Filter out service accounts
    members = {m for m in members if not m.startswith("svc")}
    print(f"GitHub team members ({len(members)}): {sorted(members)}")

    # 2. Get Slack user IDs for each member
    slack_user_ids = []
    missing_users = []

    for username in sorted(members):
        email = get_user_email(username)
        slack_id = get_slack_user_id(slack_client, email)
        if slack_id:
            slack_user_ids.append(slack_id)
        else:
            missing_users.append((username, email, "not found in Slack"))

    if missing_users:
        print(f"\nWarning: Could not resolve {len(missing_users)} users:")
        for username, email, reason in missing_users:
            print(f"  - {username}: {reason}" + (f" (tried {email})" if email else ""))

    if not slack_user_ids:
        print(f"Error: No Slack users found for team '{team_slug}'")
        return False

    # 3. Get current Slack usergroup membership (or create if it doesn't exist)
    usergroup_id, current_members = get_slack_usergroup_id(slack_client, usergroup_handle)

    if not usergroup_id:
        print(f"Slack usergroup '@{usergroup_handle}' not found, creating it...")
        if dry_run:
            print(f"Dry run: Would create usergroup '@{usergroup_handle}'")
            current_members = []
        else:
            usergroup_id = create_slack_usergroup(slack_client, usergroup_handle, team_slug)
            if not usergroup_id:
                print(f"Error: Failed to create Slack usergroup '@{usergroup_handle}'")
                return False
            current_members = []

    # 4. Compare and update
    current_set = set(current_members)
    new_set = set(slack_user_ids)

    to_add = new_set - current_set
    to_remove = current_set - new_set

    print(f"\nCurrent usergroup members: {len(current_members)}")
    print(f"New members to set: {len(slack_user_ids)}")
    print(f"  Adding: {len(to_add)} users")
    print(f"  Removing: {len(to_remove)} users")

    if current_set == new_set:
        print("No changes needed - usergroup is already in sync")
        return True

    if dry_run:
        print(f"\nDry run: Would update '@{usergroup_handle}' with {len(slack_user_ids)} members")
        return True

    # 5. Update the usergroup
    try:
        slack_client.usergroups_users_update(
            usergroup=usergroup_id, users=slack_user_ids
        )
        print(f"\nSuccessfully updated '@{usergroup_handle}' with {len(slack_user_ids)} members")
        return True
    except SlackApiError as e:
        print(f"Error updating usergroup: {e.response['error']}")
        return False


def get_team_to_usergroup_mapping(parent_team_slug):
    """Fetch child teams of a parent team and generate the mapping."""
    org = get_org()
    child_teams = get_child_teams(org, parent_team_slug)

    if not child_teams:
        print(f"Error: No child teams found under '{parent_team_slug}'")
        return {}

    mapping = {}
    for team_slug in child_teams:
        usergroup_handle = github_team_to_slack_usergroup(team_slug)
        mapping[team_slug] = usergroup_handle

    return mapping


def sync_all_teams(dry_run=False, parent_teams=None, direct_teams=None):
    """Sync GitHub teams to their Slack usergroups.

    Args:
        parent_teams: List of team slugs whose *children* are each synced.
                      Defaults to PARENT_TEAM_SLUGS.
        direct_teams: List of team slugs synced directly (not their children).
                      Defaults to DIRECT_TEAM_SLUGS.
    """
    if parent_teams is None:
        parent_teams = PARENT_TEAM_SLUGS
    if direct_teams is None:
        direct_teams = DIRECT_TEAM_SLUGS

    team_to_usergroup = {}

    for parent_slug in parent_teams:
        print(f"Fetching child teams of '{parent_slug}'...")
        mapping = get_team_to_usergroup_mapping(parent_slug)
        team_to_usergroup.update(mapping)

    for team_slug in direct_teams:
        usergroup_handle = github_team_to_slack_usergroup(team_slug)
        team_to_usergroup[team_slug] = usergroup_handle

    if not team_to_usergroup:
        return False

    print(f"Found {len(team_to_usergroup)} teams to sync")
    print("\nTeam to usergroup mapping:")
    for team, usergroup in sorted(team_to_usergroup.items()):
        print(f"  {team} -> @{usergroup}")

    results = {"success": [], "failed": []}

    for team_slug, usergroup_handle in team_to_usergroup.items():
        success = sync_team_to_usergroup(team_slug, usergroup_handle, dry_run=dry_run)
        if success:
            results["success"].append(team_slug)
        else:
            results["failed"].append(team_slug)

    # Summary
    print(f"\n{'='*60}")
    print("SYNC SUMMARY")
    print(f"{'='*60}")
    print(f"Successful: {len(results['success'])}")
    print(f"Failed: {len(results['failed'])}")

    if results["failed"]:
        print(f"\nFailed teams: {', '.join(results['failed'])}")
        return False

    return True


def main():
    parser = argparse.ArgumentParser(
        description="Sync GitHub team membership to Slack user groups"
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Show what would be done without making changes",
    )
    parser.add_argument(
        "--list",
        action="store_true",
        help="List all configured team-to-usergroup mappings",
    )
    parser.add_argument(
        "--parent-team",
        action="append",
        dest="parent_teams",
        metavar="SLUG",
        help=(
            "Sync all children of this GitHub team (can be repeated). "
            f"Defaults to: {PARENT_TEAM_SLUGS}"
        ),
    )
    parser.add_argument(
        "--team",
        action="append",
        dest="direct_teams",
        metavar="SLUG",
        help=(
            "Sync this GitHub team directly (can be repeated). "
            f"Defaults to: {DIRECT_TEAM_SLUGS}"
        ),
    )

    args = parser.parse_args()

    # Use CLI values when provided, otherwise fall back to module-level defaults
    parent_teams = args.parent_teams if args.parent_teams is not None else PARENT_TEAM_SLUGS
    direct_teams = args.direct_teams if args.direct_teams is not None else DIRECT_TEAM_SLUGS

    if args.list:
        team_to_usergroup = {}
        for parent_slug in parent_teams:
            print(f"Fetching child teams of '{parent_slug}'...")
            team_to_usergroup.update(get_team_to_usergroup_mapping(parent_slug))
        for team_slug in direct_teams:
            team_to_usergroup[team_slug] = github_team_to_slack_usergroup(team_slug)
        if not team_to_usergroup:
            sys.exit(1)
        print("\nTeam-to-usergroup mappings:")
        print(f"{'GitHub Team':<35} {'Slack Usergroup':<30}")
        print("-" * 65)
        for team, usergroup in sorted(team_to_usergroup.items()):
            print(f"{team:<35} @{usergroup:<29}")
        return

    success = sync_all_teams(
        dry_run=args.dry_run, parent_teams=parent_teams, direct_teams=direct_teams
    )
    sys.exit(0 if success else 1)


if __name__ == "__main__":
    main()


================================================
FILE: .github/workflows/_build_test_publish_wheel.yml
================================================
on:
  workflow_call:
    inputs:
      ref:
        required: false
        description: Ref (SHA or branch) to release
        type: string
        default: ${{ github.sha }}
      dry-run:
        required: false
        description: Upload to PyPy Test instance
        type: boolean
        default: true
      no-publish:
        required: false
        description: Do not publish the wheel
        type: boolean
        default: true
    secrets:
      TWINE_PASSWORD:
        required: true

jobs:
  build-and-test-wheels:
    strategy:
      fail-fast: false
      matrix:
        include:
          - PACKAGE: megatron-core
            PLATFORM: arm64
            IMAGE: quay.io/pypa/manylinux_2_28_aarch64
          - PACKAGE: megatron-core
            PLATFORM: amd64
            IMAGE: quay.io/pypa/manylinux_2_28_x86_64
          - PACKAGE: megatron-fsdp
            IMAGE: quay.io/pypa/manylinux_2_28_x86_64
            PLATFORM: amd64
    runs-on: ${{ matrix.PLATFORM == 'amd64' && 'ubuntu-22.04' || 'ubuntu-22.04-arm' }}
    env:
      PACKAGE: ${{ matrix.PACKAGE }}
      IMAGE: ${{ matrix.IMAGE }}
      PLATFORM: ${{ matrix.PLATFORM }}
      PUBLISH_DRYRUN: ${{ inputs.dry-run }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6
        with:
          ref: ${{ inputs.ref }}

      - name: Build wheel
        id: build-wheel
        run: |
          set -x

          if [ "$PACKAGE" = "megatron-core" ]; then
            ROOTDIR="megatron/core"
            BUILD_DIR="."
          elif [ "$PACKAGE" = "megatron-fsdp" ]; then
            ROOTDIR="megatron/core/distributed/fsdp/src/megatron_fsdp"
            BUILD_DIR="megatron/core/distributed/fsdp/src"
          else
            echo Unknown package: $PACKAGE
            exit 1
          fi

          if [ "$PUBLISH_DRYRUN" = "true" ]; then
            PRE_RELEASE=$(sed -n "s/.*PRE_RELEASE = '\(.*\)'/\1/p" $ROOTDIR/package_info.py)
            sed -i "/^PRE_RELEASE/c\PRE_RELEASE = '${PRE_RELEASE}.dev$((RANDOM % 900000 + 100000))'" $ROOTDIR/package_info.py
          fi

          pushd $BUILD_DIR
            rm LICENSE || true
            docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE bash -c '\
              for python_version in cp310 cp311 cp312 cp313; do \
                /opt/python/${python_version}-${python_version}/bin/pip install --upgrade "setuptools<80.0.0,>=77.0.0" build; \
              done && \
              for python_version in cp310 cp311 cp312 cp313; do \
                /opt/python/${python_version}-${python_version}/bin/python -m build; \
              done \
            '

            PLATFORM_WHEELS=$(find dist -name "*.whl" -not -name "*-none-any.whl")
            if [ -n "$PLATFORM_WHEELS" ]; then
                echo "Found platform wheels to repair: $PLATFORM_WHEELS"
                docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE auditwheel repair $PLATFORM_WHEELS
                docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE rm -rf dist/*.whl
                docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE cp -a wheelhouse/* dist/
            fi
          popd

          pushd $ROOTDIR
            EXPECTED_RELEASE_NUMBER=$(python -c "import package_info; print(package_info.__version__)")
          popd

          echo "expected-release-number=$EXPECTED_RELEASE_NUMBER" | tee -a "${GITHUB_OUTPUT}"

          if [ "$PACKAGE" = "megatron-fsdp" ]; then
            mkdir -p dist/
            cp -a megatron/core/distributed/fsdp/src/dist/* dist/
          fi

          ls -al dist/

      - name: Test wheels
        run: |
          ls -al dist/

          if [ "$PACKAGE" = "megatron-core" ]; then
            ROOTPATH="megatron.core"
            WHEEL_PREFIX="megatron_core"
          elif [ "$PACKAGE" = "megatron-fsdp" ]; then
            ROOTPATH="megatron_fsdp"
            WHEEL_PREFIX="megatron_fsdp"
          else
            echo Unknown package: $PACKAGE
            exit 1
          fi

          if [ "$PACKAGE" = "megatron-core" ]; then
            if [[ "$PLATFORM" == "arm64" ]]; then
              for file in dist/$WHEEL_PREFIX*cp310*aarch64.whl; do
                pip install --no-cache-dir "$file"
              done
            else
              for file in dist/$WHEEL_PREFIX*cp310*x86_64.whl; do
                pip install --no-cache-dir "$file"
              done
            fi
          else
            pip install --no-cache-dir dist/$WHEEL_PREFIX*.whl
          fi

          sudo rm -rf megatron/

          RELEASE_NUMBER=$(python -c "import $ROOTPATH; print($ROOTPATH.__version__)")
          test "${{ steps.build-wheel.outputs.expected-release-number }}" == "$RELEASE_NUMBER"

      - name: Upload wheels
        uses: actions/upload-artifact@v6
        with:
          name: wheels-${{ matrix.PACKAGE }}-${{ matrix.PLATFORM }}-${{ inputs.dry-run && 'dry-run' || 'release' }}
          path: dist/

  publish-wheels:
    needs: [build-and-test-wheels]
    runs-on: ubuntu-latest
    if: inputs.no-publish == false
    strategy:
      fail-fast: false
      matrix:
        include:
          - PACKAGE: megatron-core
            PLATFORM: arm64
          - PACKAGE: megatron-core
            PLATFORM: amd64
          - PACKAGE: megatron-fsdp
            PLATFORM: amd64
    env:
      PACKAGE: ${{ matrix.PACKAGE }}
    steps:
      - name: Download wheels
        uses: actions/download-artifact@v7
        with:
          name: wheels-${{ matrix.PACKAGE }}-${{ matrix.PLATFORM }}-${{ inputs.dry-run && 'dry-run' || 'release' }}
          path: dist/
          merge-multiple: true

      - name: Publish wheels
        env:
          TWINE_USERNAME: __token__
          TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
          TWINE_REPOSITORY: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && 'pypi' || 'testpypi' }}
          PLATFORM: ${{ matrix.PLATFORM }}
        run: |

          # Delete sdist for arm64 since we already upload it with amd64.
          if [ "$PLATFORM" == "arm64" ]; then
            rm dist/*.tar.gz
          fi

          ls -al dist/
          pip install twine
          twine upload \
            --verbose \
            -r $TWINE_REPOSITORY \
            -u $TWINE_USERNAME \
            -p $TWINE_PASSWORD \
            dist/*


================================================
FILE: .github/workflows/_release_library.yml
================================================
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: "Release"

defaults:
  run:
    shell: bash -x -e -u -o pipefail {0}

on:
  workflow_call:
    inputs:
      release-ref:
        required: true
        description: Ref (SHA or branch) to release
        type: string
      dry-run:
        type: boolean
        required: true
        description: Do not publish a wheel and GitHub release.
      version-bump-branch:
        type: string
        required: true
        description: Branch to target for version bump
      create-gh-release:
        required: false
        description: Create a GitHub release
        type: boolean
        default: true
      gh-release-use-changelog-builder:
        required: false
        description: Use release-changelog-builder-action to dynamically build changelog
        type: boolean
        default: true
      gh-release-changelog-config:
        required: false
        description: Path to changelog builder configuration file
        type: string
        default: ".github/workflows/config/changelog-config.json"
      gh-release-from-tag:
        required: false
        description: Starting tag for changelog builder (leave empty for auto-detect)
        type: string
        default: ""
      publish-docs:
        required: false
        description: Publish documentation to S3 after release
        type: boolean
        default: true
    secrets:
      TWINE_PASSWORD:
        required: true
      SLACK_WEBHOOK:
        required: true
      PAT:
        required: true
      AWS_ASSUME_ROLE_ARN:
        required: true
      AWS_ACCESS_KEY_ID:
        required: true
      AWS_SECRET_ACCESS_KEY:
        required: true
      AKAMAI_HOST:
        required: true
      AKAMAI_CLIENT_TOKEN:
        required: true
      AKAMAI_CLIENT_SECRET:
        required: true
      AKAMAI_ACCESS_TOKEN:
        required: true
      S3_BUCKET_NAME:
        required: true

permissions:
  contents: write # To read repository content
  pull-requests: write # To create PR(s)

jobs:
  build-test-publish-wheels-dry-run:
    uses: ./.github/workflows/_build_test_publish_wheel.yml
    with:
      dry-run: true
      ref: ${{ inputs.release-ref }}
      no-publish: true
    secrets:
      TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}

  bump-next-version:
    runs-on: ubuntu-latest
    needs: build-test-publish-wheels-dry-run
    if: |
      (
        success() || !failure()
      )
      && !cancelled()
    outputs:
      release-version: ${{ steps.bump-version-mcore.outputs.release-version }}
    env:
      IS_DRY_RUN: ${{ inputs.dry-run }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6
        with:
          path: ${{ github.run_id }}
          token: ${{ secrets.PAT }}
          fetch-depth: 0
          fetch-tags: true
          ref: ${{ inputs.release-ref }}
      - name: Bump version MCore
        id: bump-version-mcore
        env:
          SRC_DIR: ""
          PYPROJECT_NAME: "megatron.core"
        run: |
          set +u
          cd ${{ github.run_id }}

          PACKAGE_INFO_FILE="$SRC_DIR${PYPROJECT_NAME//.//}/package_info.py"

          MAJOR=$(cat $PACKAGE_INFO_FILE | awk '/^MAJOR = /' | awk -F"= " '{print $2}')
          MINOR=$(cat $PACKAGE_INFO_FILE | awk '/^MINOR = /' | awk -F"= " '{print $2}')
          PATCH=$(cat $PACKAGE_INFO_FILE | awk '/^PATCH = /' | awk -F"= " '{print $2}')
          PRERELEASE=$(cat $PACKAGE_INFO_FILE | awk '/^PRE_RELEASE = /' | awk -F"= " '{print $2}' | tr -d '"' | tr -d "'")

          echo "release-version=$MAJOR.$MINOR.$PATCH$PRERELEASE" | tee -a "$GITHUB_OUTPUT"

          if [[ "$PRERELEASE" != "" ]]; then
            if [[ "$PRERELEASE" == *rc* ]]; then
              NEXT_PATCH=$PATCH
              NEXT_PRERELEASE=rc$((${PRERELEASE#rc} + 1))
            elif [[ "$PRERELEASE" == *a* ]]; then
              NEXT_PATCH=$PATCH
              NEXT_PRERELEASE=a$((${PRERELEASE#a} + 1))
            else
              echo "Unknown pre-release: $PRERELEASE"
              exit 1
            fi
          else
            NEXT_PATCH=$((${PATCH} + 1))
            NEXT_PRERELEASE=$PRERELEASE
          fi

          sed -i "/^PATCH/c\PATCH = $NEXT_PATCH" $PACKAGE_INFO_FILE
          sed -i "/^PRE_RELEASE/c\PRE_RELEASE = \"$NEXT_PRERELEASE\"" $PACKAGE_INFO_FILE

          echo "version=$MAJOR.$MINOR.$NEXT_PATCH$NEXT_PRERELEASE" | tee -a "$GITHUB_OUTPUT"

      - name: Bump version MFSDP
        id: bump-version-mfsdp
        env:
          SRC_DIR: "megatron/core/distributed/fsdp/src/"
          PYPROJECT_NAME: "megatron_fsdp"
        run: |
          set +u

          cd ${{ github.run_id }}

          PACKAGE_INFO_FILE="$SRC_DIR${PYPROJECT_NAME//.//}/package_info.py"

          MAJOR=$(cat $PACKAGE_INFO_FILE | awk '/^MAJOR = /' | awk -F"= " '{print $2}')
          MINOR=$(cat $PACKAGE_INFO_FILE | awk '/^MINOR = /' | awk -F"= " '{print $2}')
          PATCH=$(cat $PACKAGE_INFO_FILE | awk '/^PATCH = /' | awk -F"= " '{print $2}')
          PRERELEASE=$(cat $PACKAGE_INFO_FILE | awk '/^PRE_RELEASE = /' | awk -F"= " '{print $2}' | tr -d '"' | tr -d "'")

          if [[ "$PRERELEASE" != "" ]]; then
            if [[ "$PRERELEASE" == *rc* ]]; then
              NEXT_PATCH=$PATCH
              NEXT_PRERELEASE=rc$((${PRERELEASE#rc} + 1))
            elif [[ "$PRERELEASE" == *a* ]]; then
              NEXT_PATCH=$PATCH
              NEXT_PRERELEASE=a$((${PRERELEASE#a} + 1))
            else
              echo "Unknown pre-release: $PRERELEASE"
              exit 1
            fi
          else
            NEXT_PATCH=$((${PATCH} + 1))
            NEXT_PRERELEASE=$PRERELEASE
          fi

          sed -i "/^PATCH/c\PATCH = $NEXT_PATCH" $PACKAGE_INFO_FILE
          sed -i "/^PRE_RELEASE/c\PRE_RELEASE = \"$NEXT_PRERELEASE\"" $PACKAGE_INFO_FILE

          echo "version=$MAJOR.$MINOR.$NEXT_PATCH$NEXT_PRERELEASE" | tee -a "$GITHUB_OUTPUT"

      - name: Create and push deployment branch
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          cd ${{ github.run_id }}

          TMP_BRANCH="deploy-release/$(uuidgen)"
          git config --global user.name "github-actions[bot]"
          git config --global user.email "github-actions[bot]@users.noreply.github.com"
          git checkout -b "$TMP_BRANCH"
          git add -A .
          git commit -m "beep boop 🤖: Bumping versions" || echo "No changes to commit"
          git push -u origin "$TMP_BRANCH"
          echo "TMP_BRANCH=$TMP_BRANCH" | tee -a $GITHUB_ENV

          # Create PR to collect app based status checks that run on PRs only
          # (like DCO check)
          PR_URL=$(gh pr create \
            --base ${{ inputs.version-bump-branch }} \
            --head $TMP_BRANCH \
            --title "beep boop 🤖: Bumping versions" \
            --body "This is an automated PR to bump versions.")

          # Extract PR number from URL
          PR_NUMBER=$(echo $PR_URL | grep -o '[0-9]*$')

      - name: Wait for status checks on tmp branch
        uses: actions/github-script@v8
        id: wait-status
        with:
          github-token: ${{ secrets.PAT }}
          script: |
            const branch = process.env.TMP_BRANCH;
            const owner = context.repo.owner;
            const repo = context.repo.repo;

            // Get latest commit SHA of branch
            const { data: refData } = await github.rest.git.getRef({
              owner,
              repo,
              ref: `heads/${branch}`,  // note: no 'refs/' prefix here
            });

            const sha = refData.object.sha;

            console.log(`Polling status for commit SHA: ${sha}`);

            let checksPassed = false;
            let maxAttempts = 30;
            let attempt = 0;
            const delay = ms => new Promise(res => setTimeout(res, ms));

            while (!checksPassed && attempt < maxAttempts) {
              attempt++;

              // Use commit SHA instead of branch ref
              const { data: status } = await github.rest.repos.getCombinedStatusForRef({
                owner,
                repo,
                ref: sha,
              });

              const { data: checks } = await github.rest.checks.listForRef({
                owner,
                repo,
                ref: sha,
              });

              const allStatuses = status.statuses;
              const allChecks = checks.check_runs;

              if (allStatuses.length === 0 && allChecks.length === 0) {
                console.log(`Attempt ${attempt}: No checks or statuses yet. Waiting...`);
                await delay(10000);
                continue;
              }

              const statusesOk = allStatuses.every(s => s.state === 'success');
              const checksOk = allChecks.every(c => c.status === 'completed');

              if (statusesOk && checksOk) {
                console.log('✅ All checks passed.');
                checksPassed = true;
                break
              }

              console.log(`Attempt ${attempt}: Checks not complete yet. Waiting...`);
              await delay(10000);
            }

            if (!checksPassed) {
              core.setFailed('❌ Status checks did not pass in time');
            }

      - name: Merge into ${{ inputs.version-bump-branch }}
        run: |
          cd ${{ github.run_id }}

          git config user.name "github-actions[bot]"
          git config user.email "github-actions[bot]@users.noreply.github.com"

          CMD=$(echo -E 'git push origin ${{ inputs.version-bump-branch }}')

          if [[ "$IS_DRY_RUN" == "true" ]]; then
            echo "dry-run enabled, would have run: $CMD"
          else
            # Here we account for potential race conditions from multiple concurrent releases.
            # Those can be legit (operating on different packages within the monorepo, for example)
            # but the pushes would be still rejected purely because of git's inability to
            # push non-fast-forward updates to the branch. In this case we would need to let
            # a retry.
            git fetch origin ${{ inputs.version-bump-branch }}
            git checkout ${{ inputs.version-bump-branch }}
            git merge ${{ env.TMP_BRANCH }}

            for attempt in {1..3}; do
              if eval "$CMD"; then
                echo "Git push succeeded on attempt $attempt"
                break
              else
                echo "Git push failed on attempt $attempt"
                if [[ $attempt -lt 3 ]]; then
                  sleep $((RANDOM % 3 + 1))
                  # We refetch, reset and re-merge. Note resetting because the local
                  # branch is "contaminated" with previous merge attempt.
                  git fetch origin ${{ inputs.version-bump-branch }}
                  git reset --hard origin/${{ inputs.version-bump-branch }}
                  git merge ${{ env.TMP_BRANCH }}
                else
                  echo "Git push failed after 3 attempts"
                  exit 1
                fi
              fi
            done
          fi

      - name: Delete ${{ env.TMP_BRANCH }} branch
        if: always()
        run: |
          cd ${{ github.run_id }}
          git push -d origin ${{ env.TMP_BRANCH }}

  build-test-publish-wheels:
    needs: [bump-next-version]
    uses: ./.github/workflows/_build_test_publish_wheel.yml
    with:
      dry-run: false
      ref: ${{ inputs.release-ref }}
      no-publish: false
    secrets:
      TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}

  create-gh-release:
    needs: [build-test-publish-wheels, bump-next-version]
    runs-on: ubuntu-latest
    if: |
      (
        success() || !failure()
      )
      && inputs.create-gh-release == true
      && !cancelled()
    outputs:
      is-release-candidate: ${{ steps.version-number.outputs.is-release-candidate }}
    env:
      REPOSITORY: ${{ github.repository }}
      PROJECT_NAME: Megatron Core
      VERSION: ${{ needs.bump-next-version.outputs.release-version }}
      TAG_PREFIX: core_
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6
        with:
          path: ${{ github.run_id }}
          ref: ${{ inputs.release-ref }}
          token: ${{ secrets.PAT || secrets.GITHUB_TOKEN }}

      - name: Determine fromTag for changelog
        id: determine-from-tag
        if: inputs.gh-release-use-changelog-builder == true
        run: |
          cd ${{ github.run_id }}

          # If gh-release-from-tag is provided, use it
          if [[ -n "${{ inputs.gh-release-from-tag }}" ]]; then
            FROM_TAG="${{ inputs.gh-release-from-tag }}"
            echo "Using provided fromTag: $FROM_TAG"
          else
            # Get the most recent tag
            FROM_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "")
            if [[ -z "$FROM_TAG" ]]; then
              echo "No previous tags found, leaving fromTag empty"
            else
              echo "Auto-detected most recent tag: $FROM_TAG"
            fi
          fi

          echo "from-tag=$FROM_TAG" >> $GITHUB_OUTPUT

      - name: Build Changelog
        id: build-changelog
        if: inputs.gh-release-use-changelog-builder == true
        uses: mikepenz/release-changelog-builder-action@v6.1.0
        env:
          GITHUB_TOKEN: ${{ secrets.PAT || secrets.GITHUB_TOKEN }}
        with:
          configuration: ${{ github.run_id }}/${{ inputs.gh-release-changelog-config }}
          owner: ${{ github.repository_owner }}
          repo: ${{ github.event.repository.name }}
          ignorePreReleases: "false"
          failOnError: "false"
          fromTag: ${{ steps.determine-from-tag.outputs.from-tag }}
          toTag: ${{ inputs.release-ref }}
          mode: ${{ inputs.gh-release-changelog-mode }}

      - name: Create release
        id: version-number
        env:
          SHA: ${{ inputs.release-ref }}
          GH_TOKEN: ${{ secrets.PAT }}
          IS_DRY_RUN: ${{ inputs.dry-run }}
          BUILT_CHANGELOG: ${{ steps.build-changelog.outputs.changelog }}
        run: |
          cd ${{ github.run_id }}

          IS_RELEASE_CANDIDATE=$([[ "$VERSION" == *rc* ]] && echo "true" || echo "false")
          IS_ALPHA=$([[ "$VERSION" == *a* ]] && echo "true" || echo "false")
          IS_PRERELEASE=$([[ "$IS_RELEASE_CANDIDATE" == "true" || "$IS_ALPHA" == "true" ]] && echo "true" || echo "false")
          NAME="NVIDIA $PROJECT_NAME ${VERSION}"

          # Use built changelog if available, otherwise fall back to CHANGELOG.md
          if [[ -n "$BUILT_CHANGELOG" ]]; then
            CHANGELOG="$BUILT_CHANGELOG"
          elif [[ "$IS_RELEASE_CANDIDATE" == "true" ]]; then
            DATE=$(date +"%Y-%m-%d")
            CHANGELOG="Prerelease: $NAME ($DATE)"
          else
            CHANGELOG=$(awk '/^## '"$NAME"'/{flag=1; next} /^## /{flag=0} flag' CHANGELOG.md)
            CHANGELOG=$(echo "$CHANGELOG" | sed '/./,$!d' | sed ':a;N;$!ba;s/\n$//')
          fi

          echo "is-release-candidate=$IS_RELEASE_CANDIDATE" | tee -a "$GITHUB_OUTPUT"

          PAYLOAD=$(jq -nc \
                      --arg TAG_NAME "${TAG_PREFIX}v${VERSION}" \
                      --arg CI_COMMIT_BRANCH "$SHA" \
                      --arg NAME "$NAME" \
                      --arg BODY "$CHANGELOG" \
                      --argjson PRERELEASE "$IS_PRERELEASE" \
                      '{
                        "tag_name": $TAG_NAME,
                        "target_commitish": $CI_COMMIT_BRANCH,
                        "name": $NAME,
                        "body": $BODY,
                        "draft": false,
                        "prerelease": $PRERELEASE,
                        "generate_release_notes": false
                      }'
                  )
          echo -E "$PAYLOAD" > payload.txt

          CMD=$(echo -E 'curl -L \
            -X POST \
            -H "Accept: application/vnd.github+json" \
            -H "Authorization: Bearer '"$GH_TOKEN"'" \
            -H "X-GitHub-Api-Version: 2022-11-28" \
            https://api.github.com/repos/'"$REPOSITORY"'/releases \
            -d @payload.txt
          ')

          if [[ "$IS_DRY_RUN" == "true" ]]; then
            echo -E "$CMD"
          else
            eval "$CMD"
          fi

  publish-docs:
    needs: [bump-next-version, create-gh-release]
    uses: ./.github/workflows/release-docs.yml
    if: |
      (
        success() || !failure()
      )
      && inputs.publish-docs == true
      && !cancelled()
    with:
      dry-run: ${{ inputs.dry-run }}
      publish-as-latest: true
      docs-version-override: ${{ needs.bump-next-version.outputs.release-version }}
      build-docs-ref: ${{ inputs.release-ref }}
    secrets: inherit

  notify:
    needs: [build-test-publish-wheels, create-gh-release]
    runs-on: ubuntu-latest
    env:
      GH_URL: https://github.com/${{ github.repository }}/releases/tag/v${{ needs.build-test-publish-wheels.outputs.version }}
      PYPI_URL: https://${{ inputs.dry-run == true && 'test.' || '' }}pypi.org/project/${{ needs.build-test-publish-wheels.outputs.pypi-name }}/${{ needs.build-test-publish-wheels.outputs.version }}/
      PROJECT_NAME: Megatron Core
      VERSION: ${{ needs.build-test-publish-wheels.outputs.version }}
    steps:
      - name: Checkout
        uses: actions/checkout@v6
        with:
          repository: NVIDIA-NeMo/FW-CI-templates
          ref: v0.17.0
          path: send-slack-alert

      - name: Send Slack alert
        uses: ./send-slack-alert/.github/actions/send-slack-alert
        env:
          MESSAGE: |
            ${{ inputs.dry-run == true && 'This is a dry-run, nothing actually happened: ' || '' }}We have released `${{ env.VERSION }}` of `NVIDIA ${{ env.PROJECT_NAME }}` 🚀✨🎉

            • <${{ env.GH_URL }}|GitHub release>
            • <${{ env.PYPI_URL }}|PyPi release>

        with:
          message: ${{ env.MESSAGE }}
          webhook: ${{ secrets.SLACK_WEBHOOK }}


================================================
FILE: .github/workflows/_update_dependencies.yml
================================================
name: ~Update dependencies template
on:
  workflow_call:
    inputs:
      target-branch:
        required: true
        type: string
        description: "The target branch to bump"
    secrets:
      PAT:
        required: true
      SSH_KEY:
        required: true
      SSH_PWD:
        required: true

jobs:
  pre-flight:
    runs-on: ubuntu-latest
    outputs:
      bump-branch: bump-ci-container-${{ steps.ref.outputs.date }}-${{ inputs.target-branch }}
      date: ${{ steps.ref.outputs.date }}
    steps:
      - name: Get date
        id: ref
        run: echo "date=$(date +%F)" | tee -a "$GITHUB_OUTPUT"

  update-lockfile:
    runs-on: linux-amd64-cpu16
    needs: [pre-flight]
    env:
      SOURCE_BRANCH: ${{ needs.pre-flight.outputs.bump-branch }}
      TARGET_BRANCH: ${{ inputs.target-branch }}
    steps:
      - name: Checkout repo
        uses: actions/checkout@v6
        with:
          ref: ${{ env.TARGET_BRANCH }}

      - name: Mock test data
        run: mkdir -p assets/

      - name: Fetch NGC Version
        id: ngc-version
        run: |
          NGC_VERSION=$(cat docker/.ngc_version.dev)
          echo "NGC_VERSION=${NGC_VERSION}" | tee -a "$GITHUB_OUTPUT"

      - name: Build container
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          docker build -f docker/Dockerfile.ci.dev --build-arg FROM_IMAGE_NAME="${{ steps.ngc-version.outputs.NGC_VERSION }}" --target=main -t megatron-core .

      - name: Create bump branch if not exists
        run: |
          if ! git ls-remote --exit-code origin $SOURCE_BRANCH; then
            git checkout -b $SOURCE_BRANCH $TARGET_BRANCH
            git push origin $SOURCE_BRANCH
          fi

      - name: Checkout repo
        uses: actions/checkout@v6
        with:
          ref: ${{ env.SOURCE_BRANCH }}

      - name: Upgrade lock file
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          docker run \
          --rm \
          -v $(pwd):/workspace \
          -w /workspace \
          -e GH_TOKEN=${{ secrets.PAT }} \
          megatron-core \
          bash -c 'uv lock --upgrade'

      - name: Upload lock file
        uses: actions/upload-artifact@v6
        with:
          name: lock-file-${{ env.SOURCE_BRANCH }}
          path: uv.lock

  create-pr:
    needs: [update-lockfile, pre-flight]
    runs-on: ubuntu-latest
    env:
      SOURCE_BRANCH: ${{ needs.pre-flight.outputs.bump-branch }}
      TARGET_BRANCH: ${{ inputs.target-branch }}
    steps:
      - name: Checkout code
        uses: actions/checkout@v6
        with:
          token: ${{ secrets.PAT }}
          ref: ${{ env.TARGET_BRANCH }}

      - name: Rebase against ${{ env.SOURCE_BRANCH }}
        run: |
          if git ls-remote --exit-code origin ${{ env.SOURCE_BRANCH }}; then
            git fetch origin ${{ env.SOURCE_BRANCH }}
            git rebase -S origin/${{ env.SOURCE_BRANCH }}
          fi

      - name: Download lock file
        uses: actions/download-artifact@v7
        with:
          name: lock-file-${{ env.SOURCE_BRANCH }}

      - name: Create Bump PR
        uses: peter-evans/create-pull-request@v8
        id: create-pull-request
        env:
          title: "chore(beep boop 🤖): Bump `uv.lock` (${{ inputs.target-branch}}) (${{ needs.pre-flight.outputs.date }})"
        with:
          branch: ${{ env.SOURCE_BRANCH }}
          base: ${{ env.TARGET_BRANCH }}
          title: ${{ env.title }}
          token: ${{ secrets.PAT }}
          body: |
            🚀 PR to bump `uv.lock` in `${{ inputs.target-branch }}`.  

            📝 Please remember the following to-do's before merge: 
            - [ ] Verify the presubmit CI  

            🙏 Please merge this PR only if the CI workflow completed successfully.
          commit-message: ${{ env.title }}
          signoff: true
          committer: "github-actions[bot] <github-actions[bot]@users.noreply.github.com>"

      - name: Post /ok to test comment
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          PR_NUMBER="${{ steps.create-pull-request.outputs.pull-request-number }}"
          if [ -z "$PR_NUMBER" ]; then
            echo "No PR was created, skipping comment"
            exit 0
          fi
          SHA="${{ steps.create-pull-request.outputs.pull-request-head-sha }}"
          gh pr comment "$PR_NUMBER" --body "/ok to test $SHA"

      - name: Wait for CI checks
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          PR_NUMBER="${{ steps.create-pull-request.outputs.pull-request-number }}"
          if [ -z "$PR_NUMBER" ]; then
            echo "No PR was created, skipping wait"
            exit 0
          fi

          # Fetch required status checks from branch protection rules
          REQUIRED_CHECKS=$(gh api \
            "repos/${{ github.repository }}/branches/${{ env.TARGET_BRANCH }}/protection/required_status_checks" \
            --jq '.checks[].context' 2>/dev/null \
            || gh api \
            "repos/${{ github.repository }}/branches/${{ env.TARGET_BRANCH }}/protection/required_status_checks" \
            --jq '.contexts[]' 2>/dev/null \
            || true)

          if [ -z "$REQUIRED_CHECKS" ]; then
            echo "No branch protection rules found for ${{ env.TARGET_BRANCH }}, skipping wait"
            exit 0
          fi

          echo "Required checks from branch protection:"
          echo "$REQUIRED_CHECKS"

          echo "Waiting for required checks to complete on PR #$PR_NUMBER..."
          i=0
          INITIALIZED=false
          while true; do
            i=$((i + 1))
            CHECKS_JSON=$(gh pr checks "$PR_NUMBER" --json name,state 2>/dev/null || echo "[]")
            ALL_DONE=true
            FAILED_CHECKS=""
            while IFS= read -r check; do
              CHECK_STATE=$(echo "$CHECKS_JSON" | jq -r --arg name "$check" '.[] | select(.name == $name) | .state // ""' | tr '[:upper:]' '[:lower:]')
              case "$CHECK_STATE" in
                *success*|*pass*|*skip*|*neutral*) ;;
                *pending*|*queued*|*progress*|*waiting*|*request*|"")
                  ALL_DONE=false
                  INITIALIZED=true
                  break
                  ;;
                *)
                  if [ "$INITIALIZED" = "true" ]; then
                    FAILED_CHECKS="${FAILED_CHECKS}  - ${check} (${CHECK_STATE})"$'\n'
                  else
                    ALL_DONE=false
                  fi
                  ;;
              esac
            done <<< "$REQUIRED_CHECKS"
            if [ "$ALL_DONE" = "true" ]; then
              if [ -n "$FAILED_CHECKS" ]; then
                echo "Required check(s) did not pass:"
                echo "$FAILED_CHECKS"
                exit 1
              fi
              echo "All required checks passed!"
              break
            fi
            echo "Checks not yet complete (attempt $i), retrying in 30s..."
            sleep 30
          done

      - name: Merge PR
        env:
          title: "chore(beep boop 🤖): Bump `uv.lock` (${{ env.TARGET_BRANCH}}) (${{ needs.pre-flight.outputs.date }})"
        run: |
          PR_NUMBER="${{ steps.create-pull-request.outputs.pull-request-number }}"
          if [ -z "$PR_NUMBER" ]; then
            echo "No PR was created, skipping merge"
            exit 0
          fi
          git config user.name "github-actions[bot]"
          git config user.email "github-actions[bot]@users.noreply.github.com"
          git fetch origin ${{ env.SOURCE_BRANCH }}
          git fetch origin ${{ env.TARGET_BRANCH }}
          git checkout ${{ env.TARGET_BRANCH }}
          git merge --squash origin/${{ env.SOURCE_BRANCH }}
          git commit -m "${{ env.title }}"
          git pull --rebase origin ${{ env.TARGET_BRANCH }}
          git push origin ${{ env.TARGET_BRANCH }}
          git push origin --delete ${{ env.SOURCE_BRANCH }}


================================================
FILE: .github/workflows/auto-assign-milestone.yml
================================================
name: Auto-assign Milestone to PR

on:
  push:
    branches:
      - "pull-request/[0-9]+"

permissions:
  contents: read
  pull-requests: write
  issues: write

jobs:
  assign-milestone:
    runs-on: ubuntu-latest
    if: github.repository == 'NVIDIA/Megatron-LM'
    steps:
      - name: Get PR info
        id: get-pr-info
        if: startsWith(github.ref, 'refs/heads/pull-request/')
        uses: nv-gha-runners/get-pr-info@main

      - name: Check if PR has milestone
        id: check_milestone
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          MILESTONE=$(gh pr view ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} \
            --repo ${{ github.repository }} \
            --json milestone \
            --jq '.milestone.title')

          if [ "$MILESTONE" = "null" ] || [ -z "$MILESTONE" ]; then
            echo "has_milestone=false" >> $GITHUB_OUTPUT
          else
            echo "has_milestone=true" >> $GITHUB_OUTPUT
            echo "PR already has milestone: $MILESTONE"
          fi

      - name: Get most recent open milestone
        if: steps.check_milestone.outputs.has_milestone == 'false'
        id: get_milestone
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          # Get the most recent open milestone (sorted by due date, then by creation date)
          MILESTONE_NUMBER=$(gh api \
            "repos/${{ github.repository }}/milestones?state=open&sort=due_on&direction=desc" \
            --jq '.[0].number')

          MILESTONE_TITLE=$(gh api \
            "repos/${{ github.repository }}/milestones?state=open&sort=due_on&direction=desc" \
            --jq '.[0].title')

          if [ -z "$MILESTONE_NUMBER" ] || [ "$MILESTONE_NUMBER" = "null" ]; then
            echo "No open milestones found"
            echo "milestone_found=false" >> $GITHUB_OUTPUT
          else
            echo "milestone_found=true" >> $GITHUB_OUTPUT
            echo "milestone_number=$MILESTONE_NUMBER" >> $GITHUB_OUTPUT
            echo "milestone_title=$MILESTONE_TITLE" >> $GITHUB_OUTPUT
            echo "Found milestone: $MILESTONE_TITLE (number: $MILESTONE_NUMBER)"
          fi

      - name: Assign milestone to PR
        if: steps.check_milestone.outputs.has_milestone == 'false' && steps.get_milestone.outputs.milestone_found == 'true'
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          gh pr edit ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} \
            --repo ${{ github.repository }} \
            --milestone "${{ steps.get_milestone.outputs.milestone_title }}"

          echo "✅ Assigned milestone '${{ steps.get_milestone.outputs.milestone_title }}' to PR #${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}"


================================================
FILE: .github/workflows/auto-reminder-bot.yml
================================================
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.

name: Auto Reminder Bot

on:
  workflow_dispatch:
  schedule:
    - cron: "0 12 * * *"

jobs:
  run-script:
    name: Run Auto Reminder Bot
    runs-on: ubuntu-latest
    if: github.repository == 'NVIDIA/Megatron-LM'
    steps:
      - name: Check out repository code
        uses: actions/checkout@v6

      - name: Set up Python
        uses: actions/setup-python@v6
        with:
          python-version: "3.10"

      - name: Install dependencies
        run: |
          pip install --no-cache-dir PyGithub slack-sdk

      - name: Run Auto Reminder Bot
        run: |
          export SLACK_TOKEN=${{ secrets.SLACK_BOT_TOKEN }}
          export SLACK_WEBHOOK_URL=${{ secrets.SLACK_REVIEW_REMINDER_CHANNEL_WEBHOOK }}
          export GH_TOKEN=${{ secrets.PAT }}
          python tests/test_utils/python_scripts/auto_reminder_github.py


================================================
FILE: .github/workflows/auto-swap-labels.yml
================================================
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.

name: Auto Swap Labels
on:
  pull_request_target:
    types: [ready_for_review, synchronize]
    branches:
      - main
  workflow_run:
    workflows: ["Review Trigger"]
    types: [completed]

permissions:
  pull-requests: write
  contents: read
  actions: read

jobs:
  check-approval:
    runs-on: ubuntu-latest
    if: >-
      github.repository == 'NVIDIA/Megatron-LM' && (
        (github.event_name == 'pull_request_target' &&
         github.event.pull_request.base.ref == 'main' &&
         !github.event.pull_request.draft) ||
        (github.event_name == 'workflow_run' &&
         github.event.workflow_run.conclusion == 'success')
      )

    steps:
      - name: Get PR number from workflow_run
        id: get-pr
        if: github.event_name == 'workflow_run'
        continue-on-error: true
        uses: actions/download-artifact@v4
        with:
          name: pr-number
          path: pr-number
          github-token: ${{ github.token }}
          run-id: ${{ github.event.workflow_run.id }}

      - name: Set PR number
        id: pr
        run: |
          if [ "${{ github.event_name }}" = "workflow_run" ]; then
            if [ "${{ steps.get-pr.outcome }}" != "success" ]; then
              echo "No approval artifact found — review was not an approval. Skipping."
              exit 0
            fi
            echo "number=$(cat pr-number/number)" >> $GITHUB_OUTPUT
          else
            echo "number=${{ github.event.pull_request.number }}" >> $GITHUB_OUTPUT
          fi

      - name: Check out repository code
        if: steps.pr.outputs.number
        uses: actions/checkout@v4

      - name: Set up Python
        if: steps.pr.outputs.number
        uses: actions/setup-python@v6
        with:
          python-version: "3.10"

      - name: Install dependencies
        if: steps.pr.outputs.number
        run: |
          pip install --no-cache-dir PyGithub slack-sdk

      - name: Run Auto Swap Labels
        if: steps.pr.outputs.number
        run: |
          export GH_TOKEN=${{ secrets.PAT }}
          export PR_NUMBER=${{ steps.pr.outputs.number }}
          python tests/test_utils/python_scripts/swap_pr_labels.py


================================================
FILE: .github/workflows/auto-update-copy-pr-bot.yml
================================================
name: Auto Update Copy PR Bot

on:
  workflow_dispatch:
  schedule:
    - cron: "0 0 * * *"

jobs:
  auto-update-copy-pr-bot:
    runs-on: ubuntu-latest
    if: github.repository == 'NVIDIA/Megatron-LM'
    steps:
      - name: Checkout code
        uses: actions/checkout@v6
        with:
          token: ${{ secrets.PAT }}
          ref: main

      - name: Fetch list of members in mcore-reviewers team
        shell: bash -euxo pipefail {0}
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          #!/bin/bash

          get_members() {
              local org=$1 team=$2 seen_file=$3    

              gh api "/orgs/$org/teams/$team/members" --paginate --jq '.[].login' >> "$seen_file"
              
              gh api "/orgs/$org/teams/$team/teams" --paginate --jq '.[].slug' | while read -r child; do
                  get_members "$org" "$child" "$seen_file"
              done

              cat "$seen_file"
          }

          tmp=$(mktemp)
          echo "" > final.txt
          get_members "NVIDIA" "mcore-engineers" "$tmp" | sort -u >> final.txt && rm "$tmp"

          tmp=$(mktemp)
          get_members "NVIDIA" "mcore-reviewers" "$tmp" | sort -u >> final.txt && rm "$tmp"

          cat final.txt | jq -sR 'split("\n") | map(select(. != "")) | flatten | unique'

          export TRUSTEES=$(cat final.txt | jq -csR 'split("\n") | map(select(. != "")) | flatten | unique')
          yq '.trustees_override = env(TRUSTEES)' .github/copy-pr-bot.yaml | yq -o yaml > .github/copy-pr-bot.yaml.new

          mv .github/copy-pr-bot.yaml.new .github/copy-pr-bot.yaml

      - name: Commit changes
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          git remote set-url origin https://x-access-token:${GH_TOKEN}@github.com/NVIDIA/Megatron-LM.git
          git config --global user.name "GitHub Actions"
          git config --global user.email "github-actions[bot]@users.noreply.github.com"
          git add .github/copy-pr-bot.yaml
          if git diff --cached --exit-code --quiet; then
            echo "No changes to commit. Exiting gracefully."
            exit 0
          fi
          git commit -m "Update copy-pr-bot.yaml [skip ci]"
          git push -u origin main


================================================
FILE: .github/workflows/build-docs.yml
================================================
# Copyright (c) 2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: Build docs

on:
  push:
    branches:
      - main
      - "pull-request/[0-9]+"
      - "deploy-release/*"

concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }}
  cancel-in-progress: true

jobs:
  pre-flight:
    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2

  build-docs:
    needs: [pre-flight]
    if: needs.pre-flight.outputs.is_deployment_workflow != 'true'
    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_docs.yml@v0.57.0

  build-docs-summary:
    needs: [pre-flight, build-docs]
    if: |
      (
        needs.pre-flight.outputs.is_deployment_workflow == 'true'
        || always()
      )
      && !cancelled()
    runs-on: ubuntu-latest
    steps:
      - name: Get workflow result
        id: result
        shell: bash -x -e -u -o pipefail {0}
        env:
          GH_TOKEN: ${{ github.token }}
          RUN_ID: ${{ github.run_id }}
          SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' }}
        run: |
          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0

          if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
              echo "✅ All previous jobs completed successfully"
              exit 0
          else
              echo "❌ Found $FAILED_JOBS failed job(s)"
              # Show which jobs failed
              gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
              exit 1
          fi


================================================
FILE: .github/workflows/build-test-publish-wheel.yml
================================================
# Copyright (c) 2019-2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: Build, test, and publish a PyPi wheel (to testpypi).

on:
  push:
    branches:
      - main
      - "pull-request/[0-9]+"
      - "deploy-release/*"
  merge_group:
    types: [checks_requested]

defaults:
  run:
    shell: bash -x -e -u -o pipefail {0}

permissions:
  id-token: write
  contents: read

jobs:
  pre-flight:
    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2
    if: github.repository == 'NVIDIA/Megatron-LM'

  build-test-publish-wheels:
    needs: [pre-flight]
    uses: ./.github/workflows/_build_test_publish_wheel.yml
    with:
      no-publish: true
    secrets:
      TWINE_PASSWORD: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && secrets.SVC_PYPI_TOKEN || secrets.SVC_PYPI_TEST_TOKEN }}

  build-test-publish-wheel-summary:
    needs: [pre-flight, build-test-publish-wheels]
    if: |
      (
        needs.pre-flight.outputs.docs_only == 'true'
        || needs.pre-flight.outputs.is_merge_group == 'true'
        || needs.pre-flight.outputs.is_deployment_workflow == 'true'
        || always()
      )
      && github.repository == 'NVIDIA/Megatron-LM'
      && !cancelled()
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6

      - name: Result
        env:
          GH_TOKEN: ${{ github.token }}
          GITHUB_RUN_ID: ${{ github.run_id }}
          SKIPPING_IS_ALLOWED: false
        run: |
          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success" and (.name | test("build-and-test-wheels")))] | length') || echo 0

          if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
              echo "✅ All build-and-test-wheels jobs completed successfully"
              exit 0
          else
              echo "❌ Found $FAILED_JOBS failed build-and-test-wheels job(s)"
              # Show which jobs failed
              gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success" and (.name | test("build-and-test-wheels"))) | .name'
              exit 1
          fi


================================================
FILE: .github/workflows/cherry-pick-release-commit.yml
================================================
# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Create PR to main with cherry-pick from release

on:
  push:
    branches:
      - main

jobs:
  cherry-pick:
    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cherry_pick.yml@v0.65.9
    if: github.repository == 'NVIDIA/Megatron-LM'
    with:
      target-branches-pattern: 'core_(*dev_)?r[0-9]+\.[0-9]+\.[0-9]+'
    secrets:
      PAT: ${{ secrets.PAT }}
      SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_TEAM_GROUP_ID }}
      SLACK_WEBHOOK: ${{ secrets.SLACK_CI_CHANNEL_WEBHOOK }}


================================================
FILE: .github/workflows/cicd-approve-test-queue.yml
================================================
# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: Approve Test Queue

on:
  schedule:
    - cron: "*/5 * * * *" # Runs every 5 minutes
  workflow_dispatch: # Allows manual triggering

jobs:
  approve-queue:
    runs-on: ubuntu-latest
    environment: main
    if: github.repository == 'NVIDIA/Megatron-LM'
    strategy:
      matrix:
        branch: [main, dev, others]
        contributor_type: [internal, external]
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6

      - name: Set up Python
        uses: actions/setup-python@v6
        with:
          python-version: "3.12"

      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install requests

      - name: Download SSO users list
        run: |
          gh release download v0.1.0 \
            --repo NVIDIA-GitHub-Management/github-audits \
            --pattern users_sso.json \
            --output users_sso.json || echo '{}' > users_sso.json
        env:
          GH_TOKEN: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }}

      - name: Approve waiting deployments
        env:
          GITHUB_TOKEN: ${{ secrets.PAT }}
          MAX_CONCURRENCY: ${{ vars.MAX_CONCURRENCY || 1 }}
          MAX_CONCURRENCY_EXTERNAL: ${{ vars.MAX_CONCURRENCY_EXTERNAL || 1 }}
          CONTRIBUTOR_TYPE: ${{ matrix.contributor_type }}
          SSO_USERS_FILE: users_sso.json
          PYTHONUNBUFFERED: 1
        shell: python
        run: |
          import os
          import json
          import requests
          import re

          # GitHub API configuration
          GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
          REPO = os.environ["GITHUB_REPOSITORY"]
          CONTRIBUTOR_TYPE = os.environ["CONTRIBUTOR_TYPE"]
          if CONTRIBUTOR_TYPE == "external":
              # Global limit across all branches — no division needed since we count globally.
              MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY_EXTERNAL"])
          else:
              MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY"]) // 2
          API_BASE = f"https://api.github.com/repos/NVIDIA/Megatron-LM"

          # Load SSO users for internal/external classification
          with open(os.environ["SSO_USERS_FILE"]) as f:
              sso_users = json.load(f)

          # Headers for GitHub API
          headers = {
              "Authorization": f"token {GITHUB_TOKEN}",
              "Accept": "application/vnd.github.v3+json",
              "X-GitHub-Api-Version": "2022-11-28",
          }

          def make_request(endpoint, method="GET", data=None):
              """Make a request to the GitHub API with error handling."""
              url = f"{API_BASE}/{endpoint}"
              try:
                  if method == "GET":
                      response = requests.get(url, headers=headers)
                  else:
                      response = requests.post(url, headers=headers, json=data)
                  response.raise_for_status()
                  return response.json()
              except requests.exceptions.RequestException as e:
                  print(f"Error making request to {endpoint}: {str(e)}")
                  if hasattr(e.response, 'text'):
                      print(f"Response: {e.response.text}")
                  return None

          def is_internal_contributor(pr_info):
              """Return True if the PR author is a member of NVIDIA or NVIDIA-NeMo org (is_org_member)."""
              login = pr_info.get("user", {}).get("login", "")
              org_roles = sso_users.get(login, {}).get("org_roles", [])
              return any(role in ("NVIDIA:Member", "NVIDIA-NeMo:Member") for role in org_roles)

          def get_pr_base_branch(workflow_run):
              """
              Return the base branch of the PR associated with a workflow run, or None.
              Extracts PR number from head branch like 'pull-request/1913' and fetches PR info.
              Returns (base_branch, pr_info) tuple, or (None, None) if not a PR run.
              """
              print(workflow_run.get("head_branch", ""))
              head_branch = workflow_run.get("head_branch", "")
              match = re.match(r"pull-request/(\d+)", head_branch)
              if not match:
                  return None, None  # Not a PR branch pattern

              pr_number = int(match.group(1))

              # Fetch PR info from GitHub API
              pr_info = make_request(f"pulls/{pr_number}")
              if not pr_info:
                  print(f"Failed to fetch PR #{pr_number}")
                  return None, None

              base_branch = pr_info.get("base", {}).get("ref")
              return base_branch, pr_info

          def matches_contributor(workflow_run, contributor_type):
              """Return True if the workflow run matches the contributor type (ignores branch)."""
              _, pr_info = get_pr_base_branch(workflow_run)
              if pr_info is None:
                  return False
              internal = is_internal_contributor(pr_info)
              return (contributor_type == "internal") == internal

          def matches_queue(workflow_run, target_branch, contributor_type):
              """
              Return True if the workflow run belongs to this queue cell:
              matching target branch AND matching contributor type (internal/external).
              """
              base_branch, pr_info = get_pr_base_branch(workflow_run)
              if base_branch is None:
                  return False

              branch_match = (
                  (base_branch == target_branch) or
                  (base_branch != "main" and base_branch != "dev" and target_branch == "others")
              )
              if not branch_match:
                  return False

              pr_number = re.match(r"pull-request/(\d+)", workflow_run.get("head_branch", "")).group(1)
              internal = is_internal_contributor(pr_info)
              contributor_match = (contributor_type == "internal") == internal
              if branch_match and contributor_match:
                  print(f"PR #{pr_number} targets {target_branch}, contributor_type={contributor_type} (internal={internal})")
              return branch_match and contributor_match

          # Get current running and queued workflows
          print("Fetching workflow runs...")
          queued_workflow_runs = make_request("actions/runs?status=queued").get("workflow_runs", [])
          in_progress_workflow_runs = make_request("actions/runs?status=in_progress").get("workflow_runs", [])

          # For external contributors, enforce a single global concurrency limit across ALL branches.
          # For internal contributors, enforce per-branch limits as before.
          if CONTRIBUTOR_TYPE == "external":
              queued_workflow_runs = [run for run in queued_workflow_runs
                                      if run["name"] == "CICD Megatron-LM" and matches_contributor(run, CONTRIBUTOR_TYPE)]
              in_progress_workflow_runs = [run for run in in_progress_workflow_runs
                                          if run["name"] == "CICD Megatron-LM" and matches_contributor(run, CONTRIBUTOR_TYPE)]
          else:
              # Filter for workflows belonging to PRs targeting ${{ matrix.branch }} with matching contributor type
              queued_workflow_runs = [run for run in queued_workflow_runs
                                      if run["name"] == "CICD Megatron-LM" and matches_queue(run, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)]
              in_progress_workflow_runs = [run for run in in_progress_workflow_runs
                                          if run["name"] == "CICD Megatron-LM" and matches_queue(run, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)]

          # Count running and queued workflows
          queued_workflows = len(queued_workflow_runs)
          in_progress_workflows = len(in_progress_workflow_runs)

          total_workflows = queued_workflows + in_progress_workflows
          print(f"Current queued workflows (PRs targeting ${{ matrix.branch }}, {CONTRIBUTOR_TYPE}): {queued_workflows}")
          print(f"Current running workflows (PRs targeting ${{ matrix.branch }}, {CONTRIBUTOR_TYPE}): {in_progress_workflows}")
          print(f"Total workflows: {total_workflows}")
          print(f"Max concurrency: {MAX_CONCURRENCY}")

          if total_workflows >= MAX_CONCURRENCY:
              print("Maximum concurrency reached, no new approvals will be made")
              exit(0)

          # Get waiting CI workflows for test environment
          print("Fetching deployments...")
          pending_workflows = make_request("actions/runs?status=waiting").get("workflow_runs", [])
          print("Pending workflows:", len(pending_workflows))
          pending_workflows = [run for run in pending_workflows
                              if run["name"] == "CICD Megatron-LM" and matches_queue(run, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)]

          # Sort deployments by creation date (oldest first)
          print("Sorting workflows...")
          pending_workflows = sorted(pending_workflows, key=lambda x: x["created_at"])

          # Process each deployment
          print(f"Processing {len(pending_workflows)} pending workflows...")
          for workflow in pending_workflows:
              if total_workflows >= MAX_CONCURRENCY:
                  print("Maximum concurrency reached, stopping approvals")
                  break

              workflow_id = workflow["id"]
              workflow_name = workflow["display_title"]
              print(f"Approving workflow {workflow_name} with Run Id: {workflow_id}")

              deployment_url = f"actions/runs/{workflow_id}/pending_deployments"
              deployment = make_request(deployment_url)[0]
              environment_id = deployment["environment"]["id"]

              # Approve the deployment
              status_data = {
                  "environment_ids": [environment_id],
                  "state": "approved",
                  "comment": "Automatically approved by queue manager"
              }
              result = make_request(deployment_url, method="POST", data=status_data)

              if result:
                  total_workflows += 1
              else:
                  print(f"Failed to approve deployment {deployment['id']}")
                  exit(1)
  notify:
    if: failure()
    runs-on: ubuntu-latest
    needs: [approve-queue]
    steps:
      - name: Notify
        env:
          SLACK_WEBHOOK: ${{ secrets.SLACK_CI_CHANNEL_WEBHOOK }}
          SLACK_WEBHOOK_ADMIN: <!subteam^${{ secrets.SLACK_TEAM_GROUP_ID }}>
          GITHUB_RUN_ID: ${{ github.run_id }}
          GITHUB_REPOSITORY: ${{ github.repository }}
        run: |
          curl -X POST \
            -H 'Content-type: application/json' \
            --data "{\"text\":\":robot_joy: <https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}|Test-queue-approval-bot workflow> failed. Please review manually.\n\ncc ${SLACK_WEBHOOK_ADMIN}\"}" \
            $SLACK_WEBHOOK


================================================
FILE: .github/workflows/cicd-main.yml
================================================
# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: CICD Megatron-LM
on:
  schedule:
    - cron: 0 0 * * *
  push:
    branches:
      - "pull-request/[0-9]+"
      - "deploy-release/*"
  merge_group:
    types: [checks_requested]
  workflow_dispatch:

concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.event.merge_group.head_ref || github.ref }}
  cancel-in-progress: true

permissions:
  id-token: write
  contents: read

env:
  container-registry: 766267172432.dkr.ecr.us-east-1.amazonaws.com
  container-registry-gb200: us-east4-docker.pkg.dev/nv-projdgxchipp-20260113193621/megatron-lm

jobs:
  is-not-external-contributor:
    runs-on: ubuntu-latest
    if: github.repository == 'NVIDIA/Megatron-LM'
    outputs:
      is_external_contributor: ${{ github.event.pull_request.user.type == 'User' }}
      is_maintainer: ${{ steps.check-membership.outputs.is_maintainer }}
      selected_runner: ${{ steps.check-membership.outputs.is_maintainer == 'true' && 'nvidia-ci-aws-gpu-x8' || 'nvidia-ci-aws-gpu-x8-ephemeral' }}
      selected_runner_gb200: ${{ steps.check-membership.outputs.is_maintainer == 'true' && 'nvidia-ci-gcp-gpu-x4' || 'ubuntu-latest' }}
    permissions:
      issues: write
      pull-requests: write
    env:
      GITHUB_TOKEN: ${{ secrets.PAT }}
      REPO: ${{ github.repository }}
      DISABLE_EXTERNAL_CONTRIBUTOR: ${{ vars.DISABLE_EXTERNAL_CONTRIBUTOR }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6
        with:
          token: ${{ env.GITHUB_TOKEN }}

      - name: Get PR info
        id: get-pr-info
        if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push'
        uses: nv-gha-runners/get-pr-info@main

      - name: Check NVIDIA SSO membership
        id: check-sso
        uses: ./.github/actions/check-nvidia-sso-membership
        with:
          username: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }}
          github_token: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }}
          sso_users_filename: ${{ vars.SSO_USERS_FILENAME }}

      - name: Set maintainer status
        id: check-membership
        env:
          IS_MAIN_BRANCH: ${{ github.ref == 'refs/heads/main' }}
          IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }}
          SCHEDULED_JOB: ${{ github.event_name == 'schedule' }}
        run: |
          # Skip SSO check for scheduled jobs, main branch, or merge groups
          if [ "${{ env.SCHEDULED_JOB }}" == "true" ] || [ "${IS_MAIN_BRANCH}" == "true" ] || [ "${IS_MERGE_GROUP}" == "true" ]; then
            echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT
            exit 0
          fi

          # Use SSO membership check result
          IS_MEMBER="${{ steps.check-sso.outputs.is_member }}"

          # If external contributor is disabled, check if user is a repo collaborator or an org collaborator to NVIDIA or NVIDIA-NeMo
          if [ "${{ env.DISABLE_EXTERNAL_CONTRIBUTOR }}" == "true" ] && [ "${{ steps.check-sso.outputs.is_member }}" != "true" ]; then
            PR_AUTHOR=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }}

            echo "Checking if $PR_AUTHOR is a repo collaborator..."
            API_URL="https://api.github.com/repos/$REPO/collaborators/$PR_AUTHOR"
            REPO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \
              -H "Accept: application/vnd.github+json" \
              -H "Authorization: Bearer $GITHUB_TOKEN" \
              -H "X-GitHub-Api-Version: 2022-11-28" \
              $API_URL)

            echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA-NeMo..."
            API_URL="https://api.github.com/orgs/NVIDIA-NeMo/members/$PR_AUTHOR"
            ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \
              -H "Accept: application/vnd.github+json" \
              -H "Authorization: Bearer $GITHUB_TOKEN" \
              -H "X-GitHub-Api-Version: 2022-11-28" \
              $API_URL)

            echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA..."
            API_URL="https://api.github.com/orgs/NVIDIA/members/$PR_AUTHOR"
            ORG_NVIDIA_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \
              -H "Accept: application/vnd.github+json" \
              -H "Authorization: Bearer $GITHUB_TOKEN" \
              -H "X-GitHub-Api-Version: 2022-11-28" \
              $API_URL)

            if [ "$REPO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_MEMBERSHIP_RESPONSE" -eq 204 ]; then
              IS_MEMBER="true"
            else
              exit 1
            fi
          fi

          # Use SSO membership check result
          if [ "$IS_MEMBER" == "true" ]; then
            echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT
          else
            echo "is_maintainer=false" | tee -a $GITHUB_OUTPUT
          fi

  pre-flight:
    needs: [is-not-external-contributor]
    if: github.repository == 'NVIDIA/Megatron-LM'
    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2

  linting:
    runs-on: ubuntu-latest
    needs: [pre-flight]
    if: |
      (
        needs.pre-flight.outputs.is_deployment_workflow == 'false'
          && needs.pre-flight.outputs.is_ci_workload == 'true'
      ) || (
        needs.pre-flight.outputs.is_deployment_workflow == 'false'
          && needs.pre-flight.outputs.is_ci_workload == 'false'
          && needs.pre-flight.outputs.docs_only == 'false'
      )
    steps:
      - name: Checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0

      - name: Install uv
        uses: astral-sh/setup-uv@v1
        with:
          version: 0.7.2

      - name: Install linting tools
        run: |
          uv sync --locked --only-group linting

      - name: Get PR info
        id: get-pr-info
        if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push'
        uses: nv-gha-runners/get-pr-info@main

      - name: Run linting
        if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push'
        run: |
          export PATH=".venv/bin:$PATH"
          export GITLAB_ENDPOINT=github.com
          export CI_PROJECT_NAMESPACE=NVIDIA
          export BASE_REF="${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }}"
          export CHECK_ONLY=true
          export SKIP_DOCS=false
          bash tools/autoformat.sh

  cicd-wait-in-queue:
    runs-on: ubuntu-latest
    needs: [pre-flight, linting]
    environment: "test"
    if: |
      !(needs.pre-flight.outputs.is_ci_workload == 'true'
      || needs.pre-flight.outputs.is_deployment_workflow == 'true'
      || needs.pre-flight.outputs.is_merge_group == 'true'
      || needs.pre-flight.outputs.docs_only == 'true')
    steps:
      - name: Running CI tests
        run: |
          echo "Running CI tests"
          echo "is_merge_group: ${{ needs.pre-flight.outputs.is_merge_group }}"

  cicd-parse-downstream-testing:
    runs-on: ubuntu-latest
    needs:
      - pre-flight
      - cicd-wait-in-queue
    if: |
      needs.pre-flight.result != 'cancelled'
      && needs.cicd-wait-in-queue.result != 'cancelled'
      && (
        success()
        || needs.pre-flight.outputs.is_ci_workload == 'true'
        || needs.pre-flight.outputs.force_run_all == 'true'
        || needs.pre-flight.outputs.is_merge_group == 'true'
      )
      && !cancelled()
    outputs:
      mbridge-test-suite: ${{ steps.select-mbridge-test-suite.outputs.main }}
    steps:
      - name: Checkout
        uses: actions/checkout@v6

      - name: Get PR info
        id: get-pr-info
        if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push'
        uses: nv-gha-runners/get-pr-info@main

      - name: Select MBridge test suite
        id: select-mbridge-test-suite
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
          TEST_SUITE=$(gh pr view $PR_NUMBER --json labels | jq -r 'if [.labels[].name] | any(. == "Run MBridge tests") then "all" else "unit-only" end')
          echo "main=$TEST_SUITE" | tee -a $GITHUB_OUTPUT

      - name: How-To
        run: bash .github/scripts/readme.sh

  cicd-mbridge-testing:
    runs-on: ubuntu-latest
    needs:
      - pre-flight
      - cicd-wait-in-queue
      - cicd-parse-downstream-testing
    if: |
      needs.pre-flight.result != 'cancelled'
      && needs.cicd-wait-in-queue.result != 'cancelled'
      && needs.cicd-parse-downstream-testing.result != 'cancelled'
      && (
        success()
        || needs.pre-flight.outputs.is_ci_workload == 'true'
        || needs.pre-flight.outputs.force_run_all == 'true'
        || needs.pre-flight.outputs.is_merge_group == 'true'
      )
      && !cancelled()
    steps:
      - name: Get PR info
        id: get-pr-info
        if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push'
        uses: nv-gha-runners/get-pr-info@main

      - name: Checkout MBridge and create testing branch
        uses: actions/checkout@v6
        with:
          ref: main
          repository: NVIDIA-NeMo/Megatron-Bridge
          path: megatron-bridge
          token: ${{ secrets.PAT }}

      - name: Create testing branch
        env:
          MBRIDGE_BRANCH_NAME: mcore-testing-${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || github.run_id }}
        run: |
          cd megatron-bridge
          git fetch origin main
          git checkout -b ${{ env.MBRIDGE_BRANCH_NAME }} origin/main
          git push origin ${{ env.MBRIDGE_BRANCH_NAME }} --force

      - name: Get merge commit sha
        shell: bash -x -e -u -o pipefail {0}
        id: sha
        env:
          IS_PR: ${{ startsWith(github.ref, 'refs/heads/pull-request/') }}
          IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }}
        run: |
          if [[ "$IS_PR" == "true" ]]; then
            SHA=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').merge_commit_sha }}
          elif [[ "$IS_MERGE_GROUP" == "true" ]]; then
            SHA=${{ github.event.merge_group.head_sha }}
          else
            SHA=${GITHUB_SHA}
          fi
          echo "main=${SHA}" | tee -a "$GITHUB_OUTPUT"

      - name: Trigger MBridge tests
        uses: convictional/trigger-workflow-and-wait@v1.6.5
        env:
          MBRIDGE_BRANCH_NAME: mcore-testing-${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || github.run_id }}
        with:
          owner: NVIDIA-NeMo
          repo: Megatron-Bridge
          workflow_file_name: cicd-main.yml
          github_token: ${{ secrets.PAT }}
          ref: ${{ env.MBRIDGE_BRANCH_NAME }}
          wait_interval: 60
          propagate_failure: true
          client_payload: |
            {
              "mcore_ref": "${{ steps.sha.outputs.main }}",
              "test_suite": "${{ needs.cicd-parse-downstream-testing.outputs.mbridge-test-suite }}",
              "triggered_by": "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
            }

      - name: Delete testing branch
        if: always()
        env:
          MBRIDGE_BRANCH_NAME: mcore-testing-${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || github.run_id }}
        run: |
          cd megatron-bridge
          git push origin --delete ${{ env.MBRIDGE_BRANCH_NAME }}

  cicd-compute-build-matrix:
    runs-on: ubuntu-latest
    needs: [is-not-external-contributor]
    outputs:
      matrix: ${{ steps.compute.outputs.matrix }}
    steps:
      - name: Compute build matrix
        id: compute
        env:
          IS_MAINTAINER: ${{ needs.is-not-external-contributor.outputs.is_maintainer }}
          SELECTED_RUNNER: ${{ needs.is-not-external-contributor.outputs.selected_runner }}
          SELECTED_RUNNER_GB200: ${{ needs.is-not-external-contributor.outputs.selected_runner_gb200 }}
          REGISTRY_AWS: ${{ env.container-registry }}
          REGISTRY_GCP: ${{ env.container-registry-gb200 }}
        run: |
          AWS_ENTRY=$(jq -nc --arg registry "$REGISTRY_AWS" --arg runner "$SELECTED_RUNNER" \
            '{"cloud": "aws", "registry": $registry, "runner": $runner}')
          if [ "$IS_MAINTAINER" == "true" ]; then
            GCP_ENTRY=$(jq -nc --arg registry "$REGISTRY_GCP" --arg runner "$SELECTED_RUNNER_GB200" \
              '{"cloud": "gcp", "registry": $registry, "runner": $runner}')
            MATRIX=$(jq -nc --argjson aws "$AWS_ENTRY" --argjson gcp "$GCP_ENTRY" \
              '{"include": [$aws, $gcp]}')
          else
            MATRIX=$(jq -nc --argjson aws "$AWS_ENTRY" '{"include": [$aws]}')
          fi
          echo "matrix=$MATRIX" | tee -a "$GITHUB_OUTPUT"

  cicd-container-build:
    needs: [is-not-external-contributor, pre-flight, cicd-wait-in-queue, cicd-compute-build-matrix]
    strategy:
      fail-fast: false
      matrix: ${{ fromJson(needs.cicd-compute-build-matrix.outputs.matrix) }}
    runs-on: ${{ matrix.runner }}
    if: |
      needs.is-not-external-contributor.result != 'cancelled'
      && needs.pre-flight.result != 'cancelled'
      && needs.cicd-wait-in-queue.result != 'cancelled'
      && needs.cicd-compute-build-matrix.result != 'cancelled'
      && (
        success()
        || needs.pre-flight.outputs.is_ci_workload == 'true'
        || needs.pre-flight.outputs.is_merge_group == 'true'
        || needs.pre-flight.outputs.force_run_all == 'true'
      )
      && !cancelled()
    steps:
      - name: Get PR info
        id: get-pr-info
        if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push'
        uses: nv-gha-runners/get-pr-info@main

      - name: Get merge commit sha
        shell: bash -x -e -u -o pipefail {0}
        id: sha
        env:
          IS_PR: ${{ startsWith(github.ref, 'refs/heads/pull-request/') }}
          IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }}
        run: |
          if [[ "$IS_PR" == "true" ]]; then
            SHA=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').merge_commit_sha }}
          elif [[ "$IS_MERGE_GROUP" == "true" ]]; then
            SHA=${{ github.event.merge_group.head_sha }}
          else
            SHA=${GITHUB_SHA}
          fi
          echo "main=${SHA}" | tee -a "$GITHUB_OUTPUT"

      - name: Checkout
        uses: actions/checkout@v6
        with:
          ref: ${{ steps.sha.outputs.main }}

      - name: Setup python
        uses: actions/setup-python@v6
        with:
          python-version: 3.12

      - name: Install GH CLI
        shell: bash -x -e -u -o pipefail {0}
        run: |
          apt-get update
          apt-get install -y gh

      - name: Has lts label
        id: has-lts-label
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
          HAS_LTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "container::lts")') || echo "false"
          echo "main=$HAS_LTS_LABEL" | tee -a $GITHUB_OUTPUT

      - name: Download test data
        shell: bash
        run: |
          echo "::group::Download test data"
          pip install --no-cache-dir click requests
          python tests/test_utils/python_scripts/download_unit_tests_dataset.py --assets-dir ./assets
          echo "::endgroup::"

      - name: Install GH CLI
        shell: bash
        run: |
          apt-get update
          apt-get install -y gh

      - name: Get last merged PR
        id: cache_from
        env:
          GH_TOKEN: ${{ github.token }}
        run: |
          LAST_PRS=$(gh api graphql -f query='
            query {
              repository(owner: "NVIDIA", name: "Megatron-LM") {
                pullRequests(states: MERGED, first: 100, orderBy: {field: UPDATED_AT, direction: DESC}) {
                  nodes {
                    number
                  }
                }
              }
            }' | jq -r '.data.repository.pullRequests.nodes[].number' | while read -r number; do
              echo "type=registry,ref=${{ matrix.registry }}/megatron-lm:$number-buildcache,mode=max"
            done)

          echo "LAST_PRS<<EOF" | tee -a $GITHUB_OUTPUT
          echo "$LAST_PRS" | tee -a $GITHUB_OUTPUT
          echo "EOF" | tee -a $GITHUB_OUTPUT

      - name: Parse baseimage
        shell: bash
        id: base-image
        env:
          HAS_LTS_LABEL: ${{ steps.has-lts-label.outputs.main }}
        run: |
          if [ "$HAS_LTS_LABEL" == "true" ]; then
            NGC_VERSION=$(cat docker/.ngc_version.lts)
            echo "version=$NGC_VERSION" | tee -a $GITHUB_OUTPUT
            echo "image_type=lts" | tee -a $GITHUB_OUTPUT
          else
            NGC_VERSION=$(cat docker/.ngc_version.dev)
            echo "version=$NGC_VERSION" | tee -a $GITHUB_OUTPUT
            echo "image_type=dev" | tee -a $GITHUB_OUTPUT
          fi

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3

      - name: Build and push
        uses: docker/build-push-action@v6
        with:
          file: ./docker/Dockerfile.ci.dev
          push: true
          context: .
          target: main
          build-args: |
            FROM_IMAGE_NAME=${{ steps.base-image.outputs.version }}
            IMAGE_TYPE=${{ steps.base-image.outputs.image_type }}
          cache-from: |
            type=registry,ref=${{ matrix.registry }}/megatron-lm:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}-buildcache,mode=max
            type=registry,ref=${{ matrix.registry }}/megatron-lm:main-buildcache,mode=max
            ${{ steps.cache_from.outputs.LAST_PRS }}
          cache-to: |
            type=registry,ref=${{ matrix.registry }}/megatron-lm:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}-buildcache,mode=max
          no-cache: false
          tags: |
            ${{ matrix.registry }}/megatron-lm:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}
            ${{ matrix.registry }}/megatron-lm:${{ github.sha }}
          secrets: |
            GH_TOKEN=${{ secrets.PAT }}

  cicd-parse-unit-tests:
    runs-on: ubuntu-latest
    outputs:
      unit-tests: ${{ steps.parse-unit-tests.outputs.unit-tests }}
    needs:
      - pre-flight
      - cicd-wait-in-queue
      - cicd-container-build
    if: |
      needs.pre-flight.result != 'cancelled'
      && needs.cicd-wait-in-queue.result != 'cancelled'
      && needs.cicd-container-build.result != 'cancelled'
      && (
        success()
        || needs.pre-flight.outputs.is_ci_workload == 'true'
        || needs.pre-flight.outputs.force_run_all == 'true'
        || needs.pre-flight.outputs.is_merge_group == 'true'
      )
      && !cancelled()
    steps:
      - name: Checkout
        uses: actions/checkout@v6
      - name: Parse unit tests
        id: parse-unit-tests
        run: |
          cat tests/test_utils/recipes/h100/unit-tests.yaml | yq -o json '[.products[].test_case[] | { "bucket": .}] | sort_by(.model, .test_case)' | jq -c > unit-tests.json
          echo "unit-tests=$(cat unit-tests.json)" | tee -a $GITHUB_OUTPUT

  cicd-unit-tests-latest:
    strategy:
      fail-fast: false
      matrix:
        include: ${{ fromJson(needs.cicd-parse-unit-tests.outputs.unit-tests) }}
    needs:
      - is-not-external-contributor
      - pre-flight
      - cicd-wait-in-queue
      - cicd-container-build
      - cicd-parse-unit-tests
    runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }}
    timeout-minutes: 60
    name: "${{ matrix.bucket }} - latest"
    if: |
      needs.is-not-external-contributor.result != 'cancelled'
      && needs.pre-flight.result != 'cancelled'
      && needs.cicd-wait-in-queue.result != 'cancelled'
      && needs.cicd-container-build.result != 'cancelled'
      && needs.cicd-parse-unit-tests.result != 'cancelled'
      && (
        success()
        || needs.pre-flight.outputs.is_ci_workload == 'true'
        || needs.pre-flight.outputs.force_run_all == 'true'
        || needs.pre-flight.outputs.is_merge_group == 'true'
      )
      && !cancelled()
    env:
      PIP_DISABLE_PIP_VERSION_CHECK: 1
      PIP_NO_PYTHON_VERSION_WARNING: 1
      PIP_ROOT_USER_ACTION: ignore
    steps:
      - name: Checkout
        uses: actions/checkout@v6
      - name: main
        uses: ./.github/actions
        with:
          test_case: ${{ matrix.bucket }}
          tag: latest
          timeout: ${{ matrix.timeout || 30 }}
          is_unit_test: "true"
          PAT: ${{ secrets.PAT }}
          container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }}
          is_ci_workload: ${{ needs.pre-flight.outputs.is_ci_workload }}

  cicd-parse-integration-tests-h100:
    runs-on: ubuntu-latest
    needs:
      - pre-flight
      - cicd-wait-in-queue
      - cicd-container-build
      - cicd-unit-tests-latest
    if: |
      needs.pre-flight.result != 'cancelled'
      && needs.cicd-wait-in-queue.result != 'cancelled'
      && needs.cicd-container-build.result != 'cancelled'
      && needs.cicd-unit-tests-latest.result != 'cancelled'
      && (
        success()
        || needs.pre-flight.outputs.is_ci_workload == 'true'
        || needs.pre-flight.outputs.force_run_all == 'true'
        || needs.pre-flight.outputs.is_merge_group == 'true'
      )
      && !cancelled()
    outputs:
      integration-tests-h100: ${{ steps.main.outputs.integration-tests-h100 }}
    steps:
      - name: Checkout
        uses: actions/checkout@v6

      - name: Get PR info
        id: get-pr-info
        if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push'
        uses: nv-gha-runners/get-pr-info@main

      - name: Has Run tests label
        id: has-run-tests-label
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
          HAS_RUN_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run tests")') || echo "false"
          echo "main=$HAS_RUN_TESTS_LABEL" | tee -a $GITHUB_OUTPUT

      - name: Has Run functional tests label
        id: has-run-functional-tests-label
        env:
          GH_TOKEN: ${{ secrets.PAT }}
          IS_CI_WORKLOAD: ${{ needs.pre-flight.outputs.is_ci_workload }}
        run: |
          PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
          HAS_RUN_FUNCTIONAL_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run functional tests")')
          HAS_RUN_FUNCTIONAL_TESTS_LABEL=${HAS_RUN_FUNCTIONAL_TESTS_LABEL:-$IS_CI_WORKLOAD}
          echo "main=$HAS_RUN_FUNCTIONAL_TESTS_LABEL" | tee -a $GITHUB_OUTPUT

      - name: Parse functional tests
        id: main
        env:
          HAS_RUN_TESTS_LABEL: ${{ steps.has-run-tests-label.outputs.main }}
          HAS_RUN_FUNCTIONAL_TESTS_LABEL: ${{ steps.has-run-functional-tests-label.outputs.main == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' }}
        run: |
          export PYTHONPATH=$(pwd)

          if [ "$HAS_RUN_TESTS_LABEL" == "true" ]; then
            ARGS=(
              --scope mr-github
              --enable-lightweight-mode
            )
          elif [ "$HAS_RUN_FUNCTIONAL_TESTS_LABEL" == "true" ]; then
            ARGS=(
              --scope mr-github
            )
          else
            ARGS=(
              --scope mr-github-slim
            )
          fi

          python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
            --n-repeat 5 \
            --time-limit 2700 \
            --test-cases all \
            --container-image mcore_ci_dev \
            --container-tag latest \
            --dependent-job functional:configure \
            --record-checkpoints false \
            --slurm-account gh \
            --no-enable-warmup \
            --environment dev \
            --platform dgx_h100 \
            --cluster ghci \
            ${ARGS[@]} \
            --output-path integration-tests-h100.yaml

          cat integration-tests-h100.yaml | \
            yq -o json 'del(.default, .stages, .workflow) | to_entries | map({"model": .value.stage, "test_case": .key}) | sort_by(.model, .test_case)' | jq -c  > integration-tests-h100.json

          echo "integration-tests-h100=$(cat integration-tests-h100.json)" | tee -a "$GITHUB_OUTPUT"

  cicd-integration-tests-latest-h100:
    timeout-minutes: 60
    strategy:
      fail-fast: false
      matrix:
        include: ${{ fromJson(needs.cicd-parse-integration-tests-h100.outputs.integration-tests-h100) }}
    needs:
      - is-not-external-contributor
      - pre-flight
      - cicd-wait-in-queue
      - cicd-parse-integration-tests-h100
      - cicd-unit-tests-latest
    runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }}
    name: "${{ matrix.model }}/${{ matrix.test_case }} - latest"
    env:
      PIP_DISABLE_PIP_VERSION_CHECK: 1
      PIP_NO_PYTHON_VERSION_WARNING: 1
      PIP_ROOT_USER_ACTION: ignore
    if: |
      needs.is-not-external-contributor.result != 'cancelled'
      && needs.pre-flight.result != 'cancelled'
      && needs.cicd-wait-in-queue.result != 'cancelled'
      && needs.cicd-parse-integration-tests-h100.result != 'cancelled'
      && needs.cicd-unit-tests-latest.result != 'cancelled'
      && (
        success()
        || needs.pre-flight.outputs.is_ci_workload == 'true'
        || needs.pre-flight.outputs.force_run_all == 'true'
        || needs.pre-flight.outputs.is_merge_group == 'true'
      )
      && !cancelled()
    steps:
      - name: Checkout
        uses: actions/checkout@v6
      - name: main
        uses: ./.github/actions
        with:
          test_case: ${{ matrix.test_case }}
          model: ${{ matrix.model }}
          tag: latest
          timeout: ${{ matrix.timeout || 30 }}
          is_unit_test: "false"
          PAT: ${{ secrets.PAT }}
          container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }}
          is_ci_workload: ${{ needs.pre-flight.outputs.is_ci_workload }}
          is_merge_group: ${{ needs.pre-flight.outputs.is_merge_group }}

  cicd-parse-integration-tests-gb200:
    runs-on: ubuntu-latest
    needs:
      - is-not-external-contributor
      - pre-flight
      - cicd-wait-in-queue
      - cicd-container-build
      - cicd-unit-tests-latest
    if: |
      needs.is-not-external-contributor.outputs.is_maintainer == 'true'
      && needs.pre-flight.result != 'cancelled'
      && needs.cicd-wait-in-queue.result != 'cancelled'
      && needs.cicd-container-build.result != 'cancelled'
      && needs.cicd-unit-tests-latest.result != 'cancelled'
      && (
        success()
        || needs.pre-flight.outputs.is_ci_workload == 'true'
        || needs.pre-flight.outputs.force_run_all == 'true'
        || needs.pre-flight.outputs.is_merge_group == 'true'
      )
      && !cancelled()
    outputs:
      integration-tests-gb200: ${{ steps.main.outputs.integration-tests-gb200 }}
    steps:
      - name: Checkout
        uses: actions/checkout@v6

      - name: Get PR info
        id: get-pr-info
        if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push'
        uses: nv-gha-runners/get-pr-info@main

      - name: Has Run tests label
        id: has-run-tests-label
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
          HAS_RUN_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run tests")') || echo "false"
          echo "main=$HAS_RUN_TESTS_LABEL" | tee -a $GITHUB_OUTPUT

      - name: Has Run functional tests label
        id: has-run-functional-tests-label
        env:
          GH_TOKEN: ${{ secrets.PAT }}
          IS_CI_WORKLOAD: ${{ needs.pre-flight.outputs.is_ci_workload }}
        run: |
          PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
          HAS_RUN_FUNCTIONAL_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run functional tests")')
          HAS_RUN_FUNCTIONAL_TESTS_LABEL=${HAS_RUN_FUNCTIONAL_TESTS_LABEL:-$IS_CI_WORKLOAD}
          echo "main=$HAS_RUN_FUNCTIONAL_TESTS_LABEL" | tee -a $GITHUB_OUTPUT

      - name: Parse functional tests
        id: main
        env:
          HAS_RUN_TESTS_LABEL: ${{ steps.has-run-tests-label.outputs.main }}
          HAS_RUN_FUNCTIONAL_TESTS_LABEL: ${{ steps.has-run-functional-tests-label.outputs.main == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' }}
        run: |
          export PYTHONPATH=$(pwd)

          if [ "$HAS_RUN_TESTS_LABEL" == "true" ]; then
            ARGS=(
              --scope mr-github
              --enable-lightweight-mode
            )
          elif [ "$HAS_RUN_FUNCTIONAL_TESTS_LABEL" == "true" ]; then
            ARGS=(
              --scope mr-github
            )
          else
            ARGS=(
              --scope mr-github-slim
            )
          fi

          python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
            --n-repeat 5 \
            --time-limit 2700 \
            --test-cases all \
            --container-image mcore_ci_dev \
            --container-tag latest \
            --dependent-job functional:configure \
            --record-checkpoints false \
            --slurm-account gh \
            --no-enable-warmup \
            --environment dev \
            --platform dgx_gb200 \
            --cluster dgxgb200_oci-hsg \
            ${ARGS[@]} \
            --output-path integration-tests-gb200.yaml

          cat integration-tests-gb200.yaml | \
            yq -o json 'del(.default, .stages, .workflow) | to_entries | map({"model": .value.stage, "test_case": .key}) | sort_by(.model, .test_case)' | jq -c  > integration-tests-gb200.json

          echo "integration-tests-gb200=$(cat integration-tests-gb200.json)" | tee -a "$GITHUB_OUTPUT"

  cicd-integration-tests-latest-gb200:
    timeout-minutes: 60
    strategy:
      fail-fast: false
      matrix:
        include: ${{ fromJson(needs.cicd-parse-integration-tests-gb200.outputs.integration-tests-gb200) }}
    needs:
      - is-not-external-contributor
      - pre-flight
      - cicd-wait-in-queue
      - cicd-parse-integration-tests-gb200
      - cicd-unit-tests-latest
    runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner_gb200 }}
    name: "${{ matrix.model }}/${{ matrix.test_case }} - latest"
    env:
      PIP_DISABLE_PIP_VERSION_CHECK: 1
      PIP_NO_PYTHON_VERSION_WARNING: 1
      PIP_ROOT_USER_ACTION: ignore
    if: |
      needs.is-not-external-contributor.outputs.is_maintainer == 'true'
      && needs.is-not-external-contributor.result != 'cancelled'
      && needs.pre-flight.result != 'cancelled'
      && needs.cicd-wait-in-queue.result != 'cancelled'
      && needs.cicd-parse-integration-tests-gb200.result != 'cancelled'
      && needs.cicd-unit-tests-latest.result != 'cancelled'
      && (
        success()
        || needs.pre-flight.outputs.is_ci_workload == 'true'
        || needs.pre-flight.outputs.force_run_all == 'true'
        || needs.pre-flight.outputs.is_merge_group == 'true'
      )
      && !cancelled()
    steps:
      - name: Checkout
        uses: actions/checkout@v6
      - name: main
        uses: ./.github/actions
        with:
          test_case: ${{ matrix.test_case }}
          model: ${{ matrix.model }}
          tag: latest
          timeout: ${{ matrix.timeout || 30 }}
          is_unit_test: "false"
          PAT: ${{ secrets.PAT }}
          container-image: ${{ env.container-registry-gb200 }}/megatron-lm:${{ github.sha }}
          is_ci_workload: ${{ needs.pre-flight.outputs.is_ci_workload }}
          is_merge_group: ${{ needs.pre-flight.outputs.is_merge_group }}
          platform: dgx_gb200

  Nemo_CICD_Test:
    needs:
      - pre-flight
      - is-not-external-contributor
      - cicd-unit-tests-latest
      - cicd-integration-tests-latest-h100
      - cicd-integration-tests-latest-gb200
    if: |
      (
        needs.pre-flight.outputs.docs_only == 'true'
        || needs.pre-flight.outputs.is_deployment_workflow == 'true'
        || needs.pre-flight.outputs.is_ci_workload == 'true'
        || needs.pre-flight.outputs.is_merge_group == 'true'
        || always()
      )
      && !cancelled()
      && github.repository == 'NVIDIA/Megatron-LM'
    runs-on: ubuntu-latest
    permissions: write-all
    steps:
      - name: Checkout
        uses: actions/checkout@v6

      - name: Get workflow result
        id: result
        shell: bash -x -e -u -o pipefail {0}
        env:
          GH_TOKEN: ${{ github.token }}
          GITHUB_RUN_ID: ${{ github.run_id }}
          DOCS_ONLY: ${{ needs.pre-flight.outputs.docs_only }}
          IS_DEPLOYMENT: ${{ needs.pre-flight.outputs.is_deployment_workflow }}
          IS_MAINTAINER: ${{ needs.is-not-external-contributor.outputs.is_maintainer }}
          UNIT_RESULT: ${{ needs.cicd-unit-tests-latest.result }}
          H100_RESULT: ${{ needs.cicd-integration-tests-latest-h100.result }}
          GB200_RESULT: ${{ needs.cicd-integration-tests-latest-gb200.result }}
        run: |
          # Docs-only and deployment workflows intentionally skip all tests
          if [ "$DOCS_ONLY" == "true" ] || [ "$IS_DEPLOYMENT" == "true" ]; then
            echo "✅ Docs-only or deployment workflow — test checks skipped"
            exit 0
          fi

          FAILED=false

          # Unit tests must always succeed (never skipped or cancelled)
          if [ "$UNIT_RESULT" != "success" ]; then
            echo "❌ cicd-unit-tests-latest: $UNIT_RESULT"
            FAILED=true
          fi

          # H100 integration tests must always succeed
          if [ "$H100_RESULT" != "success" ]; then
            echo "❌ cicd-integration-tests-latest-h100: $H100_RESULT"
            FAILED=true
          fi

          # GB200 integration tests may be skipped only for non-maintainer PRs
          # (no GB200 runners available); maintainer runs must always succeed
          if [ "$GB200_RESULT" == "skipped" ] && [ "$IS_MAINTAINER" == "true" ]; then
            echo "❌ cicd-integration-tests-latest-gb200: skipped unexpectedly for a maintainer run"
            FAILED=true
          elif [ "$GB200_RESULT" != "success" ] && [ "$GB200_RESULT" != "skipped" ]; then
            echo "❌ cicd-integration-tests-latest-gb200: $GB200_RESULT"
            FAILED=true
          fi

          # Broad scan: catch any individual job failures or cancellations
          # (e.g. a single matrix instance cancelled mid-run)
          BAD_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '
            [.jobs[] | select(
              .status == "completed"
              and (.conclusion == "failure" or .conclusion == "cancelled")
              and .name != "merge-queue-notification"
              and .name != "cicd-mbridge-testing"
            )] | length
          ') || BAD_JOBS=0

          if [ "${BAD_JOBS:-0}" -gt 0 ]; then
            echo "❌ Found ${BAD_JOBS} failed or cancelled job(s):"
            gh run view $GITHUB_RUN_ID --json jobs --jq '
              .jobs[] | select(
                .status == "completed"
                and (.conclusion == "failure" or .conclusion == "cancelled")
                and .name != "merge-queue-notification"
                and .name != "cicd-mbridge-testing"
              ) | .name + " → " + .conclusion
            '
            FAILED=true
          fi

          if [ "$FAILED" != "true" ]; then
            echo "✅ All previous jobs completed successfully"
          else
            exit 1
          fi

  Coverage_Fake:
    runs-on: ubuntu-latest
    needs: [Nemo_CICD_Test, pre-flight]
    if: |
      (
        needs.pre-flight.outputs.docs_only == 'true'
        || needs.pre-flight.outputs.is_deployment_workflow == 'true'
        || github.event == 'merge_group'
      )
      && needs.pre-flight.outputs.is_ci_workload == 'false'
      && !cancelled()
      && github.repository == 'NVIDIA/Megatron-LM'
    steps:
      - name: Generate fake coverage report
        uses: actions/github-script@v8
        with:
          github-token: ${{ secrets.PAT }}
          script: |
            await github.rest.repos.createCommitStatus({
              owner: context.repo.owner,
              repo: context.repo.repo,
              sha: context.sha,
              state: 'success',
              description: 'No code changes - coverage check skipped',
              context: 'codecov/patch'
            });

  Coverage:
    runs-on: ubuntu-latest
    needs: [Nemo_CICD_Test]
    if: |
      (
        (needs.pre-flight.outputs.is_ci_workload == 'true' && !failure())
        || success()
      )
      && !cancelled()
      && github.repository == 'NVIDIA/Megatron-LM'
    strategy:
      matrix:
        flag: [unit-test]
    steps:
      - name: Checkout
        uses: actions/checkout@v6

      - name: Download coverage reports of current branch
        uses: actions/download-artifact@v7
        with:
          pattern: coverage-${{ matrix.flag }}-*

      - name: List coverage files
        run: find . -type f -name "*.xml" -o -name "*.lcov"

      - name: Get total coverage of current branch
        shell: bash -x -e -u -o pipefail {0}
        if: always()
        run: |
          pip install coverage

          ls -al .
          ls -al coverage-*/
          coverage combine --keep $(ls coverage-*/.coverage)
          coverage report -i
          rm -rf coverage-*
          ls -al

      - name: Upload coverage reports to Codecov
        uses: codecov/codecov-action@v5
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          verbose: true
          flags: ${{ matrix.flag }}

      - name: Upload artifacts
        uses: actions/upload-artifact@v6
        with:
          name: coverage-${{ matrix.flag }}-aggregated
          path: |
            .coverage
          include-hidden-files: true

  merge-queue-notification:
    runs-on: ubuntu-latest
    if: github.event_name == 'merge_group'
    permissions:
      pull-requests: write
    steps:
      - name: Extract PR number from merge group
        id: get-pr-number
        run: |
          # Extract PR number from merge group head_ref (format: refs/heads/gh-readonly-queue/main/pr-<number>-<sha>)
          PR_NUMBER=$(echo "${{ github.event.merge_group.head_ref }}" | sed -n 's/.*\/pr-\([0-9]*\)-.*/\1/p')
          echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT

      - name: Comment on PR with action run URL
        uses: actions/github-script@v8
        with:
          github-token: ${{ secrets.PAT }}
          script: |
            const prNumber = ${{ steps.get-pr-number.outputs.pr_number }};
            const runUrl = `https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}`;

            await github.rest.issues.createComment({
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: prNumber,
              body: `🔄 Merge queue validation started!\n\nYou can track the progress here: ${runUrl}`
            });

  cleanup-taint-node:
    runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }}
    needs:
      - is-not-external-contributor
      - cicd-container-build
      - cicd-unit-tests-latest
      - cicd-integration-tests-latest-h100
      - cicd-integration-tests-latest-gb200
      - Coverage
      - Coverage_Fake
    if: |
      always()
      && !cancelled()
      && contains(needs.is-not-external-contributor.outputs.selected_runner, 'ephemeral')
      && !needs.pre-flight.outputs.is_deployment_workflow == 'true'
    steps:
      - name: Taint node for cleanup
        shell: bash
        run: taint-node.sh


================================================
FILE: .github/workflows/claude-complexity-label.yml
================================================
name: Claude Complexity Label

on:
  pull_request_target:
    types: [ready_for_review]

jobs:
  label-complexity:
    name: Label PR Complexity
    runs-on: ubuntu-latest
    permissions:
      contents: read
      pull-requests: write
      issues: write
      id-token: write
    env:
      GH_TOKEN: ${{ secrets.PAT }}
      REPO: ${{ github.repository }}
      PR_NUMBER: ${{ github.event.pull_request.number }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6
        with:
          fetch-depth: 0

      - name: Run Claude Complexity Analysis
        uses: anthropics/claude-code-action@v1
        with:
          anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
          github_token: ${{ secrets.PAT }}
          prompt: |
            REPO: ${{ env.REPO }}
            PR NUMBER: ${{ env.PR_NUMBER }}

            You are a PR complexity analyzer. Your job is to analyze the diff of this PR and apply exactly one complexity label.

            STEPS:
            1. Get the PR diff by running: gh pr diff $PR_NUMBER --repo $REPO
            2. Analyze every changed line (added or removed) in the diff and classify each as one of:
               - "docs-only": changes to docstrings, comments (lines starting with # or //), documentation files (.md, .rst, .txt), or similar non-functional text
               - "test": changes in test files (files with "test" in the name/path, or inside a tests/ directory)
               - "real code": all other changes (functional source code)
            3. Compute "real code line changes" using this formula:
               real_code_line_changes = (number of real code lines changed) + (number of test lines changed / 10)
               Count both added and removed lines. Do not count unchanged context lines. Do not count comments or docstrings.
            4. Remove any previously applied complexity or docs-only labels:
               gh pr edit $PR_NUMBER --repo $REPO --remove-label "complexity: low,complexity: medium,complexity: high,docs-only"
            5. Apply exactly ONE label using the gh CLI:
               - If there are ZERO real code lines and ZERO test lines (only docs-only changes), apply label "docs-only":
                 gh pr edit $PR_NUMBER --repo $REPO --add-label "docs-only"
               - If real_code_line_changes < 100, apply label "complexity: low":
                 gh pr edit $PR_NUMBER --repo $REPO --add-label "complexity: low"
               - If real_code_line_changes >= 100 and < 500, apply label "complexity: medium":
                 gh pr edit $PR_NUMBER --repo $REPO --add-label "complexity: medium"
               - If real_code_line_changes >= 500, apply label "complexity: high":
                 gh pr edit $PR_NUMBER --repo $REPO --add-label "complexity: high"

            Do NOT post any comments on the PR. Only apply the label.
          claude_args: |
            --allowedTools "Bash(gh pr diff:*),Bash(gh pr edit:*),Bash(gh pr view:*)"


================================================
FILE: .github/workflows/claude_review.yml
================================================
name: Claude Code Review

on:
  issue_comment:
    types: [created]

jobs:
  review-on-comment:
    name: Claude Review (comment trigger)
    if: |
      github.event_name == 'issue_comment' &&
      github.event.issue.pull_request &&
      contains(github.event.comment.body, '/claude review')
    runs-on: ubuntu-latest
    permissions:
      contents: read
      pull-requests: write
      issues: write
      id-token: write
    env:
      GH_TOKEN: ${{ github.token }}
      REPO: ${{ github.repository }}
      PR_NUMBER: ${{ github.event.issue.number }}
    steps:
      - name: Get PR head commit
        id: get-pr-head-commit
        run: |
          echo "sha=$(gh pr view $PR_NUMBER --repo $REPO --json headRefOid -q .headRefOid)" | tee -a $GITHUB_OUTPUT

      - name: Checkout repository
        uses: actions/checkout@v6
        with:
          fetch-depth: 1
          ref: ${{ steps.get-pr-head-commit.outputs.sha }}

      - name: Run Claude Code Review
        uses: anthropics/claude-code-action@v1
        with:
          anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
          trigger_phrase: "/claude review"
          show_full_output: true
          claude_args: |
            --allowedTools "mcp__github_inline_comment__create_inline_comment,Bash(gh pr comment:*),Bash(gh pr diff:*),Bash(gh pr view:*),Bash(gh pr review:*)"
            --model "claude-opus-4-6"
          prompt: |
            REPO: ${{ env.REPO }}
            PR NUMBER: ${{ env.PR_NUMBER }}

            You are doing a light code review. Keep it concise and actionable.

            Focus ONLY on:
            - Critical bugs or logic errors
            - Typos in code, comments, or strings
            - Missing or insufficient test coverage for changed code
            - Outdated or inaccurate documentation affected by the changes

            Do NOT comment on:
            - Style preferences or formatting
            - Minor naming suggestions
            - Architectural opinions or refactoring ideas
            - Performance unless there is a clear, measurable issue

            Provide feedback using inline comments for specific code suggestions.
            Use top-level comments for general observations.

            It's perfectly acceptable to not have anything to comment on.
            If you do not have anything to comment on, post "LGTM".


================================================
FILE: .github/workflows/close-inactive-issue-pr.yml
================================================
# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Stale-Close-Inactive-Issues-PRs
on:
  schedule:
    - cron: "30 1 * * *"

jobs:
  close-issues:
    if: github.repository == 'NVIDIA/Megatron-LM'
    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_close_inactive_issue_pr.yml@v0.44.0


================================================
FILE: .github/workflows/community-bot.yml
================================================
# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Community Bot

on:
  issues:
    types: [opened, edited, reopened, closed, deleted]
  issue_comment:
    types: [created, edited, deleted]

jobs:
  community-bot:
    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_community_bot.yml@v0.65.10
    with:
      community_project_id: ${{ vars.COMMUNITY_PROJECT_ID }}
    if: github.repository == 'NVIDIA/Megatron-LM'
    secrets:
      GH_TOKEN: ${{ secrets.PAT }}


================================================
FILE: .github/workflows/config/changelog-config.json
================================================
{
    "categories": [],
    "ignore_labels": [
      "ignore"
    ],
    "sort": "ASC",
    "template": "\n${{CHANGELOG}}\n\n<details><summary>Changelog Details</summary>\n\n${{UNCATEGORIZED}}\n</details>\n",
    "pr_template": "- ${{TITLE}} by @${{AUTHOR}} :: PR: #${{NUMBER}}",
    "commit_template": "- ${{TITLE}} by @${{AUTHOR}}",
    "empty_template": "${{OWNER}}\n${{REPO}}\n${{FROM_TAG}}\n${{TO_TAG}}",
    "duplicate_filter": {
      "pattern": ".+",
      "on_property": "title",
      "method": "match"
    },
    "transformers": [],
    "max_tags_to_fetch": 100,
    "max_pull_requests": 1250,
    "max_back_track_time_days": 365,
    "exclude_merge_branches": [],
    "tag_resolver": {
      "method": "semver"
    }
}


================================================
FILE: .github/workflows/copyright-check.yml
================================================
# Copyright (c) 2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: Copyright check

on:
  push:
    branches:
      - "pull-request/[0-9]+"
      - "deploy-release/*"
  merge_group:
    types: [checks_requested]

jobs:
  pre-flight:
    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2
    if: github.repository == 'NVIDIA/Megatron-LM'

  copyright-check:
    needs: [pre-flight]
    if: |
      !(needs.pre-flight.outputs.docs_only == 'true'
      || needs.pre-flight.outputs.is_merge_group == 'true'
      || needs.pre-flight.outputs.is_deployment_workflow == 'true')
      && github.repository == 'NVIDIA/Megatron-LM'
    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_copyright_check.yml@v0.66.7

  copyright-check-summary:
    needs: [pre-flight, copyright-check]
    if: |
      (
        needs.pre-flight.outputs.docs_only == 'true'
        || needs.pre-flight.outputs.is_deployment_workflow == 'true'
        || always()
      )
      && !cancelled()
      && github.repository == 'NVIDIA/Megatron-LM'
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6

      - name: Result
        env:
          GH_TOKEN: ${{ github.token }}
          GITHUB_RUN_ID: ${{ github.run_id }}
          SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_ci_workload == 'true' }}
        run: |
          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0

          if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
              echo "✅ All previous jobs completed successfully"
              exit 0
          else
              echo "❌ Found $FAILED_JOBS failed job(s)"
              # Show which jobs failed
              gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
              exit 1
          fi


================================================
FILE: .github/workflows/dependabot.yml
================================================
name: Dependabot
on:
  schedule:
    - cron: "0 8 * * 1"
  workflow_dispatch: # Allow manual triggering

permissions:
  id-token: write
  contents: write

jobs:
  get-release-branch-names:
    runs-on: ubuntu-latest
    outputs:
      mcore: ${{ steps.get-branch.outputs.mcore_release_branch }}
    if: github.repository == 'NVIDIA/Megatron-LM'
    steps:
      - name: Get release branch names
        id: get-branch
        env:
          PAT: ${{ secrets.PAT }}
        run: |
          latest_branch=$(git ls-remote --heads https://token:${PAT}@github.com/NVIDIA/Megatron-LM.git 'refs/heads/core_r*' | 
            grep -o 'core_r[0-9]\+\.[0-9]\+\.[0-9]\+' | 
            sort -V | 
            tail -n1)
          echo "mcore_release_branch=$latest_branch" | tee -a $GITHUB_OUTPUT

  bump-tags:
    needs: [get-release-branch-names]
    if: github.repository == 'NVIDIA/Megatron-LM'
    strategy:
      fail-fast: false
      matrix:
        include:
          - target-branch: ${{ needs.get-release-branch-names.outputs.mcore }}
          - target-branch: main
    uses: ./.github/workflows/_update_dependencies.yml
    with:
      target-branch: ${{ matrix.target-branch }}
    secrets:
      PAT: ${{ secrets.PAT }}
      SSH_KEY: ${{ secrets.SSH_KEY }}
      SSH_PWD: ${{ secrets.SSH_PWD }}

  notify:
    if: failure() && github.repository == 'NVIDIA/Megatron-LM'
    runs-on: ubuntu-latest
    needs: [bump-tags]
    steps:
      - name: Notify
        env:
          SLACK_WEBHOOK: ${{ secrets.SLACK_CI_CHANNEL_WEBHOOK }}
          SLACK_WEBHOOK_ADMIN: <!subteam^${{ secrets.SLACK_TEAM_GROUP_ID }}>
          GITHUB_RUN_ID: ${{ github.run_id }}
          GITHUB_REPOSITORY: ${{ github.repository }}
        run: |
          curl -X POST \
            -H 'Content-type: application/json' \
            --data "{\"text\":\":robot_joy: <https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}|Dependabot workflow> failed. Please fix manually.\n\ncc ${SLACK_WEBHOOK_ADMIN}\"}" \
            $SLACK_WEBHOOK


================================================
FILE: .github/workflows/force-draft-pr.yml
================================================
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.

name: Force Draft PR

on:
  pull_request_target:
    types: [opened]
    branches:
      - main

permissions:
  pull-requests: write

jobs:
  force-draft:
    runs-on: ubuntu-latest
    if: ${{ !github.event.pull_request.draft && github.repository == 'NVIDIA/Megatron-LM' }}
    steps:
      - name: Convert PR to draft
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          gh pr ready --undo ${{ github.event.pull_request.number }} --repo ${{ github.repository }}

      - name: Add comment explaining draft policy
        env:
          GH_TOKEN: ${{ github.token }}
        run: |
          gh pr comment ${{ github.event.pull_request.number }} --repo ${{ github.repository }} --body \
            "This PR has been automatically converted to **draft** because all PRs must start as drafts.

          When you are ready for review, click **Ready for Review** to begin the review process. This will:
          1. Add the oncall reviewer (optional reviewer)
          2. Add required review teams based on your changes

          See the [contribution guide](https://github.com/NVIDIA/Megatron-LM/blob/main/docs/developer/submit.md) for more details."


================================================
FILE: .github/workflows/install-test.yml
================================================
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This workflow verifies that the basic install works across all supported platforms.
# For basic install, all imports need to either be successful or appropriately guarded.

name: Installation Test

on:
  push:
    branches:
      - dev
      - main
      - "pull-request/[0-9]+"
      - "deploy-release/*"
  merge_group:
    types: [checks_requested]

jobs:
  pre-flight:
    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2
    if: github.repository == 'NVIDIA/Megatron-LM'

  pip-test-pytorch:
    needs: [pre-flight]
    if: |
      !(needs.pre-flight.outputs.docs_only == 'true'
      || needs.pre-flight.outputs.is_merge_group == 'true'
      || needs.pre-flight.outputs.is_deployment_workflow == 'true')
      && github.repository == 'NVIDIA/Megatron-LM'
    runs-on: linux-amd64-cpu16
    name: Pip - Python${{ matrix.python-version }} - AMD64/Linux - NGC PyTorch
    container:
      image: nvcr.io/nvidia/pytorch:25.05-py3
    strategy:
      fail-fast: false
      matrix:
        python-version: ["3.12"]
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6

      - name: Set PATH
        run: |
          echo "UV_PROJECT_ENVIRONMENT=/opt/venv" | tee -a "$GITHUB_ENV"
          echo "UV_LINK_MODE=copy" | tee -a "$GITHUB_ENV"
          echo "CUDA_HOME=/usr/local/cuda" | tee -a "$GITHUB_ENV"
          echo "LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" | tee -a "$GITHUB_ENV"
          echo "PATH=$HOME/.local/bin:$PATH:$CUDA_HOME/bin" | tee -a "$GITHUB_ENV"
          echo "TORCH_CUDA_ARCH_LIST=6.0;6.1;7.0;7.5;8.0;8.6;9.0" | tee -a "$GITHUB_ENV"

      - name: Install megatron-core
        shell: bash -x -e -u -o pipefail {0}
        run: bash docker/common/install.sh --environment dev --base-image pytorch --python-version ${{ matrix.python-version }}

      - name: Checkout check-imports
        uses: actions/checkout@v6
        with:
          repository: NVIDIA-NeMo/FW-CI-templates
          ref: v0.63.2
          path: FW-CI-templates

      - name: Check imports for megatron-core
        uses: ./FW-CI-templates/.github/actions/check-imports
        with:
          package-name: megatron.core
          python-binary: ${{ env.UV_PROJECT_ENVIRONMENT }}/bin/python

  uv-test-pytorch:
    needs: [pre-flight]
    if: |
      !(needs.pre-flight.outputs.docs_only == 'true'
      || needs.pre-flight.outputs.is_merge_group == 'true'
      || needs.pre-flight.outputs.is_deployment_workflow == 'true')
      && github.repository == 'NVIDIA/Megatron-LM'
    runs-on: linux-amd64-cpu16
    name: UV - Python${{ matrix.python-version }} - AMD64/Linux - NGC PyTorch
    container:
      image: nvcr.io/nvidia/pytorch:25.05-py3
    strategy:
      fail-fast: false
      matrix:
        python-version: ["3.12"]
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6

      - name: Set PATH
        run: |
          echo "UV_PROJECT_ENVIRONMENT=/opt/venv" | tee -a "$GITHUB_ENV"
          echo "VIRTUAL_ENV=/opt/venv" | tee -a "$GITHUB_ENV"
          echo "UV_LINK_MODE=copy" | tee -a "$GITHUB_ENV"
          echo "CUDA_HOME=/usr/local/cuda" | tee -a "$GITHUB_ENV"
          echo "LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" | tee -a "$GITHUB_ENV"
          echo "PATH=$HOME/.local/bin:$PATH:$CUDA_HOME/bin" | tee -a "$GITHUB_ENV"
          echo "CUDACXX=/usr/local/cuda/bin/nvcc" | tee -a "$GITHUB_ENV"
          echo "TORCH_CUDA_ARCH_LIST=6.0;6.1;7.0;7.5;8.0;8.6;9.0" | tee -a "$GITHUB_ENV"

      - name: Install project
        shell: bash
        run: bash docker/common/install.sh --environment dev --base-image pytorch --use-uv

      # NGC PyTorch 25.05 has a version of triton that is broken on CPU only machines.
      # - name: Checkout check-imports
      #   uses: actions/checkout@v6
      #   with:
      #     repository: NVIDIA-NeMo/FW-CI-templates
      #     ref: v0.63.2
      #     path: FW-CI-templates

      # - name: Check imports for megatron-core
      #   uses: ./FW-CI-templates/.github/actions/check-imports
      #   with:
      #     package-name: megatron.core
      #     python-binary: ${{ env.UV_PROJECT_ENVIRONMENT }}/bin/python

  install-test-summary:
    needs: [pre-flight, pip-test-pytorch, uv-test-pytorch]
    runs-on: ubuntu-latest
    name: Install test summary
    if: |
      (
        needs.pre-flight.outputs.docs_only == 'true'
        || needs.pre-flight.outputs.is_deployment_workflow == 'true'
        || always()
      )
      && !cancelled()
      && github.repository == 'NVIDIA/Megatron-LM'
    steps:
      - name: Checkout
        uses: actions/checkout@v6

      - name: Get workflow result
        id: result
        shell: bash -x -e -u -o pipefail {0}
        env:
          GH_TOKEN: ${{ github.token }}
          RUN_ID: ${{ github.run_id }}
          SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' }}
        run: |
          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0

          if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
              echo "✅ All previous jobs completed successfully"
              exit 0
          else
              echo "❌ Found $FAILED_JOBS failed job(s)"
              # Show which jobs failed
              gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
              exit 1
          fi


================================================
FILE: .github/workflows/multi-approval-bot.yml
================================================
name: "Codeowners Approval Workflow"

on:
  push:
    branches:
      - "pull-request/[0-9]+"
  merge_group:
    types: [checks_requested]

jobs:
  pre-flight:
    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2
    if: github.repository == 'NVIDIA/Megatron-LM'

  codeowners-approval:
    needs: [pre-flight]
    runs-on: ubuntu-latest
    if: |
      !(needs.pre-flight.outputs.docs_only == 'true'
      || needs.pre-flight.outputs.is_merge_group == 'true'
      || needs.pre-flight.outputs.is_deployment_workflow == 'true')
    steps:
      - name: Get PR info
        id: get-pr-info
        if: startsWith(github.ref, 'refs/heads/pull-request/')
        uses: nv-gha-runners/get-pr-info@main

      - name: Checkout action
        uses: actions/checkout@v6
        with:
          repository: noamelf/codeowner-multi-approval-action
          ref: v0.1
          path: codeowner-multi-approval-action

      - name: Check Codeowners Approval
        uses: ./codeowner-multi-approval-action
        with:
          pr-number: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
          repo-name: ${{ github.repository }}
          github-token: ${{ secrets.PAT }}

  multi-approval-bot-summary:
    needs: [pre-flight, codeowners-approval]
    if: |
      (
        needs.pre-flight.outputs.docs_only == 'true'
        || needs.pre-flight.outputs.is_merge_group == 'true'
        || needs.pre-flight.outputs.is_deployment_workflow == 'true'
        || always()
      )
      && github.repository == 'NVIDIA/Megatron-LM'
      && !cancelled()
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6

      - name: Result
        env:
          GH_TOKEN: ${{ github.token }}
          GITHUB_RUN_ID: ${{ github.run_id }}
          SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_ci_workload == 'true' }}
        run: |
          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0

          if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
              echo "✅ All previous jobs completed successfully"
              exit 0
          else
              echo "❌ Found $FAILED_JOBS failed job(s)"
              # Show which jobs failed
              gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
              exit 1
          fi


================================================
FILE: .github/workflows/oncall-assign.yml
================================================
# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: Oncall Assign

on:
  pull_request_target:
    types: [ready_for_review]
    branches:
      - main

permissions:
  pull-requests: write
  contents: read

jobs:
  assign-reviewer:
    runs-on: ubuntu-latest
    if: ${{ !github.event.pull_request.draft }}
    steps:
      - name: Checkout code
        uses: actions/checkout@v6

      - name: Set up Python
        uses: actions/setup-python@v6
        with:
          python-version: '3.10'

      - name: Install dependencies
        run: pip install requests slack-sdk

      - name: Assign Reviewer
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          python .github/scripts/oncall_manager.py assign --pr ${{ github.event.pull_request.number }}


================================================
FILE: .github/workflows/oncall-rotation.yml
================================================
# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: Oncall Rotation

on:
  schedule:
    # Runs at 09:00 UTC every Wednesday
    - cron: "0 9 * * 3"
  workflow_dispatch:

permissions:
  contents: write

jobs:
  rotate-schedule:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v6
        with:
          token: ${{ secrets.PAT }}

      - name: Set up Python
        uses: actions/setup-python@v6
        with:
          python-version: "3.10"

      - name: Rotate Schedule
        env:
          # Token to read org team members. Needs read:org scope.
          GH_TOKEN: ${{ secrets.NVIDIA_MCORE_ONCALL_TOKEN || secrets.PAT || secrets.GITHUB_TOKEN }}
          # Slack token for updating the Slack usergroup
          SLACK_TOKEN: ${{ secrets.ONCALL_SLACK_TOKEN }}
        run: |
          pip install --no-cache-dir "uv<0.9.29"
          uv venv .venv
          uv cache clean
          uv sync --no-cache 
          uv run --with slack-sdk python .github/scripts/oncall_manager.py rotate

      - name: Commit and Push changes
        run: |
          git config --global user.name "github-actions[bot]"
          git config --global user.email "github-actions[bot]@users.noreply.github.com"
          git add .github/oncall_schedule.json
          git commit -m "chore: rotate oncall schedule" || echo "No changes to commit"
          git pull --rebase
          git push origin HEAD:main


================================================
FILE: .github/workflows/release-docs.yml
================================================
# Copyright (c) 2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Release docs
on:
  workflow_dispatch:
    inputs:
      dry-run:
        description: Whether to run the workflow in dry-run mode
        required: true
        type: boolean
        default: true
      publish-as-latest:
        description: Publish as Latest stable version.
        required: false
        type: boolean
        default: true
      docs-version-override:
        description: Docs version if commit is not tagged
        required: false
        type: string
        default: ""
      update-version-picker:
        description: Update version picker.
        required: false
        type: boolean
        default: true
      notify-emails:
        description: Email addresses to send the notification to. Format as "me@me.com,you@you.com".
        required: false
        type: string
  workflow_call:
    inputs:
      dry-run:
        description: Whether to run the workflow in dry-run mode
        required: true
        type: boolean
        default: true
      publish-as-latest:
        description: Publish as Latest stable version.
        required: false
        type: boolean
        default: true
      docs-version-override:
        description: Docs version if commit is not tagged
        required: false
        type: string
        default: ""
      update-version-picker:
        description: Update version picker.
        required: false
        type: boolean
        default: true
      notify-emails:
        description: Email addresses to send the notification to. Format as "me@me.com,you@you.com".
        required: false
        type: string
      build-docs-ref:
        description: Reference to build the docs from
        required: false
        type: string
        default: ${{ github.sha }}

jobs:
  build-docs:
    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_docs.yml@v0.67.0
    with:
      ref: ${{ inputs.build-docs-ref }}

  publish-docs:
    runs-on: ubuntu-latest
    needs: [build-docs]
    steps:
      - uses: actions/checkout@v6
        with:
          repository: NVIDIA-NeMo/FW-CI-templates
          ref: v0.74.0

Download .txt

gitextract_32wjwf3g/

├── .coderabbit.yaml
├── .flake8
├── .github/
│   ├── CODEOWNERS
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.md
│   │   ├── config.yml
│   │   ├── feature_request.md
│   │   ├── question.md
│   │   └── regression.md
│   ├── actions/
│   │   ├── action.yml
│   │   └── check-nvidia-sso-membership/
│   │       └── action.yml
│   ├── copy-pr-bot.yaml
│   ├── oncall_schedule.json
│   ├── pull_request_template.md
│   ├── scripts/
│   │   ├── oncall_manager.py
│   │   ├── readme.sh
│   │   └── sync_team_usergroups.py
│   └── workflows/
│       ├── _build_test_publish_wheel.yml
│       ├── _release_library.yml
│       ├── _update_dependencies.yml
│       ├── auto-assign-milestone.yml
│       ├── auto-reminder-bot.yml
│       ├── auto-swap-labels.yml
│       ├── auto-update-copy-pr-bot.yml
│       ├── build-docs.yml
│       ├── build-test-publish-wheel.yml
│       ├── cherry-pick-release-commit.yml
│       ├── cicd-approve-test-queue.yml
│       ├── cicd-main.yml
│       ├── claude-complexity-label.yml
│       ├── claude_review.yml
│       ├── close-inactive-issue-pr.yml
│       ├── community-bot.yml
│       ├── config/
│       │   └── changelog-config.json
│       ├── copyright-check.yml
│       ├── dependabot.yml
│       ├── force-draft-pr.yml
│       ├── install-test.yml
│       ├── multi-approval-bot.yml
│       ├── oncall-assign.yml
│       ├── oncall-rotation.yml
│       ├── release-docs.yml
│       ├── release-freeze.yml
│       ├── release-nightly-docs.yml
│       ├── release.yaml
│       ├── review-trigger.yml
│       ├── sync-team-usergroups.yml
│       └── trigger-mbridge-tests.yml
├── .gitignore
├── .gitlab/
│   ├── labeler-config.yml
│   ├── scripts/
│   │   ├── build.sh
│   │   ├── check_imports.py
│   │   └── fetch-legacy-suite.sh
│   └── stages/
│       ├── 00.pre.yml
│       ├── 01.build.yml
│       ├── 02.test.yml
│       ├── 03.integration-tests.yml
│       ├── 04.functional-tests.yml
│       └── 05.publish.yml
├── .gitlab-ci.yml
├── .pre-commit-config.yaml
├── .pylintrc
├── .python-version
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── codecov.yml
├── docker/
│   ├── .ngc_version.dev
│   ├── .ngc_version.lts
│   ├── Dockerfile.ci.dev
│   ├── Dockerfile.ci.nemo
│   ├── Dockerfile.linting
│   ├── common/
│   │   ├── install.sh
│   │   └── install_source_wheels.sh
│   └── patches/
│       └── deepep.patch
├── docs/
│   ├── add_copyright_header.py
│   ├── advanced/
│   │   └── index.md
│   ├── api-backwards-compatibility-check.md
│   ├── api-guide/
│   │   ├── core/
│   │   │   ├── datasets.md
│   │   │   ├── dist_checkpointing.md
│   │   │   ├── dist_checkpointing.strategies.md
│   │   │   ├── distributed.md
│   │   │   ├── fusions.md
│   │   │   ├── index.md
│   │   │   ├── pipeline_parallel.md
│   │   │   ├── tensor_parallel.md
│   │   │   └── transformer.md
│   │   ├── index.md
│   │   ├── internal/
│   │   │   ├── index.md
│   │   │   ├── num_microbatches_calculator.md
│   │   │   └── optimizer_param_scheduler.md
│   │   ├── models/
│   │   │   ├── index.md
│   │   │   ├── models.bert.md
│   │   │   ├── models.gpt.md
│   │   │   ├── models.md
│   │   │   └── models.t5.md
│   │   └── router_replay.md
│   ├── autodoc2_docstrings_parser.py
│   ├── broken_links_false_positives.json
│   ├── conf.py
│   ├── developer/
│   │   ├── contribute.md
│   │   ├── generate_docs.md
│   │   ├── oncall.md
│   │   └── submit.md
│   ├── discussions/
│   │   ├── README.md
│   │   └── megatron-fsdp-user-guide/
│   │       ├── example-scripts/
│   │       │   ├── sbatch_checkpoint_convert.sh
│   │       │   └── sbatch_mfsdp_deepseek_v3.sh
│   │       └── megatron-fsdp-user-guide.md
│   ├── documentation.md
│   ├── get-started/
│   │   ├── install.md
│   │   ├── overview.md
│   │   ├── quickstart.md
│   │   └── releasenotes.md
│   ├── index.md
│   ├── llama_mistral.md
│   ├── models/
│   │   ├── index.md
│   │   ├── llms.md
│   │   └── multimodal.md
│   ├── project.json
│   ├── user-guide/
│   │   ├── data-preparation.md
│   │   ├── features/
│   │   │   ├── context_parallel.md
│   │   │   ├── custom_fsdp.md
│   │   │   ├── dist_optimizer.md
│   │   │   ├── fine_grained_activation_offloading.md
│   │   │   ├── index.md
│   │   │   ├── megatron_energon.md
│   │   │   ├── megatron_rl.md
│   │   │   ├── moe.md
│   │   │   ├── multi_latent_attention.md
│   │   │   ├── multi_token_prediction.md
│   │   │   ├── optimizer_cpu_offload.md
│   │   │   ├── pipeline_parallel_layout.md
│   │   │   └── tokenizers.md
│   │   ├── index.md
│   │   ├── msc_integration.md
│   │   ├── parallelism-guide.md
│   │   └── training-examples.md
│   └── versions1.json
├── examples/
│   ├── __init__.py
│   ├── academic_paper_scripts/
│   │   ├── detxoify_lm/
│   │   │   ├── README.md
│   │   │   ├── annotations/
│   │   │   │   ├── filter-selfgeneration.py
│   │   │   │   ├── perspective_api_annotate.py
│   │   │   │   └── preprocess.sh
│   │   │   ├── finetune_gpt.py
│   │   │   ├── finetune_gpt_distributed-1.3b.sh
│   │   │   ├── generate-1.3b.sh
│   │   │   ├── generate_samples_gpt.py
│   │   │   ├── perspective_api.py
│   │   │   └── self_generation/
│   │   │       └── selfgenerate-1.3b-unconditional.sh
│   │   ├── msdp/
│   │   │   ├── README.md
│   │   │   ├── data_processing.sh
│   │   │   ├── eval_knwl_generation.sh
│   │   │   ├── eval_resp_generation.sh
│   │   │   ├── prep_resp_gen.sh
│   │   │   ├── prompt_knwl_gen.sh
│   │   │   └── prompt_resp_gen.sh
│   │   └── sc21/
│   │       ├── CONFIG.sh
│   │       ├── README.md
│   │       ├── SBATCH.sh
│   │       ├── SRUN.sh
│   │       ├── run_figure_11.sh
│   │       ├── run_figure_12.sh
│   │       ├── run_figure_13.sh
│   │       ├── run_figure_14.sh
│   │       ├── run_figure_15.sh
│   │       ├── run_figure_16.sh
│   │       ├── run_figure_17.sh
│   │       ├── run_figure_18.sh
│   │       └── run_table_1.sh
│   ├── bert/
│   │   ├── README.md
│   │   └── train_bert_340m_distributed.sh
│   ├── export/
│   │   ├── README.md
│   │   └── trtllm_export/
│   │       ├── README.md
│   │       ├── distributed_export/
│   │       │   └── gpt_distributed_gpu_export.py
│   │       └── single_device_export/
│   │           └── gpt_single_device_cpu_export.py
│   ├── gpt3/
│   │   ├── README.md
│   │   ├── gpt_config.yaml
│   │   └── train_gpt3_175b_distributed.sh
│   ├── gptoss/
│   │   ├── 01_convert_from_hf.py
│   │   ├── 02_train.sh
│   │   ├── 03_convert_to_hf.py
│   │   └── README.md
│   ├── inference/
│   │   ├── README.md
│   │   ├── gpt/
│   │   │   ├── gpt_dynamic_inference.py
│   │   │   ├── gpt_dynamic_inference_12b.sh
│   │   │   ├── gpt_dynamic_inference_357m.sh
│   │   │   ├── gpt_dynamic_inference_with_coordinator.py
│   │   │   ├── gpt_static_inference.py
│   │   │   └── utils.py
│   │   ├── llama_mistral/
│   │   │   ├── huggingface_reference.py
│   │   │   ├── run_static_inference_llama4_scout.sh
│   │   │   ├── run_text_generation_llama3.1.sh
│   │   │   ├── run_text_generation_llama3.sh
│   │   │   └── run_text_generation_mistral.sh
│   │   ├── run_text_generation_server_345M.sh
│   │   ├── run_text_generation_server_345M_8_tensor_parallel.sh
│   │   └── t5/
│   │       └── simple_t5_batch_inference.py
│   ├── llama/
│   │   ├── README.md
│   │   └── train_llama3_8b_h100_fp8.sh
│   ├── mamba/
│   │   ├── .gitignore
│   │   ├── Dockerfile
│   │   ├── README.md
│   │   ├── run_text_gen_server_8b.sh
│   │   ├── run_text_gen_server_8b_gpt3.sh
│   │   └── train.sh
│   ├── mimo/
│   │   ├── __init__.py
│   │   ├── avlm_inference.py
│   │   ├── configs/
│   │   │   ├── llava_avlm.py
│   │   │   ├── llava_vlm.py
│   │   │   └── mock.py
│   │   ├── data/
│   │   │   ├── __init__.py
│   │   │   ├── avlm_sample_loader.py
│   │   │   ├── energon_avlm_task_encoder.py
│   │   │   ├── energon_vlm_task_encoder.py
│   │   │   ├── mock.py
│   │   │   ├── prepare_video_llava_data.py
│   │   │   └── utils/
│   │   │       └── calculate_audio_tokens.py
│   │   ├── model_providers/
│   │   │   ├── __init__.py
│   │   │   ├── hf_clip_encoder.py
│   │   │   ├── hf_whisper_encoder.py
│   │   │   ├── llava_avlm.py
│   │   │   ├── llava_vlm.py
│   │   │   └── mock.py
│   │   ├── scripts/
│   │   │   ├── run_avlm_train.sh
│   │   │   ├── run_mock_train.sh
│   │   │   ├── run_video_vlm_train.sh
│   │   │   └── run_vlm_train.sh
│   │   ├── train.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── data_helpers.py
│   │       ├── logging.py
│   │       └── model_helpers.py
│   ├── mixtral/
│   │   ├── README.md
│   │   └── train_mixtral_8x7b_distributed.sh
│   ├── multimodal/
│   │   ├── Dockerfile
│   │   ├── README.md
│   │   ├── combine_lm_vision_checkpoints.sh
│   │   ├── combine_state_dicts.py
│   │   ├── config.py
│   │   ├── convert_llava_pretrain_to_wds.py
│   │   ├── dataloader_provider.py
│   │   ├── dataset_helpers.py
│   │   ├── energon_util.py
│   │   ├── evaluation/
│   │   │   ├── evaluate_ai2d.py
│   │   │   ├── evaluate_chartqa.py
│   │   │   ├── evaluate_coco.py
│   │   │   ├── evaluate_infovqa.py
│   │   │   ├── evaluate_mathvista.py
│   │   │   ├── evaluate_mmmu.py
│   │   │   ├── evaluate_ocrbench.py
│   │   │   ├── evaluate_ocrbench_v2.py
│   │   │   ├── evaluate_rd_tablebench.py
│   │   │   ├── evaluate_realworldqa.py
│   │   │   ├── evaluate_spdocvqa.py
│   │   │   ├── evaluate_textvqa.py
│   │   │   ├── evaluate_video_motionbench.py
│   │   │   ├── evaluate_video_mvbench.py
│   │   │   ├── evaluate_video_phys_game_bench.py
│   │   │   ├── evaluate_vqav2.py
│   │   │   ├── evaluation_datasets.py
│   │   │   └── mmmu_utils.py
│   │   ├── image_processing.py
│   │   ├── layer_scaling.py
│   │   ├── layer_specs.py
│   │   ├── llama_3p1_nemotron_nano_vl_8b_v1/
│   │   │   ├── Dockerfile
│   │   │   ├── README.md
│   │   │   ├── pretraining_llama_3p1_nemotron_nano_vl_8b_v1.sh
│   │   │   ├── sft_llama_3p1_nemotron_nano_vl_8b_v1.sh
│   │   │   └── text_generation.sh
│   │   ├── manual_prompts.json
│   │   ├── model.py
│   │   ├── model_converter/
│   │   │   ├── clip_converter.py
│   │   │   ├── internvit_converter.py
│   │   │   ├── radio_converter.py
│   │   │   ├── siglip_converter.py
│   │   │   └── vision_model_tester.py
│   │   ├── multimodal_args.py
│   │   ├── nvlm/
│   │   │   ├── README.md
│   │   │   ├── internvit.py
│   │   │   ├── nvlm_prompts.json
│   │   │   ├── pp_checkpoint_converter.py
│   │   │   ├── pretrain_blend.yaml
│   │   │   ├── pretrain_qwen20_72b_internvit_6b.sh
│   │   │   ├── pretrain_yi_34b_internvit_6b.sh
│   │   │   ├── run_text_generation_qwen20_72b_internvit_6b.sh
│   │   │   ├── run_text_generation_qwen25_7b_internvit_video.sh
│   │   │   ├── run_text_generation_qwen25_7b_siglip.sh
│   │   │   ├── run_text_generation_yi_34b_internvit_6b.sh
│   │   │   ├── sft_34b_internvit.sh
│   │   │   ├── sft_blend.yaml
│   │   │   ├── sft_qwen20_72b_internvit_6b.sh
│   │   │   └── sft_qwen2p5_7b_internvit_6b_video.sh
│   │   ├── pretrain_dataset.yaml
│   │   ├── pretrain_mistral_clip.sh
│   │   ├── radio/
│   │   │   └── radio_g.py
│   │   ├── run_text_generation.py
│   │   ├── sft_dataset.yaml
│   │   ├── sft_mistral_clip.sh
│   │   ├── text_generation_mistral_clip.sh
│   │   └── train.py
│   ├── post_training/
│   │   └── modelopt/
│   │       ├── .gitignore
│   │       ├── ADVANCED.md
│   │       ├── Dockerfile
│   │       ├── README.md
│   │       ├── conf/
│   │       │   ├── Qwen/
│   │       │   │   ├── Qwen2.5-0.5B-Instruct.sh
│   │       │   │   ├── Qwen2.5-7B-Instruct.sh
│   │       │   │   ├── Qwen3-0.6B.sh
│   │       │   │   ├── Qwen3-235B-A22B.sh
│   │       │   │   ├── Qwen3-30B-A3B.sh
│   │       │   │   └── Qwen3-8B.sh
│   │       │   ├── arguments.sh
│   │       │   ├── deepseek-ai/
│   │       │   │   ├── DeepSeek-R1.sh
│   │       │   │   └── DeepSeek-V2-Lite.sh
│   │       │   ├── meta-llama/
│   │       │   │   ├── Llama-3.1-8B-Instruct.sh
│   │       │   │   ├── Llama-3.2-1B-Instruct.sh
│   │       │   │   ├── Llama-4-Maverick-17B-128E-Instruct.sh
│   │       │   │   └── Llama-4-Scout-17B-16E-Instruct.sh
│   │       │   ├── moonshotai/
│   │       │   │   ├── Kimi-K2-Instruct.sh
│   │       │   │   ├── kimi_k2_instruct.sh
│   │       │   │   └── kimi_k2_instruct_export.sh
│   │       │   ├── nvidia/
│   │       │   │   ├── NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.sh
│   │       │   │   ├── NVIDIA-Nemotron-3-Super-120B-A12B-BF16.sh
│   │       │   │   ├── NVIDIA-Nemotron-Nano-9B-v2.sh
│   │       │   │   ├── Nemotron-H-47B-Reasoning-128K.sh
│   │       │   │   ├── Nemotron-H-4B-Instruct.sh
│   │       │   │   ├── Nemotron-H-56B-Base-8K.sh
│   │       │   │   ├── Nemotron-H-8B-Base-8K.sh
│   │       │   │   └── Nemotron-Mini-4B-Instruct.sh
│   │       │   └── openai/
│   │       │       ├── gpt-oss-120b.sh
│   │       │       └── gpt-oss-20b.sh
│   │       ├── convert.sh
│   │       ├── convert_model.py
│   │       ├── distillation.md
│   │       ├── eagle3.sh
│   │       ├── export.py
│   │       ├── export.sh
│   │       ├── finetune.py
│   │       ├── finetune.sh
│   │       ├── generate.py
│   │       ├── generate.sh
│   │       ├── generation_server.sh
│   │       ├── mmlu.py
│   │       ├── mmlu.sh
│   │       ├── offline_feature_extract.py
│   │       ├── offline_feature_extract.sh
│   │       ├── prune.py
│   │       ├── prune.sh
│   │       ├── quantize.py
│   │       ├── quantize.sh
│   │       ├── requirements.txt
│   │       ├── requirements_ssm.txt
│   │       ├── slurm/
│   │       │   ├── env_setup_template.sh
│   │       │   └── sbatch.sh
│   │       ├── speculative.md
│   │       ├── train.sh
│   │       ├── validate.py
│   │       └── validate.sh
│   ├── rl/
│   │   ├── README.md
│   │   ├── benchmark_refit.py
│   │   ├── environment_configs/
│   │   │   ├── countdown.yaml
│   │   │   ├── dapo.yaml
│   │   │   ├── default.yaml
│   │   │   ├── gsm8k.yaml
│   │   │   ├── gsm8k_nanov3.yaml
│   │   │   ├── math.yaml
│   │   │   └── openmathinstructv2.yaml
│   │   ├── environments/
│   │   │   ├── __init__.py
│   │   │   ├── countdown/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── countdown.py
│   │   │   │   └── countdown_agent.py
│   │   │   └── math/
│   │   │       ├── __init__.py
│   │   │       ├── aime_agent.py
│   │   │       ├── bigmath_agent.py
│   │   │       ├── dapo_agent.py
│   │   │       ├── gsm8k_agent.py
│   │   │       ├── math_agent.py
│   │   │       └── openmath_agent.py
│   │   └── model_configs/
│   │       ├── common.sh
│   │       ├── llama3p1_8b_instruct.sh
│   │       ├── nemotron5_56b.sh
│   │       ├── nemotron5_8b.sh
│   │       ├── nemotron5p5_12b_H.sh
│   │       ├── nemotron6_3b_moe.sh
│   │       ├── qwen3_30b_a3b_moe.sh
│   │       ├── qwen3_32b.sh
│   │       ├── qwen3_4b.sh
│   │       ├── qwen3_8b.sh
│   │       ├── qwen_2p5_32b.sh
│   │       ├── qwen_2p5_3b.sh
│   │       ├── qwen_2p5_distill_7b.sh
│   │       └── qwen_2p5_math_7b.sh
│   ├── run_simple_mcore_train_loop.py
│   └── t5/
│       ├── README.md
│       └── train_t5_220m_distributed.sh
├── gpt_builders.py
├── greptile.json
├── mamba_builders.py
├── megatron/
│   ├── core/
│   │   ├── MSC_Integration.md
│   │   ├── QuickStart.md
│   │   ├── README.md
│   │   ├── README_STRAGGLER.md
│   │   ├── __init__.py
│   │   ├── _rank_utils.py
│   │   ├── activations.py
│   │   ├── config.py
│   │   ├── config_logger.py
│   │   ├── datasets/
│   │   │   ├── Makefile
│   │   │   ├── __init__.py
│   │   │   ├── bert_dataset.py
│   │   │   ├── blended_dataset.py
│   │   │   ├── blended_megatron_dataset_builder.py
│   │   │   ├── blended_megatron_dataset_config.py
│   │   │   ├── data_schedule.py
│   │   │   ├── gpt_dataset.py
│   │   │   ├── helpers.cpp
│   │   │   ├── helpers.py
│   │   │   ├── indexed_dataset.py
│   │   │   ├── masked_dataset.py
│   │   │   ├── megatron_dataset.py
│   │   │   ├── multimodal_dataset.py
│   │   │   ├── object_storage_utils.py
│   │   │   ├── readme.md
│   │   │   ├── t5_dataset.py
│   │   │   ├── utils.py
│   │   │   └── utils_s3.py
│   │   ├── dist_checkpointing/
│   │   │   ├── __init__.py
│   │   │   ├── core.py
│   │   │   ├── dict_utils.py
│   │   │   ├── exchange_utils.py
│   │   │   ├── mapping.py
│   │   │   ├── optimizer.py
│   │   │   ├── serialization.py
│   │   │   ├── state_dict_utils.py
│   │   │   ├── strategies/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── async_utils.py
│   │   │   │   ├── base.py
│   │   │   │   ├── cached_metadata_filesystem_reader.py
│   │   │   │   ├── checkpointable.py
│   │   │   │   ├── common.py
│   │   │   │   ├── filesystem_async.py
│   │   │   │   ├── fully_parallel.py
│   │   │   │   ├── state_dict_saver.py
│   │   │   │   └── torch.py
│   │   │   ├── tensor_aware_state_dict.py
│   │   │   ├── utils.py
│   │   │   └── validation.py
│   │   ├── distributed/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── data_parallel_base.py
│   │   │   ├── distributed_data_parallel.py
│   │   │   ├── distributed_data_parallel_config.py
│   │   │   ├── finalize_model_grads.py
│   │   │   ├── fsdp/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── mcore_fsdp_adapter.py
│   │   │   │   └── src/
│   │   │   │       ├── README.md
│   │   │   │       ├── __init__.py
│   │   │   │       ├── megatron_fsdp/
│   │   │   │       │   ├── __init__.py
│   │   │   │       │   ├── distributed_data_parallel_config.py
│   │   │   │       │   ├── fully_shard.py
│   │   │   │       │   ├── megatron_fsdp.py
│   │   │   │       │   ├── mixed_precision.py
│   │   │   │       │   ├── package_info.py
│   │   │   │       │   ├── param_and_grad_buffer.py
│   │   │   │       │   ├── uneven_dtensor.py
│   │   │   │       │   └── utils.py
│   │   │   │       └── pyproject.toml
│   │   │   ├── param_and_grad_buffer.py
│   │   │   ├── reduce_scatter_with_fp32_accumulation.py
│   │   │   ├── torch_fully_sharded_data_parallel.py
│   │   │   └── torch_fully_sharded_data_parallel_config.py
│   │   ├── energy_monitor.py
│   │   ├── enums.py
│   │   ├── export/
│   │   │   ├── __init__.py
│   │   │   ├── data_type.py
│   │   │   ├── export_config.py
│   │   │   ├── model_type.py
│   │   │   └── trtllm/
│   │   │       ├── __init__.py
│   │   │       ├── engine_builder/
│   │   │       │   ├── __init__.py
│   │   │       │   └── trtllm_engine_builder.py
│   │   │       ├── model_to_trllm_mapping/
│   │   │       │   ├── __init__.py
│   │   │       │   └── default_conversion_dict.py
│   │   │       ├── trt_model_config.py
│   │   │       ├── trt_model_type.py
│   │   │       ├── trtllm_helper.py
│   │   │       ├── trtllm_layers.py
│   │   │       └── trtllm_weights_converter/
│   │   │           ├── __init__.py
│   │   │           ├── distributed_trtllm_model_weights_converter.py
│   │   │           ├── single_device_trtllm_model_weights_converter.py
│   │   │           └── utils.py
│   │   ├── extensions/
│   │   │   ├── TransformerEngineMixedPrecision.md
│   │   │   ├── __init__.py
│   │   │   ├── kitchen.py
│   │   │   ├── transformer_engine.py
│   │   │   └── transformer_engine_spec_provider.py
│   │   ├── fp4_utils.py
│   │   ├── fp8_utils.py
│   │   ├── full_cuda_graph.py
│   │   ├── fusions/
│   │   │   ├── __init__.py
│   │   │   ├── fused_bias_dropout.py
│   │   │   ├── fused_bias_geglu.py
│   │   │   ├── fused_bias_gelu.py
│   │   │   ├── fused_bias_swiglu.py
│   │   │   ├── fused_cross_entropy.py
│   │   │   ├── fused_indices_converter.py
│   │   │   ├── fused_layer_norm.py
│   │   │   ├── fused_mla_yarn_rope_apply.py
│   │   │   ├── fused_pad_routing_map.py
│   │   │   ├── fused_softmax.py
│   │   │   └── fused_weighted_squared_relu.py
│   │   ├── hyper_comm_grid.py
│   │   ├── inference/
│   │   │   ├── __init__.py
│   │   │   ├── async_stream.py
│   │   │   ├── batch_dimensions_utils.py
│   │   │   ├── common_inference_params.py
│   │   │   ├── communication/
│   │   │   │   └── torch_symm_triton/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── barrier.py
│   │   │   │       ├── collectives.py
│   │   │   │       ├── fused_collectives.py
│   │   │   │       ├── multimem_asm.py
│   │   │   │       └── utils.py
│   │   │   ├── communication_utils.py
│   │   │   ├── config.py
│   │   │   ├── contexts/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── attention_context/
│   │   │   │   │   ├── mamba_metadata.py
│   │   │   │   │   ├── metadata_base.py
│   │   │   │   │   ├── mha_metadata.py
│   │   │   │   │   └── triton/
│   │   │   │   │       └── tensor_ops.py
│   │   │   │   ├── base_context.py
│   │   │   │   ├── dynamic_context.py
│   │   │   │   ├── fused_kv_append_kernel.py
│   │   │   │   ├── kv_block_allocator.py
│   │   │   │   ├── mamba_slot_allocator.py
│   │   │   │   ├── routing_metadata.py
│   │   │   │   └── static_context.py
│   │   │   ├── data_parallel_inference_coordinator.py
│   │   │   ├── engines/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── abstract_engine.py
│   │   │   │   ├── async_zmq_communicator.py
│   │   │   │   ├── dynamic_engine.py
│   │   │   │   ├── mcore_engine.py
│   │   │   │   └── static_engine.py
│   │   │   ├── headers.py
│   │   │   ├── inference_client.py
│   │   │   ├── inference_request.py
│   │   │   ├── model_inference_wrappers/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── abstract_model_inference_wrapper.py
│   │   │   │   ├── gpt/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── gpt_inference_wrapper.py
│   │   │   │   ├── multimodal/
│   │   │   │   │   └── vlm_inference_wrapper.py
│   │   │   │   └── t5/
│   │   │   │       ├── __init__.py
│   │   │   │       └── t5_inference_wrapper.py
│   │   │   ├── moe/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── activations.py
│   │   │   │   ├── fused_moe.py
│   │   │   │   ├── pad.py
│   │   │   │   └── permute.py
│   │   │   ├── quantization/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── mxfp8_quantize.py
│   │   │   │   ├── mxfp8_tensor.py
│   │   │   │   └── utils.py
│   │   │   ├── sampling_params.py
│   │   │   ├── scheduler.py
│   │   │   ├── symmetric_memory.py
│   │   │   ├── text_generation_controllers/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── encoder_decoder_text_generation_controller.py
│   │   │   │   ├── text_generation_controller.py
│   │   │   │   └── vlm_text_generation_controller.py
│   │   │   ├── text_generation_server/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── dynamic_text_gen_server/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── endpoints/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── chat_completions.py
│   │   │   │   │   │   ├── common.py
│   │   │   │   │   │   ├── completions.py
│   │   │   │   │   │   └── health.py
│   │   │   │   │   ├── text_generation_server.py
│   │   │   │   │   └── tokenization.py
│   │   │   │   ├── endpoints/
│   │   │   │   │   ├── common.py
│   │   │   │   │   └── completions.py
│   │   │   │   ├── run_mcore_engine.py
│   │   │   │   ├── text_generation_server.py
│   │   │   │   └── tokenization.py
│   │   │   ├── unified_memory.py
│   │   │   └── utils.py
│   │   ├── inference_params.py
│   │   ├── jit.py
│   │   ├── model_parallel_config.py
│   │   ├── models/
│   │   │   ├── T5/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── t5_model.py
│   │   │   │   └── t5_spec.py
│   │   │   ├── __init__.py
│   │   │   ├── backends.py
│   │   │   ├── bert/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── bert_layer_specs.py
│   │   │   │   ├── bert_lm_head.py
│   │   │   │   ├── bert_model.py
│   │   │   │   └── pooler.py
│   │   │   ├── common/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── embeddings/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── language_model_embedding.py
│   │   │   │   │   ├── relative_pos_embedding.py
│   │   │   │   │   ├── rope_utils.py
│   │   │   │   │   ├── rotary_pos_embedding.py
│   │   │   │   │   └── yarn_rotary_pos_embedding.py
│   │   │   │   ├── language_module/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── language_module.py
│   │   │   │   ├── model_chunk_schedule_plan.py
│   │   │   │   └── vision_module/
│   │   │   │       ├── __init__.py
│   │   │   │       └── vision_module.py
│   │   │   ├── gpt/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── experimental_attention_variant_module_specs.py
│   │   │   │   ├── fine_grained_callables.py
│   │   │   │   ├── gpt_layer_specs.py
│   │   │   │   ├── gpt_model.py
│   │   │   │   ├── heterogeneous/
│   │   │   │   │   └── heterogeneous_layer_specs.py
│   │   │   │   └── moe_module_specs.py
│   │   │   ├── huggingface/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── clip_model.py
│   │   │   │   ├── module.py
│   │   │   │   └── qwen_model.py
│   │   │   ├── mamba/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── mamba_layer_specs.py
│   │   │   │   └── mamba_model.py
│   │   │   ├── mimo/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── config/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── base_configs.py
│   │   │   │   ├── model/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── base.py
│   │   │   │   ├── partition/
│   │   │   │   │   └── utils.py
│   │   │   │   └── submodules/
│   │   │   │       ├── audio.py
│   │   │   │       ├── base.py
│   │   │   │       └── vision.py
│   │   │   ├── multimodal/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── context_parallel.py
│   │   │   │   ├── llava_model.py
│   │   │   │   └── llava_spec.py
│   │   │   └── vision/
│   │   │       ├── __init__.py
│   │   │       ├── clip_vit_model.py
│   │   │       ├── multimodal_projector.py
│   │   │       ├── radio.py
│   │   │       └── vit_layer_specs.py
│   │   ├── msc_utils.py
│   │   ├── nccl_allocator.py
│   │   ├── num_microbatches_calculator.py
│   │   ├── optimizer/
│   │   │   ├── __init__.py
│   │   │   ├── clip_grads.py
│   │   │   ├── cpu_offloading/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   └── hybrid_optimizer.py
│   │   │   ├── distrib_optimizer.py
│   │   │   ├── grad_scaler.py
│   │   │   ├── layer_wise_optimizer.py
│   │   │   ├── muon.py
│   │   │   ├── optimizer.py
│   │   │   ├── optimizer_config.py
│   │   │   └── qk_clip.py
│   │   ├── optimizer_param_scheduler.py
│   │   ├── package_info.py
│   │   ├── packed_seq_params.py
│   │   ├── parallel_state.py
│   │   ├── pipeline_parallel/
│   │   │   ├── __init__.py
│   │   │   ├── bridge_communicator.py
│   │   │   ├── combined_1f1b.py
│   │   │   ├── fine_grained_activation_offload.py
│   │   │   ├── hybrid_cp_schedule.py
│   │   │   ├── multimodule_communicator.py
│   │   │   ├── p2p_communication.py
│   │   │   ├── schedules.py
│   │   │   └── utils.py
│   │   ├── post_training/
│   │   │   ├── __init__.py
│   │   │   └── modelopt/
│   │   │       ├── __init__.py
│   │   │       ├── gpt/
│   │   │       │   ├── __init__.py
│   │   │       │   ├── model_specs.py
│   │   │       │   └── state_dict_hooks.py
│   │   │       ├── layers.py
│   │   │       └── mamba/
│   │   │           ├── __init__.py
│   │   │           └── model_specs.py
│   │   ├── process_groups_config.py
│   │   ├── quantization/
│   │   │   ├── __init__.py
│   │   │   ├── quant_config.py
│   │   │   └── utils.py
│   │   ├── requirements.txt
│   │   ├── rerun_state_machine.py
│   │   ├── resharding/
│   │   │   ├── __init__.py
│   │   │   ├── copy_services/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base.py
│   │   │   │   ├── gloo_copy_service.py
│   │   │   │   ├── nccl_copy_service.py
│   │   │   │   └── nvshmem_copy_service.py
│   │   │   ├── execution.py
│   │   │   ├── nvshmem_copy_service/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── compat.py
│   │   │   │   ├── core/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── gpu_resource_manager.py
│   │   │   │   │   ├── kernel_launcher.py
│   │   │   │   │   └── pipeline_executor.py
│   │   │   │   ├── kernels/
│   │   │   │   │   └── chunked_kernel.cu
│   │   │   │   ├── logger.py
│   │   │   │   ├── memory/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── double_buffer_manager.py
│   │   │   │   │   └── tensor_pointer_utils.py
│   │   │   │   ├── nvshmem_types.py
│   │   │   │   ├── planning/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── communication_scheduler.py
│   │   │   │   │   ├── gpu_execution_planner.py
│   │   │   │   │   ├── task_segmenter.py
│   │   │   │   │   └── workload_packer.py
│   │   │   │   ├── service.py
│   │   │   │   └── validation.py
│   │   │   ├── planner.py
│   │   │   ├── refit.py
│   │   │   ├── transforms.py
│   │   │   └── utils.py
│   │   ├── safe_globals.py
│   │   ├── ssm/
│   │   │   ├── __init__.py
│   │   │   ├── gated_delta_net.py
│   │   │   ├── mamba_block.py
│   │   │   ├── mamba_context_parallel.py
│   │   │   ├── mamba_hybrid_layer_allocation.py
│   │   │   ├── mamba_layer.py
│   │   │   ├── mamba_mixer.py
│   │   │   ├── mlp_layer.py
│   │   │   ├── ops/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── causal_conv1d_triton.py
│   │   │   │   ├── causal_conv1d_varlen.py
│   │   │   │   ├── determinism.py
│   │   │   │   ├── mamba_ssm.py
│   │   │   │   ├── ssd_bmm.py
│   │   │   │   ├── ssd_chunk_scan.py
│   │   │   │   ├── ssd_chunk_state.py
│   │   │   │   ├── ssd_combined.py
│   │   │   │   └── ssd_state_passing.py
│   │   │   └── triton_cache_manager.py
│   │   ├── tensor_parallel/
│   │   │   ├── __init__.py
│   │   │   ├── cross_entropy.py
│   │   │   ├── data.py
│   │   │   ├── inference_layers.py
│   │   │   ├── layers.py
│   │   │   ├── mappings.py
│   │   │   ├── random.py
│   │   │   └── utils.py
│   │   ├── timers.py
│   │   ├── tokenizers/
│   │   │   ├── __init__.py
│   │   │   ├── base_tokenizer.py
│   │   │   ├── megatron_tokenizer.py
│   │   │   ├── text/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── libraries/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── abstract_tokenizer.py
│   │   │   │   │   ├── bytelevel_tokenizer.py
│   │   │   │   │   ├── chat_template.py
│   │   │   │   │   ├── huggingface_tokenizer.py
│   │   │   │   │   ├── megatron_hf_tokenizer.py
│   │   │   │   │   ├── null_tokenizer.py
│   │   │   │   │   ├── sentencepiece_tokenizer.py
│   │   │   │   │   ├── sft_tokenizer.py
│   │   │   │   │   └── tiktoken_tokenizer.py
│   │   │   │   ├── models/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── bert_tokenizer.py
│   │   │   │   │   ├── default_tokenizer.py
│   │   │   │   │   ├── gpt_tokenizer.py
│   │   │   │   │   ├── mamba_tokenizer.py
│   │   │   │   │   └── t5_tokenizer.py
│   │   │   │   ├── parsers/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base_parser.py
│   │   │   │   │   ├── deepseek_r1_reasoning_parser.py
│   │   │   │   │   └── qwen3_coder_tool_parser.py
│   │   │   │   └── text_tokenizer.py
│   │   │   ├── utils/
│   │   │   │   └── build_tokenizer.py
│   │   │   └── vision/
│   │   │       ├── __init__.py
│   │   │       ├── libraries/
│   │   │       │   ├── __init__.py
│   │   │       │   ├── multimodal_tokenizer.py
│   │   │       │   └── null_multimodal_tokenizer.py
│   │   │       ├── models/
│   │   │       │   ├── __init__.py
│   │   │       │   └── default_tokenizer.py
│   │   │       └── vision_tokenizer.py
│   │   ├── transformer/
│   │   │   ├── __init__.py
│   │   │   ├── attention.py
│   │   │   ├── cuda_graphs.py
│   │   │   ├── custom_layers/
│   │   │   │   ├── __init__.py
│   │   │   │   └── batch_invariant_kernels.py
│   │   │   ├── dot_product_attention.py
│   │   │   ├── enums.py
│   │   │   ├── experimental_attention_variant/
│   │   │   │   ├── absorbed_mla.py
│   │   │   │   └── dsa.py
│   │   │   ├── fsdp_dtensor_checkpoint.py
│   │   │   ├── heterogeneous/
│   │   │   │   ├── heterogeneous_config.py
│   │   │   │   └── linear_replacements.py
│   │   │   ├── identity_op.py
│   │   │   ├── mlp.py
│   │   │   ├── module.py
│   │   │   ├── moe/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── experts.py
│   │   │   │   ├── fused_a2a.py
│   │   │   │   ├── moe_layer.py
│   │   │   │   ├── moe_utils.py
│   │   │   │   ├── router.py
│   │   │   │   ├── router_replay.py
│   │   │   │   ├── shared_experts.py
│   │   │   │   ├── token_dispatcher.py
│   │   │   │   ├── token_dispatcher_inference.py
│   │   │   │   └── upcycling_utils.py
│   │   │   ├── multi_latent_attention.py
│   │   │   ├── multi_token_prediction.py
│   │   │   ├── pipeline_parallel_layer_layout.py
│   │   │   ├── spec_utils.py
│   │   │   ├── torch_layer_norm.py
│   │   │   ├── torch_norm.py
│   │   │   ├── transformer_block.py
│   │   │   ├── transformer_config.py
│   │   │   ├── transformer_layer.py
│   │   │   └── utils.py
│   │   ├── typed_torch.py
│   │   └── utils.py
│   ├── inference/
│   │   ├── __init__.py
│   │   └── utils.py
│   ├── legacy/
│   │   ├── fp16_deprecated/
│   │   │   └── loss_scaler.py
│   │   ├── fused_kernels/
│   │   │   ├── __init__.py
│   │   │   ├── compat.h
│   │   │   ├── tests/
│   │   │   │   ├── __init__.py
│   │   │   │   └── test_fused_kernels.py
│   │   │   └── type_shim.h
│   │   └── model/
│   │       ├── __init__.py
│   │       ├── bert_model.py
│   │       ├── biencoder_model.py
│   │       ├── classification.py
│   │       ├── enums.py
│   │       ├── fused_bias_gelu.py
│   │       ├── fused_layer_norm.py
│   │       ├── fused_softmax.py
│   │       ├── gpt_model.py
│   │       ├── language_model.py
│   │       ├── module.py
│   │       ├── multiple_choice.py
│   │       ├── realm_model.py
│   │       ├── rms_norm.py
│   │       ├── t5_model.py
│   │       ├── transformer.py
│   │       ├── utils.py
│   │       └── vision/
│   │           ├── classification.py
│   │           ├── dino.py
│   │           ├── esvit_swin_backbone.py
│   │           ├── inpainting.py
│   │           ├── knn_monitor.py
│   │           ├── mit_backbone.py
│   │           ├── swin_backbone.py
│   │           ├── utils.py
│   │           └── vit_backbone.py
│   ├── post_training/
│   │   ├── __init__.py
│   │   ├── arguments.py
│   │   ├── checkpointing.py
│   │   ├── generate.py
│   │   ├── loss_func.py
│   │   ├── model_builder.py
│   │   ├── non_loss_data_func.py
│   │   └── utils.py
│   ├── rl/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── agent/
│   │   │   ├── __init__.py
│   │   │   ├── api.py
│   │   │   ├── huggingface_dataset_agent.py
│   │   │   ├── pass_at_evaluation_agent.py
│   │   │   ├── remote_agent.py
│   │   │   ├── reward_only_agent.py
│   │   │   └── weighted_multi_task.py
│   │   ├── inference/
│   │   │   ├── __init__.py
│   │   │   ├── api.py
│   │   │   ├── inference_interface.py
│   │   │   └── megatron.py
│   │   ├── logging.py
│   │   ├── parallel_utils.py
│   │   ├── rl_utils.py
│   │   ├── sequence_packing_utils.py
│   │   └── server/
│   │       ├── __init__.py
│   │       ├── agent/
│   │       │   ├── __init__.py
│   │       │   └── fastapi_env_server.py
│   │       ├── api.py
│   │       └── inference/
│   │           ├── __init__.py
│   │           └── inference_interface_server.py
│   └── training/
│       ├── __init__.py
│       ├── argument_utils.py
│       ├── arguments.py
│       ├── async_utils.py
│       ├── checkpointing.py
│       ├── config/
│       │   ├── __init__.py
│       │   ├── common_config.py
│       │   ├── resilience_config.py
│       │   └── training_config.py
│       ├── datasets/
│       │   ├── README.md
│       │   ├── __init__.py
│       │   ├── data_samplers.py
│       │   ├── fim_dataset.py
│       │   └── sft_dataset.py
│       ├── dgrad_logging.py
│       ├── dist_signal_handler.py
│       ├── ft_integration.py
│       ├── global_vars.py
│       ├── initialize.py
│       ├── inprocess_restart.py
│       ├── log_handler.py
│       ├── one_logger_utils.py
│       ├── theoretical_memory_usage.py
│       ├── training.py
│       ├── utils.py
│       ├── wandb_utils.py
│       └── yaml_arguments.py
├── model_provider.py
├── pretrain_bert.py
├── pretrain_gpt.py
├── pretrain_mamba.py
├── pretrain_t5.py
├── pretrain_vlm.py
├── pyproject.toml
├── scripts/
│   └── check_api_backwards_compatibility.py
├── setup.py
├── tasks/
│   ├── data_utils.py
│   ├── eval_utils.py
│   └── finetune_utils.py
├── tests/
│   ├── README.md
│   ├── __init__.py
│   ├── functional_tests/
│   │   ├── __init__.py
│   │   ├── python_test_utils/
│   │   │   ├── __init__.py
│   │   │   ├── common.py
│   │   │   ├── compute_golden_statistics.py
│   │   │   ├── conftest.py
│   │   │   ├── get_test_results_from_tensorboard_logs.py
│   │   │   ├── test_grpo_training_loop.py
│   │   │   ├── test_inference_regular_pipeline.py
│   │   │   ├── test_optimizer_grads_match.py
│   │   │   ├── test_pretraining_regular_pipeline.py
│   │   │   └── test_pretraining_resume_checkpoint_pipeline.py
│   │   ├── shell_test_utils/
│   │   │   ├── _run_training.sh
│   │   │   ├── run_batch_ci_tests.sh
│   │   │   ├── run_ci_test.sh
│   │   │   └── start_interactive_job.sh
│   │   └── test_cases/
│   │       ├── bert/
│   │       │   ├── bert_mcore_tp1_pp2/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── bert_mcore_tp1_pp4_vp2/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── bert_mcore_tp2_pp2/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── bert_mcore_tp2_pp2_frozen_resume_torch_dist/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── bert_mcore_tp2_pp2_local_spec/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── bert_mcore_tp2_pp2_resume_torch_dist/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── bert_mcore_tp2_pp2_resume_torch_dist_local_spec/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── bert_mcore_tp4_pp1/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── bert_release/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   └── bert_release_sm/
│   │       │       ├── golden_values_dev_dgx_gb200.json
│   │       │       ├── golden_values_dev_dgx_h100.json
│   │       │       └── model_config.yaml
│   │       ├── common/
│   │       │   ├── ckpt_converter/
│   │       │   │   ├── __main__.py
│   │       │   │   └── model_config.yaml
│   │       │   └── moe_perf/
│   │       │       ├── __main__.py
│   │       │       ├── baseline.json
│   │       │       └── test_cases.py
│   │       ├── gpt/
│   │       │   ├── gpt3_15b_8t_release/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_15b_8t_release_gb200/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_15b_8t_release_sm/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_15b_8t_release_sm_gb200/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_7b_tp1_pp4_memory_speed/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_7b_tp4_pp1_memory_speed/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_reruns_disable/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_reruns_enable/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_reruns_persistent_1/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_reruns_persistent_2/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_reruns_reshard/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_reruns_resume/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_reruns_resume_check_grads/
│   │       │   │   ├── README.md
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_reruns_transient/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp1_mup/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp1_uniform_full_recompute/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp2_rope_embeddings/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_disable_bias_linear/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_sequence_parallel/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_swiglu/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_untie_embeddings_and_outputs/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_calculate_per_token_loss/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_decoupled_lr/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxh100_dgxc.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_cp2_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_fsdp2_resume_torch_dist/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_gdn/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_modelopt_distill_resume/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_multi_dist_optimizer_instances/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cp2/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_ddp_average_in_collective/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_defer_embedding_wgrad_compute/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/
│   │       │   │   └── golden_values_dev_dgxh100_dgxc.json
│   │       │   ├── gpt3_mcore_te_tp2_pp2_mla/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_no_mmap_bin_files/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_resume_torch_dist/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp4_pp1_qk_layernorm_test_mode/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_dev_dgxh100_dgxc.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp1_pp1_fsdp2_resume_torch_dist_te/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp1_pp2/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp1_pp2_fp16/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp1_pp2_resume_torch_dist/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp1_pp4/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp1_pp4_resume_torch_dist/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_pp2_resume_torch_dist_uninstall_te/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_pp2_uninstall_te/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp4_pp1/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp4_pp1_resume_torch/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp4_pp1_resume_torch_dist/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_weekly_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── model_config.yaml
│   │       │   │   └── tp_comm_overlap_cfg.yaml
│   │       │   ├── gpt3_weekly_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_validation/
│   │       │   │   ├── cuda_graphs.py
│   │       │   │   ├── cuda_graphs.sh
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp1_pp8_dp1_583m_logitsmatch_zmq/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp2_pp2_dp2_583m_logitsmatch_zmq/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_grpo_basic_function/
│   │       │   │   ├── env_config.yaml
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest/
│   │       │   │   ├── env_config.yaml
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest_github/
│   │       │   │   ├── env_config.yaml
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_grpo_tp4_pp1_dp2_8b_cudagraphs_throughput/
│   │       │   │   ├── env_config.yaml
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_grpo_tp4_pp1_dp2_8b_throughput/
│   │       │   │   ├── env_config.yaml
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_grpo_tp4_pp1_dp2_8b_throughput_github/
│   │       │   │   ├── env_config.yaml
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/
│   │       │   │   ├── README.md
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── model_config.yaml
│   │       │   │   └── test_prompts.jsonl
│   │       │   ├── gpt_static_inference_tp1_pp1_583m_cudagraphs/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   └── gpt_static_inference_tp1_pp1_583m_logitsmatch/
│   │       │       ├── golden_values_dev_dgx_a100.json
│   │       │       ├── golden_values_dev_dgx_h100.json
│   │       │       └── model_config.yaml
│   │       ├── gpt-nemo/
│   │       │   ├── bert-nemo_340m_mr_mbs2_gbs32_mcore_te_tp2_pp2_1N8G/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gemma2-nemo_2b_mr_mbs1_gbs8_mcore_te_tp4_pp1_cp1_1N8G/
│   │       │   │   └── model_config.yaml
│   │       │   ├── llama3-nemo_8b_mr_mbs1_gbs8_mcore_te_8experts_tp2_ep2_pp2_dgx_a100_1N8G/
│   │       │   │   └── model_config.yaml
│   │       │   ├── llama3-nemo_8b_mr_mbs4_gbs64_mcore_te_tp1_pp1_cp2_dgx_a100_1N8G/
│   │       │   │   └── model_config.yaml
│   │       │   ├── mixtral-nemo_8x7b_mr_mbs1_gbs8_mcore_te_tp2_pp1_ep2_1N8G/
│   │       │   │   └── model_config.yaml
│   │       │   └── t5-nemo_220m_mr_mbs4_gbs64_te_tp1_pp1_1N8G/
│   │       │       └── model_config.yaml
│   │       ├── hybrid/
│   │       │   ├── hybrid_dynamic_inference_tp1_pp1_dp8_583m/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── hybrid_dynamic_inference_tp1_pp1_dp8_583m_chunked_prefill/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── hybrid_mr_mcore_te_tp1_pp2_vpp2_cp1_dgx_a100_1N8G/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── hybrid_mr_mcore_te_tp1_pp4_cp1_dgx_a100_1N8G/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── hybrid_static_inference_tp1_pp1_2B_cudagraphs/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   └── hybrid_static_inference_tp1_pp1_2B_logitsmatch/
│   │       │       ├── golden_values_dev_dgx_h100.json
│   │       │       └── model_config.yaml
│   │       ├── mimo/
│   │       │   ├── mimo_vlm_pretrain_convergence_tp1_pp1_cp1_dp8/
│   │       │   │   ├── golden_values_dev.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── mimo_vlm_pretrain_convergence_tp1_pp1_cp1_dp8_seq_packing/
│   │       │   │   ├── golden_values_dev.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   └── mimo_vlm_pretrain_convergence_tp1_pp1_cp2_dp8/
│   │       │       ├── golden_values_dev.json
│   │       │       ├── golden_values_dev_dgx_h100.json
│   │       │       └── model_config.yaml
│   │       ├── mixtral/
│   │       │   ├── deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release/
│   │       │   │   └── model_config.yaml
│   │       │   ├── deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release_sm/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── deepseekv3_proxy_flex_tp2pp2emp16etp1cp1_gb_200_release/
│   │       │   │   └── model_config.yaml
│   │       │   ├── deepseekv3_proxy_flex_tp2pp2emp16etp1cp1_gb_200_release_sm/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── mixtral_8x22b_tp2pp8ep8vpp1_release/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── mixtral_8x7b_alltoall_tp2pp4ep4_release/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   └── mixtral_8x7b_tp1pp4ep8vpp8_release/
│   │       │       ├── golden_values_dev_dgx_h100.json
│   │       │       ├── golden_values_lts_dgx_a100.json
│   │       │       └── model_config.yaml
│   │       ├── moe/
│   │       │   ├── deepseek_proxy_fsdp_ep2_fsdp2/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── deepseek_proxy_fsdp_ep2_fsdp2_1node/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_te_8experts2parallel_top2router/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4/
│   │       │   │   ├── golden_values_dev.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxh100_dgxc.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/
│   │       │   │   ├── golden_values_dev.json
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_pp2_resume_torch_dist_te_2experts/
│   │       │   │   ├── golden_values_dev.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_moe_mcore_te_ep8_resume_torch_dist_muon/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_dev_dgxh100_dgxc.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp4_etp1_pp1_ep8_16B_logitsmatch_cudagraph_zmq/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp4_etp1_pp1_ep8_16B_logitsmatch_zmq/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp4_etp1_pp1_ep8_16B_logitsmatch_zmq_suspend_resume/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── model_config.yaml
│   │       │   │   └── prompts.json
│   │       │   ├── gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_grpo_tp8tp4_pp1_ep8ep2_dp8_throughputtest/
│   │       │   │   ├── env_config.yaml
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_static_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   └── gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/
│   │       │       ├── golden_values_dev_dgx_h100.json
│   │       │       └── model_config.yaml
│   │       ├── multimodal-llava/
│   │       │   ├── multimodal_llava_mcore_te_tp1_pp1/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   └── multimodal_llava_mcore_te_tp4_sp_cp2/
│   │       │       ├── golden_values_dev_dgx_a100.json
│   │       │       ├── golden_values_dev_dgx_h100.json
│   │       │       ├── golden_values_lts_dgx_a100.json
│   │       │       └── model_config.yaml
│   │       └── t5/
│   │           ├── t5_11b_mcore_tp4_pp1/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_te_tp1_pp1_vp1_resume_torch/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_a100_2nd.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_dev_dgx_h100_2nd.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_te_tp2_pp1_vp1/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_te_tp2_pp1_vp1_sequence_parallel/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_te_tp4_pp1/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_te_tp4_pp1_resume_torch_dist/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_dev_dgx_h100_2nd.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_tp1_pp1_vp1/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_tp1_pp1_vp1_resume_torch/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_a100_2nd.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_dev_dgx_h100_2nd.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_tp2_pp1_vp1/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_tp4_pp1/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_tp4_pp1_resume_torch_dist/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_dev_dgx_h100_2nd.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_release/
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_release_sm/
│   │           │   ├── golden_values_dev_dgx_gb200.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_weekly_mcore_te_tp2_pp1_vp1/
│   │           │   └── golden_values_lts_dgx_a100.json
│   │           └── t5_weekly_mcore_te_tp2_pp1_vp1_sequence_parallel/
│   │               └── golden_values_lts_dgx_a100.json
│   ├── test_utils/
│   │   ├── python_scripts/
│   │   │   ├── approve_merge_gate.py
│   │   │   ├── auto_reminder.py
│   │   │   ├── auto_reminder_github.py
│   │   │   ├── check_status_of_main.py
│   │   │   ├── dashboard.py
│   │   │   ├── download_coverage_results.py
│   │   │   ├── download_golden_values.py
│   │   │   ├── download_unit_tests_dataset.py
│   │   │   ├── generate_jet_trigger_job.py
│   │   │   ├── generate_local_jobs.py
│   │   │   ├── launch_jet_workload.py
│   │   │   ├── launch_nemo_run_workload.py
│   │   │   ├── notify.py
│   │   │   ├── recipe_parser.py
│   │   │   ├── swap_pr_labels.py
│   │   │   └── wait_for_resources.py
│   │   └── recipes/
│   │       ├── _build-mcore-dev.yaml
│   │       ├── _build-mcore-lts.yaml
│   │       ├── _build-nemo.yaml
│   │       ├── gb200/
│   │       │   ├── gpt.yaml
│   │       │   ├── moe-1node.yaml
│   │       │   ├── moe.yaml
│   │       │   └── unit-tests.yaml
│   │       └── h100/
│   │           ├── bert.yaml
│   │           ├── ckpt_converter.yaml
│   │           ├── gpt-dynamic-inference-cuda-graphs.yaml
│   │           ├── gpt-dynamic-inference-with-coordinator.yaml
│   │           ├── gpt-dynamic-inference.yaml
│   │           ├── gpt-grads.yaml
│   │           ├── gpt-grpo.yaml
│   │           ├── gpt-nemo.yaml
│   │           ├── gpt-static-inference.yaml
│   │           ├── gpt.yaml
│   │           ├── mamba-dynamic-inference.yaml
│   │           ├── mamba-static-inference.yaml
│   │           ├── mamba.yaml
│   │           ├── mimo.yaml
│   │           ├── module_performance.yaml
│   │           ├── moe-dynamic-inference-with-coordinator.yaml
│   │           ├── moe-dynamic-inference.yaml
│   │           ├── moe-grpo.yaml
│   │           ├── moe-static-inference.yaml
│   │           ├── moe.yaml
│   │           ├── multimodal-llava.yaml
│   │           ├── t5.yaml
│   │           └── unit-tests.yaml
│   └── unit_tests/
│       ├── __init__.py
│       ├── a2a_overlap/
│       │   ├── test_cuda_graphed_schedule_chunk_1f1b.py
│       │   ├── test_schedule_chunk_1f1b.py
│       │   ├── test_schedule_layer_1f1b.py
│       │   └── utils.py
│       ├── conftest.py
│       ├── data/
│       │   ├── __init__.py
│       │   ├── test_bin_reader.py
│       │   ├── test_builder.py
│       │   ├── test_fim_dataset.py
│       │   ├── test_gpt_dataset.py
│       │   ├── test_multimodal_dataset.py
│       │   ├── test_preprocess_data.py
│       │   └── test_preprocess_mmdata.py
│       ├── dist_checkpointing/
│       │   ├── __init__.py
│       │   ├── conftest.py
│       │   ├── models/
│       │   │   ├── __init__.py
│       │   │   ├── common.py
│       │   │   ├── test_bert_model.py
│       │   │   ├── test_gpt_model.py
│       │   │   ├── test_mamba.py
│       │   │   ├── test_mlp_glu.py
│       │   │   ├── test_moe_experts.py
│       │   │   └── test_t5_model.py
│       │   ├── test_async_save.py
│       │   ├── test_checkpointable.py
│       │   ├── test_fp8.py
│       │   ├── test_fully_parallel.py
│       │   ├── test_global_metadata_reuse.py
│       │   ├── test_layer_wise_optimizer.py
│       │   ├── test_local.py
│       │   ├── test_mapping.py
│       │   ├── test_msc.py
│       │   ├── test_nonpersistent.py
│       │   ├── test_optimizer.py
│       │   ├── test_pipeline_parallel_layout.py
│       │   ├── test_replication.py
│       │   ├── test_safe_globals.py
│       │   ├── test_serialization.py
│       │   ├── test_strict.py
│       │   ├── test_torch_dist.py
│       │   └── utils.py
│       ├── distributed/
│       │   ├── megatron_fsdp/
│       │   │   ├── test_mcore_fully_sharded_data_parallel.py
│       │   │   ├── test_mfsdp_fully_shard.py
│       │   │   └── utils.py
│       │   ├── test_distributed_data_parallel.py
│       │   ├── test_finalize_model_grads.py
│       │   ├── test_grad_reduce_for_replicated_embedder.py
│       │   ├── test_grad_sync_with_expert_parallel.py
│       │   ├── test_param_and_grad_buffer.py
│       │   ├── test_reduce_scatter_with_fp32_accumulation.py
│       │   └── test_torch_fully_sharded_parallel.py
│       ├── export/
│       │   └── trtllm/
│       │       ├── __init__.py
│       │       ├── test_distributed_fp8.py
│       │       ├── test_single_device_fp8.py
│       │       ├── test_trtllm_distributed_gpu_converter.py
│       │       ├── test_trtllm_helper.py
│       │       ├── test_trtllm_layers.py
│       │       └── test_trtllm_single_device_converter.py
│       ├── extension/
│       │   └── test_kitchen_sdpa.py
│       ├── find_test_cases.py
│       ├── fusions/
│       │   ├── test_bias_dropout_fusion.py
│       │   ├── test_mla_yarn_rope_apply.py
│       │   ├── test_rmsnorm_residual_fusion.py
│       │   ├── test_swiglu_fusion.py
│       │   ├── test_torch_softmax.py
│       │   └── test_weighted_squared_relu_fusion.py
│       ├── inference/
│       │   ├── __init__.py
│       │   ├── contexts/
│       │   │   ├── attention_metadata/
│       │   │   │   ├── test_mamba_metadata.py
│       │   │   │   └── test_tensor_ops.py
│       │   │   ├── test_dynamic_context.py
│       │   │   └── test_dynamic_prefix_caching.py
│       │   ├── engines/
│       │   │   ├── __init__.py
│       │   │   ├── test_dynamic_engine.py
│       │   │   ├── test_dynamic_events.py
│       │   │   ├── test_mamba_prefix_caching_e2e.py
│       │   │   └── test_static_engine.py
│       │   ├── model_inference_wrappers/
│       │   │   ├── __init__.py
│       │   │   ├── gpt/
│       │   │   │   └── test_gpt_inference_wrapper.py
│       │   │   └── t5/
│       │   │       └── test_t5_inference_wrapper.py
│       │   ├── test_batch_dimension_utils.py
│       │   ├── test_common_inference_params.py
│       │   ├── test_communication_utils.py
│       │   ├── test_data_parallel_inference_coordinator.py
│       │   ├── test_dynamic_prefix_caching_coordinator.py
│       │   ├── test_flash_decode.py
│       │   ├── test_inference_config.py
│       │   ├── test_inference_utils.py
│       │   ├── test_moe_inference.py
│       │   ├── test_moe_permute.py
│       │   ├── test_mxfp8_utils.py
│       │   ├── test_scheduler.py
│       │   ├── test_stop_words.py
│       │   ├── test_wandb_logging.py
│       │   └── text_generation_controllers/
│       │       ├── __init__.py
│       │       ├── test_encoder_decoder_text_generation_controller.py
│       │       ├── test_text_generation_controller.py
│       │       └── test_vlm_text_generation_controller.py
│       ├── models/
│       │   ├── __init__.py
│       │   ├── test_base_embedding.py
│       │   ├── test_bert_model.py
│       │   ├── test_clip_vit_model.py
│       │   ├── test_gpt_model.py
│       │   ├── test_gpt_model_batch_invariant.py
│       │   ├── test_gpt_model_quantization.py
│       │   ├── test_heterogeneous_gpt_model.py
│       │   ├── test_llava_model.py
│       │   ├── test_mamba_model.py
│       │   ├── test_mamba_moe_model.py
│       │   ├── test_mimo_audio_submodules.py
│       │   ├── test_mimo_embedding_alignment.py
│       │   ├── test_mimo_model.py
│       │   ├── test_mimo_partition.py
│       │   ├── test_mimo_submodules.py
│       │   ├── test_multimodal_projector.py
│       │   ├── test_radio_model.py
│       │   └── test_t5_model.py
│       ├── optimizer/
│       │   ├── __init__.py
│       │   └── test_optimizer_config.py
│       ├── pipeline_parallel/
│       │   ├── __init__.py
│       │   ├── test_bridge_communicator.py
│       │   ├── test_fine_grained_activation_offloading.py
│       │   ├── test_helpers.py
│       │   ├── test_multimodule_communicator.py
│       │   ├── test_multimodule_schedules.py
│       │   ├── test_pipeline_layout.py
│       │   └── test_schedules.py
│       ├── post_training/
│       │   ├── __init__.py
│       │   ├── test_modelopt_model_builder.py
│       │   └── test_modelopt_module_spec.py
│       ├── resharding/
│       │   ├── test_communication_scheduler.py
│       │   ├── test_dp_balancing.py
│       │   ├── test_model_swap.py
│       │   ├── test_mxfp8_refit.py
│       │   ├── test_task_segmenter.py
│       │   └── test_workload_packer.py
│       ├── rl/
│       │   ├── test_grouped_rollouts.py
│       │   ├── test_rl_batch_invariant.py
│       │   ├── test_rl_utils.py
│       │   └── test_sequence_packing_utils.py
│       ├── run_ci_test.sh
│       ├── ssm/
│       │   ├── ops/
│       │   │   ├── test_causal_conv1d_varlen.py
│       │   │   ├── test_ops_init.py
│       │   │   ├── test_ssd_bmm.py
│       │   │   ├── test_ssd_chunk_scan.py
│       │   │   ├── test_ssd_chunk_state.py
│       │   │   ├── test_ssd_combined.py
│       │   │   ├── test_ssd_state_passing.py
│       │   │   └── test_ssm_kernel.py
│       │   ├── test_causal_conv1d_triton.py
│       │   ├── test_gated_delta_net.py
│       │   ├── test_mamba_block.py
│       │   ├── test_mamba_context_parallel.py
│       │   ├── test_mamba_hybrid_layer_allocation.py
│       │   ├── test_mamba_layer.py
│       │   └── test_mamba_mixer.py
│       ├── tensor_parallel/
│       │   ├── __init__.py
│       │   ├── test_cross_entropy.py
│       │   ├── test_data.py
│       │   ├── test_initialization.py
│       │   ├── test_layers.py
│       │   ├── test_mappings.py
│       │   ├── test_random.py
│       │   └── test_tensor_parallel_utils.py
│       ├── test_api_backwards_compat_setup.py
│       ├── test_argument_utils.py
│       ├── test_basic.py
│       ├── test_checkpointing.py
│       ├── test_fp8_param.py
│       ├── test_fp8_utils.py
│       ├── test_hyper_comm_grid.py
│       ├── test_imports.py
│       ├── test_inference.py
│       ├── test_layer_wise_optimizer.py
│       ├── test_lion_optimizer.py
│       ├── test_local_multi_tensor_fns.py
│       ├── test_model_configs.py
│       ├── test_muon_optimizer.py
│       ├── test_nccl_allocator.py
│       ├── test_num_microbatches_calculator.py
│       ├── test_optimizer.py
│       ├── test_optimizer_cpu_offloading.py
│       ├── test_optimizer_param_scheduler.py
│       ├── test_parallel_state.py
│       ├── test_process_groups_config.py
│       ├── test_training.py
│       ├── test_typed_torch.py
│       ├── test_utilities.py
│       ├── test_utils.py
│       ├── tokenizers/
│       │   └── test_tokenizer.py
│       ├── transformer/
│       │   ├── __init__.py
│       │   ├── experimental_attention_variant/
│       │   │   ├── test_absorbed_mla.py
│       │   │   └── test_attention_variant_dsa.py
│       │   ├── moe/
│       │   │   ├── __init__.py
│       │   │   ├── conftest.py
│       │   │   ├── test_a2a_token_dispatcher.py
│       │   │   ├── test_aux_loss.py
│       │   │   ├── test_grouped_mlp.py
│       │   │   ├── test_latent_moe_layer.py
│       │   │   ├── test_moe_layer.py
│       │   │   ├── test_moe_layer_discrepancy.py
│       │   │   ├── test_multihot_indices_converter.py
│       │   │   ├── test_router_replay.py
│       │   │   ├── test_routers.py
│       │   │   ├── test_sequential_mlp.py
│       │   │   ├── test_shared_experts.py
│       │   │   ├── test_token_dispatcher.py
│       │   │   └── test_upcycling.py
│       │   ├── test_attention.py
│       │   ├── test_attention_no_rope.py
│       │   ├── test_attention_packed_seq.py
│       │   ├── test_core_attention.py
│       │   ├── test_cuda_graphs.py
│       │   ├── test_full_cuda_graph.py
│       │   ├── test_mlp.py
│       │   ├── test_module.py
│       │   ├── test_multi_latent_attention.py
│       │   ├── test_multi_token_prediction.py
│       │   ├── test_mup.py
│       │   ├── test_quantization_config.py
│       │   ├── test_relative_attention.py
│       │   ├── test_rope.py
│       │   ├── test_spec_customization.py
│       │   ├── test_submodule_callables.py
│       │   ├── test_te_layers_batch_invariant.py
│       │   ├── test_thd_correctness.py
│       │   ├── test_transformer_block.py
│       │   ├── test_transformer_block_custom_pgs.py
│       │   ├── test_transformer_layer.py
│       │   ├── test_utils.py
│       │   └── test_vision_cuda_graphs.py
│       └── utils/
│           └── test_experimental_log_once.py
├── tools/
│   ├── __init__.py
│   ├── autoformat.sh
│   ├── bert_embedding/
│   │   ├── __init__.py
│   │   ├── dataset.py
│   │   ├── embed.py
│   │   ├── external_libs.py
│   │   └── huggingface.py
│   ├── build_sequences_per_dataset.py
│   ├── check_copyright.py
│   ├── checkpoint/
│   │   ├── checkpoint_inspector.py
│   │   ├── convert.py
│   │   ├── hybrid_conversion.py
│   │   ├── loader_base.py
│   │   ├── loader_core.py
│   │   ├── loader_legacy.py
│   │   ├── loader_llama_mistral.py
│   │   ├── loader_llava.py
│   │   ├── loader_mixtral_hf.py
│   │   ├── saver_base.py
│   │   ├── saver_core.py
│   │   ├── saver_hf_llava.py
│   │   ├── saver_legacy.py
│   │   ├── saver_llava.py
│   │   ├── schema_base.py
│   │   ├── schema_core.py
│   │   ├── schema_hf.py
│   │   └── utils.py
│   ├── copyright.sh
│   ├── linter.py
│   ├── merge_datasets.py
│   ├── preprocess_data.py
│   ├── preprocess_data_nmt.py
│   ├── preprocess_mmdata.py
│   ├── report_theoretical_memory.py
│   ├── run_dynamic_text_generation_server.py
│   ├── run_inference_performance_test.py
│   ├── run_mamba_text_generation_server.py
│   ├── run_mamba_text_generation_server_completions.py
│   ├── run_text_generation_server.py
│   ├── run_vlm_text_generation.py
│   ├── text_generation_cli.py
│   ├── trigger_internal_ci.md
│   ├── trigger_internal_ci.py
│   ├── upgrade_dependencies.sh
│   └── wait_daemon.sh
└── train_rl.py

Download .txt

Showing preview only (519K chars total). Download the full file or copy to clipboard to get everything.

SYMBOL INDEX (5871 symbols across 514 files)

FILE: .github/scripts/oncall_manager.py
  function get_headers (line 37) | def get_headers():
  function get_repo_info (line 52) | def get_repo_info():
  function get_team_members (line 61) | def get_team_members(org, team_slug):
  function get_user_email (line 85) | def get_user_email(username):
  function get_slack_client (line 150) | def get_slack_client():
  function get_slack_user_id (line 158) | def get_slack_user_id(slack_client, email):
  function get_slack_usergroup_id (line 176) | def get_slack_usergroup_id(slack_client, handle):
  function update_slack_usergroup (line 192) | def update_slack_usergroup(new_oncall_username, old_members_usernames):
  function load_schedule (line 238) | def load_schedule():
  function save_schedule (line 255) | def save_schedule(schedule):
  function update_active_oncall_team (line 260) | def update_active_oncall_team(org, new_oncall):
  function rotate_schedule (line 289) | def rotate_schedule(repo_owner, dry_run=False):
  function get_last_wednesday (line 339) | def get_last_wednesday():
  function ensure_schedule_filled (line 345) | def ensure_schedule_filled(schedule, repo_owner):
  function assign_reviewer (line 393) | def assign_reviewer(pr_number):
  function main (line 408) | def main():

FILE: .github/scripts/sync_team_usergroups.py
  function get_headers (line 45) | def get_headers():
  function get_org (line 61) | def get_org():
  function github_team_to_slack_usergroup (line 67) | def github_team_to_slack_usergroup(team_slug):
  function get_child_teams (line 100) | def get_child_teams(org, parent_team_slug):
  function get_team_members (line 139) | def get_team_members(org, team_slug):
  function get_user_email (line 167) | def get_user_email(username):
  function get_slack_client (line 233) | def get_slack_client():
  function get_slack_user_id (line 242) | def get_slack_user_id(slack_client, email):
  function fetch_all_usergroups (line 261) | def fetch_all_usergroups(slack_client):
  function get_slack_usergroup_id (line 291) | def get_slack_usergroup_id(slack_client, handle):
  function github_team_to_usergroup_name (line 301) | def github_team_to_usergroup_name(team_slug):
  function create_slack_usergroup (line 312) | def create_slack_usergroup(slack_client, handle, team_slug):
  function sync_team_to_usergroup (line 356) | def sync_team_to_usergroup(team_slug, usergroup_handle, dry_run=False):
  function get_team_to_usergroup_mapping (line 447) | def get_team_to_usergroup_mapping(parent_team_slug):
  function sync_all_teams (line 464) | def sync_all_teams(dry_run=False, parent_teams=None, direct_teams=None):
  function main (line 520) | def main():

FILE: .gitlab/scripts/check_imports.py
  class ImportChecker (line 31) | class ImportChecker:
    method __init__ (line 34) | def __init__(self, package_name: str = "megatron.core", verbose: bool ...
    method should_skip_module (line 59) | def should_skip_module(self, module_name: str) -> bool:
    method discover_modules (line 66) | def discover_modules(self, package_path: str) -> List[str]:
    method import_module (line 102) | def import_module(self, module_name: str) -> Tuple[str, str]:
    method check_all_imports (line 123) | def check_all_imports(self):
  function main (line 200) | def main(package_name: str):

FILE: docs/add_copyright_header.py
  function main (line 15) | def main():

FILE: docs/autodoc2_docstrings_parser.py
  class NapoleonParser (line 20) | class NapoleonParser(MystParser):
    method parse (line 23) | def parse(self, input_string: str, document: nodes.document) -> None:

FILE: examples/academic_paper_scripts/detxoify_lm/annotations/filter-selfgeneration.py
  function get_corpus_scores (line 24) | def get_corpus_scores(lines):
  function main (line 37) | def main():

FILE: examples/academic_paper_scripts/detxoify_lm/annotations/perspective_api_annotate.py
  class PerspectiveApiScorer (line 24) | class PerspectiveApiScorer:
    method __init__ (line 31) | def __init__(self):
    method get_scores (line 44) | def get_scores(self, input_text: str, requested_attributes: Optional[L...
  function test (line 73) | def test():
  function split_lines (line 79) | def split_lines(lines, split):
  function get_score (line 88) | def get_score(line):
  function get_scores (line 118) | def get_scores(lines):
  function get_annotated_datasets (line 150) | def get_annotated_datasets(lines, threads=10):
  function main (line 160) | def main():

FILE: examples/academic_paper_scripts/detxoify_lm/finetune_gpt.py
  function model_provider (line 28) | def model_provider(pre_process=True, post_process=True):
  function get_batch (line 41) | def get_batch(data_iterator):
  function loss_func (line 72) | def loss_func(loss_mask, output_tensor):
  function forward_step (line 83) | def forward_step(data_iterator, model):
  function train_valid_test_datasets_provider (line 100) | def train_valid_test_datasets_provider(train_val_test_num_samples):
  function add_validation_args (line 141) | def add_validation_args(parser):

FILE: examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py
  function model_provider (line 29) | def model_provider(pre_process=True, post_process=True) -> Union[GPTMode...
  function add_text_generate_args (line 93) | def add_text_generate_args(parser):
  function generate_samples_unconditional (line 119) | def generate_samples_unconditional(model):
  function generate_samples_conditional (line 156) | def generate_samples_conditional(model):
  function generate_and_write_samples_unconditional (line 209) | def generate_and_write_samples_unconditional(model):
  function generate_and_write_samples_conditional (line 218) | def generate_and_write_samples_conditional(model):
  function main (line 232) | def main():

FILE: examples/academic_paper_scripts/detxoify_lm/perspective_api.py
  class PerspectiveApiScorer (line 24) | class PerspectiveApiScorer:
    method __init__ (line 31) | def __init__(self):
    method get_scores (line 44) | def get_scores(self, input_text: str, requested_attributes: Optional[L...
  function test (line 73) | def test():
  function get_score (line 80) | def get_score(x):
  function main (line 92) | def main():

FILE: examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
  function initialize_distributed (line 17) | def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_...
  function model_provider (line 29) | def model_provider():
  function load_distributed_checkpoint (line 49) | def load_distributed_checkpoint(checkpoint_path, gpt_model):

FILE: examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
  function initialize_distributed (line 18) | def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_...
  function model_provider (line 30) | def model_provider():
  function load_distributed_checkpoint (line 50) | def load_distributed_checkpoint(checkpoint_path, gpt_model):

FILE: examples/gptoss/01_convert_from_hf.py
  function _parse_args (line 10) | def _parse_args():

FILE: examples/gptoss/03_convert_to_hf.py
  function _parse_args (line 10) | def _parse_args():

FILE: examples/inference/gpt/gpt_dynamic_inference.py
  function run_inference (line 58) | def run_inference(
  function main (line 279) | def main():

FILE: examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py
  function suspend_resume_cycle (line 33) | async def suspend_resume_cycle(client, engine, args, futures):
  function main (line 49) | async def main(

FILE: examples/inference/gpt/gpt_static_inference.py
  function add_static_inference_args (line 37) | def add_static_inference_args(parser):
  function get_inference_engine (line 55) | def get_inference_engine(args: Namespace, model: MegatronModule) -> Stat...
  function generate (line 84) | async def generate(
  function main (line 119) | def main():

FILE: examples/inference/gpt/utils.py
  function get_default_sampling_params (line 23) | def get_default_sampling_params(termination_id: int = None):
  function get_curr_time (line 34) | def get_curr_time() -> float:
  class Request (line 42) | class Request:
    method __init__ (line 57) | def __init__(
    method __str__ (line 81) | def __str__(self) -> str:
  function get_time_offsets (line 91) | def get_time_offsets(
  function get_cli_requests (line 136) | def get_cli_requests(
  function get_synthetic_requests (line 153) | def get_synthetic_requests(
  function get_requests_from_file (line 188) | def get_requests_from_file(
  function build_requests (line 230) | def build_requests(
  function get_model_size_str (line 244) | def get_model_size_str(model):
  function build_dynamic_engine_setup_prefix (line 253) | def build_dynamic_engine_setup_prefix(
  function get_global_peak_memory_stats_bytes (line 315) | def get_global_peak_memory_stats_bytes() -> dict:

FILE: examples/inference/t5/simple_t5_batch_inference.py
  function add_text_generate_args (line 38) | def add_text_generate_args(parser):
  function get_inference_engine (line 70) | def get_inference_engine(args: Namespace, model: MegatronModule) -> Abst...
  function main (line 102) | def main():

FILE: examples/mimo/avlm_inference.py
  function init_distributed (line 23) | def init_distributed(tp_size: int = 1, pp_size: int = 1):
  function get_input_data (line 32) | def get_input_data(
  function main (line 129) | def main():
  function load_distributed_checkpoint (line 213) | def load_distributed_checkpoint(model: torch.nn.Module, ckpt_dir: str):

FILE: examples/mimo/configs/llava_avlm.py
  function get_llava_projection_config (line 25) | def get_llava_projection_config(
  function get_vicuna_language_layer_spec (line 46) | def get_vicuna_language_layer_spec() -> ModuleSpec:
  function get_llava_projection_layer_spec (line 50) | def get_llava_projection_layer_spec() -> ModuleSpec:

FILE: examples/mimo/configs/llava_vlm.py
  function get_vicuna_language_model_config (line 24) | def get_vicuna_language_model_config(
  function get_llava_projection_config (line 82) | def get_llava_projection_config(
  function get_vicuna_language_layer_spec (line 103) | def get_vicuna_language_layer_spec() -> ModuleSpec:
  function get_llava_projection_layer_spec (line 107) | def get_llava_projection_layer_spec() -> ModuleSpec:

FILE: examples/mimo/configs/mock.py
  function get_mock_language_model_config (line 28) | def get_mock_language_model_config(config: Optional[TransformerConfig] =...
  function get_mock_vision_model_config (line 47) | def get_mock_vision_model_config(config: Optional[TransformerConfig] = N...
  function get_mock_projection_config (line 76) | def get_mock_projection_config(hidden_size: int = 128) -> TransformerCon...
  function get_mock_language_layer_spec (line 97) | def get_mock_language_layer_spec():
  function get_mock_vision_layer_spec (line 107) | def get_mock_vision_layer_spec():
  function get_mock_projection_layer_spec (line 120) | def get_mock_projection_layer_spec():

FILE: examples/mimo/data/avlm_sample_loader.py
  function sample_loader (line 3) | def sample_loader(raw: dict) -> dict:
  function part_filter (line 85) | def part_filter(part: str) -> bool:

FILE: examples/mimo/data/energon_avlm_task_encoder.py
  class ConversationTemplateConfig (line 50) | class ConversationTemplateConfig:
  class LlavaConversationTemplateConfig (line 56) | class LlavaConversationTemplateConfig(ConversationTemplateConfig):
  class VisionAudioQASample (line 64) | class VisionAudioQASample(VQASample):
  class AVLMModelType (line 74) | class AVLMModelType(Enum):
  class AVLMTaskEncoder (line 78) | class AVLMTaskEncoder(
    method __init__ (line 86) | def __init__(
    method apply_prompt_template (line 100) | def apply_prompt_template(self, input_text: VisionAudioQASample):
    method _find_pattern_indices (line 157) | def _find_pattern_indices(
    method encode_sample (line 169) | def encode_sample(self, sample: VisionAudioQASample):
    method batch (line 289) | def batch(self, samples: List[Dict]) -> Dict:
    method encode_batch_avlm_clip_whisper_llava (line 314) | def encode_batch_avlm_clip_whisper_llava(self, batch_data: Dict) -> Dict:
    method encode_batch (line 351) | def encode_batch(self, batch_data: Dict) -> dict:
  function llava_avlm_dataloader_provider (line 358) | def llava_avlm_dataloader_provider(train_val_test_num_samples):
  class KeyProcessor (line 445) | class KeyProcessor(Protocol):
    method __call__ (line 448) | def __call__(self, values: List[torch.Tensor]) -> torch.Tensor:  # pra...
  class StackProcessor (line 452) | class StackProcessor:
    method __init__ (line 455) | def __init__(self, dim: int = 0):
    method __call__ (line 458) | def __call__(self, values: List[torch.Tensor]) -> torch.Tensor:
  class PaddingProcessor (line 462) | class PaddingProcessor:
    method __init__ (line 465) | def __init__(self, pad_value: int, batch_first: bool = True):
    method __call__ (line 469) | def __call__(self, values: List[torch.Tensor]) -> torch.Tensor:

FILE: examples/mimo/data/energon_vlm_task_encoder.py
  class ConversationTemplateConfig (line 44) | class ConversationTemplateConfig:
  class LlavaConversationTemplateConfig (line 51) | class LlavaConversationTemplateConfig(ConversationTemplateConfig):
  class ModelType (line 57) | class ModelType(Enum):
  function predict_seq_len_with_padding (line 61) | def predict_seq_len_with_padding(instance_tokens: torch.Tensor, pad_to_m...
  function group_samples (line 73) | def group_samples(samples: List[Dict[str, torch.Tensor]],
  class VLMTaskEncoder (line 110) | class VLMTaskEncoder(
    method __init__ (line 118) | def __init__(
    method apply_prompt_template (line 145) | def apply_prompt_template(self, input_text: VQASample):
    method _find_pattern_indices (line 202) | def _find_pattern_indices(
    method select_samples_to_pack (line 213) | def select_samples_to_pack(self, samples: List[Dict[str, torch.Tensor]...
    method pack_selected_samples (line 237) | def pack_selected_samples(self, samples: List[Dict[str, torch.Tensor]]...
    method encode_sample (line 335) | def encode_sample(self, sample: VQASample):
    method batch (line 385) | def batch(self, samples: List[Dict]) -> Dict:
    method encode_batch_vlm_clip_llava (line 437) | def encode_batch_vlm_clip_llava(self, batch_data: Dict) -> Dict:
    method encode_batch_vlm_clip_llava_video (line 469) | def encode_batch_vlm_clip_llava_video(self, batch_data: Dict) -> Dict:
    method encode_batch (line 494) | def encode_batch(self, batch_data: Dict) -> dict:
  function llava_vlm_dataloader_provider (line 502) | def llava_vlm_dataloader_provider(train_val_test_num_samples, max_seq_le...
  class KeyProcessor (line 573) | class KeyProcessor(Protocol):
    method __call__ (line 576) | def __call__(self, values: List[torch.Tensor], max_len: Optional[int] ...
  class StackProcessor (line 580) | class StackProcessor:
    method __init__ (line 583) | def __init__(self, dim: int = 0):
    method __call__ (line 586) | def __call__(self, values: List[torch.Tensor], max_len: Optional[int] ...
  class PaddingProcessor (line 594) | class PaddingProcessor:
    method __init__ (line 597) | def __init__(self, pad_value: int, batch_first: bool = True):
    method _pad_and_stack (line 601) | def _pad_and_stack(self, tensors: List[torch.Tensor], max_len: int, pa...
    method __call__ (line 616) | def __call__(self, values: List[torch.Tensor], max_len: Optional[int] ...
  class PackingKwargsProcessor (line 624) | class PackingKwargsProcessor:
    method __call__ (line 627) | def __call__(self, values: List[torch.Tensor], max_len: Optional[int] ...
  class GenericStackProcessor (line 633) | class GenericStackProcessor:
    method __init__ (line 635) | def __init__(self, dim: int = 0):
    method __call__ (line 638) | def __call__(self, values: List[torch.Tensor], max_len: Optional[int] ...

FILE: examples/mimo/data/mock.py
  function create_mock_image (line 15) | def create_mock_image(image_size: int = 336) -> torch.Tensor:
  function create_mock_caption (line 28) | def create_mock_caption() -> str:
  class MockVLMDataset (line 38) | class MockVLMDataset(Dataset):
    method __init__ (line 41) | def __init__(
    method __len__ (line 81) | def __len__(self) -> int:
    method __getitem__ (line 85) | def __getitem__(self, idx: int) -> Dict:
    method _mock_tokenize (line 134) | def _mock_tokenize(self) -> torch.Tensor:
  function get_mock_vlm_dataloader (line 165) | def get_mock_vlm_dataloader(
  function _collate_fn (line 211) | def _collate_fn(batch: List[Dict]) -> Dict[str, torch.Tensor]:
  function train_valid_test_datasets_provider (line 240) | def train_valid_test_datasets_provider(train_val_test_num_samples):

FILE: examples/mimo/data/prepare_video_llava_data.py
  function _extract_archives (line 11) | def _extract_archives(root: str):
  function convert_llava_video_to_wds (line 23) | def convert_llava_video_to_wds(dataset_root: str, shard_size: int = 8000):

FILE: examples/mimo/data/utils/calculate_audio_tokens.py
  function calculate_num_mel_frames (line 18) | def calculate_num_mel_frames(audio_length, sample_rate, window_stride, w...
  function calculate_num_audio_tokens (line 44) | def calculate_num_audio_tokens(audio_tensor, model_name):

FILE: examples/mimo/model_providers/hf_clip_encoder.py
  class HFCLIPEncoderWrapper (line 10) | class HFCLIPEncoderWrapper(torch.nn.Module):
    method __init__ (line 13) | def __init__(self, feature_layer_index=-2, is_video_input: bool = False):
    method forward (line 30) | def forward(self, pixel_values: torch.Tensor):

FILE: examples/mimo/model_providers/hf_whisper_encoder.py
  class HFWhisperEncoderWrapper (line 6) | class HFWhisperEncoderWrapper(torch.nn.Module):
    method __init__ (line 9) | def __init__(self, model_name: str):
    method forward (line 13) | def forward(self, input_features, seq_lengths=None):

FILE: examples/mimo/model_providers/llava_avlm.py
  function model_provider_llava_avlm (line 31) | def model_provider_llava_avlm(

FILE: examples/mimo/model_providers/llava_vlm.py
  function model_provider_llava_vlm (line 29) | def model_provider_llava_vlm(

FILE: examples/mimo/model_providers/mock.py
  function model_provider_mock_vlm_single_encoder (line 28) | def model_provider_mock_vlm_single_encoder(

FILE: examples/mimo/train.py
  function add_mimo_args (line 52) | def add_mimo_args(parser):
  function get_batch (line 86) | def get_batch(data_iterator: Iterator[Dict[str, Any]]):
  function loss_func (line 139) | def loss_func(loss_mask, output_tensor):
  function forward_step (line 176) | def forward_step(data_iterator, model):
  function train_valid_test_datasets_provider (line 193) | def train_valid_test_datasets_provider(*provider_args, **provider_kwargs):
  function model_provider (line 219) | def model_provider(

FILE: examples/mimo/utils/data_helpers.py
  function flatten (line 15) | def flatten(
  function regroup (line 32) | def regroup(flat: List[Tuple[Tuple[str, ...], torch.Tensor]]) -> Dict[st...
  function broadcast_nested_data_batch (line 43) | def broadcast_nested_data_batch(nested_dict: Dict[str, Any]) -> Dict[str...

FILE: examples/mimo/utils/logging.py
  function print_mimo_structure (line 9) | def print_mimo_structure(model):

FILE: examples/mimo/utils/model_helpers.py
  function load_submodule_ckpt (line 10) | def load_submodule_ckpt(module: torch.nn.Module, ckpt_dir: str):

FILE: examples/multimodal/combine_state_dicts.py
  function combine (line 15) | def combine(input_files, module_prefixes, output_files):

FILE: examples/multimodal/config.py
  function get_language_model_config (line 9) | def get_language_model_config(config):
  function get_vision_model_config (line 179) | def get_vision_model_config(config, apply_query_key_layer_scaling):
  function get_vision_projection_config (line 334) | def get_vision_projection_config(config, hidden_size):
  class EvaluationConfig (line 393) | class EvaluationConfig:

FILE: examples/multimodal/dataloader_provider.py
  function datasets_provider (line 27) | def datasets_provider(task_encoder,worker_config=None):
  function is_first_or_last_stage (line 71) | def is_first_or_last_stage(pp_size):
  function is_dataloader_rank (line 84) | def is_dataloader_rank():
  function train_valid_test_dataloaders_provider (line 95) | def train_valid_test_dataloaders_provider(train_val_test_num_samples, ta...
  class EnergonDataloader (line 152) | class EnergonDataloader:
    method __init__ (line 154) | def __init__(self, dataloader):
    method __next__ (line 158) | def __next__(self):
    method __iter__ (line 161) | def __iter__(self):
    method save_state (line 164) | def save_state(self):
  function cyclic_iter (line 168) | def cyclic_iter(iter):

FILE: examples/multimodal/dataset_helpers.py
  class ImageTaskSample (line 36) | class ImageTaskSample(Sample):
  class ImageTaskSamplePacked (line 50) | class ImageTaskSamplePacked(Sample):
  class ImageTaskBatchPacked (line 72) | class ImageTaskBatchPacked(Batch):
  function search_for_fit (line 95) | def search_for_fit(numbers: List[int], capacity: int) -> int:
  function greedy_knapsack (line 103) | def greedy_knapsack(item_sizes: List[int], samples: List, max_capacity: ...
  class TaskEncoder (line 145) | class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatc...
    method __init__ (line 148) | def __init__(
    method _get_total_seq_length (line 195) | def _get_total_seq_length(self, input_ids, num_tiles):
    method _truncate_for_packing (line 202) | def _truncate_for_packing(self, input_ids, target, num_tiles):
    method encode_sample (line 219) | def encode_sample(self, sample: Union[CaptioningSample, OCRSample, VQA...
    method encode_captioning (line 247) | def encode_captioning(self, sample: CaptioningSample):
    method encode_llava_pretrain (line 293) | def encode_llava_pretrain(self, sample: VQASample):
    method encode_sample_list (line 327) | def encode_sample_list(self, samples: SampleListSample):
    method encode_llava_sft (line 347) | def encode_llava_sft(self, sample: Union[SimilarityInterleavedSample, ...
    method target_has_trainable_tokens (line 533) | def target_has_trainable_tokens(self, input_ids, num_tiles, target):
    method replace_value_with_repetition (line 552) | def replace_value_with_repetition(self, arr, token_to_replace, num_rep...
    method encode_any_single_turn_vqa (line 581) | def encode_any_single_turn_vqa(self, sample):
    method combined_ocr_encoder (line 663) | def combined_ocr_encoder(self, sample, task_type):
    method encode_pdf_prompt (line 703) | def encode_pdf_prompt(self, sample: OCRSample) -> ImageTaskSample:
    method encode_ocr_ref_prompt (line 724) | def encode_ocr_ref_prompt(self, sample: OCRSample) -> ImageTaskSample:
    method bbox_coord_to_label (line 758) | def bbox_coord_to_label(self, text, bbox):
    method encode_ocr_prompt (line 772) | def encode_ocr_prompt(self, sample: OCRSample) -> ImageTaskSample:
    method batch (line 791) | def batch(self, samples: List[Union[ImageTaskSample, ImageTaskSamplePa...
    method encode_batch (line 864) | def encode_batch(self, batch: ImageTaskBatchPacked) -> dict:
    method select_samples_to_pack (line 869) | def select_samples_to_pack(self, samples: List[ImageTaskSample]) -> Li...
    method pack_selected_samples (line 882) | def pack_selected_samples(self, samples: List[ImageTaskSample]) -> Lis...
  function print_error_handler (line 945) | def print_error_handler(exc: Exception, key: Optional[str]):
  function format_multichoice_question (line 953) | def format_multichoice_question(question, multichoice_options):
  function format_multichoice_answer (line 964) | def format_multichoice_answer(idx):

FILE: examples/multimodal/energon_util.py
  class SampleListSample (line 10) | class SampleListSample(Sample):
  class OfflineTargetAspectRatioSample (line 21) | class OfflineTargetAspectRatioSample(Sample):

FILE: examples/multimodal/evaluation/evaluate_ai2d.py
  function merge_input_files (line 8) | def merge_input_files(input_path):
  function ai2d_eval (line 38) | def ai2d_eval(input_path):

FILE: examples/multimodal/evaluation/evaluate_chartqa.py
  function merge_input_files (line 8) | def merge_input_files(input_path):
  function chartqa_eval (line 35) | def chartqa_eval(input_path):

FILE: examples/multimodal/evaluation/evaluate_coco.py
  function convert_to_coco_format (line 10) | def convert_to_coco_format(input_path):
  function coco_captioning_eval (line 40) | def coco_captioning_eval(input_path, groundtruth_file):

FILE: examples/multimodal/evaluation/evaluate_infovqa.py
  function merge_input_files (line 8) | def merge_input_files(input_path):
  function infovqa_eval (line 35) | def infovqa_eval(input_path):

FILE: examples/multimodal/evaluation/evaluate_mathvista.py
  function merge_input_files (line 10) | def merge_input_files(input_path):
  function extra_processing (line 36) | def extra_processing(text):
  function extract_answer (line 60) | def extract_answer(text):
  function compute_mathvista_accuracy (line 74) | def compute_mathvista_accuracy(result_file):
  function mathvista_eval (line 108) | def mathvista_eval(input_path):

FILE: examples/multimodal/evaluation/evaluate_mmmu.py
  function get_input_output_paths (line 21) | def get_input_output_paths(input_path, task):
  function extract_answer (line 38) | def extract_answer(text):
  function convert_to_mmmu_format (line 55) | def convert_to_mmmu_format(input_path):
  function mmmu_eval (line 87) | def mmmu_eval(input_path, groundtruth_path):
  function main (line 113) | def main():

FILE: examples/multimodal/evaluation/evaluate_ocrbench.py
  function merge_input_files (line 7) | def merge_input_files(input_path):
  function compute_ocrbench_score (line 33) | def compute_ocrbench_score(result_file):
  function ocrbench_eval (line 123) | def ocrbench_eval(input_path):

FILE: examples/multimodal/evaluation/evaluate_ocrbench_v2.py
  function convert_to_ocrbench_v2_format (line 10) | def convert_to_ocrbench_v2_format(input_path, groundtruth_path):
  function ocrbench_v2_eval (line 37) | def ocrbench_v2_eval(input_path, groundtruth_path, output_path):
  function main (line 71) | def main():

FILE: examples/multimodal/evaluation/evaluate_rd_tablebench.py
  function convert_to_rdtablebench_format (line 22) | def convert_to_rdtablebench_format(input_path):
  function rdtablebench_eval (line 42) | def rdtablebench_eval(input_path):
  function main (line 67) | def main():

FILE: examples/multimodal/evaluation/evaluate_realworldqa.py
  function merge_input_files (line 8) | def merge_input_files(input_path):
  function realworldqa_eval (line 32) | def realworldqa_eval(input_path):

FILE: examples/multimodal/evaluation/evaluate_spdocvqa.py
  function merge_input_files (line 8) | def merge_input_files(input_path):
  function spdocvqa_eval (line 35) | def spdocvqa_eval(input_path):

FILE: examples/multimodal/evaluation/evaluate_textvqa.py
  function merge_input_files (line 8) | def merge_input_files(input_path):
  function textvqa_eval (line 38) | def textvqa_eval(input_path):

FILE: examples/multimodal/evaluation/evaluate_video_motionbench.py
  function merge_input_files (line 9) | def merge_input_files(input_path):
  function motionbench_eval (line 33) | def motionbench_eval(input_path):

FILE: examples/multimodal/evaluation/evaluate_video_mvbench.py
  function merge_input_files (line 7) | def merge_input_files(input_path):
  function check_ans (line 36) | def check_ans(pred, gt):
  function create_result_dict (line 53) | def create_result_dict(result_list):
  function combine_all_res (line 83) | def combine_all_res(acc_dict):
  function mvbench_eval (line 98) | def mvbench_eval(input_path):

FILE: examples/multimodal/evaluation/evaluate_video_phys_game_bench.py
  function merge_input_files (line 7) | def merge_input_files(input_path):
  function check_ans (line 35) | def check_ans(pred, gt):
  function compute_all_acc (line 52) | def compute_all_acc(result_list):
  function phys_game_bench_eval (line 83) | def phys_game_bench_eval(input_path):

FILE: examples/multimodal/evaluation/evaluate_vqav2.py
  function levenshtein_distance (line 11) | def levenshtein_distance(s1: str, s2: str) -> int:
  function normalized_levenshtein_distance (line 29) | def normalized_levenshtein_distance(s1: str, s2: str) -> float:
  function similarity_function (line 34) | def similarity_function(prediction: str, gold_label: str, threshold: flo...
  function anls_score (line 38) | def anls_score(
  function merge_input_files (line 58) | def merge_input_files(input_path):
  function is_number (line 85) | def is_number(n: str):
  function compute_vqa_accuracy (line 94) | def compute_vqa_accuracy(result_file, task):
  function vqav2_eval (line 148) | def vqav2_eval(input_path):

FILE: examples/multimodal/evaluation/evaluation_datasets.py
  function _get_partition_bounds (line 17) | def _get_partition_bounds(
  class VQADataset (line 28) | class VQADataset(torch.utils.data.Dataset):
    method __init__ (line 31) | def __init__(
    method __len__ (line 69) | def __len__(self):
    method __getitem__ (line 72) | def __getitem__(self, idx):
  class CaptioningDataset (line 110) | class CaptioningDataset(torch.utils.data.Dataset):
    method __init__ (line 113) | def __init__(
    method __len__ (line 150) | def __len__(self):
    method __getitem__ (line 153) | def __getitem__(self, idx):
  class MMMUDataset (line 179) | class MMMUDataset(torch.utils.data.Dataset):
    method __init__ (line 182) | def __init__(
    method __len__ (line 255) | def __len__(self):
    method process_image_tag (line 258) | def process_image_tag(self, q):
    method __getitem__ (line 307) | def __getitem__(self, idx):
  class VideoMMEDataset (line 452) | class VideoMMEDataset(torch.utils.data.Dataset):
    method __init__ (line 455) | def __init__(
    method __len__ (line 500) | def __len__(self):
    method __getitem__ (line 503) | def __getitem__(self, idx):
  class OCRBenchDataset (line 553) | class OCRBenchDataset(torch.utils.data.Dataset):
    method __init__ (line 556) | def __init__(
    method __len__ (line 587) | def __len__(self):
    method __getitem__ (line 590) | def __getitem__(self, idx):
  class MathVistaDataset (line 621) | class MathVistaDataset(torch.utils.data.Dataset):
    method __init__ (line 624) | def __init__(
    method __len__ (line 665) | def __len__(self):
    method __getitem__ (line 668) | def __getitem__(self, idx):
  class AI2DDataset (line 719) | class AI2DDataset(torch.utils.data.Dataset):
    method __init__ (line 722) | def __init__(
    method __len__ (line 756) | def __len__(self):
    method __getitem__ (line 759) | def __getitem__(self, idx):
  class RDTableBenchDataset (line 787) | class RDTableBenchDataset(torch.utils.data.Dataset):
    method __init__ (line 788) | def __init__(
    method __len__ (line 828) | def __len__(self):
    method __getitem__ (line 831) | def __getitem__(self, idx):
  class RealworldQADataset (line 865) | class RealworldQADataset(torch.utils.data.Dataset):
    method __init__ (line 866) | def __init__(
    method __len__ (line 899) | def __len__(self):
    method __getitem__ (line 902) | def __getitem__(self, idx):
  class MotionBenchDataset (line 952) | class MotionBenchDataset(torch.utils.data.Dataset):
    method __init__ (line 953) | def __init__(
    method __len__ (line 1007) | def __len__(self):
    method __getitem__ (line 1010) | def __getitem__(self, idx):
  class PhysGameBenchDataset (line 1057) | class PhysGameBenchDataset(torch.utils.data.Dataset):
    method __init__ (line 1058) | def __init__(
    method __len__ (line 1104) | def __len__(self):
    method _qa_template (line 1107) | def _qa_template(self, data):
    method __getitem__ (line 1116) | def __getitem__(self, idx):
  class MVBenchDataset (line 1167) | class MVBenchDataset(torch.utils.data.Dataset):
    method __init__ (line 1168) | def __init__(
    method __len__ (line 1245) | def __len__(self):
    method get_index (line 1248) | def get_index(self, bound, fps, max_frame, first_idx=0):
    method qa_template (line 1262) | def qa_template(self, data):
    method read_frame (line 1276) | def read_frame(self, video_path, bound=None, fps=2):
    method read_video_ours (line 1285) | def read_video_ours(self, video_path, bound=None):
    method __getitem__ (line 1299) | def __getitem__(self, idx):
  class ExampleInferenceDataset (line 1342) | class ExampleInferenceDataset(torch.utils.data.Dataset):
    method __init__ (line 1343) | def __init__(
    method __len__ (line 1372) | def __len__(self):
    method __getitem__ (line 1375) | def __getitem__(self, idx):
  function get_evaluation_dataset (line 1408) | def get_evaluation_dataset(

FILE: examples/multimodal/evaluation/mmmu_utils.py
  function load_yaml (line 58) | def load_yaml(file_path):
  function parse_img_path (line 68) | def parse_img_path(text):
  function process_single_sample (line 73) | def process_single_sample(data):
  function construct_prompt (line 98) | def construct_prompt(sample, config):
  function parse_multi_choice_response (line 151) | def parse_multi_choice_response(response, all_choices, index2ans):
  function check_is_number (line 206) | def check_is_number(string):
  function normalize_str (line 218) | def normalize_str(string):
  function extract_numbers (line 243) | def extract_numbers(string):
  function parse_open_response (line 266) | def parse_open_response(response):
  function eval_multi_choice (line 321) | def eval_multi_choice(gold_i, pred_i):
  function eval_open (line 338) | def eval_open(gold_i, pred_i):
  function evaluate (line 367) | def evaluate(samples):
  function calculate_ins_level_acc (line 393) | def calculate_ins_level_acc(results: Dict):
  function mmmu_main_eval (line 405) | def mmmu_main_eval(output_dict, task_cfg):

FILE: examples/multimodal/image_processing.py
  function find_closest_aspect_ratio (line 31) | def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height...
  function find_closest_area_weighted_aspect_ratio (line 47) | def find_closest_area_weighted_aspect_ratio(aspect_ratio, target_ratios,...
  class ImageTransform (line 65) | class ImageTransform:
    method __init__ (line 68) | def __init__(self, input_size, vision_model_type):
    method __call__ (line 72) | def __call__(self, img, img_h, img_w, use_tiling=False, max_num_tiles=...
  function dynamic_preprocess (line 88) | def dynamic_preprocess(
  function _build_transform (line 131) | def _build_transform(input_size, vision_model_type):

FILE: examples/multimodal/layer_scaling.py
  function _bias_dropout_add_func_layer_scaling (line 10) | def _bias_dropout_add_func_layer_scaling(ls, x_with_bias, residual, prob...
  function bias_dropout_add_unfused_layer_scaling (line 24) | def bias_dropout_add_unfused_layer_scaling(ls, training):
  function get_bias_dropout_add_layer_scaling (line 33) | def get_bias_dropout_add_layer_scaling(ls, training, fused):
  class LayerScalingTransformerLayer (line 40) | class LayerScalingTransformerLayer(TransformerLayer):
    method __init__ (line 42) | def __init__(self, *args, **kwargs):

FILE: examples/multimodal/layer_specs.py
  function get_layer_spec (line 54) | def get_layer_spec(is_vit, normalization) -> ModuleSpec:
  function get_layer_spec_te (line 98) | def get_layer_spec_te(is_vit=False, padding=False) -> ModuleSpec:
  function get_mamba_layer_spec_te (line 128) | def get_mamba_layer_spec_te(padding=False) -> ModuleSpec:
  function get_mlp_module_spec (line 187) | def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:
  function get_norm_mlp_module_spec_te (line 198) | def get_norm_mlp_module_spec_te() -> ModuleSpec:

FILE: examples/multimodal/model.py
  function model_provider (line 18) | def model_provider(
  function _get_tile_tags (line 238) | def _get_tile_tags(args, tokenizer):

FILE: examples/multimodal/model_converter/clip_converter.py
  function convert (line 10) | def convert(download_root, output_path, tensor_parallel_size, use_te):

FILE: examples/multimodal/model_converter/internvit_converter.py
  function convert (line 8) | def convert(model_name, output_path, tensor_parallel_size, use_te):

FILE: examples/multimodal/model_converter/radio_converter.py
  function convert_radio_h (line 7) | def convert_radio_h(output_path, tensor_parallel_size, use_te, version):
  function convert_radio_g (line 127) | def convert_radio_g(output_path, tensor_parallel_size, use_te, version):
  function convert (line 279) | def convert(output_path, tensor_parallel_size, use_te, model_type, versi...

FILE: examples/multimodal/model_converter/siglip_converter.py
  function convert (line 8) | def convert(output_path, tensor_parallel_size, use_te):

FILE: examples/multimodal/model_converter/vision_model_tester.py
  function run_mcore_vision (line 24) | def run_mcore_vision(model_path):
  function run_hf_vision (line 74) | def run_hf_vision(model_name):
  function main (line 89) | def main(mcore_model, hf_model):

FILE: examples/multimodal/multimodal_args.py
  function add_multimodal_extra_args (line 5) | def add_multimodal_extra_args(parser):

FILE: examples/multimodal/nvlm/internvit.py
  class InternViTRMSNorm (line 61) | class InternViTRMSNorm(MegatronModule):
    method __init__ (line 63) | def __init__(
    method _norm (line 91) | def _norm(self, x, var):
    method forward (line 97) | def forward(self, x: torch.Tensor) -> torch.Tensor:
    method _gather_var (line 115) | def _gather_var(self, input_, max_dim):
    method sharded_state_dict (line 150) | def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata={}):
  function get_mlp_module_spec (line 163) | def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:
  class InternViTSelfAttention (line 175) | class InternViTSelfAttention(SelfAttention):
    method __init__ (line 177) | def __init__(
  class InternViTTEDotProductAttention (line 214) | class InternViTTEDotProductAttention(TEDotProductAttention):
    method forward (line 218) | def forward(self, *args, **kwargs):
  function get_internvit_layer_spec (line 237) | def get_internvit_layer_spec(use_te) -> ModuleSpec:
  function get_internvit300M_layer_spec (line 263) | def get_internvit300M_layer_spec(use_te) -> ModuleSpec:

FILE: examples/multimodal/nvlm/pp_checkpoint_converter.py
  function split (line 14) | def split(input_dir, base_output_dir, input_pp, output_pp, num_tp, num_l...
  function combine (line 82) | def combine(input_dir, base_output_dir, input_pp, output_pp, num_tp, num...

FILE: examples/multimodal/radio/radio_g.py
  function get_mlp_module_spec (line 54) | def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:
  function get_norm_mlp_module_spec_te (line 65) | def get_norm_mlp_module_spec_te() -> ModuleSpec:
  function get_radio_g_layer_spec (line 75) | def get_radio_g_layer_spec(normalization) -> ModuleSpec:
  function get_radio_g_layer_spec_te (line 115) | def get_radio_g_layer_spec_te() -> ModuleSpec:

FILE: examples/multimodal/run_text_generation.py
  function is_first_rank (line 46) | def is_first_rank():
  function add_text_generation_args (line 54) | def add_text_generation_args(parser):
  function get_evaluation_dataloader (line 107) | def get_evaluation_dataloader(
  function generate_samples (line 156) | def generate_samples(model, config: EvaluationConfig, print_output):
  function get_evaluation_configs (line 365) | def get_evaluation_configs(config_path=None) -> Dict[str, EvaluationConf...
  function get_output_path (line 424) | def get_output_path(config, dp_rank):
  function generate_and_write_samples (line 439) | def generate_and_write_samples(model, config, print_output=True):
  class VLMForwardStep (line 457) | class VLMForwardStep(ForwardStep):
    method __init__ (line 460) | def __init__(
    method _forward (line 482) | def _forward(self, tokens, position_ids, attention_mask):
    method __call__ (line 493) | def __call__(self, tokens, position_ids, attention_mask):
  function get_conversation (line 536) | def get_conversation(task, question, metadata=None):
  function get_prompt_and_generated (line 648) | def get_prompt_and_generated(prompt_and_generation, prompt_format):
  function run_eval (line 690) | def run_eval(config, iteration=None):
  function run_evaluation_loop (line 804) | def run_evaluation_loop(model, configs, output_dir_override=None, iterat...
  function eval_tasks (line 843) | def eval_tasks():

FILE: examples/multimodal/train.py
  function get_batch (line 33) | def get_batch(data_iterator, image_token_index, img_seq_len):
  function get_ltor_masks_and_position_ids (line 152) | def get_ltor_masks_and_position_ids(input_ids, target, pad_token):
  function get_mask_start_and_end_idx (line 168) | def get_mask_start_and_end_idx(arr):
  function scaled_loss_func (line 193) | def scaled_loss_func(loss_mask, output_tensor):
  function loss_func (line 241) | def loss_func(loss_mask, output_tensor):
  function forward_step (line 254) | def forward_step(data_iterator, model: LLaVAModel):
  function llava_embedding_ranks (line 300) | def llava_embedding_ranks(pp_ranks):
  function llava_position_embedding_ranks (line 313) | def llava_position_embedding_ranks(pp_ranks):
  function run_online_eval (line 326) | def run_online_eval(model):
  function write_eval_to_tensorboard (line 353) | def write_eval_to_tensorboard(data, iteration, writer, walltime=None):
  function write_online_eval_to_tensorboard (line 363) | def write_online_eval_to_tensorboard(data, iteration, writer, walltime=N...

FILE: examples/post_training/modelopt/convert_model.py
  function add_convert_args (line 39) | def add_convert_args(parser):
  function get_model (line 73) | def get_model(model_provider_func, model_type=ModelType.encoder_or_decod...
  function check_arguments (line 91) | def check_arguments():

FILE: examples/post_training/modelopt/export.py
  function add_modelopt_export_args (line 27) | def add_modelopt_export_args(parser):

FILE: examples/post_training/modelopt/finetune.py
  function add_finetune_args (line 37) | def add_finetune_args(parser):
  function get_eos_id (line 46) | def get_eos_id():
  class OfflineDataset (line 66) | class OfflineDataset(torch.utils.data.Dataset):
    method __init__ (line 67) | def __init__(self, data_dir: str, num_samples):
    method __len__ (line 77) | def __len__(self):
    method __getitem__ (line 80) | def __getitem__(self, idx):
  class SFTDataset (line 86) | class SFTDataset(torch.utils.data.Dataset):
    method _wildcard_get (line 112) | def _wildcard_get(cls, directory: Dict[str, Any], name: str, default_v...
    method __init__ (line 120) | def __init__(
    method __len__ (line 195) | def __len__(self):
    method __getitem__ (line 198) | def __getitem__(self, idx):
    method _process_and_pack_example (line 232) | def _process_and_pack_example(self):
    method _process_example (line 260) | def _process_example(self, example: Dict[str, Any]):
    method _to_conversation (line 305) | def _to_conversation(cls, question, response):
    method _sharegpt_to_openai_conversations (line 311) | def _sharegpt_to_openai_conversations(cls, data):
    method _special_to_openai_conversations (line 330) | def _special_to_openai_conversations(cls, data):
  function train_valid_test_sft_datasets_provider (line 335) | def train_valid_test_sft_datasets_provider(train_val_test_num_samples):
  function get_batch (line 377) | def get_batch(data_iterator):
  function non_loss_data_func (line 444) | def non_loss_data_func(model: GPTModel):
  function forward_step (line 455) | def forward_step(data_iterator, model: GPTModel):

FILE: examples/post_training/modelopt/generate.py
  function add_generate_args (line 28) | def add_generate_args(parser):
  function check_arguments (line 41) | def check_arguments():
  function mtbench_to_oai_chat (line 53) | def mtbench_to_oai_chat(example):
  function get_conversations (line 62) | def get_conversations(example):

FILE: examples/post_training/modelopt/mmlu.py
  function add_mmlu_args (line 32) | def add_mmlu_args(parser):
  function get_all_subjects (line 45) | def get_all_subjects():
  function format_example (line 108) | def format_example(example, include_answer: bool = True):
  function generate_prompt (line 120) | def generate_prompt(test_example, dev_examples, few_shots=0, no_subject_...

FILE: examples/post_training/modelopt/offline_feature_extract.py
  function add_extract_args (line 23) | def add_extract_args(parser):
  function extract_feature (line 32) | def extract_feature(dataset, model, output_dir, idx_start, idx_end):

FILE: examples/post_training/modelopt/prune.py
  function add_prune_args (line 43) | def add_prune_args(parser):
  function check_arguments (line 125) | def check_arguments(args):
  function get_calib_dataloader (line 132) | def get_calib_dataloader(calib_size=1024, max_sequence_length=512):
  function get_params (line 142) | def get_params(model):
  function _custom_prompt_forward_loop_func (line 187) | def _custom_prompt_forward_loop_func(model):
  function _hf_dataset_forword_loop_func (line 202) | def _hf_dataset_forword_loop_func(model):

FILE: examples/post_training/modelopt/quantize.py
  function add_text_generate_ptq_args (line 76) | def add_text_generate_ptq_args(parser):
  function check_arguments (line 145) | def check_arguments():
  function _is_first_layers (line 157) | def _is_first_layers(name: str, num_layers: int = 1, num_layers_to_disab...
  function _is_last_layers (line 167) | def _is_last_layers(name: str, num_layers: int = 1, num_layers_to_disabl...
  function get_first_layers_disabled_config (line 177) | def get_first_layers_disabled_config(config, num_layers: int = 1, num_la...
  function get_last_layers_disabled_config (line 195) | def get_last_layers_disabled_config(config, num_layers: int = 1, num_lay...
  function get_modelopt_torch_quantization_config (line 213) | def get_modelopt_torch_quantization_config():
  function get_calib_dataloader (line 270) | def get_calib_dataloader(
  function _custom_prompt_forward_loop_func (line 382) | def _custom_prompt_forward_loop_func(model):
  function _dataset_forward_loop_func (line 397) | def _dataset_forward_loop_func(model):

FILE: examples/post_training/modelopt/validate.py
  function add_ar_validation_args (line 27) | def add_ar_validation_args(parser):
  function check_arguments (line 59) | def check_arguments():
  function get_current_memory_info (line 71) | def get_current_memory_info():
  function report_current_memory_info (line 82) | def report_current_memory_info():

FILE: examples/rl/benchmark_refit.py
  function add_benchmark_args (line 24) | def add_benchmark_args(parser):
  function model_provider (line 51) | def model_provider(pre_process=True, post_process=True, parallel_output=...
  function create_refit_service (line 67) | def create_refit_service(method):
  function print_config_summary (line 79) | def print_config_summary(args, src_config, dst_config, world_size, mode):
  function run_benchmark (line 94) | def run_benchmark(src_model, dst_model, refit_service, num_warmup, num_i...
  function print_results (line 129) | def print_results(timings):
  function benchmark_collocated (line 145) | def benchmark_collocated():
  function benchmark_non_collocated (line 220) | def benchmark_non_collocated():
  function main (line 316) | def main():

FILE: examples/rl/environments/countdown/countdown.py
  function extract_solution (line 6) | def extract_solution(solution_str: str, remove_prompt: bool = False):
  function validate_equation (line 28) | def validate_equation(equation_str, available_numbers):
  function evaluate_equation (line 44) | def evaluate_equation(equation_str):
  function compute_score (line 59) | def compute_score(solution_str, ground_truth, method='strict', format_sc...

FILE: examples/rl/environments/countdown/countdown_agent.py
  class CountdownAgent (line 12) | class CountdownAgent(RewardOnlyAgent, HFDatasetAgent):
    method make_prefix (line 15) | def make_prefix(self, target, nums) -> str:
    method get_dataset (line 20) | def get_dataset(self, validation: bool = False):
    method evaluation_prompts (line 29) | async def evaluation_prompts(
    method get_prompt (line 38) | async def get_prompt(self, validation=False) -> tuple[str, dict]:
    method get_reward (line 43) | async def get_reward(self, response, golden: dict) -> float:

FILE: examples/rl/environments/math/aime_agent.py
  class AIMEAgent (line 15) | class AIMEAgent(MathAgent):
    method get_dataset (line 18) | def get_dataset(self, validation: bool = False):
    method evaluation_prompts (line 22) | async def evaluation_prompts(
    method get_prompt (line 34) | async def get_prompt(self, validation=False) -> tuple[str, dict]:
    method get_reward (line 44) | async def get_reward(self, response, golden: dict) -> float:

FILE: examples/rl/environments/math/bigmath_agent.py
  class BigMathAgent (line 16) | class BigMathAgent(MathAgent):
    method get_dataset (line 19) | def get_dataset(self, validation: bool = False):
    method evaluation_prompts (line 22) | async def evaluation_prompts(
    method get_prompt (line 31) | async def get_prompt(self, validation=False) -> tuple[str, dict]:
    method get_reward (line 37) | async def get_reward(self, response, golden: dict) -> float:

FILE: examples/rl/environments/math/dapo_agent.py
  class DAPOAgent (line 15) | class DAPOAgent(MathAgent):
    method reformat_datum (line 18) | def reformat_datum(self, datum: dict) -> dict:
    method get_dataset (line 30) | def get_dataset(self, validation: bool = False):
    method evaluation_prompts (line 33) | async def evaluation_prompts(
    method get_prompt (line 43) | async def get_prompt(self, validation=False) -> tuple[str, dict]:
    method get_reward (line 50) | async def get_reward(self, response, golden: dict) -> float:

FILE: examples/rl/environments/math/gsm8k_agent.py
  class GSM8KAgent (line 25) | class GSM8KAgent(MathAgent):
    method __init__ (line 26) | def __init__(self,
    method reformat_datum (line 41) | def reformat_datum(self, datum: dict) -> dict:
    method get_dataset (line 48) | def get_dataset(self, validation: bool = False):
    method evaluation_prompts (line 51) | async def evaluation_prompts(
    method get_prompt (line 60) | async def get_prompt(self, validation=False) -> tuple[str, dict]:
    method get_reward (line 67) | async def get_reward(self, response, golden: dict) -> float:

FILE: examples/rl/environments/math/math_agent.py
  class MathAgent (line 23) | class MathAgent(RewardOnlyAgent):
    method __init__ (line 24) | def __init__(self,
    method compute_score (line 49) | def compute_score(self, response: str, golden: dict, golden_key: str =...
    method make_prefix (line 120) | def make_prefix(self, problem_key: str = "problem", **kwargs) -> str:

FILE: examples/rl/environments/math/openmath_agent.py
  class OpenMathInstructAgent (line 16) | class OpenMathInstructAgent(MathAgent):
    method get_dataset (line 19) | def get_dataset(self, validation: bool = False):
    method evaluation_prompts (line 22) | async def evaluation_prompts(
    method get_prompt (line 31) | async def get_prompt(self, validation=False) -> tuple[str, dict]:
    method get_reward (line 37) | async def get_reward(self, response, golden: dict) -> float:

FILE: examples/run_simple_mcore_train_loop.py
  function initialize_distributed (line 32) | def initialize_distributed(
  function model_provider (line 60) | def model_provider() -> GPTModel:
  function get_train_data_iterator (line 85) | def get_train_data_iterator() -> Iterator:
  function forward_step_func (line 123) | def forward_step_func(
  function save_distributed_checkpoint (line 163) | def save_distributed_checkpoint(
  function load_distributed_checkpoint (line 183) | def load_distributed_checkpoint(

FILE: gpt_builders.py
  function gpt_builder (line 28) | def gpt_builder(args, pre_process, post_process, vp_stage=None, config=N...
  function _get_transformer_layer_spec (line 116) | def _get_transformer_layer_spec(use_te, config):

FILE: mamba_builders.py
  function mamba_builder (line 12) | def mamba_builder(args, pre_process, post_process, vp_stage=None, config...

FILE: megatron/core/_rank_utils.py
  function safe_get_rank (line 12) | def safe_get_rank() -> int:
  function log_single_rank (line 31) | def log_single_rank(logger: logging.Logger, *args: Any, rank: int = 0, *...

FILE: megatron/core/activations.py
  function squared_relu (line 9) | def squared_relu(x: torch.Tensor) -> torch.Tensor:
  function quick_gelu (line 15) | def quick_gelu(x: torch.Tensor) -> torch.Tensor:
  function fast_gelu (line 21) | def fast_gelu(x: torch.Tensor) -> torch.Tensor:

FILE: megatron/core/config.py
  function set_experimental_flag (line 6) | def set_experimental_flag(flag: bool):
  function is_experimental_enabled (line 12) | def is_experimental_enabled():

FILE: megatron/core/config_logger.py
  function get_config_logger_path (line 25) | def get_config_logger_path(config):
  function has_config_logger_enabled (line 30) | def has_config_logger_enabled(config):
  function get_path_count (line 40) | def get_path_count(path):
  function get_path_with_count (line 52) | def get_path_with_count(path):
  class JSONEncoderWithMcoreTypes (line 59) | class JSONEncoderWithMcoreTypes(json.JSONEncoder):
    method default (line 64) | def default(self, o):
  function log_config_to_disk (line 97) | def log_config_to_disk(config, dict_data, prefix='', rank_str=''):

FILE: megatron/core/datasets/bert_dataset.py
  class BERTMaskedWordPieceDatasetConfig (line 17) | class BERTMaskedWordPieceDatasetConfig(MaskedWordPieceDatasetConfig):
    method __post_init__ (line 23) | def __post_init__(self) -> None:
  class BERTMaskedWordPieceDataset (line 30) | class BERTMaskedWordPieceDataset(MaskedWordPieceDataset):
    method __init__ (line 44) | def __init__(
    method _key_config_attributes (line 64) | def _key_config_attributes() -> List[str]:
    method __getitem__ (line 74) | def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
    method _get_token_mask (line 173) | def _get_token_mask(self, numpy_random_state: numpy.random.RandomState...

FILE: megatron/core/datasets/blended_dataset.py
  class BlendedDataset (line 24) | class BlendedDataset(torch.utils.data.Dataset):
    method __init__ (line 41) | def __init__(
    method __len__ (line 88) | def __len__(self) -> int:
    method __getitem__ (line 97) | def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
    method _build_indices (line 110) | def _build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]:

FILE: megatron/core/datasets/blended_megatron_dataset_builder.py
  class BlendedMegatronDatasetBuilder (line 29) | class BlendedMegatronDatasetBuilder(object):
    method __init__ (line 45) | def __init__(
    method build (line 77) | def build(self) -> List[Optional[TopLevelDataset]]:
    method _build_blended_dataset_splits (line 136) | def _build_blended_dataset_splits(self) -> List[Optional[TopLevelDatas...
    method _build_megatron_datasets_parallel (line 331) | def _build_megatron_datasets_parallel(
    method _build_megatron_dataset_splits (line 416) | def _build_megatron_dataset_splits(
    method build_generic_dataset (line 491) | def build_generic_dataset(
  function _get_size_per_split_per_dataset (line 553) | def _get_size_per_split_per_dataset(

FILE: megatron/core/datasets/blended_megatron_dataset_config.py
  class BlendedMegatronDatasetConfig (line 16) | class BlendedMegatronDatasetConfig:
    method __post_init__ (line 99) | def __post_init__(self) -> None:
  function parse_and_normalize_split (line 155) | def parse_and_normalize_split(split: str) -> List[float]:
  function convert_split_vector_to_split_matrix (line 175) | def convert_split_vector_to_split_matrix(

FILE: megatron/core/datasets/data_schedule.py
  class HybridCPDataLoaderWrapper (line 12) | class HybridCPDataLoaderWrapper:
    method __init__ (line 28) | def __init__(
    method __iter__ (line 51) | def __iter__(self):
    method get_global_seqlens (line 55) | def get_global_seqlens(self, subsample_seqlens: torch.Tensor) -> List[...
    method get_global_id_seqlens (line 105) | def get_global_id_seqlens(self, num_local_subsamples, offsets, seqlens...
    method _gid_to_src_rank (line 126) | def _gid_to_src_rank(self, gid: int, offsets: List[int]) -> int:
    method reroute_samples_to_hdp_ranks (line 136) | def reroute_samples_to_hdp_ranks(
    method unpack_batch (line 245) | def unpack_batch(self, batch):
    method __next__ (line 267) | def __next__(self) -> Any:

FILE: megatron/core/datasets/gpt_dataset.py
  class GPTDatasetConfig (line 25) | class GPTDatasetConfig(BlendedMegatronDatasetConfig):
    method __post_init__ (line 79) | def __post_init__(self) -> None:
  class GPTDataset (line 101) | class GPTDataset(MegatronDataset):
    method __init__ (line 119) | def __init__(
    method numel_low_level_dataset (line 148) | def numel_low_level_dataset(low_level_dataset: IndexedDataset) -> int:
    method build_low_level_dataset (line 163) | def build_low_level_dataset(dataset_path: str, config: GPTDatasetConfi...
    method __len__ (line 196) | def __len__(self) -> int:
    method __getitem__ (line 225) | def __getitem__(self, idx: Optional[int]) -> Dict[str, torch.Tensor]:
    method _query_document_sample_shuffle_indices (line 298) | def _query_document_sample_shuffle_indices(
    method _build_document_sample_shuffle_indices (line 381) | def _build_document_sample_shuffle_indices(
    method _get_num_tokens_per_epoch (line 609) | def _get_num_tokens_per_epoch(self) -> int:
    method _get_num_epochs (line 617) | def _get_num_epochs(self, num_tokens_per_epoch: int) -> int:
  function _build_document_index (line 640) | def _build_document_index(
  function _build_shuffle_index (line 674) | def _build_shuffle_index(
  function _get_ltor_masks_and_position_ids (line 706) | def _get_ltor_masks_and_position_ids(
  class MockGPTLowLevelDataset (line 783) | class MockGPTLowLevelDataset:
    method __init__ (line 803) | def __init__(self, tokenizer: MegatronTokenizerBase) -> None:
    method __len__ (line 811) | def __len__(self) -> int:
    method __getitem__ (line 814) | def __getitem__(self, idx: int) -> numpy.number:
    method get (line 821) | def get(self, idx: int, offset: int = 0, length: Optional[int] = None)...
  class MockGPTDataset (line 839) | class MockGPTDataset(GPTDataset):
    method __init__ (line 857) | def __init__(
    method numel_low_level_dataset (line 878) | def numel_low_level_dataset(low_level_dataset: MockGPTLowLevelDataset)...
    method build_low_level_dataset (line 890) | def build_low_level_dataset(  # type: ignore[override]

FILE: megatron/core/datasets/helpers.cpp
  function build_exhaustive_blending_indices (line 22) | void build_exhaustive_blending_indices(py::array_t<int16_t> &dataset_ind...
  function build_blending_indices (line 77) | void build_blending_indices(py::array_t<int16_t> &dataset_index,
  function build_sample_idx (line 145) | py::array_t<T> build_sample_idx(
  function get_target_sample_len (line 251) | inline int32_t get_target_sample_len(const int32_t short_seq_ratio,
  function build_mapping_impl (line 269) | py::array build_mapping_impl(const py::array_t<int64_t> &docs_,
  function build_mapping (line 529) | py::array build_mapping(const py::array_t<int64_t> &docs_,
  function build_blocks_mapping_impl (line 567) | py::array build_blocks_mapping_impl(const py::array_t<int64_t> &docs_,
  function build_blocks_mapping (line 808) | py::array build_blocks_mapping(const py::array_t<int64_t> &docs_,
  function PYBIND11_MODULE (line 841) | PYBIND11_MODULE(helpers_cpp, m)

FILE: megatron/core/datasets/helpers.py
  function build_sample_idx (line 12) | def build_sample_idx(

FILE: megatron/core/datasets/indexed_dataset.py
  class DType (line 50) | class DType(Enum):
    method code_from_dtype (line 63) | def code_from_dtype(cls, value: Type[numpy.number]) -> int:
    method dtype_from_code (line 75) | def dtype_from_code(cls, value: int) -> Type[numpy.number]:
    method size (line 87) | def size(key: Union[int, Type[numpy.number]]) -> int:
    method optimal_dtype (line 107) | def optimal_dtype(cardinality: Optional[int]) -> Type[numpy.number]:
  class _IndexWriter (line 122) | class _IndexWriter(object):
    method __init__ (line 131) | def __init__(self, idx_path: str, dtype: Type[numpy.number]) -> None:
    method __enter__ (line 135) | def __enter__(self) -> "_IndexWriter":
    method __exit__ (line 154) | def __exit__(
    method write (line 175) | def write(
    method _sequence_pointers (line 213) | def _sequence_pointers(
  class _IndexReader (line 233) | class _IndexReader(object):
    method __init__ (line 246) | def __init__(
    method __del__ (line 336) | def __del__(self) -> None:
    method __len__ (line 342) | def __len__(self) -> int:
    method __getitem__ (line 351) | def __getitem__(self, idx: int) -> Tuple[numpy.int32, numpy.int64, Opt...
  class _BinReader (line 368) | class _BinReader(ABC):
    method read (line 372) | def read(self, dtype: Type[numpy.number], count: int, offset: int) -> ...
  class _MMapBinReader (line 389) | class _MMapBinReader(_BinReader):
    method __init__ (line 396) | def __init__(self, bin_path: str) -> None:
    method read (line 405) | def read(self, dtype: Type[numpy.number], count: int, offset: int) -> ...
    method __del__ (line 421) | def __del__(self) -> None:
  class _FileBinReader (line 431) | class _FileBinReader(_BinReader):
    method __init__ (line 438) | def __init__(
    method read (line 447) | def read(self, dtype: Type[numpy.number], count: int, offset: int) -> ...
  class _S3BinReader (line 500) | class _S3BinReader(_BinReader):
    method __init__ (line 513) | def __init__(self, bin_path: str, object_storage_config: ObjectStorage...
    method _extract_from_cache (line 523) | def _extract_from_cache(self, offset: int, size: int) -> bytes:
    method read (line 532) | def read(self, dtype: Type[numpy.number], count: int, offset: int) -> ...
    method __del__ (line 581) | def __del__(self) -> None:
  class _MultiStorageClientBinReader (line 586) | class _MultiStorageClientBinReader(_BinReader):
    method __init__ (line 595) | def __init__(self, bin_path: str, object_storage_config: ObjectStorage...
    method read (line 599) | def read(self, dtype: Type[numpy.number], count: int, offset: int) -> ...
  class IndexedDataset (line 611) | class IndexedDataset(torch.utils.data.Dataset):
    method __init__ (line 634) | def __init__(
    method initialize (line 678) | def initialize(
    method __getstate__ (line 736) | def __getstate__(self) -> Tuple[str, bool, bool, Optional[ObjectStorag...
    method __setstate__ (line 752) | def __setstate__(self, state: Tuple[str, bool, bool, Optional[ObjectSt...
    method __del__ (line 777) | def __del__(self) -> None:
    method __len__ (line 782) | def __len__(self) -> int:
    method __getitem__ (line 790) | def __getitem__(
    method get (line 843) | def get(
    method sequence_lengths (line 872) | def sequence_lengths(self) -> numpy.ndarray:
    method document_indices (line 881) | def document_indices(self) -> numpy.ndarray:
    method get_document_indices (line 889) | def get_document_indices(self) -> numpy.ndarray:
    method set_document_indices (line 899) | def set_document_indices(self, document_indices: numpy.ndarray) -> None:
    method sequence_modes (line 910) | def sequence_modes(self) -> numpy.ndarray:
    method exists (line 920) | def exists(path_prefix: str) -> bool:
  class IndexedDatasetBuilder (line 937) | class IndexedDatasetBuilder(object):
    method __init__ (line 948) | def __init__(
    method add_item (line 965) | def add_item(self, tensor: torch.Tensor, mode: int = 0) -> None:
    method add_document (line 979) | def add_document(
    method end_document (line 999) | def end_document(self) -> None:
    method add_index (line 1003) | def add_index(self, path_prefix: str) -> None:
    method finalize (line 1029) | def finalize(self, idx_path: str) -> None:
  function get_idx_path (line 1040) | def get_idx_path(path_prefix: str) -> str:
  function get_bin_path (line 1052) | def get_bin_path(path_prefix: str) -> str:

FILE: megatron/core/datasets/masked_dataset.py
  class MaskedWordPieceDatasetConfig (line 23) | class MaskedWordPieceDatasetConfig(BlendedMegatronDatasetConfig):
    method __post_init__ (line 49) | def __post_init__(self) -> None:
  class MaskedWordPieceDataset (line 76) | class MaskedWordPieceDataset(MegatronDataset):
    method __init__ (line 102) | def __init__(
    method numel_low_level_dataset (line 116) | def numel_low_level_dataset(low_level_dataset: IndexedDataset) -> int:
    method build_low_level_dataset (line 128) | def build_low_level_dataset(
    method _key_config_attributes (line 144) | def _key_config_attributes() -> List[str]:
    method __len__ (line 160) | def __len__(self) -> int:
    method _build_sample_index (line 163) | def _build_sample_index(
    method _create_masked_lm_predictions (line 247) | def _create_masked_lm_predictions(
    method _get_token_mask (line 440) | def _get_token_mask(self, numpy_random_state: numpy.random.RandomState...

FILE: megatron/core/datasets/megatron_dataset.py
  class MegatronDataset (line 23) | class MegatronDataset(ABC, torch.utils.data.Dataset):
    method __init__ (line 41) | def __init__(
    method numel_low_level_dataset (line 117) | def numel_low_level_dataset(low_level_dataset: LowLevelDataset) -> int:
    method build_low_level_dataset (line 134) | def build_low_level_dataset(
    method _key_config_attributes (line 155) | def _key_config_attributes() -> List[str]:
    method __len__ (line 167) | def __len__(self) -> int:
    method __getitem__ (line 176) | def __getitem__(self, idx: int) -> Dict[str, Union[torch.Tensor, numpy...

FILE: megatron/core/datasets/multimodal_dataset.py
  class MultimodalDatasetConfig (line 12) | class MultimodalDatasetConfig(GPTDatasetConfig):
    method __post_init__ (line 28) | def __post_init__(self) -> None:
  class MockMultimodalDataset (line 35) | class MockMultimodalDataset(MockGPTDataset):
    method __getitem__ (line 42) | def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:

FILE: megatron/core/datasets/object_storage_utils.py
  class ObjectStorageConfig (line 21) | class ObjectStorageConfig:
  class S3Client (line 46) | class S3Client(Protocol):
    method download_file (line 49) | def download_file(self, Bucket: str, Key: str, Filename: str) -> None:
    method upload_file (line 53) | def upload_file(self, Filename: str, Bucket: str, Key: str) -> None:
    method head_object (line 57) | def head_object(self, Bucket: str, Key: str) -> Dict[str, Any]:
    method get_object (line 61) | def get_object(self, Bucket: str, Key: str, Range: str) -> Dict[str, A...
    method close (line 65) | def close(self) -> None:
  function _remove_s3_prefix (line 70) | def _remove_s3_prefix(path: str) -> str:
  function _is_s3_path (line 82) | def _is_s3_path(path: str) -> bool:
  function _remove_msc_prefix (line 94) | def _remove_msc_prefix(path: str) -> str:
  function _is_msc_path (line 107) | def _is_msc_path(path: str) -> bool:
  function _s3_download_file (line 119) | def _s3_download_file(client: S3Client, s3_path: str, local_path: str) -...
  function _s3_object_exists (line 135) | def _s3_object_exists(client: S3Client, path: str) -> bool:
  function is_object_storage_path (line 158) | def is_object_storage_path(path: str) -> bool:
  function get_index_cache_path (line 170) | def get_index_cache_path(idx_path: str, object_storage_config: ObjectSto...
  function parse_s3_path (line 195) | def parse_s3_path(path: str) -> Tuple[str, str]:
  function get_object_storage_access (line 215) | def get_object_storage_access(path: str) -> str:
  function dataset_exists (line 220) | def dataset_exists(path_prefix: str, idx_path: str, bin_path: str) -> bool:
  function cache_index_file (line 243) | def cache_index_file(remote_path: str, local_path: str) -> None:

FILE: megatron/core/datasets/t5_dataset.py
  class T5MaskedWordPieceDatasetConfig (line 22) | class T5MaskedWordPieceDatasetConfig(MaskedWordPieceDatasetConfig):
    method __post_init__ (line 36) | def __post_init__(self) -> None:
  class T5MaskedWordPieceDataset (line 48) | class T5MaskedWordPieceDataset(MaskedWordPieceDataset):
    method __init__ (line 67) | def __init__(
    method _key_config_attributes (line 85) | def _key_config_attributes() -> List[str]:
    method _build_b1ss_attention_mask (line 96) | def _build_b1ss_attention_mask(
    method config_attention_mask (line 128) | def config_attention_mask(
    method __getitem__ (line 225) | def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
    method _get_token_mask (line 329) | def _get_token_mask(self, numpy_random_state: numpy.random.RandomState...

FILE: megatron/core/datasets/utils.py
  class Split (line 14) | class Split(Enum):
  function compile_helpers (line 20) | def compile_helpers():
  function normalize (line 33) | def normalize(weights: List[float]) -> List[float]:
  function get_blend_from_list (line 49) | def get_blend_from_list(

FILE: megatron/core/dist_checkpointing/core.py
  class CheckpointingException (line 15) | class CheckpointingException(Exception):
  class CheckpointingConfig (line 22) | class CheckpointingConfig:
  function check_is_distributed_checkpoint (line 38) | def check_is_distributed_checkpoint(checkpoint_dir):
  function maybe_load_config (line 50) | def maybe_load_config(checkpoint_dir: str) -> Optional[CheckpointingConf...
  function save_config (line 76) | def save_config(config: CheckpointingConfig, checkpoint_dir: str):

FILE: megatron/core/dist_checkpointing/dict_utils.py
  function extract_matching_values (line 18) | def extract_matching_values(
  function diff (line 69) | def diff(x1: Any, x2: Any, prefix: Tuple = ()) -> Tuple[list, list, list]:
  function inspect_types (line 138) | def inspect_types(x: Any, prefix: Tuple = (), indent: int = 4):
  function nested_values (line 166) | def nested_values(x: Union[dict, list]):
  function nested_items_iter (line 176) | def nested_items_iter(x: Union[dict, list]):
  function dict_map (line 186) | def dict_map(f: Callable, d: dict):
  function dict_map_with_key (line 192) | def dict_map_with_key(f: Callable, d: dict):
  function dict_list_map_inplace (line 198) | def dict_list_map_inplace(f: Callable[[U], V], x: Union[Dict, List, U]):
  function dict_list_map_outplace (line 210) | def dict_list_map_outplace(f: Callable[[U], V], x: Union[Dict, List, U])...
  function merge (line 220) | def merge(x1: Union[dict, list], x2: Union[dict, list], key: Tuple[Union...
  function map_reduce (line 244) | def map_reduce(

FILE: megatron/core/dist_checkpointing/exchange_utils.py
  function is_float8tensor (line 32) | def is_float8tensor(tensor: torch.Tensor) -> bool:
  class ShardDistribution (line 40) | class ShardDistribution(NamedTuple):
  function _shard_size (line 63) | def _shard_size(sh_ten: ShardedTensor):
  function _get_empty_tensor_for_exchange (line 69) | def _get_empty_tensor_for_exchange(
  function distribute_shards_to_ranks (line 118) | def distribute_shards_to_ranks(
  function determine_main_replica_uniform_distribution (line 174) | def determine_main_replica_uniform_distribution(
  function exchange_loaded_tensors_gather_rounds (line 257) | def exchange_loaded_tensors_gather_rounds(
  function exchange_loaded_tensors_gather_object (line 375) | def exchange_loaded_tensors_gather_object(
  function exchange_loaded_objects_gather_object (line 421) | def exchange_loaded_objects_gather_object(
  function exchange_loaded_tensors_broadcast (line 454) | def exchange_loaded_tensors_broadcast(
  function exchange_by_distribution (line 538) | def exchange_by_distribution(

FILE: megatron/core/dist_checkpointing/mapping.py
  class ShardedBase (line 34) | class ShardedBase(ABC):
    method validate_metadata_integrity (line 42) | def validate_metadata_integrity(self):
    method without_data (line 46) | def without_data(self) -> "ShardedBase":
  class ShardedTensor (line 52) | class ShardedTensor(ShardedBase):
    method __post_init__ (line 93) | def __post_init__(self):
    method validate_metadata_integrity (line 96) | def validate_metadata_integrity(self) -> None:
    method has_regular_grid (line 137) | def has_regular_grid(self):
    method global_slice (line 141) | def global_slice(self) -> Tuple[Union[int, slice], ...]:
    method local_chunk_offset_in_global (line 159) | def local_chunk_offset_in_global(self) -> Tuple[int, ...]:
    method max_allowed_chunks (line 172) | def max_allowed_chunks(self) -> Tuple[int, ...]:
    method without_data (line 186) | def without_data(self):
    method from_rank_offsets (line 190) | def from_rank_offsets(
    method init_data (line 247) | def init_data(self, device: Union[str, torch.device], init_fn=torch.em...
    method narrow (line 262) | def narrow(self, dim: int, start: int, length: int) -> List["ShardedTe...
  function is_main_replica (line 322) | def is_main_replica(replica_id: ReplicaId):
  class LocalNonpersistentObject (line 342) | class LocalNonpersistentObject:
    method __init__ (line 351) | def __init__(self, obj):
    method unwrap (line 354) | def unwrap(self):
  class ShardedObject (line 360) | class ShardedObject(ShardedBase):
    method __post_init__ (line 384) | def __post_init__(self):
    method validate_metadata_integrity (line 387) | def validate_metadata_integrity(self):
    method without_data (line 393) | def without_data(self):
    method unique_key (line 397) | def unique_key(self):
    method __str__ (line 405) | def __str__(self):
    method empty_from_unique_key (line 409) | def empty_from_unique_key(cls, unique_key, replica_id: ReplicaId = 0) ...
  class ShardedTensorFactory (line 438) | class ShardedTensorFactory(ShardedBase):
    method build (line 471) | def build(self):
    method validate_metadata_integrity (line 475) | def validate_metadata_integrity(self):
    method without_data (line 479) | def without_data(self):
  function apply_factories (line 483) | def apply_factories(sharded_state_dict: ShardedStateDict):
  function apply_factory_merges (line 502) | def apply_factory_merges(

FILE: megatron/core/dist_checkpointing/optimizer.py
  function get_optim_param_to_id_map (line 35) | def get_optim_param_to_id_map(optim_params_iter: Iterable[torch.nn.Param...
  function get_param_id_to_sharded_param_map (line 45) | def get_param_id_to_sharded_param_map(
  function make_sharded_optimizer_tensor (line 83) | def make_sharded_optimizer_tensor(
  function optim_state_to_sharding_state (line 111) | def optim_state_to_sharding_state(

FILE: megatron/core/dist_checkpointing/serialization.py
  function load (line 61) | def load(
  function load_common_state_dict (line 174) | def load_common_state_dict(checkpoint_dir: Union[str, Path]) -> StateDict:
  function load_tensors_metadata (line 196) | def load_tensors_metadata(
  function load_sharded_metadata (line 227) | def load_sharded_metadata(
  function load_plain_tensors (line 270) | def load_plain_tensors(checkpoint_dir: str) -> StateDict:
  function load_content_metadata (line 287) | def load_content_metadata(
  function remove_sharded_tensors (line 308) | def remove_sharded_tensors(checkpoint_dir: str, key_prefix: str):
  function save (line 314) | def save(
  function get_default_save_sharded_strategy (line 442) | def get_default_save_sharded_strategy(
  function get_default_save_common_strategy (line 449) | def get_default_save_common_strategy(
  function get_default_load_sharded_strategy (line 456) | def get_default_load_sharded_strategy(

FILE: megatron/core/dist_checkpointing/state_dict_utils.py
  function save_preprocess (line 20) | def save_preprocess(
  function load_preprocess (line 62) | def load_preprocess(sharded_state_dict: ShardedStateDict):
  function filter_out_empty_flatten_tensor (line 96) | def filter_out_empty_flatten_tensor(sharded_state_dict: Union[dict, list]):

FILE: megatron/core/dist_checkpointing/strategies/async_utils.py
  function _set_process_qos (line 28) | def _set_process_qos(cpu_priority: int, io_priority: Optional[int]) -> N...
  function _disable_gc (line 85) | def _disable_gc():
  class AsyncRequest (line 97) | class AsyncRequest(NamedTuple):
    method add_finalize_fn (line 123) | def add_finalize_fn(self, fn: Callable) -> None:
    method execute_sync (line 137) | def execute_sync(self) -> None:
    method freeze (line 163) | def freeze(self) -> 'AsyncRequest':
  class AsyncCaller (line 173) | class AsyncCaller(ABC):
    method schedule_async_call (line 180) | def schedule_async_call(self, async_req: AsyncRequest) -> None:
    method is_current_async_call_done (line 193) | def is_current_async_call_done(self, blocking: bool, no_dist: bool) ->...
    method sync_all_async_calls (line 213) | def sync_all_async_calls(self, is_alive: int) -> bool:
    method close (line 228) | def close(self, abort=False):
    method __del__ (line 232) | def __del__(self):
  class TemporalAsyncCaller (line 236) | class TemporalAsyncCaller(AsyncCaller):
    method __init__ (line 242) | def __init__(self):
    method schedule_async_call (line 247) | def schedule_async_call(self, async_req: AsyncRequest) -> None:
    method is_current_async_call_done (line 283) | def is_current_async_call_done(self, blocking: bool = False, no_dist: ...
    method close (line 315) | def close(self, abort=False):
    method __del__ (line 343) | def __del__(self):
  class PersistentAsyncCaller (line 347) | class PersistentAsyncCaller(AsyncCaller):
    method __init__ (line 358) | def __init__(self):
    method _get_process (line 365) | def _get_process(
    method schedule_async_call (line 395) | def schedule_async_call(self, async_req: AsyncRequest) -> None:
    method is_current_async_call_done (line 435) | def is_current_async_call_done(self, blocking: bool = False, no_dist: ...
    method close (line 488) | def close(self, abort=False):
    method __del__ (line 517) | def __del__(self):
    method async_loop (line 522) | def async_loop(
  class _ActiveAsyncRequest (line 599) | class _ActiveAsyncRequest(NamedTuple):
  class AsyncCallsQueue (line 614) | class AsyncCallsQueue:
    method __init__ (line 623) | def __init__(self, persistent: bool = False):
    method _get_async_caller (line 628) | def _get_async_caller(self):
    method warmup_persistent_caller (line 636) | def warmup_persistent_caller(
    method schedule_async_request (line 646) | def schedule_async_request(self, async_request: AsyncRequest) -> int:
    method maybe_finalize_async_calls (line 670) | def maybe_finalize_async_calls(self, blocking=False, no_dist=False) ->...
    method get_num_unfinalized_calls (line 707) | def get_num_unfinalized_calls(self):
    method close (line 711) | def close(self, abort=False):

FILE: megatron/core/dist_checkpointing/strategies/base.py
  class StrategyAction (line 15) | class StrategyAction(Enum):
  function get_default_strategy (line 29) | def get_default_strategy(action: StrategyAction, backend: str, version: ...
  function register_default_strategy (line 50) | def register_default_strategy(
  class LoadStrategyBase (line 67) | class LoadStrategyBase(ABC):
    method check_backend_compatibility (line 72) | def check_backend_compatibility(self, loaded_backend):
    method check_version_compatibility (line 77) | def check_version_compatibility(self, loaded_version):
    method can_handle_sharded_objects (line 82) | def can_handle_sharded_objects(self):
  class SaveStrategyBase (line 87) | class SaveStrategyBase(ABC):
    method __init__ (line 91) | def __init__(self, backend: str, version: int):
    method can_handle_sharded_objects (line 96) | def can_handle_sharded_objects(self):
    method __str__ (line 100) | def __str__(self):
  class LoadCommonStrategy (line 104) | class LoadCommonStrategy(LoadStrategyBase):
    method load_common (line 108) | def load_common(self, checkpoint_dir: Union[str, Path]):
    method load_sharded_objects (line 113) | def load_sharded_objects(
    method load_sharded_metadata (line 119) | def load_sharded_metadata(self, checkpoint_dir: Union[str, Path]) -> S...
  class LoadShardedStrategy (line 126) | class LoadShardedStrategy(LoadStrategyBase):
    method load (line 130) | def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: U...
    method load_tensors_metadata (line 135) | def load_tensors_metadata(self, checkpoint_dir: Union[str, Path]):
    method load_sharded_metadata (line 149) | def load_sharded_metadata(self, checkpoint_dir: Union[str, Path]):
    method remove_sharded_tensors (line 164) | def remove_sharded_tensors(self, checkpoint_dir: Union[str, Path], key...
  class SaveCommonStrategy (line 169) | class SaveCommonStrategy(SaveStrategyBase):
    method save_common (line 173) | def save_common(self, common_state_dict: StateDict, checkpoint_dir: Un...
    method save_sharded_objects (line 177) | def save_sharded_objects(
  class SaveShardedStrategy (line 184) | class SaveShardedStrategy(SaveStrategyBase):
    method save (line 188) | def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: U...
  class AsyncSaveShardedStrategy (line 193) | class AsyncSaveShardedStrategy(SaveShardedStrategy):
    method async_save (line 197) | def async_save(
    method save (line 212) | def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: U...

FILE: megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py
  class CachedMetadataFileSystemReader (line 11) | class CachedMetadataFileSystemReader(FileSystemReader):
    method __init__ (line 24) | def __init__(self, path: Union[str, os.PathLike], cache_metadata: bool...
    method read_metadata (line 34) | def read_metadata(self) -> Metadata:
    method clear_metadata_cache (line 49) | def clear_metadata_cache(cls):

FILE: megatron/core/dist_checkpointing/strategies/checkpointable.py
  class CheckpointableShardedTensor (line 15) | class CheckpointableShardedTensor(torch.Tensor):
    method __new__ (line 21) | def __new__(cls, data: torch.Tensor, sh_ten: ShardedTensor):
    method __init__ (line 24) | def __init__(self, data: torch.Tensor, sh_ten: ShardedTensor):
    method __create_write_items__ (line 28) | def __create_write_items__(
    method __create_chunk_list__ (line 59) | def __create_chunk_list__(self) -> list[ChunkStorageMetadata]:
    method __get_tensor_shard__ (line 71) | def __get_tensor_shard__(self, index: MetadataIndex) -> torch.Tensor:
    method from_sh_ten (line 83) | def from_sh_ten(cls, sh_ten: ShardedTensor) -> 'CheckpointableShardedT...
    method __torch_dispatch__ (line 96) | def __torch_dispatch__(cls, func, types, args, kwargs=None):
    method __repr__ (line 103) | def __repr__(self):
  class LocalShardsContainer (line 107) | class LocalShardsContainer(torch.Tensor):
    method __new__ (line 117) | def __new__(cls, local_shards: list[torch.Tensor]) -> "LocalShardsCont...
    method __init__ (line 122) | def __init__(self, local_shards: list[torch.Tensor]):
    method __torch_dispatch__ (line 129) | def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
    method __create_write_items__ (line 136) | def __create_write_items__(
    method __create_chunk_list__ (line 155) | def __create_chunk_list__(self) -> list[ChunkStorageMetadata]:
    method __get_tensor_shard__ (line 165) | def __get_tensor_shard__(self, index: MetadataIndex) -> torch.Tensor:
    method __repr__ (line 195) | def __repr__(self):

FILE: megatron/core/dist_checkpointing/strategies/common.py
  function register_default_common_strategies (line 29) | def register_default_common_strategies():
  class TorchCommonSaveStrategy (line 37) | class TorchCommonSaveStrategy(SaveCommonStrategy):
    method save_common (line 40) | def save_common(self, common_state_dict: StateDict, checkpoint_dir: Un...
    method save_sharded_objects (line 50) | def save_sharded_objects(
    method can_handle_sharded_objects (line 66) | def can_handle_sharded_objects(self):
  class TorchCommonLoadStrategy (line 71) | class TorchCommonLoadStrategy(LoadCommonStrategy):
    method load_common (line 74) | def load_common(self, checkpoint_dir: Union[str, Path]):
    method load_sharded_objects (line 100) | def load_sharded_objects(
    method load_sharded_metadata (line 153) | def load_sharded_metadata(self, checkpoint_dir: Union[str, Path]) -> S...
    method can_handle_sharded_objects (line 185) | def can_handle_sharded_objects(self):
    method check_backend_compatibility (line 189) | def check_backend_compatibility(self, loaded_version):
    method check_version_compatibility (line 192) | def check_version_compatibility(self, loaded_version):

FILE: megatron/core/dist_checkpointing/strategies/filesystem_async.py
  function get_write_results_queue (line 53) | def get_write_results_queue(mp_mode: str = 'spawn') -> mp.Queue:
  class FileSystemWriterAsync (line 69) | class FileSystemWriterAsync(FileSystemWriter):
    method __init__ (line 90) | def __init__(
    method prepare_write_data (line 114) | def prepare_write_data(self, plan: SavePlan, planner: SavePlanner) -> ...
    method get_save_function_and_args (line 201) | def get_save_function_and_args(self) -> Tuple[Optional[Callable], Opti...
    method preload_tensors (line 222) | def preload_tensors(write_buckets: List[WriteBucket], non_blocking=Tru...
    method write_preloaded_data_multithread (line 248) | def write_preloaded_data_multithread(
    method write_preloaded_data (line 359) | def write_preloaded_data(
    method write_data (line 438) | def write_data(self, plan: SavePlan, planner: SavePlanner) -> Future[L...
    method retrieve_write_results (line 442) | def retrieve_write_results(self) -> Union[List[WriteResult], WRAPPED_E...
    method prepare_decentralized_global_plan (line 478) | def prepare_decentralized_global_plan(self, local_plan: SavePlan) -> S...
    method finish (line 493) | def finish(self, metadata: Metadata, results: List[List[WriteResult]])...
    method prepare_local_plan (line 518) | def prepare_local_plan(self, plan: SavePlan) -> SavePlan:
    method checkpoint_id (line 532) | def checkpoint_id(self) -> Union[str, os.PathLike]:
    method validate_checkpoint_id (line 539) | def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]...
  function _split_by_size_and_type (line 554) | def _split_by_size_and_type(bins: int, items: List[WriteItem]) -> List[L...
  function _split_by_separation_hint (line 600) | def _split_by_separation_hint(
  function _item_size (line 631) | def _item_size(item: WriteItem) -> int:
  function _process_memory (line 653) | def _process_memory() -> int:

FILE: megatron/core/dist_checkpointing/strategies/fully_parallel.py
  class FullyParallelSaveStrategyWrapper (line 48) | class FullyParallelSaveStrategyWrapper(AsyncSaveShardedStrategy):
    method __init__ (line 73) | def __init__(
    method async_save (line 88) | def async_save(self, sharded_state_dict: ShardedStateDict, checkpoint_...
    method save (line 96) | def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: P...
    method apply_saving_parallelization (line 100) | def apply_saving_parallelization(self, sharded_state_dict: ShardedStat...
    method can_handle_sharded_objects (line 137) | def can_handle_sharded_objects(self):
  class FullyParallelLoadStrategyWrapper (line 141) | class FullyParallelLoadStrategyWrapper(LoadShardedStrategy):
    method __init__ (line 167) | def __init__(
    method load (line 188) | def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: P...
    method _defer_loading_sharded_objects (line 293) | def _defer_loading_sharded_objects(
    method _defer_loading_sharded_tensors (line 304) | def _defer_loading_sharded_tensors(
    method fill_in_deferred_sharded_objects (line 317) | def fill_in_deferred_sharded_objects(
    method fill_in_deferred_sharded_tensors (line 336) | def fill_in_deferred_sharded_tensors(
    method apply_loading_parallelization (line 354) | def apply_loading_parallelization(
    method can_handle_sharded_objects (line 392) | def can_handle_sharded_objects(self):
    method load_tensors_metadata (line 395) | def load_tensors_metadata(self, checkpoint_dir: Path):
    method load_sharded_metadata (line 398) | def load_sharded_metadata(self, checkpoint_dir: Path):
    method check_backend_compatibility (line 401) | def check_backend_compatibility(self, loaded_version):
    method check_version_compatibility (line 404) | def check_version_compatibility(self, loaded_version):
  function distribute_main_replicas_with_precomputed_distribution (line 408) | def distribute_main_replicas_with_precomputed_distribution(
  function _defer_loading_sharded_items (line 465) | def _defer_loading_sharded_items(
  function _fill_in_deferred_sharded_items (line 502) | def _fill_in_deferred_sharded_items(

FILE: megatron/core/dist_checkpointing/strategies/state_dict_saver.py
  function _compare_dataclasses (line 27) | def _compare_dataclasses(obj1, obj2):
  function save_state_dict_async_plan (line 41) | def save_state_dict_async_plan(
  function verify_global_md_reuse (line 171) | def verify_global_md_reuse(
  function save_state_dict_async_finalize (line 213) | def save_state_dict_async_finalize(

FILE: megatron/core/dist_checkpointing/strategies/torch.py
  class MCoreMetadata (line 87) | class MCoreMetadata:
  class MCoreSavePlan (line 94) | class MCoreSavePlan:
  function register_default_torch_strategies (line 100) | def register_default_torch_strategies():
  function flatten_state_dict (line 113) | def flatten_state_dict(
  function sharded_tensor_to_torch_sharded_tensor (line 141) | def sharded_tensor_to_torch_sharded_tensor(
  function mcore_to_pyt_state_dict (line 248) | def mcore_to_pyt_state_dict(
  function _unwrap_pyt_sharded_tensor (line 338) | def _unwrap_pyt_sharded_tensor(
  function _replace_state_dict_keys_with_sharded_keys (line 363) | def _replace_state_dict_keys_with_sharded_keys(
  function _replace_sharded_keys_with_state_dict_keys (line 380) | def _replace_sharded_keys_with_state_dict_keys(
  function _restore_dict_types (line 395) | def _restore_dict_types(x: Union[dict, list, Any], keys_template: Union[...
  class MCoreSavePlanner (line 410) | class MCoreSavePlanner(DefaultSavePlanner):
    method __init__ (line 421) | def __init__(
    method create_local_plan (line 442) | def create_local_plan(self) -> SavePlan:
    method create_decentralized_global_plan (line 462) | def create_decentralized_global_plan(self, local_plan: SavePlan) -> Sa...
    method transform_object (line 479) | def transform_object(self, write_item: WriteItem, object: Any):
  class MCoreLoadPlanner (line 484) | class MCoreLoadPlanner(DefaultLoadPlanner):
    method __init__ (line 491) | def __init__(
    method _validate_global_shapes (line 503) | def _validate_global_shapes(self, metadata, sharded_tensors):
    method _temporarily_bypass_shape_validation (line 521) | def _temporarily_bypass_shape_validation(self):
    method create_local_plan (line 545) | def create_local_plan(self) -> LoadPlan:
    method resolve_tensor (line 554) | def resolve_tensor(self, read_item: ReadItem):
    method commit_tensor (line 576) | def commit_tensor(self, read_item: ReadItem, tensor: torch.Tensor) -> ...
  class TorchDistSaveShardedStrategy (line 589) | class TorchDistSaveShardedStrategy(AsyncSaveShardedStrategy):
    method __init__ (line 597) | def __init__(
    method async_save (line 643) | def async_save(
    method _get_save_and_finalize_callbacks (line 745) | def _get_save_and_finalize_callbacks(self, writer, save_state_dict_ret...
    method can_handle_sharded_objects (line 754) | def can_handle_sharded_objects(self):
  function _get_filesystem_reader (line 758) | def _get_filesystem_reader(
  class TorchDistLoadShardedStrategy (line 771) | class TorchDistLoadShardedStrategy(LoadShardedStrategy):
    method __init__ (line 774) | def __init__(self, cache_metadata: bool = False):
    method load (line 779) | def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: P...
    method load_tensors_metadata (line 835) | def load_tensors_metadata(self, checkpoint_dir: Path, metadata: Metada...
    method load_sharded_metadata (line 853) | def load_sharded_metadata(self, checkpoint_dir: Path) -> ShardedStateD...
    method remove_sharded_tensors (line 868) | def remove_sharded_tensors(self, checkpoint_dir: str, key_prefix: str):
    method can_handle_sharded_objects (line 952) | def can_handle_sharded_objects(self):
    method check_backend_compatibility (line 955) | def check_backend_compatibility(self, loaded_version):
    method check_version_compatibility (line 958) | def check_version_compatibility(self, loaded_version):

FILE: megatron/core/dist_checkpointing/tensor_aware_state_dict.py
  class MCoreTensorAwareStateDict (line 48) | class MCoreTensorAwareStateDict(TensorAwareStateDict):
    method _validate_params (line 61) | def _validate_params(algo):
    method _get_distribution (line 68) | def _get_distribution(
    method _remove_redundant_data (line 86) | def _remove_redundant_data(
    method from_state_dict (line 101) | def from_state_dict(
    method is_hollow (line 155) | def is_hollow(self):
    method _sharded_tensors (line 162) | def _sharded_tensors(self):
    method tensors (line 179) | def tensors(self) -> Iterator[torch.Tensor]:
    method common_state_dict (line 187) | def common_state_dict(self) -> Dict:
    method pop_tensors (line 193) | def pop_tensors(self) -> List[torch.Tensor]:
    method insert_tensors (line 213) | def insert_tensors(self, tensor_data: Iterable[torch.Tensor]):
    method init_tensors (line 230) | def init_tensors(self):
    method copy_tensors_to_cpu (line 245) | def copy_tensors_to_cpu(self, non_blocking=False):
    method restore_tensor_device (line 264) | def restore_tensor_device(self, non_blocking=True):
    method _insert_sharded_data (line 276) | def _insert_sharded_data(
    method to_state_dict (line 325) | def to_state_dict(

FILE: megatron/core/dist_checkpointing/utils.py
  function zip_strict (line 25) | def zip_strict(*args):
  function _sharded_tensor_shard_id (line 37) | def _sharded_tensor_shard_id(sharded_tensor: ShardedTensor) -> _ShardId:
  function _sharded_object_id (line 55) | def _sharded_object_id(sharded_object: ShardedObject) -> _ShardId:
  function extract_sharded_tensors (line 68) | def extract_sharded_tensors(
  function extract_sharded_tensors_and_factories (line 86) | def extract_sharded_tensors_and_factories(
  function extract_sharded_tensors_or_nonpersistent (line 107) | def extract_sharded_tensors_or_nonpersistent(
  function extract_sharded_base (line 129) | def extract_sharded_base(
  function extract_nonpersistent (line 145) | def extract_nonpersistent(
  function add_prefix_for_sharding (line 165) | def add_prefix_for_sharding(sharded_state_dict: ShardedStateDict, prefix...
  function replace_prefix_for_sharding (line 184) | def replace_prefix_for_sharding(
  function apply_prefix_mapping (line 210) | def apply_prefix_mapping(sharded_state_dict: ShardedStateDict, prefix_ma...
  function force_all_tensors_to_non_fp8 (line 236) | def force_all_tensors_to_non_fp8(sharded_state_dict: ShardedStateDict):
  function logger_stack (line 255) | def logger_stack(name: Optional[str] = None, current_logger: Optional[lo...
  function debug_time (line 293) | def debug_time(
  function debug_msg (line 318) | def debug_msg(msg: str):
  function _clean_metadata_for_serialization (line 335) | def _clean_metadata_for_serialization(metadata: dict) -> dict:

FILE: megatron/core/dist_checkpointing/validation.py
  class StrictHandling (line 44) | class StrictHandling(Enum):
    method requires_explicit_ckpt_mismatch_check (line 86) | def requires_explicit_ckpt_mismatch_check(val: "StrictHandling") -> bool:
    method requires_global_app_metadata (line 91) | def requires_global_app_metadata(val: "StrictHandling") -> bool:
    method requires_returning_mismatch_keys (line 101) | def requires_returning_mismatch_keys(val: "StrictHandling") -> bool:
  function parse_strict_flag (line 106) | def parse_strict_flag(strict: Union[str, StrictHandling]) -> StrictHandl...
  function validate_integrity_and_strict_load (line 124) | def validate_integrity_and_strict_load(
  function verify_checkpoint_and_load_strategy (line 202) | def verify_checkpoint_and_load_strategy(
  function adjust_non_strict_load (line 268) | def adjust_non_strict_load(
  function _determine_missing_and_unexpected_keys (line 289) | def _determine_missing_and_unexpected_keys(
  function maybe_report_missing_and_unexpected_keys (line 337) | def maybe_report_missing_and_unexpected_keys(
  function _validate_common_state_dict (line 381) | def _validate_common_state_dict(common_state_dict: CommonStateDict) -> N...
  function validate_sharding_integrity (line 415) | def validate_sharding_integrity(
  function _validate_sharding_for_key (line 458) | def _validate_sharding_for_key(
  function _compute_shards_access (line 500) | def _compute_shards_access(rank_sharding):
  function _validate_objects_for_key (line 510) | def _validate_objects_for_key(sharded_objects: List[ShardedObject]) -> L...
  function determine_global_metadata (line 530) | def determine_global_metadata(
  function validate_sharded_objects_handling (line 547) | def validate_sharded_objects_handling(

FILE: megatron/core/distributed/data_parallel_base.py
  class _BaseDataParallel (line 11) | class _BaseDataParallel(MegatronModule):
    method __init__ (line 14) | def __init__(self, config: TransformerConfig, module: torch.nn.Module):
    method forward (line 18) | def forward(self, *inputs, **kwargs):
    method no_sync (line 25) | def no_sync(self):
    method start_grad_sync (line 34) | def start_grad_sync(self, *unused):
    method scale_gradients (line 45) | def scale_gradients(self, scaling_factor: float) -> None:
    method finish_grad_sync (line 49) | def finish_grad_sync(self):
    method zero_grad_buffer (line 60) | def zero_grad_buffer(self):
    method broadcast_params (line 67) | def broadcast_params(self):
    method state_dict (line 73) | def state_dict(self, prefix='', keep_vars=False, destination=None):
    method state_dict_for_save_checkpoint (line 84) | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
    method load_state_dict (line 90) | def load_state_dict(self, state_dict, strict=True):

FILE: megatron/core/distributed/distributed_data_parallel.py
  class DistributedDataParallel (line 22) | class DistributedDataParallel(_BaseDataParallel):
    method __init__ (line 41) | def __init__(
    method enable_forward_pre_hook (line 359) | def enable_forward_pre_hook(self):
    method disable_forward_pre_hook (line 371) | def disable_forward_pre_hook(self, param_sync: bool = True):
    method _make_forward_pre_hook (line 388) | def _make_forward_pre_hook(self):
    method _make_backward_post_hook (line 424) | def _make_backward_post_hook(self, param: torch.nn.Parameter):
    method no_sync (line 455) | def no_sync(self):
    method start_param_sync (line 467) | def start_param_sync(self, *unused, force_sync: bool = False, force_di...
    method start_grad_sync (line 525) | def start_grad_sync(self, *unused):
    method finish_grad_sync (line 537) | def finish_grad_sync(self, force_all_reduce: Optional[bool] = False):
    method free_overlap_buffers (line 549) | def free_overlap_buffers(self):
    method scale_gradients (line 554) | def scale_gradients(self, scaling_factor: float):
    method zero_grad_buffer (line 559) | def zero_grad_buffer(self):
    method broadcast_params (line 575) | def broadcast_params(self):
    method offload_grad_buffers (line 592) | def offload_grad_buffers(self, synchronize: bool = True, empty_cache: ...
    method restore_grad_buffers (line 613) | def restore_grad_buffers(self, synchronize: bool = True) -> None:

FILE: megatron/core/distributed/distributed_data_parallel_config.py
  class DistributedDataParallelConfig (line 10) | class DistributedDataParallelConfig:
    method __post_init__ (line 194) | def __post_init__(self):

FILE: megatron/core/distributed/finalize_model_grads.py
  function _get_main_grad_attr (line 34) | def _get_main_grad_attr(param: torch.nn.Parameter):
  function _unshard_if_dtensor (line 40) | def _unshard_if_dtensor(tensor: Union[torch.Tensor, "DTensor"]) -> torch...
  function _reshard_if_dtensor (line 60) | def _reshard_if_dtensor(
  function _allreduce_conditional_embedding_grads (line 89) | def _allreduce_conditional_embedding_grads(
  function _get_shared_word_embedding_weight (line 132) | def _get_shared_word_embedding_weight(
  function _get_position_embedding_weight (line 151) | def _get_position_embedding_weight(model_module: torch.nn.Module) -> tor...
  function _allreduce_word_embedding_grads (line 164) | def _allreduce_word_embedding_grads(
  function _allreduce_embedding_grad (line 204) | def _allreduce_embedding_grad(
  function _allreduce_position_embedding_grads (line 262) | def _allreduce_position_embedding_grads(
  function reset_model_temporary_tensors (line 278) | def reset_model_temporary_tensors(config: TransformerConfig, model: List...
  function _update_router_expert_bias (line 293) | def _update_router_expert_bias(model: List[torch.nn.Module], config: Tra...
  function _allreduce_non_tensor_model_parallel_grads (line 322) | def _allreduce_non_tensor_model_parallel_grads(
  function finalize_model_grads (line 400) | def finalize_model_grads(

FILE: megatron/core/distributed/fsdp/mcore_fsdp_adapter.py
  class FullyShardedDataParallel (line 62) | class FullyShardedDataParallel(_BaseDataParallel):
    method __init__ (line 67) | def __init__(
    method load_state_dict (line 163) | def load_state_dict(self, state_dict, strict=True):
    method _fix_tensor_parallel_attributes (line 185) | def _fix_tensor_parallel_attributes(self, module):
    method _init_dist_index (line 223) | def _init_dist_index(self, pg_collection):
    method stop_communication (line 344) | def stop_communication(self):
    method sync_rng_states_across_tp_group (line 351) | def sync_rng_states_across_tp_group(self):
  function _get_hsdp_tp_mesh (line 366) | def _get_hsdp_tp_mesh(outer_fsdp_dp_group, dp_cp_group, tp_group, ep_siz...
  function _get_dp_tp_mesh (line 435) | def _get_dp_tp_mesh(dp_cp_group, tp_group, ep_size=1):
  function _check_mesh_ranks_and_group_ranks_are_consistent (line 477) | def _check_mesh_ranks_and_group_ranks_are_consistent(mesh_ranks, group_r...
  function _get_rng_state_dict (line 491) | def _get_rng_state_dict():
  function _load_rng_state_dict (line 502) | def _load_rng_state_dict(rng_state_dict):

FILE: megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py
  class DistributedDataParallelConfig (line 10) | class DistributedDataParallelConfig:
    method __post_init__ (line 148) | def __post_init__(self):

FILE: megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py
  class ShardingStrategy (line 43) | class ShardingStrategy(IntEnum):
  function experimental_api (line 63) | def experimental_api(func: Callable) -> Callable:
  function fully_shard_model (line 75) | def fully_shard_model(
  function fully_shard_optimizer (line 408) | def fully_shard_optimizer(
  function fully_shard (line 614) | def fully_shard(

FILE: megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py
  class TrainingState (line 61) | class TrainingState(Enum):
  class MegatronFSDP (line 76) | class MegatronFSDP(torch.nn.Module):
    method __init__ (line 171) | def __init__(
    method _check_module_parameter_types (line 319) | def _check_module_parameter_types(self):
    method _init_fsdp_param_and_grad_buffer (line 333) | def _init_fsdp_param_and_grad_buffer(self):
    method _import_class_from_path (line 415) | def _import_class_from_path(self, class_path: str):
    method all_gather_and_wait_parameters_ready (line 422) | def all_gather_and_wait_parameters_ready(
    method _register_fsdp_hooks (line 477) | def _register_fsdp_hooks(self, root_module):
    method no_sync (line 1059) | def no_sync(self):
    method sync (line 1073) | def sync(self):
    method set_model_auto_sync (line 1084) | def set_model_auto_sync(self, sync_model: bool = True):
    method get_distributed_index (line 1127) | def get_distributed_index(self) -> FSDPDistributedIndex:
    method mixed_precision_context (line 1135) | def mixed_precision_context(self, mixed_precision_policy: MixedPrecisi...
    method reset_mixed_precision_policy (line 1147) | def reset_mixed_precision_policy(self, mixed_precision_policy: MixedPr...
    method start_param_sync (line 1167) | def start_param_sync(self, *unused, force_sync: bool = False, force_di...
    method start_grad_sync (line 1205) | def start_grad_sync(self, *unused):
    method synchronize_param_gather (line 1222) | def synchronize_param_gather(self):
    method synchronize_gradient_reduce (line 1229) | def synchronize_gradient_reduce(self):
    method attach_grad_to_optimizer_state (line 1242) | def attach_grad_to_optimizer_state(self):
    method finish_grad_sync (line 1249) | def finish_grad_sync(self, force_all_reduce: Optional[bool] = False):
    method _replace_param_with_distributed_if_needed (line 1281) | def _replace_param_with_distributed_if_needed(self):
    method _replace_param_with_raw_if_needed (line 1300) | def _replace_param_with_raw_if_needed(self):
    method _reestablish_shared_weights (line 1314) | def _reestablish_shared_weights(self, old_params, new_params):
    method scale_gradients (line 1348) | def scale_gradients(self, scaling_factor: float):
    method zero_grad_buffer (line 1352) | def zero_grad_buffer(self):
    method install_optimized_model_weights (line 1362) | def install_optimized_model_weights(self):
    method broadcast_params (line 1369) | def broadcast_params(self):
    method forward (line 1385) | def forward(self, *inputs, **kwargs):
  class RegisterFSDPBackwardFunction (line 1396) | class RegisterFSDPBackwardFunction(torch.autograd.Function):
    method forward (line 1404) | def forward(ctx, post_backward, *inputs: torch.Tensor):
    method backward (line 1412) | def backward(ctx, *grads: torch.Tensor):
  function _replace_module_parameter (line 1420) | def _replace_module_parameter(module, name, new_param):

FILE: megatron/core/distributed/fsdp/src/megatron_fsdp/mixed_precision.py
  function local_multi_tensor_applier (line 121) | def local_multi_tensor_applier(op, noop_flag_buffer, tensor_lists, *args):
  function local_multi_tensor_scale (line 125) | def local_multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale):
  function _multi_tensor_copy_this_to_that (line 133) | def _multi_tensor_copy_this_to_that(
  function is_te_min_version (line 162) | def is_te_min_version(vers, check_equality=True):
  function is_float8tensor (line 173) | def is_float8tensor(tensor: torch.Tensor) -> bool:
  function is_blockwise_float8tensor (line 178) | def is_blockwise_float8tensor(tensor: torch.Tensor) -> bool:
  function fp8_need_transpose_data (line 183) | def fp8_need_transpose_data(tensor: torch.Tensor) -> bool:
  function fp8_need_transpose_data_for_meta_device_init (line 188) | def fp8_need_transpose_data_for_meta_device_init(module: TransformerEngi...
  function fp8_discard_transpose_cache (line 193) | def fp8_discard_transpose_cache(tensor: torch.Tensor) -> None:
  function fp8_create_transpose_cache (line 204) | def fp8_create_transpose_cache(tensors: List[torch.Tensor]) -> None:
  function _fp8_create_transpose_cache_fallback (line 212) | def _fp8_create_transpose_cache_fallback(tensors: List[torch.Tensor]) ->...
  function fp8_set_raw_data (line 223) | def fp8_set_raw_data(tensor: torch.Tensor, data: torch.Tensor, set_trans...
  function fp8_get_raw_data (line 244) | def fp8_get_raw_data(tensor: torch.Tensor, get_transpose: bool = False) ...
  function fp8_dequantize (line 257) | def fp8_dequantize(tensor: torch.Tensor) -> torch.Tensor:
  function fp8_quantize (line 266) | def fp8_quantize(
  function _fp8_quantize_fallback (line 288) | def _fp8_quantize_fallback(
  function get_quantized_model_init_context_cls (line 353) | def get_quantized_model_init_context_cls():
  class MixedPrecisionPolicy (line 366) | class MixedPrecisionPolicy:

FILE: megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py
  function _p_assert (line 107) | def _p_assert(cond: Any, s: str, raise_assertion_error: bool = True) -> ...
  function _alloc_storage (line 118) | def _alloc_storage(tensor: torch.Tensor, size: torch.Size) -> None:
  function _free_storage (line 138) | def _free_storage(tensor: torch.Tensor):
  class MultiGroupUBRAllocator (line 170) | class MultiGroupUBRAllocator:
    method __init__ (line 200) | def __init__(self, pool, groups):  # torch.cuda.MemPool  # torch.distr...
    method __enter__ (line 206) | def __enter__(self):
    method __exit__ (line 218) | def __exit__(self, *args):
  class BucketingPolicy (line 232) | class BucketingPolicy:
  function _pad (line 252) | def _pad(number_to_be_padded: int, divisor: int) -> int:
  function build_data_parallel_buffer_index (line 256) | def build_data_parallel_buffer_index(
  function _get_dp_buffer_shard_bucket_index (line 384) | def _get_dp_buffer_shard_bucket_index(
  class Bucket (line 444) | class Bucket:
  class TemporaryBucketAllocator (line 461) | class TemporaryBucketAllocator:
    method __init__ (line 497) | def __init__(self):
    method allocate (line 500) | def allocate(
    method free (line 515) | def free(self, bucket_id: int):
  class StorageResizeBasedBucketAllocator (line 524) | class StorageResizeBasedBucketAllocator(TemporaryBucketAllocator):
    method __init__ (line 530) | def __init__(self):
    method allocate (line 533) | def allocate(
    method free (line 550) | def free(self, bucket_id: int):
  class RotaryBucketAllocator (line 558) | class RotaryBucketAllocator(TemporaryBucketAllocator):
    method __init__ (line 589) | def __init__(self, name: str):
    method allocate (line 596) | def allocate(
    method _get_gbuf_name (line 630) | def _get_gbuf_name(self, buffer_id: int):
    method free (line 633) | def free(self, bucket_id: int):
  class FixedPoolAllocator (line 642) | class FixedPoolAllocator(TemporaryBucketAllocator):
    method __init__ (line 652) | def __init__(
    method _is_two_bucket_group_equal (line 729) | def _is_two_bucket_group_equal(self, group_a, group_b):
    method allocate (line 743) | def allocate(
    method _get_gbuf_name (line 806) | def _get_gbuf_name(self, buf_group_id: int, bucket_index: int):
    method free (line 809) | def free(self, bucket_id: int):
  class DataParallelBuffer (line 828) | class DataParallelBuffer:
    method __init__ (line 846) | def __init__(
    method init_data (line 932) | def init_data(self, data: torch.Tensor):
    method fetch_bucket (line 942) | def fetch_bucket(
    method allocate_bucket_storage (line 987) | def allocate_bucket_storage(
    method free_bucket_storage (line 1043) | def free_bucket_storage(self):
    method reset_param_main_grad (line 1049) | def reset_param_main_grad(self):
    method _get_item_slice_in_shard (line 1058) | def _get_item_slice_in_shard(self, item_id: int) -> Tuple[int, int]:
    method locate_item_in_global_item (line 1106) | def locate_item_in_global_item(self, item_id: int) -> Tuple[int, int]:
    method _get_item_local_shard_index (line 1127) | def _get_item_local_shard_index(self, item_id: int) -> Tuple[int, int]:
    method _get_item_local_index (line 1171) | def _get_item_local_index(self, item_id: int) -> Tuple[int, int]:
    method set_item (line 1188) | def set_item(self, item_id: int, item_data: torch.Tensor) -> None:
    method get_item (line 1223) | def get_item(self, item_id: int, only_shard: bool = False) -> torch.Te...
    method get_item_from_bucket (line 1257) | def get_item_from_bucket(self, bucket: Bucket, item_id: int):
    method get_shard_from_bucket (line 1268) | def get_shard_from_bucket(self, bucket: Bucket):
    method get_shard_from_local_buffer (line 1278) | def get_shard_from_local_buffer(self) -> torch.Tensor:
  class ParameterGroup (line 1290) | class ParameterGroup:
  function _get_parameter_groups (line 1349) | def _get_parameter_groups(
  class ParamAndGradBuffer (line 1583) | class ParamAndGradBuffer:
    method __init__ (line 1625) | def __init__(
    method get_mem_alloc_context (line 1770) | def get_mem_alloc_context(self, groups=None, symmetric=True):
    method manual_buffer_registration (line 1834) | def manual_buffer_registration(self):
    method _log_parameter_groups (line 1869) | def _log_parameter_groups(self):
    method _init_each_parameter_group_buffers (line 1918) | def _init_each_parameter_group_buffers(self, meta_device_init_fp8_para...
    method _reset_parameters (line 2615) | def _reset_parameters(self, old_params, new_params):
    method scale_gradients (line 2659) | def scale_gradients(self, scaling_factor: float) -> None:
    method zero_grad (line 2667) | def zero_grad(self):
    method _init_distributed_params (line 2685) | def _init_distributed_params(self):
    method _init_optimizer_named_parameters (line 2756) | def _init_optimizer_named_parameters(self) -> List[Tuple[str, torch.nn...
    method update_main_grads (line 2813) | def update_main_grads(self):
    method num_buckets (line 2879) | def num_buckets(self):
    method copy_main_weights_to_model_weights (line 2884) | def copy_main_weights_to_model_weights(self):
    method copy_model_weights_to_main_weights (line 3055) | def copy_model_weights_to_main_weights(self):
    method all_gather_parameters (line 3073) | def all_gather_parameters(self, async_op: bool = True):
    method reduce_scatter_gradients (line 3104) | def reduce_scatter_gradients(self, async_op: bool = True):
    method all_reduce_gradients (line 3140) | def all_reduce_gradients(self, async_op: bool = False):
  class BucketStatus (line 3176) | class BucketStatus(Enum):
  class GradReducePipeline (line 3191) | class GradReducePipeline:
    method __init__ (line 3196) | def __init__(
    method num_buckets (line 3227) | def num_buckets(self):
    method reset (line 3231) | def reset(self):
    method reduce_gradients (line 3251) | def reduce_gradients(
    method wait_for_previous_grad_reduce (line 3295) | def wait_for_previous_grad_reduce(
    method _enforce_double_buffer_limit (line 3327) | def _enforce_double_buffer_limit(self, add_buckets):
    method get_ready_bucket_group_for_reduction (line 3349) | def get_ready_bucket_group_for_reduction(self, bucket_id: int) -> Opti...
    method get_fsdp_buffer (line 3375) | def get_fsdp_buffer(self, bucket_id: int) -> DataParallelBuffer:
    method _bucket_group_gradient_reduce (line 3382) | def _bucket_group_gradient_reduce(
  class PrefetchOrder (line 3634) | class PrefetchOrder(Enum):
  class AllGatherPipeline (line 3647) | class AllGatherPipeline:
    method __init__ (line 3652) | def __init__(
    method get_bucket_key (line 3696) | def get_bucket_key(self, bucket_id, bwd):
    method num_buckets (line 3704) | def num_buckets(self):
    method reset (line 3708) | def reset(self):
    method all_gather_params (line 3737) | def all_gather_params(
    method wait_bucket_ready (line 3922) | def wait_bucket_ready(self, bucket_id, bwd, empty_ok=False):
    method release_bucket (line 3942) | def release_bucket(self, bucket_id, bwd, lazy: bool = False):
    method recycle_unused_buckets (line 3995) | def recycle_unused_buckets(self):
    method get_fsdp_buffer (line 4003) | def get_fsdp_buffer(self, bucket_id: int, bwd=False) -> DataParallelBu...
    method async_bucket_gather (line 4020) | def async_bucket_gather(self, bucket_id, bwd) -> None:
  function gradient_reduce_preprocessing (line 4067) | def gradient_reduce_preprocessing(grad_data, scaling_factor, ddp_config):
  function _check_nan_in_grad (line 4092) | def _check_nan_in_grad(grad: torch.Tensor):
  function check_gpu_memory (line 4104) | def check_gpu_memory(threshold=0.9):
  class ResetParametersContext (line 4134) | class ResetParametersContext:
    method __init__ (line 4139) | def __init__(self, init_param_with_fp8=False, with_cuda_rng_tracker=Fa...
    method __enter__ (line 4143) | def __enter__(self):
    method __exit__ (line 4177) | def __exit__(self, *exc_details):
  function override_sharded_param_methods_with_safety_checks (line 4181) | def override_sharded_param_methods_with_safety_checks(params, all_gather...
  function _dtype_size (line 4221) | def _dtype_size(dtype: torch.dtype) -> int:
  function to_local_if_dtensor (line 4252) | def to_local_if_dtensor(tensor):
  function _get_fsdp_tensor_spec (line 4265) | def _get_fsdp_tensor_spec(
  function make_fsdp_dtensor (line 4341) | def make_fsdp_dtensor(

FILE: megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py
  function gather_and_compute_chunk_metadata (line 31) | def gather_and_compute_chunk_metadata(dtensor: DTensor) -> ChunkStorageM...
  function update_uneven_dtensor_chunk_metadata (line 98) | def update_uneven_dtensor_chunk_metadata(dtensor: DTensor) -> dict:
  function validate_uneven_dtensor (line 141) | def validate_uneven_dtensor(dtensor: DTensor) -> None:
  function filter_unflattened_state_dict (line 208) | def filter_unflattened_state_dict(state_dict, key_chain=[], visit_condit...
  function get_unflattened_state_dict (line 227) | def get_unflattened_state_dict(state_dict, key_chain=[]):
  function preprocess_state_dict_for_uneven_dtensor (line 240) | def preprocess_state_dict_for_uneven_dtensor(state_dict: dict) -> dict:
  function gather_uneven_dtensor_to_full_tensor (line 258) | def gather_uneven_dtensor_to_full_tensor(
  function _assemble_full_tensor_from_uneven_chunks (line 333) | def _assemble_full_tensor_from_uneven_chunks(
  function _intersection (line 402) | def _intersection(s1, s2):
  function _offset_slice (line 411) | def _offset_slice(s, offset):
  function split_dtensor (line 415) | def split_dtensor(

FILE: megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py
  function get_te_version (line 59) | def get_te_version():
  function is_te_min_version (line 76) | def is_te_min_version(vers, check_equality=True):
  function is_submodule (line 88) | def is_submodule(module, parent_module, strict=True):
  function get_mesh_names (line 101) | def get_mesh_names(
  function contains_submesh (line 147) | def contains_submesh(
  function _get_cuda_rng_state (line 162) | def _get_cuda_rng_state(
  function _set_cuda_rng_state (line 193) | def _set_cuda_rng_state(new_state: torch.Tensor, device: int = -1, graph...
  function initialize_rng_tracker (line 235) | def initialize_rng_tracker(
  function get_cuda_rng_tracker (line 427) | def get_cuda_rng_tracker(
  function safe_get_rank (line 437) | def safe_get_rank() -> int:
  function log_single_rank (line 457) | def log_single_rank(logger_: logging.Logger, level: int, msg: str, *args...
  class FSDPDistributedIndex (line 465) | class FSDPDistributedIndex:
    method __init__ (line 474) | def __init__(
    method get_submesh (line 627) | def get_submesh(
    method get_dp_group (line 671) | def get_dp_group(self, is_expert_parallel: bool = False) -> ProcessGroup:
    method get_fsdp_group (line 681) | def get_fsdp_group(
    method get_outer_fsdp_group (line 691) | def get_outer_fsdp_group(self, is_expert_parallel: bool = False) -> Pr...
    method get_root_mesh (line 699) | def get_root_mesh(self, is_expert_parallel: bool = False) -> DeviceMesh:
    method get_logical_hybrid_fsdp_rank (line 708) | def get_logical_hybrid_fsdp_rank(self, is_expert_parallel: bool = False):
  class GlobalMemoryBuffer (line 752) | class GlobalMemoryBuffer:
    method __init__ (line 757) | def __init__(self):
    method get_tensor (line 760) | def get_tensor(self, tensor_shape, dtype, name, mem_alloc_context: Opt...
  function get_global_memory_buffer (line 781) | def get_global_memory_buffer():
  function create_updated_function_signature (line 789) | def create_updated_function_signature(original_function, **extended_kwar...
  function is_mcore_tensor_model_parallel (line 813) | def is_mcore_tensor_model_parallel(param: torch.Tensor) -> bool:
  function is_mcore_tensor_parallel_duplicated (line 820) | def is_mcore_tensor_parallel_duplicated(param: torch.Tensor) -> bool:
  function get_mcore_tensor_parallel_partition_dim (line 827) | def get_mcore_tensor_parallel_partition_dim(param: torch.Tensor) -> Opti...

FILE: megatron/core/distributed/param_and_grad_buffer.py
  class BufferType (line 48) | class BufferType(Enum):
  function shard_buffer (line 57) | def shard_buffer(buffer: torch.Tensor, data_parallel_world_size: int):
  class _ParamAndGradBucket (line 69) | class _ParamAndGradBucket:
    method __init__ (line 87) | def __init__(
    method set_layerwise_params_list (line 122) | def set_layerwise_params_list(self, layerwise_params_list: List[List[t...
  class _LayerwiseAllGatherHandle (line 136) | class _LayerwiseAllGatherHandle:
    method __init__ (line 143) | def __init__(self, handles):
    method wait (line 146) | def wait(self):
  class _ParamAndGradBucketGroup (line 153) | class _ParamAndGradBucketGroup:
    method __init__ (line 168) | def __init__(
    method reset (line 242) | def reset(self):
    method check_grads (line 254) | def check_grads(self, check_for_nan_or_inf, check_for_large):
    method start_param_sync (line 292) | def start_param_sync(self, force_sync: bool = False):
    method finish_param_sync (line 427) | def finish_param_sync(self, skip_next_bucket_dispatch: bool = False):
    method start_grad_sync (line 515) | def start_grad_sync(self, force_all_reduce: Optional[bool] = False):
    method finish_grad_sync (line 658) | def finish_grad_sync(self, force_all_reduce: Optional[bool] = False):
    method free_overlap_buffers (line 690) | def free_overlap_buffers(self):
    method register_grad_ready (line 705) | def register_grad_ready(
  class _ParamAndGradBuffer (line 730) | class _ParamAndGradBuffer:
    method __init__ (line 752) | def __init__(
    method scale_gradients (line 1060) | def scale_gradients(self, scaling_factor: float) -> None:
    method _get (line 1064) | def _get(self, shape: torch.Size, start_index: int, buffer_type: Buffe...
    method _new_bucket (line 1081) | def _new_bucket(
    method reset (line 1125) | def reset(self):
    method offload_to_cpu (line 1131) | def offload_to_cpu(self, move_params: bool = True, move_grads: bool = ...
    method reload_from_cpu (line 1146) | def reload_from_cpu(self, move_params: bool = True, move_grads: bool =...
  function partition_buckets (line 1164) | def partition_buckets(

FILE: megatron/core/distributed/reduce_scatter_with_fp32_accumulation.py
  class _ReduceScatterWithFP32AccumulationWorkHandle (line 9) | class _ReduceScatterWithFP32AccumulationWorkHandle:
    method __init__ (line 13) | def __init__(
    method wait (line 26) | def wait(self):
  function reduce_scatter_with_fp32_accumulation (line 42) | def reduce_scatter_with_fp32_accumulation(

FILE: megatron/core/distributed/torch_fully_sharded_data_parallel.py
  class TorchFullyShardedDataParallel (line 28) | class TorchFullyShardedDataParallel(_BaseDataParallel):
    method __init__ (line 55) | def __init__(
    method load_state_dict (line 150) | def load_state_dict(self, state_dict, strict=True):

FILE: megatron/core/distributed/torch_fully_sharded_data_parallel_config.py
  class TorchFullyShardedDataParallelConfig (line 10) | class TorchFullyShardedDataParallelConfig(DistributedDataParallelConfig):

FILE: megatron/core/energy_monitor.py
  class EnergyMonitor (line 22) | class EnergyMonitor:
    method __init__ (line 30) | def __init__(self) -> None:
    method setup (line 37) | def setup(self) -> None:
    method shutdown (line 43) | def shutdown(self) -> None:
    method pause (line 48) | def pause(self) -> None:
    method resume (line 54) | def resume(self) -> None:
    method _get_energy (line 59) | def _get_energy(self) -> int:
    method lap (line 66) | def lap(self) -> float:
    method get_total (line 83) | def get_total(self) -> float:

FILE: megatron/core/enums.py
  class ModelType (line 6) | class ModelType(enum.Enum):
  class Fp8Recipe (line 12) | class Fp8Recipe(str, enum.Enum):
  class Fp4Recipe (line 22) | class Fp4Recipe(str, enum.Enum):

FILE: megatron/core/export/export_config.py
  class ExportConfig (line 9) | class ExportConfig:
    method __post_init__ (line 23) | def __post_init__(self):

FILE: megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py
  class TRTLLMEngineBuilder (line 19) | class TRTLLMEngineBuilder:
    method build_and_save_engine (line 23) | def build_and_save_engine(

FILE: megatron/core/export/trtllm/trtllm_helper.py
  class TRTLLMHelper (line 39) | class TRTLLMHelper:
    method __init__ (line 42) | def __init__(
    method _get_trtllm_config (line 110) | def _get_trtllm_config(
    method _load_scaling_factors (line 210) | def _load_scaling_factors(self, model_state_dict: dict) -> dict:
    method get_trtllm_pretrained_config_and_model_weights (line 264) | def get_trtllm_pretrained_config_and_model_weights(
    method _add_scales_to_converter (line 352) | def _add_scales_to_converter(
    method _get_trtllm_pretrained_config_and_model_weights_in_distributed_setting (line 377) | def _get_trtllm_pretrained_config_and_model_weights_in_distributed_set...
    method _get_trtllm_pretrained_config_and_model_weights_list_on_single_device (line 451) | def _get_trtllm_pretrained_config_and_model_weights_list_on_single_dev...
    method build_and_save_engine (line 532) | def build_and_save_engine(

FILE: megatron/core/export/trtllm/trtllm_layers.py
  class TRTLLMLayers (line 8) | class TRTLLMLayers(Enum):
    method return_layer_name_and_number (line 56) | def return_layer_name_and_number(layer_name: str) -> Tuple[str, int]:
    method rename_input_layer_names_to_trtllm_layer_names (line 80) | def rename_input_layer_names_to_trtllm_layer_names(
  function get_layer_name_without_prefix (line 157) | def get_layer_name_without_prefix(layer: TRTLLMLayers) -> str:

FILE: megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py
  function str_dtype_to_torch (line 23) | def str_dtype_to_torch(dtype: DataType):
  class DistributedTRTLLMModelWeightsConverter (line 31) | class DistributedTRTLLMModelWeightsConverter:
    method __init__ (line 37) | def __init__(
    method _add_to_trtllm_model_weights (line 82) | def _add_to_trtllm_model_weights(self, val: torch.Tensor, layer_name: ...
    method _convert_transformer_layer (line 100) | def _convert_transformer_layer(self, layer_name: str, val: torch.Tensor):
    method _convert_non_transformer_layer (line 195) | def _convert_non_transformer_layer(self, model_state_dict: dict, layer...
    method _get_remove_vocab_padding (line 209) | def _get_remove_vocab_padding(self, layer_name, model_state_dict, toke...
    method convert (line 236) | def convert(

FILE: megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py
  function pad_vocab_size (line 26) | def pad_vocab_size(vocab_size: int, tp_size: int):
  function str_dtype_to_torch (line 33) | def str_dtype_to_torch(dtype: DataType):
  class SingleDeviceTRTLLMModelWeightsConverter (line 40) | class SingleDeviceTRTLLMModelWeightsConverter:
    method __init__ (line 43) | def __init__(
    method _convert_non_transformer_layer (line 81) | def _convert_non_transformer_layer(self, model_state_dict: dict, layer...
    method _cast_value (line 95) | def _cast_value(self, val: torch.Tensor, layer_name: str) -> torch.Ten...
    method _convert_transformer_layer (line 114) | def _convert_transformer_layer(self, layer_name: str, val: torch.Tensor):
    method convert (line 332) | def convert(
    method get_padded_vocab_size (line 405) | def get_padded_vocab_size(self) -> int:
    method get_local_model_weights_per_gpu (line 422) | def get_local_model_weights_per_gpu(self, mapping, trtllm_model_config...

FILE: megatron/core/export/trtllm/trtllm_weights_converter/utils.py
  function is_gated_activation (line 6) | def is_gated_activation(helper):

FILE: megatron/core/extensions/transformer_engine.py
  class TransformerEngineConfigType (line 85) | class TransformerEngineConfigType(enum.Enum):
  class TEQuantizationRecipe (line 92) | class TEQuantizationRecipe:
    method parse_from_config (line 128) | def parse_from_config(cls, quant_config: Dict[Any, Any]) -> "TEQuantiz...
    method get_config_keys (line 157) | def get_config_keys(cls) -> Set[str]:
  class TEQuantizationParams (line 163) | class TEQuantizationParams:
    method parse_from_config (line 175) | def parse_from_config(quant_config: QuantizationConfig) -> "TEQuantiza...
  function _get_fp8_autocast_for_quant_recipe (line 208) | def _get_fp8_autocast_for_quant_recipe(qrecipe: TEQuantizationRecipe):
  function _get_fp8_autocast_for_quant_params (line 259) | def _get_fp8_autocast_for_quant_params(qparams: TEQuantizationParams | N...
  function _get_should_context_be_quantized_recipe (line 268) | def _get_should_context_be_quantized_recipe(
  function _get_should_context_be_quantized_params (line 284) | def _get_should_context_be_quantized_params(
  function _get_extra_te_kwargs (line 299) | def _get_extra_te_kwargs(config: TransformerConfig):
  function condition_init_method (line 312) | def condition_init_method(config, init_method):
  function split_te_layernorm_column_parallel_linear (line 317) | def split_te_layernorm_column_parallel_linear(
  class TEActivationOp (line 400) | class TEActivationOp:
    method __new__ (line 406) | def __new__(cls, config: TransformerConfig):
  class TEFusedResidualRMSNorm (line 438) | class TEFusedResidualRMSNorm(te.pytorch.RMSNorm):
    method __init__ (line 453) | def __init__(self, *args, **kwargs):
    method _make_fused_impl (line 458) | def _make_fused_impl(self) -> te.pytorch.ops.Sequential:
    method _register_hooks_on_fused_impl (line 493) | def _register_hooks_on_fused_impl(self, fused_impl: torch.nn.Module) -...
    method forward (line 574) | def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, ...
  class TENorm (line 602) | class TENorm:
    method __new__ (line 617) | def __new__(
  class TELinear (line 661) | class TELinear(te.pytorch.Linear):
    method __init__ (line 676) | def __init__(
    method finish_init (line 849) | def finish_init(self, quantization_config: QuantizationConfig):
    method will_execute_quantized (line 856) | def will_execute_quantized(self, is_context_quantized: bool) -> bool:
    method forward (line 862) | def forward(self, x):
    method sharded_state_dict (line 880) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N...
    method backward_dw (line 898) | def backward_dw(self):
  class TELayerNormColumnParallelLinear (line 904) | class TELayerNormColumnParallelLinear(te.pytorch.LayerNormLinear):
    method __init__ (line 908) | def __init__(
    method finish_init (line 1070) | def finish_init(self, quantization_config: QuantizationConfig):
    method will_execute_quantized (line 1077) | def will_execute_quantized(self, is_context_quantized: bool) -> bool:
    method forward (line 1083) | def forward(self, x):
    method sharded_state_dict (line 1102) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N...
    method extra_repr (line 1116) | def extra_repr(self) -> str:
    method backward_dw (line 1125) | def backward_dw(self):
  class TEColumnParallelLinear (line 1131) | class TEColumnParallelLinear(TELinear):
    method __init__ (line 1135) | def __init__(
    method sharded_state_dict (line 1213) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N...
    method extra_repr (line 1226) | def extra_repr(self) -> str:
    method backward_dw (line 1235) | def backward_dw(self):
  class TERowParallelLinear (line 1241) | class TERowParallelLinear(TELinear):
    method __init__ (line 1245) | def __init__(
    method sharded_state_dict (line 1317) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N...
    method extra_repr (line 1330) | def extra_repr(self) -> str:
    method backward_dw (line 1339) | def backward_dw(self):
  class TEDotProductAttention (line 1345) | class TEDotProductAttention(te.pytorch.DotProductAttention):
    method __init__ (line 1356) | def __init__(
    method forward (line 1542) | def forward(
    method sharded_state_dict (line 1645) | def sharded_state_dict(
  class TEGroupedLinear (line 1668) | class TEGroupedLinear(te.pytorch.GroupedLinear):
    method __init__ (line 1677) | def __init__(
    method finish_init (line 1874) | def finish_init(self, quantization_config: QuantizationConfig):
    method will_execute_quantized (line 1881) | def will_execute_quantized(self, is_context_quantized: bool) -> bool:
    method forward (line 1887) | def forward(self, x, m_splits):
    method _encode_extra_state (line 1905) | def _encode_extra_state(self, state):
    method _decode_extra_state (line 1916) | def _decode_extra_state(self, state):
    method _split_extra_state (line 1928) | def _split_extra_state(self, state):
    method _sharded_state_dict_grouped (line 1969) | def _sharded_state_dict_grouped(
    method backward_dw (line 2032) | def backward_dw(self):
  class TEColumnParallelGroupedLinear (line 2040) | class TEColumnParallelGroupedLinear(TEGroupedLinear):
    method __init__ (line 2046) | def __init__(
    method sharded_state_dict (line 2074) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N...
  class TERowParallelGroupedLinear (line 2086) | class TERowParallelGroupedLinear(TEGroupedLinear):
    method __init__ (line 2092) | def __init__(
    method sharded_state_dict (line 2120) | def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=N...
  class TEFusedMLP (line 2138) | class TEFusedMLP(MLP):
    method __init__ (line 2142) | def __init__(self, *args, **kwargs):
    method _make_fused_impl (line 2148) | def _make_fused_impl(self) -> te.pytorch.ops.Sequential:
    method _make_activation_op (line 2274) | def _make_activation_op(
    method _register_hooks_on_fused_impl (line 2310) | def _register_hooks_on_fused_impl(self, fused_impl: torch.nn.Module) -...
    method forward (line 2396) | def forward(self, hidden_states: torch.Tensor, **kwargs) -> Tuple[Tens...
  class TEDelayedScaling (line 2423) | class TEDelayedScaling(te.common.recipe.DelayedScaling):
    method __init__ (line 2428) | def __init__(
  class TECudaRNGStatesTracker (line 2459) | class TECudaRNGStatesTracker(te.pytorch.distributed.CudaRNGStatesTracker):
    method __init__ (line 2463) | def __init__(self, is_inference_rng_tracker=False):
    method is_initialized (line 2474) | def is_initialized(self):
    method reset (line 2478) | def reset(self):
    method set_states (line 2483) | def set_states(self, states):
    method add (line 2488) | def add(self, name, seed):
  function te_checkpoint (line 2494) | def te_checkpoint(
  function get_cpu_offload_context (line 2534) | def get_cpu_offload_context(
  function fused_apply_rotary_pos_emb (line 2575) | def fused_apply_rotary_pos_emb(
  function fused_apply_rotary_pos_emb_thd (line 2596) | def fused_apply_rotary_pos_emb_thd(
  function te_parallel_cross_entropy (line 2683) | def te_parallel_cross_entropy(
  function te_general_gemm (line 2711) | def te_general_gemm(
  function set_save_original_input (line 2764) | def set_save_original_input(module):

FILE: megatron/core/extensions/transformer_engine_spec_provider.py
  class _TENormWithResidual (line 31) | class _TENormWithResidual:
    method __new__ (line 34) | def __new__(cls, *args, **kwargs):
  class TESpecProvider (line 38) | class TESpecProvider(BackendSpecProvider):
    method linear (line 41) | def linear(self) -> type:
    method column_parallel_linear (line 45) | def column_parallel_linear(self) -> type:
    method row_parallel_linear (line 49) | def row_parallel_linear(self) -> type:
    method fuse_layernorm_and_linear (line 53) | def fuse_layernorm_and_linear(self) -> bool:
    method column_parallel_layer_norm_linear (line 57) | def column_parallel_layer_norm_linear(self) -> Optional[type]:
    method layer_norm (line 61) | def layer_norm(
    method core_attention (line 73) | def core_attention(self) -> type:
    method grouped_mlp_modules (line 77) | def grouped_mlp_modules(
    method activation_func (line 102) | def activation_func(self) -> TEActivationFunctionBuilder | None:

FILE: megatron/core/fp4_utils.py
  function is_nvfp4tensor (line 46) | def is_nvfp4tensor(tensor: torch.Tensor) -> bool:
  function get_fp4_align_size (line 51) | def get_fp4_align_size(fp4_recipe: Fp4Recipe) -> int:
  function dequantize_fp4_tensor (line 83) | def dequantize_fp4_tensor(fp4_tensor: torch.Tensor) -> torch.Tensor:
  function get_fp4_recipe (line 94) | def get_fp4_recipe(config: TransformerConfig):
  function get_fp4_context (line 122) | def get_fp4_context(config: TransformerConfig, layer_no: int = -1, is_in...
  function get_fp4_recipe (line 167) | def get_fp4_recipe(config: TransformerConfig):
  function get_fp4_context (line 171) | def get_fp4_context(config: TransformerConfig, layer_no: int = -1, is_in...

FILE: megatron/core/fp8_utils.py
  function is_float8tensor (line 96) | def is_float8tensor(tensor: torch.Tensor) -> bool:
  function is_mxfp8tensor (line 108) | def is_mxfp8tensor(tensor: torch.Tensor) -> bool:
  function dequantize_fp8_tensor (line 113) | def dequantize_fp8_tensor(fp8_tensor: torch.Tensor) -> torch.Tensor:
  function _resolve_callable_from_python_import_path (line 121) | def _resolve_callable_from_python_import_path(dotted_path: str):
  function _get_custom_recipe (line 155) | def _get_custom_recipe(quantizer_factory_python_path: str) -> Union[Fp8R...
  function get_fp8_align_size (line 168) | def get_fp8_align_size(fp8_recipe: Fp8Recipe) -> int:
  function is_column_parallel_linear (line 176) | def is_column_parallel_linear(module):
  function is_row_parallel_linear (line 188) | def is_row_parallel_linear(module):
  function _modify_underlying_storage_impl (line 226) | def _modify_underlying_storage_impl(
  function _quantize_param_shard_impl (line 233) | def _quantize_param_shard_impl(
  function _correct_amax_history_if_needed_impl (line 267) | def _correct_amax_history_if_needed_impl(model: List[torch.nn.Module]) -...
  function _modify_underlying_storage_impl (line 275) | def _modify_underlying_storage_impl(
  function _quantize_param_shard_impl (line 284) | def _quantize_param_shard_impl(
  function _correct_amax_history_if_needed_impl (line 359) | def _correct_amax_history_if_needed_impl(model: List[torch.nn.Module]) -...
  function _modify_underlying_storage_impl (line 367) | def _modify_underlying_storage_impl(tensor: Float8Tensor, new_raw_data: ...
  function _quantize_param_shard_impl (line 374) | def _quantize_param_shard_impl(
  function _correct_amax_history_if_needed_impl (line 446) | def _correct_amax_history_if_needed_impl(model: List[torch.nn.Module]) -...
  function _modify_underlying_storage_impl (line 461) | def _modify_underlying_storage_impl(*args, **kwargs):
  function _quantize_param_shard_impl (line 464) | def _quantize_param_shard_impl(model_params, *args, **kwargs):
  function _correct_amax_history_if_needed_impl (line 471) | def _correct_amax_history_if_needed_impl(*args, **kwargs):
  function modify_underlying_storage (line 478) | def modify_underlying_storage(tensor: torch.Tensor, new_raw_data: torch....
  function quantize_param_shard (line 484) | def quantize_param_shard(
  function correct_amax_history_if_needed (line 494) | def correct_amax_history_if_needed(model: List[torch.nn.Module]):
  function post_all_gather_processing (line 499) | def post_all_gather_processing(model_params):
  function is_first_last_bf16_layer (line 513) | def is_first_last_bf16_layer(config: TransformerConfig, layer_no: int):
  function get_fp8_recipe (line 536) | def get_fp8_recipe(config: TransformerConfig):
  function get_fp8_context (line 596) | def get_fp8_context(config: TransformerConfig, layer_no: int = -1, is_in...
  function get_fp8_recipe (line 658) | def get_fp8_recipe(config: TransformerConfig):
  function get_fp8_context (line 662) | def get_fp8_context(config: TransformerConfig, layer_no: int = -1, is_in...
  function _wrap_te_linear_for_padding (line 673) | def _wrap_te_linear_for_padding(module: torch.nn.Module):
  function prepare_model_for_fp8_inference (line 757) | def prepare_model_for_fp8_inference(model):
  function prepare_model_for_fp8_inference (line 780) | def prepare_model_for_fp8_inference(model):

FILE: megatron/core/full_cuda_graph.py
  function copy_tensors_in_struct (line 19) | def copy_tensors_in_struct(src):
  function clone_tensors_in_struct (line 33) | def clone_tensors_in_struct(tgt, src):
  class StaticBufferLoader (line 57) | class StaticBufferLoader:
    method __init__ (line 62) | def __init__(self):
    method __call__ (line 65) | def __call__(self, inputs, stage, microbatch):
  class FullCudaGraphWrapper (line 94) | class FullCudaGraphWrapper:
    method __init__ (line 101) | def __init__(self, forward_backward_func, cuda_graph_warmup_steps=1):
    method data_read (line 106) | def data_read(self, data_iterator, model, training, num_microbatches):
    method __call__ (line 139) | def __call__(self, *args, **kwargs):
    method curr_iter (line 192) | def curr_iter(self, stage):
    method next_iter (line 196) | def next_iter(self, stage):

FILE: megatron/core/fusions/fused_bias_dropout.py
  function _bias_dropout_add_func (line 11) | def _bias_dropout_add_func(x_with_bias, residual, prob, training):
  function bias_dropout_add_unfused (line 62) | def bias_dropout_add_unfused(training):
  function bias_dropout_add_fused_train (line 70) | def bias_dropout_add_fused_train(
  function bias_dropout_add_fused_inference (line 77) | def bias_dropout_add_fused_inference(
  function get_bias_dropout_add (line 83) | def get_bias_dropout_add(training, fused):

FILE: megatron/core/fusions/fused_bias_geglu.py
  function geglu (line 17) | def geglu(y):
  function bias_geglu (line 31) | def bias_geglu(bias, y):
  function geglu_back (line 49) | def geglu_back(g, y):
  function bias_geglu_back (line 69) | def bias_geglu_back(g, y, bias):
  class BiasGeGLUFunction (line 84) | class BiasGeGLUFunction(torch.autograd.Function):
    method forward (line 89) | def forward(ctx, input, bias):
    method backward (line 104) | def backward(ctx, grad_output):
  class GeGLUFunction (line 119) | class GeGLUFunction(torch.autograd.Function):
    method forward (line 124) | def forward(ctx, input):
    method backward (line 138) | def backward(ctx, grad_output):
  function bias_geglu_impl (line 153) | def bias_geglu_impl(input, bias):
  function quick_gelu (line 185) | def quick_gelu(y: torch.Tensor) -> torch.Tensor:
  function quick_geglu (line 191) | def quick_geglu(y: torch.Tensor, linear_offset: float = 0.0) -> torch.Te...
  function weighted_quick_geglu (line 206) | def weighted_quick_geglu(
  function quick_geglu_back (line 221) | def quick_geglu_back(g, y, linear_offset: float = 0.0) -> torch.Tensor:
  function weighted_quick_geglu_back (line 240) | def weighted_quick_geglu_back(g, y, weights, linear_offset: float = 0.0):
  function weighted_bias_quick_geglu (line 259) | def weighted_bias_quick_geglu(
  function weighted_bias_quick_geglu_back (line 279) | def weighted_bias_quick_geglu_back(g, y, bias, weights, linear_offset: f...
  class WeightedQuickGeGLUFunction (line 303) | class WeightedQuickGeGLUFunction(torch.autograd.Function):
    method forward (line 307) | def forward(
    method backward (line 333) | def backward(ctx, grad_output):
  class WeightedBiasQuickGeGLUFunction (line 350) | class WeightedBiasQuickGeGLUFunction(torch.autograd.Function):
    method forward (line 354) | def forward(
    method backward (line 387) | def backward(ctx, grad_output):
  function weighted_bias_quick_geglu_impl (line 410) | def weighted_bias_quick_geglu_impl(

FILE: megatron/core/fusions/fused_bias_gelu.py
  function bias_gelu (line 17) | def bias_gelu(bias, y):
  function bias_gelu_back (line 26) | def bias_gelu_back(g, bias, y):
  class GeLUFunction (line 36) | class GeLUFunction(torch.autograd.Function):
    method forward (line 39) | def forward(ctx, input, bias):
    method backward (line 44) | def backward(ctx, grad_output):
    method apply (line 51) | def apply(cls, *args, **kwargs):

FILE: megatron/core/fusions/fused_bias_swiglu.py
  function swiglu (line 16) | def swiglu(y):
  function bias_swiglu (line 30) | def bias_swiglu(y, bias):
  function weighted_swiglu (line 45) | def weighted_swiglu(y, weights):
  function swiglu_back (line 55) | def swiglu_back(g, y):
  function bias_swiglu_back (line 73) | def bias_swiglu_back(g, y, bias):
  function weighted_swiglu_back (line 90) | def weighted_swiglu_back(g, y, weights):
  class BiasSwiGLUFunction (line 100) | class BiasSwiGLUFunction(torch.autograd.Function):
    method forward (line 105) | def forward(ctx, input, bias, fp8_input_store, cpu_offload_input):
    method backward (line 128) | def backward(ctx, grad_output):
  class SwiGLUFunction (line 147) | class SwiGLUFunction(torch.autograd.Function):
    method forward (line 152) | def forward(ctx, input, fp8_input_store, cpu_offload_input):
    method backward (line 173) | def backward(ctx, grad_output):
  class WeightedSwiGLUFunction (line 191) | class WeightedSwiGLUFunction(torch.autograd.Function):
    method forward (line 194) | def forward(ctx, input, weights, fp8_input_store):
    method backward (line 202) | def backward(ctx, grad_output):
  function bias_swiglu_impl (line 209) | def bias_swiglu_impl(input, bias, fp8_input_store=False, cpu_offload_inp...
  function weighted_bias_swiglu_impl (line 239) | def weighted_bias_swiglu_impl(input, bias, weights, fp8_input_store=False):

FILE: megatron/core/fusions/fused_cross_entropy.py
  function calculate_logits_max (line 13) | def calculate_logits_max(vocab_parallel_logits: torch.Tensor) -> Tuple[t...
  function calculate_predicted_logits (line 26) | def calculate_predicted_logits(
  function calculate_cross_entropy_loss (line 48) | def calculate_cross_entropy_loss(
  function calculate_gradients (line 65) | def calculate_gradients(
  class _VocabParallelCrossEntropy (line 87) | class _VocabParallelCrossEntropy(torch.autograd.Function):
    method forward (line 89) | def forward(ctx, vocab_parallel_logits, target, tp_group):
    method backward (line 124) | def backward(ctx, grad_output):
  function fused_vocab_parallel_cross_entropy (line 136) | def fused_vocab_parallel_cross_entropy(vocab_parallel_logits, target, tp...

FILE: megatron/core/fusions/fused_indices_converter.py
  function _indices_to_multihot_kernel (line 32) | def _indices_to_multihot_kernel(
  function _multihot_to_indices_kernel (line 112) | def _multihot_to_indices_kernel(
  class IndicesToMultihot (line 176) | class IndicesToMultihot(torch.autograd.Function):
    method forward (line 186) | def forward(ctx, indices, probs_indices, num_of_local_experts):
    method backward (line 239) | def backward(ctx, grad_multihot_indices, grad_probs_in_multihot):
  function fused_indices_to_multihot (line 282) | def fused_indices_to_multihot(indices, probs_indices, num_of_local_exper...

FILE: megatron/core/fusions/fused_layer_norm.py
  class FusedLayerNorm (line 30) | class FusedLayerNorm(torch.nn.Module):
    method __init__ (line 52) | def __init__(
    method reset_parameters (line 122) | def reset_parameters(self):
    method forward (line 131) | def forward(self, input: Tensor) -> Tensor:

FILE: megatron/core/fusions/fused_mla_yarn_rope_apply.py
  function _get_thd_token_idx (line 31) | def _get_thd_token_idx(cu_seqlens, pid_m, seq_num, cp_rank, cp_size):
  function rotary_fwd_q_kernel (line 68) | def rotary_fwd_q_kernel(
  function rotary_bwd_q_kernel (line 148) | def rotary_bwd_q_kernel(
  class ApplyMLARotaryEmbQ (line 210) | class ApplyMLARotaryEmbQ(torch.autograd.Function):
    method forward (line 216) | def forward(
    method backward (line 285) | def backward(ctx, grad):
  function fused_apply_mla_rope_for_q (line 327) | def fused_apply_mla_rope_for_q(
  function rotary_fwd_kv_kernel (line 379) | def rotary_fwd_kv_kernel(
  function rotary_bwd_kv_kernel (line 487) | def rotary_bwd_kv_kernel(
  class ApplyMLARotaryEmbKV (line 581) | class ApplyMLARotaryEmbKV(torch.autograd.Function):
    method forward (line 587) | def forward(
    method backward (line 675) | def backward(ctx, dk, dv):
  function fused_apply_mla_rope_for_kv (line 735) | def fused_apply_mla_rope_for_kv(

FILE: megatron/core/fusions/fused_pad_routing_map.py
  function _pad_routing_map_kernel (line 31) | def _pad_routing_map_kernel(
  function fused_pad_routing_map (line 74) | def fused_pad_routing_map(routing_map: torch.Tensor, pad_multiple: int) ...

FILE: megatron/core/fusions/fused_softmax.py
  class ScaledUpperTriangMaskedSoftmax (line 11) | class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
    method forward (line 20) | def forward(ctx, inputs, scale):
    method backward (line 40) | def backward(ctx, output_grads):
  class ScaledMaskedSoftmax (line 60) | class ScaledMaskedSoftmax(torch.autograd.Function):
    method forward (line 69) | def forward(ctx, inputs, mask, scale):
    method backward (line 90) | def backward(ctx, output_grads):
  class ScaledSoftmax (line 108) | class ScaledSoftmax(torch.autograd.Function):
    method forward (line 116) | def forward(ctx, inputs, scale):
    method backward (line 136) | def backward(ctx, output_grads):
  class SoftmaxOne (line 154) | class SoftmaxOne(nn.Module):
    method __init__ (line 161) | def __init__(
    method forward (line 168) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  class FusedScaleMaskSoftmax (line 179) | class FusedScaleMaskSoftmax(nn.Module):
    method __init__ (line 193) | def __init__(
    method forward (line 219) | def forward(
    method is_kernel_available (line 238) | def is_kernel_available(self, mask, b, np, sq, sk):
    method forward_fused_softmax (line 272) | def forward_fused_softmax(self, input, mask):
    method forward_torch_softmax (line 299) | def forward_torch_softmax(self, input, mask, softmax_offset=None):
    method get_batch_per_block (line 345) | def get_batch_per_block(sq, sk, b, np):

FILE: megatron/core/fusions/fused_weighted_squared_relu.py
  function weighted_squared_relu (line 14) | def weighted_squared_relu(x: torch.Tensor, weights: torch.Tensor) -> tor...
  function _squared_relu_back (line 32) | def _squared_relu_back(g: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
  function weighted_squared_relu_back (line 41) | def weighted_squared_relu_back(g: torch.Tensor, x: torch.Tensor, weights...
  class WeightedSquaredReLUFunction (line 60) | class WeightedSquaredReLUFunction(torch.autograd.Function):
    method forward (line 65) | def forward(ctx, input: torch.Tensor, weights: torch.Tensor):
    method backward (line 79) | def backward(ctx, grad_output: torch.Tensor):
  function weighted_squared_relu_impl (line 91) | def weighted_squared_relu_impl(input: torch.Tensor, weights: torch.Tenso...

FILE: megatron/core/hyper_comm_grid.py
  class HyperCommGrid (line 33) | class HyperCommGrid:
    method __init__ (line 82) | def __init__(
    method create_pg (line 120) | def create_pg(self, dims: Union[str, list[str]], **kwargs: Any) -> dis...
    method destroy (line 168) | def destroy(self) -> None:
    method get_pg (line 175) | def get_pg(self, dims: Union[str, list[str]]) -> dist.ProcessGroup:
    method get_rank_enum (line 190) | def get_rank_enum(self, dims: Union[str, list[str]]) -> list[list[int]]:
    method _gen_rank_enum (line 206) | def _gen_rank_enum(self, dims: list[str]) -> list[list[int]]:
    method _order_dims (line 251) | def _order_dims(self, dims: Union[str, list[str]]) -> Tuple[list[str],...

FILE: megatron/core/inference/async_stream.py
  class AsyncStream (line 17) | class AsyncStream:
    method __init__ (line 24) | def __init__(
    method put (line 36) | def put(self, item: Union[InferenceRequest, Exception]) -> None:
    method finish (line 41) | def finish(self, exception: Optional[Union[BaseException, Type[BaseExc...
    method finished (line 51) | def finished(self) -> bool:
    method generator (line 55) | async def generator(self) -> AsyncGenerator[InferenceRequest, None]:
    method _is_raisable (line 70) | def _is_raisable(value: Any):

FILE: megatron/core/inference/batch_dimensions_utils.py
  class InferenceBatchDimensions (line 21) | class InferenceBatchDimensions:
    method __str__ (line 38) | def __str__(self):
    method is_applicable_for_batch_dim (line 44) | def is_applicable_for_batch_dim(
    method is_valid (line 76) | def is_valid(
    method __hash__ (line 112) | def __hash__(self):
    method __eq__ (line 119) | def __eq__(self, other: "InferenceBatchDimensions") -> bool:
    method req_count (line 132) | def req_count(self) -> int:
    method adjust_batch_dims_for_expert_parallelism (line 139) | def adjust_batch_dims_for_expert_parallelism(
  class CUDAGraphBatchDimensionBuilder (line 233) | class CUDAGraphBatchDimensionBuilder:
    method _calculate_cuda_graph_token_counts (line 244) | def _calculate_cuda_graph_token_counts(
    method generate_cuda_graph_batch_dimensions_list (line 318) | def generate_cuda_graph_batch_dimensions_list(
    method match_graph_config (line 508) | def match_graph_config(

FILE: megatron/core/inference/communication/torch_symm_triton/barrier.py
  function _send_signal (line 21) | def _send_signal(addrs, sem: tl.constexpr):
  function _wait_signal (line 43) | def _wait_signal(addrs, sem: tl.constexpr):
  function symm_mem_sync (line 65) | def symm_mem_sync(

FILE: megatron/core/inference/communication/torch_symm_triton/collectives.py
  function _ag_phase (line 30) | def _ag_phase(
  function _multimem_all_gather_kernel (line 68) | def _multimem_all_gather_kernel(
  function _multimem_all_gather_3_kernel (line 95) | def _multimem_all_gather_3_kernel(
  function _multimem_reduce_scatter_kernel (line 159) | def _multimem_reduce_scatter_kernel(
  function _kernel_launch_config (line 212) | def _kernel_launch_config(element_size: int, max_numel: int, world_size:...
  function multimem_all_gather (line 230) | def multimem_all_gather(
  function multimem_all_gather_fused (line 270) | def multimem_all_gather_fused(
  function multimem_reduce_scatter (line 323) | def multimem_reduce_scatter(

FILE: megatron/core/inference/communication/torch_symm_triton/fused_collectives.py
  function unpack_bf16x2 (line 23) | def unpack_bf16x2(x, mask):
  function sum_sq (line 42) | def sum_sq(x, y, z, w, mask):
  function apply_norm (line 74) | def apply_norm(x, y, z, w, wx, wy, wz, ww, rrms, mask):
  function _multimem_reduce_scatter_residual_add_kernel (line 115) | def _multimem_reduce_scatter_residual_add_kernel(
  function fused_multimem_rs_add_norm_ag (line 210) | def fused_multimem_rs_add_norm_ag(

FILE: megatron/core/inference/communication/torch_symm_triton/multimem_asm.py
  function ld_128 (line 21) | def ld_128(ptr, mask, multicast_op: tl.constexpr, reduce_f32: tl.constex...
  function st_128 (line 111) | def st_128(ptr, x, y, z, w, mask, multicast_op):
  function add_v8_bf16_from_u32 (line 181) | def add_v8_bf16_from_u32(
  function asm_rsqrt (line 215) | def asm_rsqrt(x, eps):

FILE: megatron/core/inference/communication/torch_symm_triton/utils.py
  function is_device_nvls_capable (line 20) | def is_device_nvls_capable(device: torch.device) -> bool:
  function are_tensors_nvls_eligible (line 26) | def are_tensors_nvls_eligible(*tensors: torch.Tensor) -> bool:
  function get_tid (line 42) | def get_tid():
  function get_ntid (line 61) | def get_ntid():
  function get_flat_tid (line 80) | def get_flat_tid():
  function get_flat_bid (line 90) | def get_flat_bid():
  function sync_threads (line 101) | def sync_threads():

FILE: megatron/core/inference/communication_utils.py
  function is_pipeline_first_stage (line 10) | def is_pipeline_first_stage(pp_group: ProcessGroup):
  function is_pipeline_last_stage (line 19) | def is_pipeline_last_stage(pp_group: ProcessGroup):
  function _is_cuda (line 28) | def _is_cuda(tensor):
  function _is_cuda_contiguous (line 34) | def _is_cuda_contiguous(tensor):
  function broadcast_from_last_pipeline_stage (line 40) | def broadcast_from_last_pipeline_stage(
  function recv_from_prev_pipeline_rank_ (line 83) | def recv_from_prev_pipeline_rank_(
  function send_to_next_pipeline_rank (line 114) | def send_to_next_pipeline_rank(
  function broadcast_tensor (line 145) | def broadcast_tensor(size, dtype, tensor=None, rank=0, data_parallel=Fal...
  function broadcast_list (line 169) | def broadcast_list(size, dtype, list_values=None, rank=0, data_parallel=...
  function broadcast_int_list (line 190) | def broadcast_int_list(size, int_list=None, rank=0, data_parallel=False):
  function broadcast_float_list (line 202) | def broadcast_float_list(size, float_list=None, rank=0, data_parallel=Fa...

FILE: megatron/core/inference/config.py
  class MambaInferenceStateConfig (line 15) | class MambaInferenceStateConfig:
    method from_model (line 46) | def from_model(
  class PrefixCachingEvictionPolicy (line 81) | class PrefixCachingEvictionPolicy(str, Enum):
  class PrefixCachingCoordinatorPolicy (line 94) | class PrefixCachingCoordinatorPolicy(str, Enum):
  class KVCacheManagementMode (line 107) | class KVCacheManagementMode(str, Enum):
  class InferenceConfig (line 121) | class InferenceConfig:

FILE: megatron/core/inference/contexts/attention_context/mamba_metadata.py
  class MambaMetadata (line 10) | class MambaMetadata:
    method __init__ (line 13) | def __init__(self, max_requests: int, max_tokens: int, mamba_chunk_siz...
    method reset (line 87) | def reset(self) -> None:
    method reset_varlen_metadata (line 101) | def reset_varlen_metadata(self) -> None:
    method update (line 120) | def update(
    method allocate_slot (line 294) | def allocate_slot(self) -> Optional[int]:
    method batch_allocate_slots (line 311) | def batch_allocate_slots(self, num_slots: int) -> Optional[torch.Tensor]:
    method free_slots (line 330) | def free_slots(self, request_indices: torch.Tensor) -> None:

FILE: megatron/core/inference/contexts/attention_context/metadata_base.py
  class MetadataBase (line 4) | class MetadataBase:
    method __init__ (line 14) | def __init__(self):
    method update (line 20) | def update(self, *args, **kwargs):
    method reset (line 26) | def reset(self):
    method tensor_copy_and_pad (line 32) | def tensor_copy_and_pad(
    method __str__ (line 68) | def __str__(self):

FILE: megatron/core/inference/contexts/attention_context/mha_metadata.py
  class MHAMetadata (line 9) | class MHAMetadata(MetadataBase):
    method __init__ (line 14) | def __init__(
    method update (line 37) | def update(
    method reset (line 123) | def reset(self):
  class GraphedMHAMetadata (line 136) | class GraphedMHAMetadata(MHAMetadata):
    method __init__ (line 141) | def __init__(
    method update (line 148) | def update(
    method reset (line 175) | def reset(self):
  class NonGraphedMHAMetadata (line 179) | class NonGraphedMHAMetadata(MHAMetadata):
    method update (line 184) | def update(

FILE: megatron/core/inference/contexts/attention_context/triton/tensor_ops.py
  function _tensor_get_slice_after_kernel (line 24) | def _tensor_get_slice_after_kernel(
  function _tensor_merge_kernel (line 54) | def _tensor_merge_kernel(
  function _tensor_masked_update_kernel_2d (line 101) | def _tensor_masked_update_kernel_2d(
  function _tensor_masked_update_kernel_3d (line 141) | def _tensor_masked_update_kernel_3d(
  function _tensor_masked_update_kernel_4d (line 197) | def _tensor_masked_update_kernel_4d(
  function _compute_row_size (line 262) | def _compute_row_size(tensor):
  function tensor_get_slice_after (line 272) | def tensor_get_slice_after(input_tensor, output_tensor, pos_on_device, c...
  function tensor_merge (line 327) | def tensor_merge(
  function tensor_masked_update (line 395) | def tensor_masked_update(states: torch.Tensor, idx: torch.Tensor, new_st...

FILE: megatron/core/inference/contexts/base_context.py
  class BaseInferenceContext (line 8) | class BaseInferenceContext(abc.ABC):
    method __init__ (line 15) | def __init__(self, inference_config: InferenceConfig):
    method is_static_batching (line 22) | def is_static_batching(self) -> bool:
    method is_dynamic_batching (line 26) | def is_dynamic_batching(self) -> bool:
    method increment_sequence_len_offset (line 30) | def increment_sequence_len_offset(self, increment: int) -> None:
    method increment_batch_size_offset (line 35) | def increment_batch_size_offset(self, increment: int) -> None:
    method reset_batch_size_offset (line 40) | def reset_batch_size_offset(self) -> None:

FILE: megatron/core/inference/contexts/dynamic_context.py
  class ContextOverflowError (line 97) | class ContextOverflowError(Exception):
    method __init__ (line 106) | def __init__(
  class RequestOverflowError (line 117) | class RequestOverflowError(ContextOverflowError):
  class TokenOverflowError (line 123) | class TokenOverflowError(ContextOverflowError):
  class MaxSequenceLengthOverflowError (line 129) | class MaxSequenceLengthOverflowError(ContextOverflowError):
    method __init__ (line 132) | def __init__(self, request_id, message: Optional[str] = None):
  class BlockOverflowError (line 136) | class BlockOverflowError(ContextOverflowError):
  class ActiveRequestCountOverflowError (line 142) | class ActiveRequestCountOverflowError(ContextOverflowError):
    method __init__ (line 146) | def __init__(self, max_request_count, active_request_count):
  class TensorStateDeallocatedError (line 155) | class TensorStateDeallocatedError(ContextOverflowError):
  class ContextErrorFactory (line 162) | class ContextErrorFactory:
    method serialize (line 166) | def serialize(cls, error: ContextOverflowError) -> dict:
    method deserialize (line 184) | def deserialize(cls, obj: dict) -> ContextOverflowError:
  function get_mem_size_str (line 206) | def get_mem_size_str(n_bytes: int) -> str:
  class DynamicInferenceContext (line 217) | class DynamicInferenceContext(BaseInferenceContext):
    method __init__ (line 248) | def __init__(self, model_config: TransformerConfig, inference_config: ...
    method _allocate_memory_buffer (line 600) | def _allocate_memory_buffer(self):
    method _allocate_mamba_states (line 636) | def _allocate_mamba_states(self):
    method initialize_all_tensors (line 702) | def initialize_all_tensors(self) -> None:
    method reinitialize_inference_state_buffers (line 801) | def reinitialize_inference_state_buffers(self):
    method deallocate_inference_state_buffers (line 838) | def deallocate_inference_state_buffers(self):
    method round_up_tokens (line 879) | def round_up_tokens(cls, value, tp_size=None):
    method round_up_requests (line 893) | def round_up_requests(cls, value, tp_size=None):
    method is_static_batching (line 906) | def is_static_batching(self) -> bool:
    method is_decode_only (line 910) | def is_decode_only(self) -> bool:
    method using_cuda_graph_this_step (line 916) | def using_cuda_graph_this_step(self) -> bool:
    method has_unfinished_requests (line 920) | def has_unfinished_requests(self) -> bool:
    method cu_query_lengths (line 924) | def cu_query_lengths(self) -> Tuple[Tensor, int]:
    method cu_kv_lengths (line 932) | def cu_kv_lengths(self) -> Tuple[Tensor, Tensor, int]:
    method get_active_sequence_lengths (line 941) | def get_active_sequence_lengths(self) -> Tensor:
    method get_max_sequence_lengths (line 947) | def get_max_sequence_lengths(self) -> Tensor:
    method get_active_request_count (line 951) | def get_active_request_count(self):
    method append_key_value_cache (line 955) | def append_key_value_cache(self, layer_number: int, key: Tensor, value...
    method key_value_cache (line 1004) | def key_value_cache(self, layer_number: int) -> Tuple[Tensor, Optional...
    method mamba_states_cache (line 1031) | def mamba_states_cache(
    method _allocate_mamba_cache (line 1051) | def _allocate_mamba_cache(self, mamba_gb: float) -> None:
    method apply_fused_qk_rotary_emb (line 1093) | def apply_fused_qk_rotary_emb(
    method apply_rotary_emb_query (line 1125) | def apply_rotary_emb_query(
    method apply_rotary_emb_key (line 1159) | def apply_rotary_emb_key(
    method reset_attention_state (line 1198) | def reset_attention_state(self) -> None:
    method reset_mamba_state (line 1210) | def reset_mamba_state(self) -> None:
    method add_dummy_requests_parallel (line 1215) | def add_dummy_requests_parallel(
    method add_dummy_requests_for_cudagraph_capture (line 1354) | def add_dummy_requests_for_cudagraph_capture(
    method num_decode_requests (line 1414) | def num_decode_requests(self) -> int:
    method add_dummy_requests_for_expert_parallel_step (line 1420) | def add_dummy_requests_for_expert_parallel_step(self) -> None:
    method initialize_attention_state (line 1476) | def initialize_attention_state(
    method reset_tensors (line 1648) | def reset_tensors(self) -> None:
    method reset_metadata (line 1674) | def reset_metadata(self) -> None:
    method reset (line 1717) | def reset(self) -> None:
    method current_input_and_position_ids (line 1736) | def current_input_and_position_ids(
    method last_token_logits (line 1758) | def last_token_logits(self, logits: Tensor) -> Tensor:
    method _compute_prefix_match (line 1781) | def _compute_prefix_match(
    method check_availability (line 1860) | def check_availability(self, req: DynamicInferenceRequest) -> Tuple[bo...
    method _find_kv_match_count (line 1880) | def _find_kv_match_count(
    method add_request (line 1927) | def add_request(
    method _move_book_keeping_tensors (line 2135) | def _move_book_keeping_tensors(
    method _swap_book_keeping_tensors (line 2164) | def _swap_book_keeping_tensors(
    method get_index_of_chunked_prefill_request (line 2194) | def get_index_of_chunked_prefill_request(self, safe: bool = True) -> int:
    method is_chunked_prefill_enabled (line 2216) | def is_chunked_prefill_enabled(self) -> bool:
    method release_memory_blocks_from_request_indexes (line 2222) | def release_memory_blocks_from_request_indexes(self, request_indexes) ...
    method resume_paused_requests (line 2254) | def resume_paused_requests(
    method evict_overflow_paused_requests (line 2338) | def evict_overflow_paused_requests(
    method update_requests (line 2453) | def update_requests(
    method calculate_log_probs (line 2902) | def calculate_log_probs(
    method get_kvcache_utilization_stats (line 2974) | def get_kvcache_utilization_stats(self) -> dict:

FILE: megatron/core/inference/contexts/fused_kv_append_kernel.py
  function _append_kv_cache_kernel (line 22) | def _append_kv_cache_kernel(
  function triton_append_key_value_cache (line 93) | def triton_append_key_value_cache(

FILE: megatron/core/inference/contexts/kv_block_allocator.py
  class KVBlockAllocator (line 12) | class KVBlockAllocator:
    method __init__ (line 27) | def __init__(
    method __str__ (line 76) | def __str__(self):
    method get_total_used (line 83) | def get_total_used(self):
    method get_active_used (line 87) | def get_active_used(self):
    method get_paused_used (line 107) | def get_paused_used(self):
    method get_active_avail (line 123) | def get_active_avail(self):
    method get_paused_avail (line 127) | def get_paused_avail(self):
    method is_memory_available (line 131) | def is_memory_available(self, num_blocks: int) -> bool:
    method allocate_memory_blocks (line 153) | def allocate_memory_blocks(self, num_blocks: int) -> Optional[Tensor]:
    method release_memory_blocks (line 188) | def release_memory_blocks(self, blocks: Tensor) -> None:
    method reset (line 227) | def reset(self) -> None:
    method register_kv_block_hashes (line 262) | def register_kv_block_hashes(self, block_ids: list[int], block_hashes:...
    method _deregister_blocks (line 276) | def _deregister_blocks(self, block_ids: Tensor) -> None:
    method update_timestamps (line 313) | def update_timestamps(self, block_ids: Tensor) -> None:
    method get_evictable_block_count (line 326) | def get_evictable_block_count(self) -> Tensor:
    method evict_lru_blocks (line 335) | def evict_lru_blocks(self, num_blocks_needed: int) -> bool:

FILE: megatron/core/inference/contexts/mamba_slot_allocator.py
  class MambaSlotAllocator (line 14) | class MambaSlotAllocator:
    method __init__ (line 31) | def __init__(
    method allocate_slot (line 79) | def allocate_slot(self, block_id: int) -> int:
    method _evict_lru_slot (line 104) | def _evict_lru_slot(self) -> int:
    method get_slot (line 138) | def get_slot(self, block_id: int) -> int:
    method has_state (line 149) | def has_state(self, block_id: int) -> bool:
    method invalidate_block (line 153) | def invalidate_block(self, block_id: int) -> None:
    method store_from_tensors (line 174) | def store_from_tensors(
    method store_from_live (line 190) | def store_from_live(self, block_id: int, request_idx: int) -> None:
    method restore_to_live (line 206) | def restore_to_live(self, request_idx: int, block_id: int) -> bool:
    method register_block_hash (line 228) | def register_block_hash(self, block_id: int, block_hash: int) -> None:
    method on_kv_blocks_deregistered (line 241) | def on_kv_blocks_deregistered(self, block_ids_list: list, hashes_to_de...
    method compute_and_store_offsets (line 263) | def compute_and_store_offsets(
    method get_intermediate_offsets (line 332) | def get_intermediate_offsets(self) -> Optional[List[List[int]]]:
    method buffer_intermediate_states (line 361) | def buffer_intermediate_states(
    method commit_intermediate_states (line 373) | def commit_intermediate_states(self) -> None:
    method _clear_intermediate_state (line 426) | def _clear_intermediate_state(self) -> None:
    method reset (line 444) | def reset(self) -> None:

FILE: megatron/core/inference/contexts/routing_metadata.py
  class RoutingMetadata (line 13) | class RoutingMetadata:
    method __init__ (line 25) | def __init__(self, context: 'DynamicInferenceContext', moe_router_topk...
    method _ensure_buffer_allocated (line 36) | def _ensure_buffer_allocated(self) -> None:
    method get_routing_indices (line 57) | def get_routing_indices(self) -> Optional[torch.Tensor]:
    method enable_static_buffer_recording (line 83) | def enable_static_buffer_recording(self) -> None:
    method disable_static_buffer_recording (line 94) | def disable_static_buffer_recording(self) -> None:

FILE: megatron/core/inference/contexts/static_context.py
  class StaticInferenceContext (line 8) | class StaticInferenceContext(BaseInferenceContext):
    method __init__ (line 17) | def __init__(
    method swap_key_value_dict (line 29) | def swap_key_value_dict(self, batch_idx):
    method enable_prefill_mode (line 46) | def enable_prefill_mode(self):
    method enable_decode_mode (line 54) | def enable_decode_mode(self):
    method is_decode_only (line 62) | def is_decode_only(self):
    method reset (line 66) | def reset(self):
    method __str__ (line 72) | def __str__(self):
    method __eq__ (line 83) | def __eq__(self, other):
    method is_static_batching (line 121) | def is_static_batching(self):

FILE: megatron/core/inference/data_parallel_inference_coordinator.py
  class DataParallelInferenceCoordinator (line 43) | class DataParallelInferenceCoordinator:
    class CoordinatorState (line 77) | class CoordinatorState(Enum):
    method __init__ (line 85) | def __init__(
    method get_next_data_parallel_rank (line 203) | def get_next_data_parallel_rank(self):
    method _remove_engine (line 217) | def _remove_engine(self, identity):
    method _send_to_engine (line 226) | def _send_to_engine(self, identity, payload):
    method compute_request_hashes (line 241) | def compute_request_hashes(self, prompt):
    method get_best_data_parallel_rank (line 259) | def get_best_data_parallel_rank(self, request_hashes):
    method _update_rank_hashes (line 292) | def _update_rank_hashes(self, rank_identity, request_hashes):
    method start (line 304) | def start(self):
    method detokenize (line 497) | def detokenize(self, finished_request):
    method entrypoint (line 522) | def entrypoint(
    method stop (line 574) | def stop(self):

FILE: megatron/core/inference/engines/abstract_engine.py
  class AbstractEngine (line 6) | class AbstractEngine(ABC):
    method generate (line 9) | def generate(self) -> dict:

FILE: megatron/core/inference/engines/async_zmq_communicator.py
  class AsyncZMQCommunicator (line 20) | class AsyncZMQCommunicator:
    method __init__ (line 29) | def __init__(self, zmq_context: zmq.Context, process_group: dist.Proce...
    method all_reduce_max (line 68) | async def all_reduce_max(self, *local_vals: int, async_op=True) -> int...
    method close (line 127) | def close(self):

FILE: megatron/core/inference/engines/dynamic_engine.py
  class EngineState (line 114) | class EngineState(Enum):
  class EngineSuspendedError (line 129) | class EngineSuspendedError(Exception):
  function format_mem_bytes (line 135) | def format_mem_bytes(mem_bytes):
  class RequestEntry (line 145) | class RequestEntry:
  class DynamicInferenceEngine (line 154) | class DynamicInferenceEngine(AbstractEngine):
    method __init__ (line 185) | def __init__(self, controller: TextGenerationController, context: Dyna...
    method reset (line 266) | def reset(self) -> None:
    method wait_until (line 314) | async def wait_until(self, state: EngineState):
    method create_cuda_graphs (line 326) | def create_cuda_graphs(self, reset_context: bool = True):
    method start_listening_to_data_parallel_coordinator (line 424) | async def start_listening_to_data_parallel_coordinator(
    method suspend_resume_ctx (line 627) | def suspend_resume_ctx(key: str, *, unified_memory_level: int) -> None:
    method suspend (line 688) | def suspend(self):
    method resume (line 737) | def resume(self):
    method _notify_cond_for_new_request (line 800) | async def _notify_cond_for_new_request(self):
    method _handle_failed_request (line 805) | def _handle_failed_request(self, request_id: int):
    method has_unfinished_requests (line 846) | def has_unfinished_requests(self) -> bool:
    method get_request (line 850) | def get_request(self, request_id: int) -> DynamicInferenceRequest:
    method _add_request (line 861) | def _add_request(
    method add_request (line 950) | def add_request(
    method post_process_requests (line 1009) | def post_process_requests(
    method _get_and_clear_stop_word_finished_ids (line 1299) | def _get_and_clear_stop_word_finished_ids(self, active_request_ids: li...
    method _check_stop_words_for_request_post_append (line 1322) | def _check_stop_words_for_request_post_append(
    method get_prefix_coordination_metrics (line 1370) | def get_prefix_coordination_metrics(self) -> dict:
    method _find_mamba_match_count (line 1378) | def _find_mamba_match_count(self, req: DynamicInferenceRequest) -> int:
    method schedule_waiting_requests (line 1394) | def schedule_waiting_requests(self):
    method schedule_non_chunked_prefill (line 1411) | def schedule_non_chunked_prefill(self):
    method schedule_chunked_prefill (line 1466) | def schedule_chunked_prefill(self):
    method async_forward (line 1594) | async def async_forward(self) -> Tuple[Dict, Dict, float]:
    method async_bookkeep (line 1666) | async def async_bookkeep(
    method async_step (line 1898) | async def async_step(
    method _run_coroutine_sync (line 1917) | def _run_coroutine_sync(self, coro):
    method step_modern (line 1935) | def step_modern(
    method step_legacy (line 1941) | def step_legacy(
    method generate (line 1959) | def generate(
    method schedule_requests (line 1978) | def schedule_requests(self) -> int:
    method shutdown (line 2123) | async def shutdown(self):
    method run_engine (line 2157) | async def run_engine(self, *, loop: Optional[asyncio.AbstractEventLoop...
    method _ep_establish_consensus (line 2178) | async def _ep_establish_consensus(
    method _world_barrier (line 2227) | async def _world_barrier(self):
    method run_engine_with_coordinator (line 2244) | async def run_engine_with_coordinator(

FILE: megatron/core/inference/engines/static_engine.py
  class StaticInferenceEngine (line 35) | class StaticInferenceEngine(AbstractEngine):
    method __init__ (line 50) | def __init__(
    method get_new_request_id (line 132) | def get_new_request_id(self) -> str:
    method add_request (line 136) | def add_request(
    method get_stream_generator (line 192) | def get_stream_generator(
    method generate_using_dynamic_engine (line 202) | def generate_using_dynamic_engine(
    method generate_using_legacy_static_engine (line 250) | def generate_using_legacy_static_engine(
    method generate (line 305) | def generate(
    method run_engine (line 351) | def run_engine(self):
    method _wrapped_run_engine (line 389) | def _wrapped_run_engine(self, cuda_device):
    method run_engine_async (line 399) | async def run_engine_async(self, loop: Optional[asyncio.AbstractEventL...

FILE: megatron/core/inference/headers.py
  class Headers (line 6) | class Headers(Enum):
  class UnknownHeaderError (line 25) | class UnknownHeaderError(Exception):
    method __init__ (line 28) | def __init__(self, header):

FILE: megatron/core/inference/inference_client.py
  class InferenceClient (line 29) | class InferenceClient:
    method __init__ (line 54) | def __init__(self, inference_coordinator_address: str, deserialize: bo...
    method add_request (line 87) | def add_request(
    method _recv_task (line 119) | async def _recv_task(self):
    method _connect_with_inference_coordinator (line 154) | def _connect_with_inference_coordinator(self):
    method start (line 166) | def start(self, loop: Optional[asyncio.AbstractEventLoop] = None):
    method _send_signal_to_engines (line 179) | def _send_signal_to_engines(self, signal, *args):
    method pause_engines (line 191) | def pause_engines(self):
    method unpause_engines (line 200) | def unpause_engines(self) -> None:
    method set_generation_epoch (line 204) | def set_generation_epoch(self, generation_epoch: int):
    method suspend_engines (line 212) | def suspend_engines(self):
    method resume_engines (line 219) | def resume_engines(self):
    method stop_engines (line 226) | def stop_engines(self):
    method shutdown_coordinator (line 234) | def shutdown_coordinator(self):
    method stop (line 241) | def stop(self):

FILE: megatron/core/inference/inference_request.py
  function serialize_tensor (line 18) | def serialize_tensor(tensor: torch.Tensor) -> List:
  function deserialize_tensor (line 36) | def deserialize_tensor(tensor_as_list: List) -> torch.Tensor:
  function unwrap_serialized_tensors (line 49) | def unwrap_serialized_tensors(serialized_request: dict) -> dict:
  class Status (line 65) | class Status(Enum):
  function compute_block_hashes_batched (line 88) | def compute_block_hashes_batched(prompt_tokens: torch.Tensor, block_size...
  class InferenceRequest (line 131) | class InferenceRequest:
    method __post_init__ (line 158) | def __post_init__(self):
    method serialize (line 166) | def serialize(self) -> dict:
    method deserialize (line 191) | def deserialize(cls, obj: dict) -> "InferenceRequest":
    method _post_deserialize (line 206) | def _post_deserialize(self, obj: dict):
  class DynamicInferenceEventType (line 230) | class DynamicInferenceEventType(Enum):
  class DynamicInferenceEvent (line 245) | class DynamicInferenceEvent:
    method __post_init__ (line 263) | def __post_init__(self):
    method __str__ (line 287) | def __str__(self):
    method serialize (line 296) | def serialize(self) -> dict:
    method deserialize (line 322) | def deserialize(cls, obj: dict) -> "DynamicInferenceEvent":
  class DynamicInferenceRequest (line 349) | class DynamicInferenceRequest(InferenceRequest):
    method __post_init__ (line 377) | def __post_init__(self):
    method _compute_block_hashes (line 391) | def _compute_block_hashes(self) -> None:
    method remaining_prompt_length (line 403) | def remaining_prompt_length(self):
    method __str__ (line 414) | def __str__(self):
    method serialize (line 425) | def serialize(self):
    method _post_deserialize (line 450) | def _post_deserialize(self, obj):
    method tracked_metadata (line 455) | def tracked_metadata(self) -> List[Any]:
    method get_metadata_types (line 475) | def get_metadata_types() -> List[Tuple[str, torch.dtype, bool]]:
    method add_event (line 494) | def add_event(
    method add_event_add_engine (line 502) | def add_event_add_engine(self):
    method add_event_add_context (line 507) | def add_event_add_context(self):
    method add_event_generated_token (line 511) | def add_event_generated_token(
    method add_event_pause (line 547) | def add_event_pause(self):
    method add_event_evict (line 551) | def add_event_evict(self):
    method add_event_finish (line 555) | def add_event_finish(self):
    method add_event_fail (line 559) | def add_event_fail(self):
    method add_event_error_transient (line 563) | def add_event_error_transient(self, error: Exception):
    method add_event_error_nontransient (line 567) | def add_event_error_nontransient(self, error: Exception):
    method succeeded (line 571) | def succeeded(self) -> bool:
    method failed (line 575) | def failed(self) -> bool:
  class DynamicInferenceRequestRecord (line 581) | class DynamicInferenceRequestRecord:
    method from_request (line 589) | def from_request(cls, request: DynamicInferenceRequest) -> "DynamicInf...
    method __getitem__ (line 602) | def __getitem__(self, idx: int) -> DynamicInferenceRequest:
    method request_id (line 614) | def request_id(self) -> int:
    method checkpoint (line 622) | def checkpoint(self, tokenizer: MegatronTokenizer | None = None):
    method merge (line 679) | def merge(self, tokenizer: MegatronTokenizer | None = None) -> Dynamic...
    method serialize (line 737) | def serialize(self) -> dict:
    method deserialize (line 751) | def deserialize(cls, obj: dict) -> "DynamicInferenceRequestRecord":
  class VLMInferenceRequest (line 766) | class VLMInferenceRequest(InferenceRequest):

FILE: megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py
  class AbstractModelInferenceWrapper (line 23) | class AbstractModelInferenceWrapper(abc.ABC):
    method __init__ (line 39) | def __init__(
    method prep_model_for_inference (line 70) | def prep_model_for_inference(self):
    method prep_inference_input (line 87) | def prep_inference_input(self, prompt_tokens) -> Dict[str, Any]:
    method get_batch_for_context_window (line 99) | def get_batch_for_context_window(self, *args, **kwargs) -> Dict[str, A...
    method _forward (line 109) | def _forward(self, inference_input):
    method dummy_forward (line 130) | def dummy_forward(self):
    method _get_batch_size_and_seq_len (line 157) | def _get_batch_size_and_seq_len(
    method _allocate_recv_buffer (line 176) | def _allocate_recv_buffer(self, batch_size, seq_len):
    method forward_pass_without_pipeline_parallel (line 188) | def forward_pass_without_pipeline_parallel(
    method forward_pass_with_pipeline_parallel (line 209) | def forward_pass_with_pipeline_parallel(
    method run_one_forward_step (line 256) | def run_one_forward_step(

FILE: megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py
  class GPTInferenceWrapper (line 19) | class GPTInferenceWrapper(AbstractModelInferenceWrapper):
    method __init__ (line 31) | def __init__(self, model: GPTModel, inference_context: Optional[BaseIn...
    method prep_inference_input (line 34) | def prep_inference_input(self, prompts_tokens: torch.Tensor) -> Dict[s...
    method _build_attention_mask_and_position_ids (line 54) | def _build_attention_mask_and_position_ids(
    method get_batch_for_context_window (line 91) | def get_batch_for_context_window(

FILE: megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py
  class VLMInferenceWrapper (line 18) | class VLMInferenceWrapper(GPTInferenceWrapper):
    method prep_model_for_inference (line 21) | def prep_model_for_inference(self, prompts_tokens: Optional[torch.Tens...
    method prep_inference_input (line 55) | def prep_inference_input(
    method get_batch_for_context_window (line 89) | def get_batch_for_context_window(
    method _forward (line 126) | def _forward(self, inference_input: Dict[str, Any]):
    method run_one_forward_step (line 155) | def run_one_forward_step(self, inference_input: Dict[str, Any]) -> tor...

FILE: megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py
  class T5InferenceWrapper (line 19) | class T5InferenceWrapper(AbstractModelInferenceWrapper):
    method __init__ (line 33) | def __init__(
    method prep_inference_input (line 42) | def prep_inference_input(
    method tokenize_encoder_prompt (line 93) | def tokenize_encoder_prompt(self, encoder_prompt: str, tokenizer) -> t...
    method pad_encoder_prompts_tokens (line 121) | def pad_encoder_prompts_tokens(
    method get_batch_for_context_window (line 144) | def get_batch_for_context_window(
    method forward_pass_without_pipeline_parallel (line 192) | def forward_pass_without_pipeline_parallel(

FILE: megatron/core/inference/moe/__init__.py
  class InferenceGroupedGemmBackend (line 10) | class InferenceGroupedGemmBackend(enum.Enum):
  function resolve_inference_grouped_gemm_backend (line 18) | def resolve_inference_grouped_gemm_backend(

FILE: megatron/core/inference/moe/activations.py
  function _ceil_div (line 28) | def _ceil_div(a, b):
  function _squared_relu_kernel (line 33) | def _squared_relu_kernel(input_ptr, output_ptr, src_idx_ptr, M, N, BLOCK...
  function padded_squared_relu (line 46) | def padded_squared_relu(x: torch.Tensor, permutation_map: torch.Tensor) ...
  function _squared_relu_quantize_kernel (line 56) | def _squared_relu_quantize_kernel(
  function squared_relu_and_quantize_mxfp8 (line 121) | def squared_relu_and_quantize_mxfp8(

FILE: megatron/core/inference/moe/fused_moe.py
  class ActivationType (line 40) | class ActivationType(Enum):
  function _bf16_grouped_mm (line 46) | def _bf16_grouped_mm(
  function _mxfp8_grouped_mm (line 54) | def _mxfp8_grouped_mm(act: MXFP8Tensor, weight: MXFP8Tensor, offs: torch...
  function _get_activation_func (line 70) | def _get_activation_func(activation_type: ActivationType, fused_quant: b...
  function mcore_fused_moe (line 81) | def mcore_fused_moe(

FILE: megatron/core/inference/moe/pad.py
  function _pad_tokens_kernel (line 37) | def _pad_tokens_kernel(
  function pad_to_alignment (line 92) | def pad_to_alignment(
  function _unpad_tokens_kernel (line 140) | def _unpad_tokens_kernel(
  function unpad_from_alignment (line 168) | def unpad_from_alignment(

FILE: megatron/core/inference/moe/permute.py
  function _ceil_div (line 31) | def _ceil_div(a, b):
  function _count_local_tokens_kernel (line 36) | def _count_local_tokens_kernel(
  function compute_local_tokens_per_expert (line 60) | def compute_local_tokens_per_expert(
  function _prefix_sum_kernel (line 79) | def _prefix_sum_kernel(
  function compute_expert_offsets (line 104) | def compute_expert_offsets(tokens_per_expert: torch.Tensor, alignment: i...
  function _permute_tokens_kernel (line 121) | def _permute_tokens_kernel(
  function permute_tokens (line 170) | def permute_tokens(
  function _unpermute_tokens_kernel (line 243) | def _unpermute_tokens_kernel(
  function unpermute_tokens (line 271) | def unpermute_tokens(
  function _permute_quantize_mxfp8_kernel (line 295) | def _permute_quantize_mxfp8_kernel(
  function permute_and_quantize_mxfp8 (line 375) | def permute_and_quantize_mxfp8(

FILE: megatron/core/inference/quantization/mxfp8_quantize.py
  function _ceil_div (line 33) | def _ceil_div(a, b):
  function _mxfp8_quant_swizzle_kernel (line 38) | def _mxfp8_quant_swizzle_kernel(
  function mxfp8_quantize (line 160) | def mxfp8_quantize(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:

FILE: megatron/core/inference/quantization/mxfp8_tensor.py
  function _ceil_div (line 20) | def _ceil_div(a, b):
  class MXFP8Tensor (line 25) | class MXFP8Tensor:
    method size (line 32) | def size(self, idx: Optional[int] = None):
    method scale_2d (line 36) | def scale_2d(self, K: Optional[int] = None) -> torch.Tensor:
    method from_bf16 (line 51) | def from_bf16(cls, x: torch.Tensor, group_size: int = 32, backend: str...

FILE: megatron/core/inference/quantization/utils.py
  function _verify_te_to_mcore_mxfp8_conversion (line 32) | def _verify_te_to_mcore_mxfp8_conversion(te_dequantized, fi_quantized: M...
  function quantize_model_to_mxfp8 (line 53) | def quantize_model_to_mxfp8(model: torch.nn.Module, backend: str = "flas...
  function _should_quantize_param (line 97) | def _should_quantize_param(val: torch.Tensor) -> bool:
  function _to_bf16 (line 114) | def _to_bf16(val: torch.Tensor) -> torch.Tensor:
  function collect_mxfp8_param_metadata (line 123) | def collect_mxfp8_param_metadata(
  function quantize_params_to_mxfp8 (line 142) | def quantize_params_to_mxfp8(
  function _mm_mxfp8_flashinfer (line 215) | def _mm_mxfp8_flashinfer(x_mxfp8: MXFP8Tensor, weight: MXFP8Tensor, out=...
  function _mm_mxfp8_torch (line 222) | def _mm_mxfp8_torch(x_mxfp8: MXFP8Tensor, weight: MXFP8Tensor, out=None):
  function mm_mxfp8 (line 241) | def mm_mxfp8(x: torch.Tensor, weight: MXFP8Tensor, out: torch.Tensor = N...

FILE: megatron/core/inference/sampling_params.py
  class SamplingParams (line 9) | class SamplingParams:
    method __post_init__ (line 38) | def __post_init__(self):
    method _sync_prompt_logprobs_fields (line 46) | def _sync_prompt_logprobs_fields(self):
    method add_attributes (line 62) | def add_attributes(self, attribute_value_pair: dict):
    method serialize (line 79) | def serialize(self) -> dict:
    method deserialize (line 84) | def deserialize(cls, data: dict) -> "SamplingParams":

FILE: megatron/core/inference/scheduler.py
  class Scheduler (line 17) | class Scheduler:
    method __init__ (line 28) | def __init__(self, max_batch_size):
    method get_new_request_id (line 37) | def get_new_request_id(self) -> int:
    method add_request (line 42) | def add_request(
    method num_requests_pending (line 124) | def num_requests_pending(self) -> int:
    method have_requests_pending (line 131) | def have_requests_pending(self) -> bool:
    method add_earliest_waiting_request_to_active_pool (line 138) | def add_earliest_waiting_request_to_active_pool(self):
    method update_requests_pools (line 154) | def update_requests_pools(
    method abort_request (line 184) | def abort_request(

FILE: megatron/core/inference/symmetric_memory.py
  class SymmetricMemoryBuffer (line 34) | class SymmetricMemoryBuffer:
    method __init__ (line 41) | def __init__(self, size_in_mb, process_group):
    method _can_allocate (line 60) | def _can_allocate(self, numel, dtype) -> bool:
    method _allocate (line 71) | def _allocate(self, numel, dtype) -> torch.Tensor:
    method maybe_get_tensors (line 77) | def maybe_get_tensors(self, tensor_specs, alignment=16):
    method maybe_get_tensor (line 115) | def maybe_get_tensor(self, tensor_shape, dtype):
  class SymmetricMemoryManager (line 131) | class SymmetricMemoryManager:
    method get_buffer (line 144) | def get_buffer(
    method destroy (line 168) | def destroy(cls, key: Optional[str] = None) -> None:
    method is_initialized (line 180) | def is_initialized(cls, key: str) -> bool:

FILE: megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py
  class EncoderDecoderTextGenerationController (line 13) | class EncoderDecoderTextGenerationController(TextGenerationController):
    method prep_inference_input (line 21) | def prep_inference_input(

FILE: megatron/core/inference/text_generation_controllers/text_generation_controller.py
  class TextGenerationController (line 49) | class TextGenerationController:
    method __init__ (line 60) | def __init__(self, inference_wrapped_model: AbstractModelInferenceWrap...
    method _get_mtp_num_heads (line 90) | def _get_mtp_num_heads(self) -> int:
    method set_stop_word_finished_ids_callback (line 97) | def set_stop_word_finished_ids_callback(self, callback):
    method _init_dynamic_sampling_tensors (line 108) | def _init_dynamic_sampling_tensors(self):
    method _init_mtp_sampling_tensor (line 144) | def _init_mtp_sampling_tensor(self):
    method tokenize_prompt (line 161) | def tokenize_prompt(tokenizer, prompt: str, add_BOS: bool = False) -> ...
    method detokenize (line 187) | def detokenize(
    method detokenize_generations (line 219) | def detokenize_generations(
    method _torch_sampling_func (line 272) | def _torch_sampling_func(
    method sample_from_logits (line 361) | def sample_from_logits(
    method update_generation_status (line 449) | def update_generation_status(
    method pad_input_prompt_tokens (line 495) | def pad_input_prompt_tokens(
    method unpad_input_prompt_tokens (line 531) | def unpad_input_prompt_tokens(
    method _dynamic_step_context_init (line 542) | def _dynamic_step_context_init(
    method _dynamic_step_forward_logits (line 618) | def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids...
    method _dynamic_step_sample_bookkeeping (line 660) | def _dynamic_step_sample_bookkeeping(self):
    method _rewind_kv_cache (line 684) | def _rewind_kv_cache(self):
    method _sample_from_logits_2d (line 793) | def _sample_from_logits_2d(self, logits_2d: Tensor) -> Tensor:
    method _compute_serial_mtp_and_sample (line 818) | def _compute_serial_mtp_and_sample(self):
    method _get_required_logit_indices (line 893) | def _get_required_logit_indices(
    method _sample_speculative_logits (line 934) | def _sample_speculative_logits(
    method _verify_speculative_tokens (line 991) | def _verify_speculative_tokens(
    method _dynamic_step_sample_logits_and_verify_tokens (line 1075) | def _dynamic_step_sample_logits_and_verify_tokens(self, logits: Tensor...
    method _dynamic_step_sample_logits (line 1156) | def _dynamic_step_sample_logits(self, logits: Tensor):
    method _dynamic_step_log_probs_bookkeeping (line 1197) | def _dynamic_step_log_probs_bookkeeping(self) -> Tuple[bool, bool]:
    method _router_record_bookkeeping (line 1211) | def _router_record_bookkeeping(self) -> Optional[Dict[int, Tensor]]:
    method _dynamic_step_calculate_log_probs (line 1271) | def _dynamic_step_calculate_log_probs(self, logits: Tensor) -> Optiona...
    method _dynamic_step_calculate_log_probs_speculative (line 1282) | def _dynamic_step_calculate_log_probs_speculative(
    method _dynamic_step_calculate_top_n_logprobs_speculative (line 1380) | def _dynamic_step_calculate_top_n_logprobs_speculative(
    method _dynamic_step_calculate_top_n_logprobs (line 1477) | def _dynamic_step_calculate_top_n_logprobs(
    method dummy_forward (line 1561) | def dummy_forward(self):
    method _dummy_serial_mtp_forward (line 1609) | def _dummy_serial_mtp_forward(self):
    method _dynamic_step_context_bookkeeping (line 1666) | def _dynamic_step_context_bookkeeping(self) -> Dict[str, Tensor]:
    method async_generate_output_tokens_dynamic_batch (line 1739) | async def async_generate_output_tokens_dynamic_batch(
    method generate_output_tokens_dynamic_batch (line 1860) | def generate_output_tokens_dynamic_batch(
    method _update_top_n_logprobs_dict (line 1867) | def _update_top_n_logprobs_dict(
    method generate_all_output_tokens_static_batch (line 1899) | def generate_all_output_tokens_static_batch(
    method prep_inference_input (line 2367) | def prep_inference_input(
    method stream_tokens (line 2393) | def stream_tokens(

FILE: megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py
  class VLMTextGenerationController (line 13) | class VLMTextGenerationController(TextGenerationController):
    method prep_inference_input (line 16) | def prep_inference_input(

FILE: megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/chat_completions.py
  function _get_field (line 20) | def _get_field(obj, key, default=None):
  function _normalize_tool_calls (line 27) | def _normalize_tool_calls(tool_calls):
  function _coerce_arguments_mapping (line 51) | def _coerce_arguments_mapping(arguments):
  function _sanitize_messages_for_template (line 72) | def _sanitize_messages_for_template(messages):
  function _sanitize_tools_for_template (line 116) | def _sanitize_tools_for_template(tools):
  function _reconstruct_reasoning_content (line 143) | def _reconstruct_reasoning_content(messages: list[dict]) -> list[dict]:
  function _replace_prefix_tokens (line 159) | def _replace_prefix_tokens(
  function apply_parsers (line 199) | def apply_parsers(message_text, tools, parsers_list, tools_requested):
  function chat_completions (line 225) | async def chat_completions():

FILE: megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/common.py
  function send_do_generate (line 11) | def send_do_generate():

FILE: megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/completions.py
  function completions (line 20) | async def completions():

FILE: megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/health.py
  function health (line 14) | async def health():

FILE: megatron/core/inference/text_generation_server/dynamic_text_gen_server/text_generation_server.py
  function temp_log_level (line 31) | def temp_log_level(level, logger=None):
  function _run_text_gen_server (line 43) | async def _run_text_gen_server(
  function _server_process_worker (line 112) | def _server_process_worker(
  function start_text_gen_server (line 141) | def start_text_gen_server(
  function stop_text_gen_server (line 184) | def stop_text_gen_server():

FILE: megatron/core/inference/text_generation_server/dynamic_text_gen_server/tokenization.py
  function tokenize_prompts (line 12) | def tokenize_prompts(
  function _tokenize_prompts_and_batch (line 70) | def _tokenize_prompts_and_batch(tokenizer, prompts, tokens_to_generate, ...

FILE: megatron/core/inference/text_generation_server/endpoints/common.py
  function send_do_generate (line 11) | def send_do_generate():

FILE: megatron/core/inference/text_generation_server/endpoints/completions.py
  function detokenize (line 24) | def detokenize(prompt, tok) -> list[str]:
  class MegatronCompletions (line 46) | class MegatronCompletions(Resource):
    method __init__ (line 49) | def __init__(self, engine, args):
    method post (line 53) | def post(self):

FILE: megatron/core/inference/text_generation_server/run_mcore_engine.py
  function run_mcore_engine (line 12) | def run_mcore_engine(

FILE: megatron/core/inference/text_generation_server/text_generation_server.py
  class MegatronGenerate (line 27) | class

Copy disabled (too large) Download .json

Condensed preview — 2310 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (16,954K chars).

[
  {
    "path": ".coderabbit.yaml",
    "chars": 1147,
    "preview": "# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json\nlanguage: \"en-US\"\n\n# Only comment on C"
  },
  {
    "path": ".flake8",
    "chars": 107,
    "preview": "[flake8]\nmax-line-length = 100\nextend-ignore = E203,E501,F401,E402,E714\nper-file-ignores = __init__.py:F401"
  },
  {
    "path": ".github/CODEOWNERS",
    "chars": 2701,
    "preview": "megatron/core/ @NVIDIA/core-adlr @NVIDIA/core-nemo\n\nmegatron/core/models/gpt/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDI"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/bug_report.md",
    "chars": 727,
    "preview": "---\nname: Bug report\nabout: Create a report to help us improve the repository or project\ntitle: \"\"\nlabels: bug\nassignees"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/config.yml",
    "chars": 29,
    "preview": "blank_issues_enabled: false\n\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/feature_request.md",
    "chars": 722,
    "preview": "---\nname: Feature request\nabout: Suggest an idea for this project\ntitle: \"\"\nlabels: enhancement\nassignees: ''\n\n---\n\n**Is"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/question.md",
    "chars": 351,
    "preview": "---\nname: QUESTION\nabout: Ask a question about Megatron-LM that is not a bug, regression or enhancement\n  request\ntitle:"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/regression.md",
    "chars": 1151,
    "preview": "---\nname: REGRESSION\nabout: Report a regression in speed or accuracy due to a Megatron-LM update\ntitle: \"[REGRESSION]\"\nl"
  },
  {
    "path": ".github/actions/action.yml",
    "chars": 9171,
    "preview": "# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \""
  },
  {
    "path": ".github/actions/check-nvidia-sso-membership/action.yml",
    "chars": 5333,
    "preview": "name: 'Check NVIDIA SSO Membership'\ndescription: 'Check if a GitHub username exists in the NVIDIA SSO users list from gi"
  },
  {
    "path": ".github/copy-pr-bot.yaml",
    "chars": 1489,
    "preview": "enabled: true\nauto_sync_draft: false\nauto_sync_ready: true\ntrustees_override: [\"AAnoosheh\", \"ArEsKay3\", \"Autumn1998\", \"B"
  },
  {
    "path": ".github/oncall_schedule.json",
    "chars": 852,
    "preview": "[\n    {\n        \"user\": \"dimapihtar\",\n        \"date\": \"2026-03-18\"\n    },\n    {\n        \"user\": \"janEbert\",\n        \"dat"
  },
  {
    "path": ".github/pull_request_template.md",
    "chars": 2205,
    "preview": "# What does this PR do ?\n<!-- Add a one line overview of what this PR aims to accomplish. -->\n\n:warning: For major chang"
  },
  {
    "path": ".github/scripts/oncall_manager.py",
    "chars": 16728,
    "preview": "# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
  },
  {
    "path": ".github/scripts/readme.sh",
    "chars": 3630,
    "preview": "#!/bin/bash\n\ncat << 'EOF'\n╔══════════════════════════════════════════════════════════════════════╗\n║                    "
  },
  {
    "path": ".github/scripts/sync_team_usergroups.py",
    "chars": 19163,
    "preview": "# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
  },
  {
    "path": ".github/workflows/_build_test_publish_wheel.yml",
    "chars": 6337,
    "preview": "on:\n  workflow_call:\n    inputs:\n      ref:\n        required: false\n        description: Ref (SHA or branch) to release\n"
  },
  {
    "path": ".github/workflows/_release_library.yml",
    "chars": 18528,
    "preview": "# Copyright (c) 2020-2021, NVIDIA CORPORATION.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you"
  },
  {
    "path": ".github/workflows/_update_dependencies.yml",
    "chars": 7901,
    "preview": "name: ~Update dependencies template\non:\n  workflow_call:\n    inputs:\n      target-branch:\n        required: true\n       "
  },
  {
    "path": ".github/workflows/auto-assign-milestone.yml",
    "chars": 2781,
    "preview": "name: Auto-assign Milestone to PR\n\non:\n  push:\n    branches:\n      - \"pull-request/[0-9]+\"\n\npermissions:\n  contents: rea"
  },
  {
    "path": ".github/workflows/auto-reminder-bot.yml",
    "chars": 917,
    "preview": "# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n\nname: Auto Reminder Bot\n\non:\n  workflow_disp"
  },
  {
    "path": ".github/workflows/auto-swap-labels.yml",
    "chars": 2252,
    "preview": "# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n\nname: Auto Swap Labels\non:\n  pull_request_ta"
  },
  {
    "path": ".github/workflows/auto-update-copy-pr-bot.yml",
    "chars": 2240,
    "preview": "name: Auto Update Copy PR Bot\n\non:\n  workflow_dispatch:\n  schedule:\n    - cron: \"0 0 * * *\"\n\njobs:\n  auto-update-copy-pr"
  },
  {
    "path": ".github/workflows/build-docs.yml",
    "chars": 2385,
    "preview": "# Copyright (c) 2025, NVIDIA CORPORATION.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may "
  },
  {
    "path": ".github/workflows/build-test-publish-wheel.yml",
    "chars": 2812,
    "preview": "# Copyright (c) 2019-2025, NVIDIA CORPORATION. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 ("
  },
  {
    "path": ".github/workflows/cherry-pick-release-commit.yml",
    "chars": 1103,
    "preview": "# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
  },
  {
    "path": ".github/workflows/cicd-approve-test-queue.yml",
    "chars": 11698,
    "preview": "# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
  },
  {
    "path": ".github/workflows/cicd-main.yml",
    "chars": 40724,
    "preview": "# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
  },
  {
    "path": ".github/workflows/claude-complexity-label.yml",
    "chars": 2985,
    "preview": "name: Claude Complexity Label\n\non:\n  pull_request_target:\n    types: [ready_for_review]\n\njobs:\n  label-complexity:\n    n"
  },
  {
    "path": ".github/workflows/claude_review.yml",
    "chars": 2368,
    "preview": "name: Claude Code Review\n\non:\n  issue_comment:\n    types: [created]\n\njobs:\n  review-on-comment:\n    name: Claude Review "
  },
  {
    "path": ".github/workflows/close-inactive-issue-pr.yml",
    "chars": 855,
    "preview": "# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
  },
  {
    "path": ".github/workflows/community-bot.yml",
    "chars": 1032,
    "preview": "# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
  },
  {
    "path": ".github/workflows/config/changelog-config.json",
    "chars": 731,
    "preview": "{\n    \"categories\": [],\n    \"ignore_labels\": [\n      \"ignore\"\n    ],\n    \"sort\": \"ASC\",\n    \"template\": \"\\n${{CHANGELOG}"
  },
  {
    "path": ".github/workflows/copyright-check.yml",
    "chars": 2686,
    "preview": "# Copyright (c) 2025, NVIDIA CORPORATION.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may "
  },
  {
    "path": ".github/workflows/dependabot.yml",
    "chars": 2028,
    "preview": "name: Dependabot\non:\n  schedule:\n    - cron: \"0 8 * * 1\"\n  workflow_dispatch: # Allow manual triggering\n\npermissions:\n  "
  },
  {
    "path": ".github/workflows/force-draft-pr.yml",
    "chars": 1253,
    "preview": "# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n\nname: Force Draft PR\n\non:\n  pull_request_tar"
  },
  {
    "path": ".github/workflows/install-test.yml",
    "chars": 6275,
    "preview": "# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \""
  },
  {
    "path": ".github/workflows/multi-approval-bot.yml",
    "chars": 2706,
    "preview": "name: \"Codeowners Approval Workflow\"\n\non:\n  push:\n    branches:\n      - \"pull-request/[0-9]+\"\n  merge_group:\n    types: "
  },
  {
    "path": ".github/workflows/oncall-assign.yml",
    "chars": 1339,
    "preview": "# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
  },
  {
    "path": ".github/workflows/oncall-rotation.yml",
    "chars": 2008,
    "preview": "# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
  },
  {
    "path": ".github/workflows/release-docs.yml",
    "chars": 4370,
    "preview": "# Copyright (c) 2025, NVIDIA CORPORATION.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may "
  },
  {
    "path": ".github/workflows/release-freeze.yml",
    "chars": 1522,
    "preview": "# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
  },
  {
    "path": ".github/workflows/release-nightly-docs.yml",
    "chars": 890,
    "preview": "# Copyright (c) 2026, NVIDIA CORPORATION.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may "
  },
  {
    "path": ".github/workflows/release.yaml",
    "chars": 3077,
    "preview": "# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
  },
  {
    "path": ".github/workflows/review-trigger.yml",
    "chars": 811,
    "preview": "# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Lightweight workflow that triggers on rev"
  },
  {
    "path": ".github/workflows/sync-team-usergroups.yml",
    "chars": 1393,
    "preview": "# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
  },
  {
    "path": ".github/workflows/trigger-mbridge-tests.yml",
    "chars": 1237,
    "preview": "# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n# SPDX-License-Identifier: Apache-2.0\n\nname: "
  },
  {
    "path": ".gitignore",
    "chars": 205,
    "preview": "__pycache__\n*.so\nbuild\n.coverage_*\n*.egg-info\n*~\nslurm*\nlogs\n.vscode\nlocal/\n.gitmodules\nwandb/\nonelogger.log\nonelogger.e"
  },
  {
    "path": ".gitlab/labeler-config.yml",
    "chars": 475,
    "preview": "CI:\n  - .gitlab-ci.yml\n  - Dockerfile.ci.lts\n  - Dockerfile.ci.dev\n  - .github/**\n  - .gitlab/**\n\nDatasets:\n  - megatron"
  },
  {
    "path": ".gitlab/scripts/build.sh",
    "chars": 2763,
    "preview": "#! /bin/bash\n\nset -x\nenv\neval \"IMAGE=\\$$IMAGE\"\n\n# Start a named container in detached mode\ndocker run -d --name download"
  },
  {
    "path": ".gitlab/scripts/check_imports.py",
    "chars": 7180,
    "preview": "# Copyright (c) 2025, NVIDIA CORPORATION.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may "
  },
  {
    "path": ".gitlab/scripts/fetch-legacy-suite.sh",
    "chars": 1551,
    "preview": "#!/bin/bash\nset -euxo pipefail\n\n# Default values\nMCORE_REPO=\"https://github.com/nvidia/megatron-lm.git\"\nMCORE_MR_COMMIT="
  },
  {
    "path": ".gitlab/stages/00.pre.yml",
    "chars": 12151,
    "preview": "include:\n  - template: Security/Secret-Detection.gitlab-ci.yml\n\n.pre_rules:\n  rules:\n    - if: $CI_MERGE_REQUEST_EVENT_T"
  },
  {
    "path": ".gitlab/stages/01.build.yml",
    "chars": 4103,
    "preview": ".build_rules:\n  rules:\n    - if: $BUILD == \"no\"\n      when: never\n    - when: on_success\n  stage: test\n\n.build_image:\n  "
  },
  {
    "path": ".gitlab/stages/02.test.yml",
    "chars": 9609,
    "preview": ".test_rules:\n  rules:\n    - if: $PUBLISH == \"yes\"\n      when: never\n    - if: $BUILD == \"no\"\n      when: never\n    - whe"
  },
  {
    "path": ".gitlab/stages/03.integration-tests.yml",
    "chars": 5301,
    "preview": ".integration_tests_rules:\n  stage: integration_tests\n  rules:\n    - if: $BUILD == \"no\"\n      when: never\n    - if: $INTE"
  },
  {
    "path": ".gitlab/stages/04.functional-tests.yml",
    "chars": 8749,
    "preview": ".functional_tests_rules:\n  stage: functional_tests\n  rules:\n    - if: $BUILD == \"no\"\n      when: never\n    - if: $FUNCTI"
  },
  {
    "path": ".gitlab/stages/05.publish.yml",
    "chars": 8536,
    "preview": ".publish_common_release:\n  stage: publish\n  rules:\n    - if: $CI_PIPELINE_SOURCE == \"web\" && $PUBLISH == \"yes\" && $PUBLI"
  },
  {
    "path": ".gitlab-ci.yml",
    "chars": 8183,
    "preview": ".merge_train_rule: &merge_train_rule\n  UNIT_TEST: \"yes\"\n  UNIT_TEST_REPEAT: 1\n  UNIT_TEST_TIMEOUT: 30\n  INTEGRATION_TEST"
  },
  {
    "path": ".pre-commit-config.yaml",
    "chars": 444,
    "preview": "repos:\n- repo: https://github.com/psf/black\n  rev: 'refs/tags/24.4.2:refs/tags/24.4.2'\n  hooks:\n  - id: black\n    files:"
  },
  {
    "path": ".pylintrc",
    "chars": 475,
    "preview": "[MAIN]\nignore-paths=tests\nmax-line-length=100\nload-plugins=pylint.extensions.bad_builtin\n[MESSAGES CONTROL]\ndisable=all\n"
  },
  {
    "path": ".python-version",
    "chars": 4,
    "preview": "3.12"
  },
  {
    "path": "CONTRIBUTING.md",
    "chars": 146,
    "preview": "# Contributing to Megatron\n\nVisit our [contributing page](https://docs.nvidia.com/megatron-core/developer-guide/latest/d"
  },
  {
    "path": "LICENSE",
    "chars": 17640,
    "preview": "The following applies to all files unless otherwise noted:\n\n# Copyright (c) 2019-2025, NVIDIA CORPORATION. All rights re"
  },
  {
    "path": "MANIFEST.in",
    "chars": 165,
    "preview": "include megatron/core/requirements.txt\ninclude megatron/core/README.md\ninclude megatron/core/package_info.py\nglobal-excl"
  },
  {
    "path": "README.md",
    "chars": 9752,
    "preview": "<div align=\"center\">\n\nMegatron-LM and Megatron Core\n=============================\n\n<h4>GPU-optimized library for trainin"
  },
  {
    "path": "codecov.yml",
    "chars": 263,
    "preview": "comment: false\ncoverage:\n  status:\n    project: false\n    patch:\n      default:\n        target: 80%\n        threshold: 5"
  },
  {
    "path": "docker/.ngc_version.dev",
    "chars": 32,
    "preview": "nvcr.io/nvidia/pytorch:26.02-py3"
  },
  {
    "path": "docker/.ngc_version.lts",
    "chars": 32,
    "preview": "nvcr.io/nvidia/pytorch:25.09-py3"
  },
  {
    "path": "docker/Dockerfile.ci.dev",
    "chars": 3854,
    "preview": "# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.\n# syntax=docker/dockerfile:1.3-labs\n\nARG FROM_IMAGE_NAME\n"
  },
  {
    "path": "docker/Dockerfile.ci.nemo",
    "chars": 707,
    "preview": "# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.\n# syntax=docker/dockerfile:1.3-labs\n\nARG FROM_IMAGE_NAME\n"
  },
  {
    "path": "docker/Dockerfile.linting",
    "chars": 867,
    "preview": "# syntax=docker/dockerfile:experimental\n\nARG FROM_IMAGE_NAME\nFROM $FROM_IMAGE_NAME as main\nENV DEBIAN_FRONTEND=nonintera"
  },
  {
    "path": "docker/common/install.sh",
    "chars": 4558,
    "preview": "#!/bin/bash\nset -xeuo pipefail # Exit immediately if a command exits with a non-zero status\n\n# Parse command line argume"
  },
  {
    "path": "docker/common/install_source_wheels.sh",
    "chars": 1668,
    "preview": "#!/bin/bash\nset -xeuo pipefail # Exit immediately if a command exits with a non-zero status\n\nINPUT_WHEEL_DIR=$(pwd)/whee"
  },
  {
    "path": "docker/patches/deepep.patch",
    "chars": 540,
    "preview": "diff --git a/setup.py b/setup.py\nindex 63ce332..4e13462 100644\n--- a/setup.py\n+++ b/setup.py\n@@ -37,7 +37,7 @@ if __name"
  },
  {
    "path": "docs/add_copyright_header.py",
    "chars": 1125,
    "preview": "#!/usr/bin/env python3\n\"\"\"One-off script to add NVIDIA copyright header to all .md files under docs/.\"\"\"\n\nfrom pathlib i"
  },
  {
    "path": "docs/advanced/index.md",
    "chars": 774,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/api-backwards-compatibility-check.md",
    "chars": 8882,
    "preview": "---\norphan: true\n---\n\n<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION a"
  },
  {
    "path": "docs/api-guide/core/datasets.md",
    "chars": 523,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/api-guide/core/dist_checkpointing.md",
    "chars": 5196,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/api-guide/core/dist_checkpointing.strategies.md",
    "chars": 891,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/api-guide/core/distributed.md",
    "chars": 941,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/api-guide/core/fusions.md",
    "chars": 945,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/api-guide/core/index.md",
    "chars": 667,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/api-guide/core/pipeline_parallel.md",
    "chars": 888,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/api-guide/core/tensor_parallel.md",
    "chars": 800,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/api-guide/core/transformer.md",
    "chars": 916,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/api-guide/index.md",
    "chars": 586,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/api-guide/internal/index.md",
    "chars": 575,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/api-guide/internal/num_microbatches_calculator.md",
    "chars": 582,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/api-guide/internal/optimizer_param_scheduler.md",
    "chars": 563,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/api-guide/models/index.md",
    "chars": 585,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/api-guide/models/models.bert.md",
    "chars": 619,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/api-guide/models/models.gpt.md",
    "chars": 798,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/api-guide/models/models.md",
    "chars": 692,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/api-guide/models/models.t5.md",
    "chars": 465,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/api-guide/router_replay.md",
    "chars": 10757,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/autodoc2_docstrings_parser.py",
    "chars": 1225,
    "preview": "# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
  },
  {
    "path": "docs/broken_links_false_positives.json",
    "chars": 39,
    "preview": "{\n    \"uri\": \"http://localhost:8080/\"\n}"
  },
  {
    "path": "docs/conf.py",
    "chars": 4844,
    "preview": "# Copyright (c) 2025-2026, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 "
  },
  {
    "path": "docs/developer/contribute.md",
    "chars": 3844,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/developer/generate_docs.md",
    "chars": 805,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/developer/oncall.md",
    "chars": 2949,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/developer/submit.md",
    "chars": 1527,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/discussions/README.md",
    "chars": 1410,
    "preview": "---\norphan: true\n---\n\n<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION a"
  },
  {
    "path": "docs/discussions/megatron-fsdp-user-guide/example-scripts/sbatch_checkpoint_convert.sh",
    "chars": 1677,
    "preview": "#!/bin/bash\n\n# Configuration: Set these paths before running the script\nMEGATRON_PATH=${MEGATRON_PATH:-\"your_own_megatro"
  },
  {
    "path": "docs/discussions/megatron-fsdp-user-guide/example-scripts/sbatch_mfsdp_deepseek_v3.sh",
    "chars": 6615,
    "preview": "#!/bin/bash\n\nexport NCCL_IB_SL=1\nexport NCCL_IB_TIMEOUT=19\nexport NVTE_FWD_LAYERNORM_SM_MARGIN=16\nexport NVTE_BWD_LAYERN"
  },
  {
    "path": "docs/discussions/megatron-fsdp-user-guide/megatron-fsdp-user-guide.md",
    "chars": 6537,
    "preview": "---\norphan: true\n---\n\n<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION a"
  },
  {
    "path": "docs/documentation.md",
    "chars": 2307,
    "preview": "---\norphan: true\n---\n\n<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION a"
  },
  {
    "path": "docs/get-started/install.md",
    "chars": 4289,
    "preview": "<!---\r\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\r\n   NVIDIA CORPORATION and its licensors ret"
  },
  {
    "path": "docs/get-started/overview.md",
    "chars": 5838,
    "preview": "<!---\r\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\r\n   NVIDIA CORPORATION and its licensors ret"
  },
  {
    "path": "docs/get-started/quickstart.md",
    "chars": 2159,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/get-started/releasenotes.md",
    "chars": 972,
    "preview": "<!---\r\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\r\n   NVIDIA CORPORATION and its licensors ret"
  },
  {
    "path": "docs/index.md",
    "chars": 2621,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/llama_mistral.md",
    "chars": 20750,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/models/index.md",
    "chars": 1170,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/models/llms.md",
    "chars": 2773,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/models/multimodal.md",
    "chars": 3457,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/project.json",
    "chars": 47,
    "preview": "{\"name\": \"megatron-lm\", \"version\": \"nightly\"}\n\n"
  },
  {
    "path": "docs/user-guide/data-preparation.md",
    "chars": 3615,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/user-guide/features/context_parallel.md",
    "chars": 4547,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/user-guide/features/custom_fsdp.md",
    "chars": 15305,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/user-guide/features/dist_optimizer.md",
    "chars": 3143,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/user-guide/features/fine_grained_activation_offloading.md",
    "chars": 2244,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/user-guide/features/index.md",
    "chars": 727,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/user-guide/features/megatron_energon.md",
    "chars": 3813,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/user-guide/features/megatron_rl.md",
    "chars": 2204,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/user-guide/features/moe.md",
    "chars": 663,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/user-guide/features/multi_latent_attention.md",
    "chars": 1210,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/user-guide/features/multi_token_prediction.md",
    "chars": 4008,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/user-guide/features/optimizer_cpu_offload.md",
    "chars": 544,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/user-guide/features/pipeline_parallel_layout.md",
    "chars": 1661,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/user-guide/features/tokenizers.md",
    "chars": 6547,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/user-guide/index.md",
    "chars": 658,
    "preview": "---\norphan: true\n---\n\n<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION a"
  },
  {
    "path": "docs/user-guide/msc_integration.md",
    "chars": 501,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/user-guide/parallelism-guide.md",
    "chars": 7017,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/user-guide/training-examples.md",
    "chars": 4465,
    "preview": "<!---\n   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.\n   NVIDIA CORPORATION and its licensors retai"
  },
  {
    "path": "docs/versions1.json",
    "chars": 456,
    "preview": "[\n    {\n        \"name\": \"nightly\",\n        \"version\": \"nightly\",\n        \"url\": \"https://docs.nvidia.com/megatron-core/d"
  },
  {
    "path": "examples/__init__.py",
    "chars": 1,
    "preview": " "
  },
  {
    "path": "examples/academic_paper_scripts/detxoify_lm/README.md",
    "chars": 5366,
    "preview": "# SGEAT: Detoxify Larger-scale Language Models\n\nThis is the official code base for our NeurIPS 2022 paper:\n\n[Exploring t"
  },
  {
    "path": "examples/academic_paper_scripts/detxoify_lm/annotations/filter-selfgeneration.py",
    "chars": 2572,
    "preview": "import json\nimport time\nfrom typing import Dict, Optional, List\n\nimport joblib\nfrom googleapiclient import discovery\nfro"
  },
  {
    "path": "examples/academic_paper_scripts/detxoify_lm/annotations/perspective_api_annotate.py",
    "chars": 6198,
    "preview": "import json\nimport time\nfrom typing import Dict, Optional, List\n\nimport joblib\nfrom googleapiclient import discovery\nfro"
  },
  {
    "path": "examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh",
    "chars": 284,
    "preview": "VOCAB_FILE=pt2-vocab.json\nMERGE_FILE=gpt2-merges.txt\n\npython3 tools/preprocess_data.py \\\n    --input $1 \\\n    --output-p"
  },
  {
    "path": "examples/academic_paper_scripts/detxoify_lm/finetune_gpt.py",
    "chars": 5332,
    "preview": "# coding=utf-8\n# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.\n\n\n\"\"\"Fine-tune GPT\"\"\"\n\nimport torch\nfrom "
  },
  {
    "path": "examples/academic_paper_scripts/detxoify_lm/finetune_gpt_distributed-1.3b.sh",
    "chars": 1693,
    "preview": "#! /bin/bash\n\n# Change for multinode config\nGPUS_PER_NODE=16\nMASTER_ADDR=localhost\nMASTER_PORT=$(($RANDOM + 1024))\nNNODE"
  },
  {
    "path": "examples/academic_paper_scripts/detxoify_lm/generate-1.3b.sh",
    "chars": 1256,
    "preview": "#!/bin/bash\nCHECKPOINT_PATH=$2          # Your model ckpt\nVOCAB_FILE=gpt2-vocab.json\nMERGE_FILE=gpt2-merges.txt\n\nGPUS_PE"
  },
  {
    "path": "examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py",
    "chars": 10472,
    "preview": "# coding=utf-8\n# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.\n\n\n\"\"\"Sample Generate GPT\"\"\"\nimport json\ni"
  },
  {
    "path": "examples/academic_paper_scripts/detxoify_lm/perspective_api.py",
    "chars": 5882,
    "preview": "import json\nimport time\nfrom typing import Dict, Optional, List\n\nimport joblib\nfrom googleapiclient import discovery\nfro"
  },
  {
    "path": "examples/academic_paper_scripts/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh",
    "chars": 1356,
    "preview": "#!/bin/bash\nCHECKPOINT_PATH=$2          # Your model ckpt\nSHARE_DATA=$PWD             # current work dir\nVOCAB_FILE=gpt2"
  },
  {
    "path": "examples/academic_paper_scripts/msdp/README.md",
    "chars": 322,
    "preview": "\n# Multi-Stage Prompting for Knowledgeable Dialogue Generation\n\nThis directory contains all the scripts of multi-stage p"
  },
  {
    "path": "examples/academic_paper_scripts/msdp/data_processing.sh",
    "chars": 3686,
    "preview": "#!/bin/bash\n\n# Data preparation for our framework: preprocessing the WoW and WoI datasets\n# The datasets can be download"
  },
  {
    "path": "examples/academic_paper_scripts/msdp/eval_knwl_generation.sh",
    "chars": 1432,
    "preview": "#!/bin/bash\n\n#########################\n# Evaluate the F1 scores.\n#########################\n\nWORLD_SIZE=1\nDISTRIBUTED_ARG"
  },
  {
    "path": "examples/academic_paper_scripts/msdp/eval_resp_generation.sh",
    "chars": 2120,
    "preview": "#!/bin/bash\n\n#########################\n# Evaluate the F1 scores.\n#########################\n\nWORLD_SIZE=1\nDISTRIBUTED_ARG"
  },
  {
    "path": "examples/academic_paper_scripts/msdp/prep_resp_gen.sh",
    "chars": 616,
    "preview": "#!/bin/bash\n\n# Preparing the input file for the response generation (second-stage prompting)\n\nDIR=`pwd`\n\nTEST_FILE=<PATH"
  },
  {
    "path": "examples/academic_paper_scripts/msdp/prompt_knwl_gen.sh",
    "chars": 1731,
    "preview": "#!/bin/bash\n\n# Stage-1: Prompt a pretrained language model to generate the context-relevant knowledge\n# The input contai"
  },
  {
    "path": "examples/academic_paper_scripts/msdp/prompt_resp_gen.sh",
    "chars": 1744,
    "preview": "#!/bin/bash\n\n# Stage-2: Prompt a pretrained language model to generate the corresponding response\n# The input contains p"
  },
  {
    "path": "examples/academic_paper_scripts/sc21/CONFIG.sh",
    "chars": 1721,
    "preview": "#!/bin/bash\n\n\n# SLURM options.\nexport SLURM_PARTITION=<slurm partition, used to feed -p option in slurm>\nexport SLURM_AC"
  },
  {
    "path": "examples/academic_paper_scripts/sc21/README.md",
    "chars": 2073,
    "preview": "# Reproducing Figures in SC21 Paper\n\n\nThis directory contains some of the scripts that were used to produce the\nresults "
  },
  {
    "path": "examples/academic_paper_scripts/sc21/SBATCH.sh",
    "chars": 219,
    "preview": "#!/bin/bash\n\n\nsbatch -p ${SLURM_PARTITION} \\\n       -A ${SLURM_ACCOUNT} \\\n       --job-name=${JOB_NAME} \\\n       --nodes"
  },
  {
    "path": "examples/academic_paper_scripts/sc21/SRUN.sh",
    "chars": 522,
    "preview": "#!/bin/bash\n\n#SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8\n\n\nTHIS_DIR=`pwd`\nDATETIME=`date +'d"
  },
  {
    "path": "examples/academic_paper_scripts/sc21/run_figure_11.sh",
    "chars": 609,
    "preview": "#!/bin/bash\n\n# ================================\n# Choose the case to run.\n# ================================\n\n# Pipeline"
  },
  {
    "path": "examples/academic_paper_scripts/sc21/run_figure_12.sh",
    "chars": 862,
    "preview": "#!/bin/bash\n\n# ================================\n# Choose the case to run.\n# ================================\n\n# Interlea"
  },
  {
    "path": "examples/academic_paper_scripts/sc21/run_figure_13.sh",
    "chars": 661,
    "preview": "#!/bin/bash\n\n# ================================\n# Choose the case to run.\n# ================================\n\n# Pipeline"
  },
  {
    "path": "examples/academic_paper_scripts/sc21/run_figure_14.sh",
    "chars": 660,
    "preview": "#!/bin/bash\n\n# ================================\n# Choose the case to run.\n# ================================\n\n# Pipeline"
  },
  {
    "path": "examples/academic_paper_scripts/sc21/run_figure_15.sh",
    "chars": 659,
    "preview": "#!/bin/bash\n\n# ================================\n# Choose the case to run.\n# ================================\n\n# Tensor-p"
  },
  {
    "path": "examples/academic_paper_scripts/sc21/run_figure_16.sh",
    "chars": 549,
    "preview": "#!/bin/bash\n\n# ================================\n# Choose the case to run.\n# ================================\n\n# Microbat"
  },
  {
    "path": "examples/academic_paper_scripts/sc21/run_figure_17.sh",
    "chars": 842,
    "preview": "#!/bin/bash\n\n# ================================\n# Choose the case to run.\n# ================================\n\n# Activati"
  },
  {
    "path": "examples/academic_paper_scripts/sc21/run_figure_18.sh",
    "chars": 1001,
    "preview": "#!/bin/bash\n\n# ================================\n# Choose the case to run.\n# ================================\n\n# Scatter-"
  },
  {
    "path": "examples/academic_paper_scripts/sc21/run_table_1.sh",
    "chars": 2717,
    "preview": "#!/bin/bash\n\n# ================================\n# Choose the case to run.\n# ================================\n# model siz"
  },
  {
    "path": "examples/bert/README.md",
    "chars": 1505,
    "preview": "# BERT MODEL\n\n## Table of contents\n- [1. Training Setup](#1-training-setup)\n- [2. Configurations](#2-configurations)\n\n##"
  },
  {
    "path": "examples/bert/train_bert_340m_distributed.sh",
    "chars": 1748,
    "preview": "#!/bin/bash\n\n# Runs the \"340M\" parameter model (Bert - Large)\n\nexport CUDA_DEVICE_MAX_CONNECTIONS=1\n\nGPUS_PER_NODE=8\n# C"
  },
  {
    "path": "examples/export/README.md",
    "chars": 546,
    "preview": "# Megatron Core Export\n\nThis module is used to export megatron core models to different inference frameworks.\nCurrently "
  },
  {
    "path": "examples/export/trtllm_export/README.md",
    "chars": 6815,
    "preview": "# Megatron Core To TRTLLM Export Documentation\nThis guide will walk you through how you can use the megatron core export"
  },
  {
    "path": "examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py",
    "chars": 4519,
    "preview": "import os\nimport torch\nfrom megatron.core import parallel_state\nfrom megatron.core import dist_checkpointing\nfrom megatr"
  },
  {
    "path": "examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py",
    "chars": 4816,
    "preview": "import os\nimport torch\nfrom megatron.core import parallel_state\nfrom megatron.core import dist_checkpointing\nfrom megatr"
  },
  {
    "path": "examples/gpt3/README.md",
    "chars": 1666,
    "preview": "# GPT3 MODEL\n\n## Table of contents\n- [1. Training Setup](#1-training-setup)\n- [2. Configurations](#2-configurations)\n- ["
  },
  {
    "path": "examples/gpt3/gpt_config.yaml",
    "chars": 6689,
    "preview": "# WARNING: Yaml configs is currently an experimental feature\nlanguage_model:\n  # model architecture\n  num_layers: 24\n  h"
  },
  {
    "path": "examples/gpt3/train_gpt3_175b_distributed.sh",
    "chars": 1881,
    "preview": "#!/bin/bash\n\n# Runs the \"175B\" parameter model\n\nexport CUDA_DEVICE_MAX_CONNECTIONS=1\n\nGPUS_PER_NODE=8\n# Change for multi"
  },
  {
    "path": "examples/gptoss/01_convert_from_hf.py",
    "chars": 1666,
    "preview": "# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n\n\"\"\"Convert HuggingFace checkpoints to Megatr"
  },
  {
    "path": "examples/gptoss/02_train.sh",
    "chars": 7589,
    "preview": "#!/bin/bash\n\nexport CUDA_DEVICE_MAX_CONNECTIONS=${CUDA_DEVICE_MAX_CONNECTIONS:-1}\n\n\n# Setup arguments with defaults\nCHEC"
  },
  {
    "path": "examples/gptoss/03_convert_to_hf.py",
    "chars": 1559,
    "preview": "# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n\n\"\"\"Convert HuggingFace checkpoints to Megatr"
  },
  {
    "path": "examples/gptoss/README.md",
    "chars": 6953,
    "preview": "# GPT-OSS Training Tutorial\n\n## Step 0: Install Dependencies\n\n### Using Megatron Bridge\n\n[Megatron-Bridge](https://githu"
  },
  {
    "path": "examples/inference/README.md",
    "chars": 13634,
    "preview": "### Megatron Core Inference Documentation\nThis guide provides an example for Megatron Core for running model inference. "
  },
  {
    "path": "examples/inference/gpt/gpt_dynamic_inference.py",
    "chars": 21066,
    "preview": "# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n\n# pylint: disable=bad-builtin\n\nimport hashli"
  },
  {
    "path": "examples/inference/gpt/gpt_dynamic_inference_12b.sh",
    "chars": 3336,
    "preview": "# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.\n\n# Run dynamic batching inference on the 12B GPT model.\n\n"
  },
  {
    "path": "examples/inference/gpt/gpt_dynamic_inference_357m.sh",
    "chars": 3014,
    "preview": "# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.\n\n# Run dynamic batching inference on the 357M GPT model.\n"
  },
  {
    "path": "examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py",
    "chars": 10110,
    "preview": "# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n\nimport asyncio\nimport json\nimport logging\nim"
  },
  {
    "path": "examples/inference/gpt/gpt_static_inference.py",
    "chars": 9086,
    "preview": "# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.\n\nimport os\nimport sys\nimport time\nfrom argparse import Na"
  },
  {
    "path": "examples/inference/gpt/utils.py",
    "chars": 10900,
    "preview": "# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n\nimport copy\nimport itertools\nimport json\nimp"
  },
  {
    "path": "examples/inference/llama_mistral/huggingface_reference.py",
    "chars": 1086,
    "preview": "import argparse\nfrom transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer\n\n# Set up argument parsing\npars"
  },
  {
    "path": "examples/inference/llama_mistral/run_static_inference_llama4_scout.sh",
    "chars": 2099,
    "preview": "#!/bin/bash\nexport CUDA_DEVICE_MAX_CONNECTIONS=1\nexport NVTE_APPLY_QK_LAYER_SCALING=0\n\nDISTRIBUTED_ARGS=\"--nproc_per_nod"
  },
  {
    "path": "examples/inference/llama_mistral/run_text_generation_llama3.1.sh",
    "chars": 1771,
    "preview": "#!/bin/bash\n# This example will start serving the Llama3.1-8B model\nexport NCCL_IB_SL=1\nexport CUDA_DEVICE_MAX_CONNECTIO"
  },
  {
    "path": "examples/inference/llama_mistral/run_text_generation_llama3.sh",
    "chars": 1740,
    "preview": "#!/bin/bash\n# This example will start serving the Llama3-8B model\nexport NCCL_IB_SL=1\nexport CUDA_DEVICE_MAX_CONNECTIONS"
  },
  {
    "path": "examples/inference/llama_mistral/run_text_generation_mistral.sh",
    "chars": 1674,
    "preview": "#!/bin/bash\n# This example will start serving the Mistral-7B-v0.3 model\nexport NCCL_IB_SL=1\nexport CUDA_DEVICE_MAX_CONNE"
  },
  {
    "path": "examples/inference/run_text_generation_server_345M.sh",
    "chars": 991,
    "preview": "#!/bin/bash\n# This example will start serving the 345M model.\nDISTRIBUTED_ARGS=\"--nproc_per_node 1 \\\n                  -"
  },
  {
    "path": "examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh",
    "chars": 1020,
    "preview": "#!/bin/bash\n# This example will start serving the 345M model that is partitioned 8 way tensor parallel\nDISTRIBUTED_ARGS="
  },
  {
    "path": "examples/inference/t5/simple_t5_batch_inference.py",
    "chars": 5688,
    "preview": "# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.\n\nimport os\nimport sys\nfrom argparse import Namespace\n\nimp"
  },
  {
    "path": "examples/llama/README.md",
    "chars": 5008,
    "preview": "# Llama Models\n\n## Table of contents\n- [1. Overview](#1-overview)\n- [2. Prerequisites](#2-prerequisites)\n- [3. Training "
  },
  {
    "path": "examples/llama/train_llama3_8b_h100_fp8.sh",
    "chars": 5779,
    "preview": "#!/bin/bash\n\n# Environment variables for performance tuning\nexport CUDA_DEVICE_MAX_CONNECTIONS=${CUDA_DEVICE_MAX_CONNECT"
  },
  {
    "path": "examples/mamba/.gitignore",
    "chars": 52,
    "preview": "checkpoints/\ndata-cache/\ntensorboard/\ntriton-cache/\n"
  }
]

// ... and 2110 more files (download for full content)

About this extraction

This page contains the full source code of the NVIDIA/Megatron-LM GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 2310 files (32.9 MB), approximately 4.1M tokens, and a symbol index with 5871 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo