gitextract_32wjwf3g/

├── .coderabbit.yaml
├── .flake8
├── .github/
│   ├── CODEOWNERS
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.md
│   │   ├── config.yml
│   │   ├── feature_request.md
│   │   ├── question.md
│   │   └── regression.md
│   ├── actions/
│   │   ├── action.yml
│   │   └── check-nvidia-sso-membership/
│   │       └── action.yml
│   ├── copy-pr-bot.yaml
│   ├── oncall_schedule.json
│   ├── pull_request_template.md
│   ├── scripts/
│   │   ├── oncall_manager.py
│   │   ├── readme.sh
│   │   └── sync_team_usergroups.py
│   └── workflows/
│       ├── _build_test_publish_wheel.yml
│       ├── _release_library.yml
│       ├── _update_dependencies.yml
│       ├── auto-assign-milestone.yml
│       ├── auto-reminder-bot.yml
│       ├── auto-swap-labels.yml
│       ├── auto-update-copy-pr-bot.yml
│       ├── build-docs.yml
│       ├── build-test-publish-wheel.yml
│       ├── cherry-pick-release-commit.yml
│       ├── cicd-approve-test-queue.yml
│       ├── cicd-main.yml
│       ├── claude-complexity-label.yml
│       ├── claude_review.yml
│       ├── close-inactive-issue-pr.yml
│       ├── community-bot.yml
│       ├── config/
│       │   └── changelog-config.json
│       ├── copyright-check.yml
│       ├── dependabot.yml
│       ├── force-draft-pr.yml
│       ├── install-test.yml
│       ├── multi-approval-bot.yml
│       ├── oncall-assign.yml
│       ├── oncall-rotation.yml
│       ├── release-docs.yml
│       ├── release-freeze.yml
│       ├── release-nightly-docs.yml
│       ├── release.yaml
│       ├── review-trigger.yml
│       ├── sync-team-usergroups.yml
│       └── trigger-mbridge-tests.yml
├── .gitignore
├── .gitlab/
│   ├── labeler-config.yml
│   ├── scripts/
│   │   ├── build.sh
│   │   ├── check_imports.py
│   │   └── fetch-legacy-suite.sh
│   └── stages/
│       ├── 00.pre.yml
│       ├── 01.build.yml
│       ├── 02.test.yml
│       ├── 03.integration-tests.yml
│       ├── 04.functional-tests.yml
│       └── 05.publish.yml
├── .gitlab-ci.yml
├── .pre-commit-config.yaml
├── .pylintrc
├── .python-version
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── codecov.yml
├── docker/
│   ├── .ngc_version.dev
│   ├── .ngc_version.lts
│   ├── Dockerfile.ci.dev
│   ├── Dockerfile.ci.nemo
│   ├── Dockerfile.linting
│   ├── common/
│   │   ├── install.sh
│   │   └── install_source_wheels.sh
│   └── patches/
│       └── deepep.patch
├── docs/
│   ├── add_copyright_header.py
│   ├── advanced/
│   │   └── index.md
│   ├── api-backwards-compatibility-check.md
│   ├── api-guide/
│   │   ├── core/
│   │   │   ├── datasets.md
│   │   │   ├── dist_checkpointing.md
│   │   │   ├── dist_checkpointing.strategies.md
│   │   │   ├── distributed.md
│   │   │   ├── fusions.md
│   │   │   ├── index.md
│   │   │   ├── pipeline_parallel.md
│   │   │   ├── tensor_parallel.md
│   │   │   └── transformer.md
│   │   ├── index.md
│   │   ├── internal/
│   │   │   ├── index.md
│   │   │   ├── num_microbatches_calculator.md
│   │   │   └── optimizer_param_scheduler.md
│   │   ├── models/
│   │   │   ├── index.md
│   │   │   ├── models.bert.md
│   │   │   ├── models.gpt.md
│   │   │   ├── models.md
│   │   │   └── models.t5.md
│   │   └── router_replay.md
│   ├── autodoc2_docstrings_parser.py
│   ├── broken_links_false_positives.json
│   ├── conf.py
│   ├── developer/
│   │   ├── contribute.md
│   │   ├── generate_docs.md
│   │   ├── oncall.md
│   │   └── submit.md
│   ├── discussions/
│   │   ├── README.md
│   │   └── megatron-fsdp-user-guide/
│   │       ├── example-scripts/
│   │       │   ├── sbatch_checkpoint_convert.sh
│   │       │   └── sbatch_mfsdp_deepseek_v3.sh
│   │       └── megatron-fsdp-user-guide.md
│   ├── documentation.md
│   ├── get-started/
│   │   ├── install.md
│   │   ├── overview.md
│   │   ├── quickstart.md
│   │   └── releasenotes.md
│   ├── index.md
│   ├── llama_mistral.md
│   ├── models/
│   │   ├── index.md
│   │   ├── llms.md
│   │   └── multimodal.md
│   ├── project.json
│   ├── user-guide/
│   │   ├── data-preparation.md
│   │   ├── features/
│   │   │   ├── context_parallel.md
│   │   │   ├── custom_fsdp.md
│   │   │   ├── dist_optimizer.md
│   │   │   ├── fine_grained_activation_offloading.md
│   │   │   ├── index.md
│   │   │   ├── megatron_energon.md
│   │   │   ├── megatron_rl.md
│   │   │   ├── moe.md
│   │   │   ├── multi_latent_attention.md
│   │   │   ├── multi_token_prediction.md
│   │   │   ├── optimizer_cpu_offload.md
│   │   │   ├── pipeline_parallel_layout.md
│   │   │   └── tokenizers.md
│   │   ├── index.md
│   │   ├── msc_integration.md
│   │   ├── parallelism-guide.md
│   │   └── training-examples.md
│   └── versions1.json
├── examples/
│   ├── __init__.py
│   ├── academic_paper_scripts/
│   │   ├── detxoify_lm/
│   │   │   ├── README.md
│   │   │   ├── annotations/
│   │   │   │   ├── filter-selfgeneration.py
│   │   │   │   ├── perspective_api_annotate.py
│   │   │   │   └── preprocess.sh
│   │   │   ├── finetune_gpt.py
│   │   │   ├── finetune_gpt_distributed-1.3b.sh
│   │   │   ├── generate-1.3b.sh
│   │   │   ├── generate_samples_gpt.py
│   │   │   ├── perspective_api.py
│   │   │   └── self_generation/
│   │   │       └── selfgenerate-1.3b-unconditional.sh
│   │   ├── msdp/
│   │   │   ├── README.md
│   │   │   ├── data_processing.sh
│   │   │   ├── eval_knwl_generation.sh
│   │   │   ├── eval_resp_generation.sh
│   │   │   ├── prep_resp_gen.sh
│   │   │   ├── prompt_knwl_gen.sh
│   │   │   └── prompt_resp_gen.sh
│   │   └── sc21/
│   │       ├── CONFIG.sh
│   │       ├── README.md
│   │       ├── SBATCH.sh
│   │       ├── SRUN.sh
│   │       ├── run_figure_11.sh
│   │       ├── run_figure_12.sh
│   │       ├── run_figure_13.sh
│   │       ├── run_figure_14.sh
│   │       ├── run_figure_15.sh
│   │       ├── run_figure_16.sh
│   │       ├── run_figure_17.sh
│   │       ├── run_figure_18.sh
│   │       └── run_table_1.sh
│   ├── bert/
│   │   ├── README.md
│   │   └── train_bert_340m_distributed.sh
│   ├── export/
│   │   ├── README.md
│   │   └── trtllm_export/
│   │       ├── README.md
│   │       ├── distributed_export/
│   │       │   └── gpt_distributed_gpu_export.py
│   │       └── single_device_export/
│   │           └── gpt_single_device_cpu_export.py
│   ├── gpt3/
│   │   ├── README.md
│   │   ├── gpt_config.yaml
│   │   └── train_gpt3_175b_distributed.sh
│   ├── gptoss/
│   │   ├── 01_convert_from_hf.py
│   │   ├── 02_train.sh
│   │   ├── 03_convert_to_hf.py
│   │   └── README.md
│   ├── inference/
│   │   ├── README.md
│   │   ├── gpt/
│   │   │   ├── gpt_dynamic_inference.py
│   │   │   ├── gpt_dynamic_inference_12b.sh
│   │   │   ├── gpt_dynamic_inference_357m.sh
│   │   │   ├── gpt_dynamic_inference_with_coordinator.py
│   │   │   ├── gpt_static_inference.py
│   │   │   └── utils.py
│   │   ├── llama_mistral/
│   │   │   ├── huggingface_reference.py
│   │   │   ├── run_static_inference_llama4_scout.sh
│   │   │   ├── run_text_generation_llama3.1.sh
│   │   │   ├── run_text_generation_llama3.sh
│   │   │   └── run_text_generation_mistral.sh
│   │   ├── run_text_generation_server_345M.sh
│   │   ├── run_text_generation_server_345M_8_tensor_parallel.sh
│   │   └── t5/
│   │       └── simple_t5_batch_inference.py
│   ├── llama/
│   │   ├── README.md
│   │   └── train_llama3_8b_h100_fp8.sh
│   ├── mamba/
│   │   ├── .gitignore
│   │   ├── Dockerfile
│   │   ├── README.md
│   │   ├── run_text_gen_server_8b.sh
│   │   ├── run_text_gen_server_8b_gpt3.sh
│   │   └── train.sh
│   ├── mimo/
│   │   ├── __init__.py
│   │   ├── avlm_inference.py
│   │   ├── configs/
│   │   │   ├── llava_avlm.py
│   │   │   ├── llava_vlm.py
│   │   │   └── mock.py
│   │   ├── data/
│   │   │   ├── __init__.py
│   │   │   ├── avlm_sample_loader.py
│   │   │   ├── energon_avlm_task_encoder.py
│   │   │   ├── energon_vlm_task_encoder.py
│   │   │   ├── mock.py
│   │   │   ├── prepare_video_llava_data.py
│   │   │   └── utils/
│   │   │       └── calculate_audio_tokens.py
│   │   ├── model_providers/
│   │   │   ├── __init__.py
│   │   │   ├── hf_clip_encoder.py
│   │   │   ├── hf_whisper_encoder.py
│   │   │   ├── llava_avlm.py
│   │   │   ├── llava_vlm.py
│   │   │   └── mock.py
│   │   ├── scripts/
│   │   │   ├── run_avlm_train.sh
│   │   │   ├── run_mock_train.sh
│   │   │   ├── run_video_vlm_train.sh
│   │   │   └── run_vlm_train.sh
│   │   ├── train.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── data_helpers.py
│   │       ├── logging.py
│   │       └── model_helpers.py
│   ├── mixtral/
│   │   ├── README.md
│   │   └── train_mixtral_8x7b_distributed.sh
│   ├── multimodal/
│   │   ├── Dockerfile
│   │   ├── README.md
│   │   ├── combine_lm_vision_checkpoints.sh
│   │   ├── combine_state_dicts.py
│   │   ├── config.py
│   │   ├── convert_llava_pretrain_to_wds.py
│   │   ├── dataloader_provider.py
│   │   ├── dataset_helpers.py
│   │   ├── energon_util.py
│   │   ├── evaluation/
│   │   │   ├── evaluate_ai2d.py
│   │   │   ├── evaluate_chartqa.py
│   │   │   ├── evaluate_coco.py
│   │   │   ├── evaluate_infovqa.py
│   │   │   ├── evaluate_mathvista.py
│   │   │   ├── evaluate_mmmu.py
│   │   │   ├── evaluate_ocrbench.py
│   │   │   ├── evaluate_ocrbench_v2.py
│   │   │   ├── evaluate_rd_tablebench.py
│   │   │   ├── evaluate_realworldqa.py
│   │   │   ├── evaluate_spdocvqa.py
│   │   │   ├── evaluate_textvqa.py
│   │   │   ├── evaluate_video_motionbench.py
│   │   │   ├── evaluate_video_mvbench.py
│   │   │   ├── evaluate_video_phys_game_bench.py
│   │   │   ├── evaluate_vqav2.py
│   │   │   ├── evaluation_datasets.py
│   │   │   └── mmmu_utils.py
│   │   ├── image_processing.py
│   │   ├── layer_scaling.py
│   │   ├── layer_specs.py
│   │   ├── llama_3p1_nemotron_nano_vl_8b_v1/
│   │   │   ├── Dockerfile
│   │   │   ├── README.md
│   │   │   ├── pretraining_llama_3p1_nemotron_nano_vl_8b_v1.sh
│   │   │   ├── sft_llama_3p1_nemotron_nano_vl_8b_v1.sh
│   │   │   └── text_generation.sh
│   │   ├── manual_prompts.json
│   │   ├── model.py
│   │   ├── model_converter/
│   │   │   ├── clip_converter.py
│   │   │   ├── internvit_converter.py
│   │   │   ├── radio_converter.py
│   │   │   ├── siglip_converter.py
│   │   │   └── vision_model_tester.py
│   │   ├── multimodal_args.py
│   │   ├── nvlm/
│   │   │   ├── README.md
│   │   │   ├── internvit.py
│   │   │   ├── nvlm_prompts.json
│   │   │   ├── pp_checkpoint_converter.py
│   │   │   ├── pretrain_blend.yaml
│   │   │   ├── pretrain_qwen20_72b_internvit_6b.sh
│   │   │   ├── pretrain_yi_34b_internvit_6b.sh
│   │   │   ├── run_text_generation_qwen20_72b_internvit_6b.sh
│   │   │   ├── run_text_generation_qwen25_7b_internvit_video.sh
│   │   │   ├── run_text_generation_qwen25_7b_siglip.sh
│   │   │   ├── run_text_generation_yi_34b_internvit_6b.sh
│   │   │   ├── sft_34b_internvit.sh
│   │   │   ├── sft_blend.yaml
│   │   │   ├── sft_qwen20_72b_internvit_6b.sh
│   │   │   └── sft_qwen2p5_7b_internvit_6b_video.sh
│   │   ├── pretrain_dataset.yaml
│   │   ├── pretrain_mistral_clip.sh
│   │   ├── radio/
│   │   │   └── radio_g.py
│   │   ├── run_text_generation.py
│   │   ├── sft_dataset.yaml
│   │   ├── sft_mistral_clip.sh
│   │   ├── text_generation_mistral_clip.sh
│   │   └── train.py
│   ├── post_training/
│   │   └── modelopt/
│   │       ├── .gitignore
│   │       ├── ADVANCED.md
│   │       ├── Dockerfile
│   │       ├── README.md
│   │       ├── conf/
│   │       │   ├── Qwen/
│   │       │   │   ├── Qwen2.5-0.5B-Instruct.sh
│   │       │   │   ├── Qwen2.5-7B-Instruct.sh
│   │       │   │   ├── Qwen3-0.6B.sh
│   │       │   │   ├── Qwen3-235B-A22B.sh
│   │       │   │   ├── Qwen3-30B-A3B.sh
│   │       │   │   └── Qwen3-8B.sh
│   │       │   ├── arguments.sh
│   │       │   ├── deepseek-ai/
│   │       │   │   ├── DeepSeek-R1.sh
│   │       │   │   └── DeepSeek-V2-Lite.sh
│   │       │   ├── meta-llama/
│   │       │   │   ├── Llama-3.1-8B-Instruct.sh
│   │       │   │   ├── Llama-3.2-1B-Instruct.sh
│   │       │   │   ├── Llama-4-Maverick-17B-128E-Instruct.sh
│   │       │   │   └── Llama-4-Scout-17B-16E-Instruct.sh
│   │       │   ├── moonshotai/
│   │       │   │   ├── Kimi-K2-Instruct.sh
│   │       │   │   ├── kimi_k2_instruct.sh
│   │       │   │   └── kimi_k2_instruct_export.sh
│   │       │   ├── nvidia/
│   │       │   │   ├── NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.sh
│   │       │   │   ├── NVIDIA-Nemotron-3-Super-120B-A12B-BF16.sh
│   │       │   │   ├── NVIDIA-Nemotron-Nano-9B-v2.sh
│   │       │   │   ├── Nemotron-H-47B-Reasoning-128K.sh
│   │       │   │   ├── Nemotron-H-4B-Instruct.sh
│   │       │   │   ├── Nemotron-H-56B-Base-8K.sh
│   │       │   │   ├── Nemotron-H-8B-Base-8K.sh
│   │       │   │   └── Nemotron-Mini-4B-Instruct.sh
│   │       │   └── openai/
│   │       │       ├── gpt-oss-120b.sh
│   │       │       └── gpt-oss-20b.sh
│   │       ├── convert.sh
│   │       ├── convert_model.py
│   │       ├── distillation.md
│   │       ├── eagle3.sh
│   │       ├── export.py
│   │       ├── export.sh
│   │       ├── finetune.py
│   │       ├── finetune.sh
│   │       ├── generate.py
│   │       ├── generate.sh
│   │       ├── generation_server.sh
│   │       ├── mmlu.py
│   │       ├── mmlu.sh
│   │       ├── offline_feature_extract.py
│   │       ├── offline_feature_extract.sh
│   │       ├── prune.py
│   │       ├── prune.sh
│   │       ├── quantize.py
│   │       ├── quantize.sh
│   │       ├── requirements.txt
│   │       ├── requirements_ssm.txt
│   │       ├── slurm/
│   │       │   ├── env_setup_template.sh
│   │       │   └── sbatch.sh
│   │       ├── speculative.md
│   │       ├── train.sh
│   │       ├── validate.py
│   │       └── validate.sh
│   ├── rl/
│   │   ├── README.md
│   │   ├── benchmark_refit.py
│   │   ├── environment_configs/
│   │   │   ├── countdown.yaml
│   │   │   ├── dapo.yaml
│   │   │   ├── default.yaml
│   │   │   ├── gsm8k.yaml
│   │   │   ├── gsm8k_nanov3.yaml
│   │   │   ├── math.yaml
│   │   │   └── openmathinstructv2.yaml
│   │   ├── environments/
│   │   │   ├── __init__.py
│   │   │   ├── countdown/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── countdown.py
│   │   │   │   └── countdown_agent.py
│   │   │   └── math/
│   │   │       ├── __init__.py
│   │   │       ├── aime_agent.py
│   │   │       ├── bigmath_agent.py
│   │   │       ├── dapo_agent.py
│   │   │       ├── gsm8k_agent.py
│   │   │       ├── math_agent.py
│   │   │       └── openmath_agent.py
│   │   └── model_configs/
│   │       ├── common.sh
│   │       ├── llama3p1_8b_instruct.sh
│   │       ├── nemotron5_56b.sh
│   │       ├── nemotron5_8b.sh
│   │       ├── nemotron5p5_12b_H.sh
│   │       ├── nemotron6_3b_moe.sh
│   │       ├── qwen3_30b_a3b_moe.sh
│   │       ├── qwen3_32b.sh
│   │       ├── qwen3_4b.sh
│   │       ├── qwen3_8b.sh
│   │       ├── qwen_2p5_32b.sh
│   │       ├── qwen_2p5_3b.sh
│   │       ├── qwen_2p5_distill_7b.sh
│   │       └── qwen_2p5_math_7b.sh
│   ├── run_simple_mcore_train_loop.py
│   └── t5/
│       ├── README.md
│       └── train_t5_220m_distributed.sh
├── gpt_builders.py
├── greptile.json
├── mamba_builders.py
├── megatron/
│   ├── core/
│   │   ├── MSC_Integration.md
│   │   ├── QuickStart.md
│   │   ├── README.md
│   │   ├── README_STRAGGLER.md
│   │   ├── __init__.py
│   │   ├── _rank_utils.py
│   │   ├── activations.py
│   │   ├── config.py
│   │   ├── config_logger.py
│   │   ├── datasets/
│   │   │   ├── Makefile
│   │   │   ├── __init__.py
│   │   │   ├── bert_dataset.py
│   │   │   ├── blended_dataset.py
│   │   │   ├── blended_megatron_dataset_builder.py
│   │   │   ├── blended_megatron_dataset_config.py
│   │   │   ├── data_schedule.py
│   │   │   ├── gpt_dataset.py
│   │   │   ├── helpers.cpp
│   │   │   ├── helpers.py
│   │   │   ├── indexed_dataset.py
│   │   │   ├── masked_dataset.py
│   │   │   ├── megatron_dataset.py
│   │   │   ├── multimodal_dataset.py
│   │   │   ├── object_storage_utils.py
│   │   │   ├── readme.md
│   │   │   ├── t5_dataset.py
│   │   │   ├── utils.py
│   │   │   └── utils_s3.py
│   │   ├── dist_checkpointing/
│   │   │   ├── __init__.py
│   │   │   ├── core.py
│   │   │   ├── dict_utils.py
│   │   │   ├── exchange_utils.py
│   │   │   ├── mapping.py
│   │   │   ├── optimizer.py
│   │   │   ├── serialization.py
│   │   │   ├── state_dict_utils.py
│   │   │   ├── strategies/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── async_utils.py
│   │   │   │   ├── base.py
│   │   │   │   ├── cached_metadata_filesystem_reader.py
│   │   │   │   ├── checkpointable.py
│   │   │   │   ├── common.py
│   │   │   │   ├── filesystem_async.py
│   │   │   │   ├── fully_parallel.py
│   │   │   │   ├── state_dict_saver.py
│   │   │   │   └── torch.py
│   │   │   ├── tensor_aware_state_dict.py
│   │   │   ├── utils.py
│   │   │   └── validation.py
│   │   ├── distributed/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── data_parallel_base.py
│   │   │   ├── distributed_data_parallel.py
│   │   │   ├── distributed_data_parallel_config.py
│   │   │   ├── finalize_model_grads.py
│   │   │   ├── fsdp/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── mcore_fsdp_adapter.py
│   │   │   │   └── src/
│   │   │   │       ├── README.md
│   │   │   │       ├── __init__.py
│   │   │   │       ├── megatron_fsdp/
│   │   │   │       │   ├── __init__.py
│   │   │   │       │   ├── distributed_data_parallel_config.py
│   │   │   │       │   ├── fully_shard.py
│   │   │   │       │   ├── megatron_fsdp.py
│   │   │   │       │   ├── mixed_precision.py
│   │   │   │       │   ├── package_info.py
│   │   │   │       │   ├── param_and_grad_buffer.py
│   │   │   │       │   ├── uneven_dtensor.py
│   │   │   │       │   └── utils.py
│   │   │   │       └── pyproject.toml
│   │   │   ├── param_and_grad_buffer.py
│   │   │   ├── reduce_scatter_with_fp32_accumulation.py
│   │   │   ├── torch_fully_sharded_data_parallel.py
│   │   │   └── torch_fully_sharded_data_parallel_config.py
│   │   ├── energy_monitor.py
│   │   ├── enums.py
│   │   ├── export/
│   │   │   ├── __init__.py
│   │   │   ├── data_type.py
│   │   │   ├── export_config.py
│   │   │   ├── model_type.py
│   │   │   └── trtllm/
│   │   │       ├── __init__.py
│   │   │       ├── engine_builder/
│   │   │       │   ├── __init__.py
│   │   │       │   └── trtllm_engine_builder.py
│   │   │       ├── model_to_trllm_mapping/
│   │   │       │   ├── __init__.py
│   │   │       │   └── default_conversion_dict.py
│   │   │       ├── trt_model_config.py
│   │   │       ├── trt_model_type.py
│   │   │       ├── trtllm_helper.py
│   │   │       ├── trtllm_layers.py
│   │   │       └── trtllm_weights_converter/
│   │   │           ├── __init__.py
│   │   │           ├── distributed_trtllm_model_weights_converter.py
│   │   │           ├── single_device_trtllm_model_weights_converter.py
│   │   │           └── utils.py
│   │   ├── extensions/
│   │   │   ├── TransformerEngineMixedPrecision.md
│   │   │   ├── __init__.py
│   │   │   ├── kitchen.py
│   │   │   ├── transformer_engine.py
│   │   │   └── transformer_engine_spec_provider.py
│   │   ├── fp4_utils.py
│   │   ├── fp8_utils.py
│   │   ├── full_cuda_graph.py
│   │   ├── fusions/
│   │   │   ├── __init__.py
│   │   │   ├── fused_bias_dropout.py
│   │   │   ├── fused_bias_geglu.py
│   │   │   ├── fused_bias_gelu.py
│   │   │   ├── fused_bias_swiglu.py
│   │   │   ├── fused_cross_entropy.py
│   │   │   ├── fused_indices_converter.py
│   │   │   ├── fused_layer_norm.py
│   │   │   ├── fused_mla_yarn_rope_apply.py
│   │   │   ├── fused_pad_routing_map.py
│   │   │   ├── fused_softmax.py
│   │   │   └── fused_weighted_squared_relu.py
│   │   ├── hyper_comm_grid.py
│   │   ├── inference/
│   │   │   ├── __init__.py
│   │   │   ├── async_stream.py
│   │   │   ├── batch_dimensions_utils.py
│   │   │   ├── common_inference_params.py
│   │   │   ├── communication/
│   │   │   │   └── torch_symm_triton/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── barrier.py
│   │   │   │       ├── collectives.py
│   │   │   │       ├── fused_collectives.py
│   │   │   │       ├── multimem_asm.py
│   │   │   │       └── utils.py
│   │   │   ├── communication_utils.py
│   │   │   ├── config.py
│   │   │   ├── contexts/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── attention_context/
│   │   │   │   │   ├── mamba_metadata.py
│   │   │   │   │   ├── metadata_base.py
│   │   │   │   │   ├── mha_metadata.py
│   │   │   │   │   └── triton/
│   │   │   │   │       └── tensor_ops.py
│   │   │   │   ├── base_context.py
│   │   │   │   ├── dynamic_context.py
│   │   │   │   ├── fused_kv_append_kernel.py
│   │   │   │   ├── kv_block_allocator.py
│   │   │   │   ├── mamba_slot_allocator.py
│   │   │   │   ├── routing_metadata.py
│   │   │   │   └── static_context.py
│   │   │   ├── data_parallel_inference_coordinator.py
│   │   │   ├── engines/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── abstract_engine.py
│   │   │   │   ├── async_zmq_communicator.py
│   │   │   │   ├── dynamic_engine.py
│   │   │   │   ├── mcore_engine.py
│   │   │   │   └── static_engine.py
│   │   │   ├── headers.py
│   │   │   ├── inference_client.py
│   │   │   ├── inference_request.py
│   │   │   ├── model_inference_wrappers/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── abstract_model_inference_wrapper.py
│   │   │   │   ├── gpt/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── gpt_inference_wrapper.py
│   │   │   │   ├── multimodal/
│   │   │   │   │   └── vlm_inference_wrapper.py
│   │   │   │   └── t5/
│   │   │   │       ├── __init__.py
│   │   │   │       └── t5_inference_wrapper.py
│   │   │   ├── moe/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── activations.py
│   │   │   │   ├── fused_moe.py
│   │   │   │   ├── pad.py
│   │   │   │   └── permute.py
│   │   │   ├── quantization/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── mxfp8_quantize.py
│   │   │   │   ├── mxfp8_tensor.py
│   │   │   │   └── utils.py
│   │   │   ├── sampling_params.py
│   │   │   ├── scheduler.py
│   │   │   ├── symmetric_memory.py
│   │   │   ├── text_generation_controllers/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── encoder_decoder_text_generation_controller.py
│   │   │   │   ├── text_generation_controller.py
│   │   │   │   └── vlm_text_generation_controller.py
│   │   │   ├── text_generation_server/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── dynamic_text_gen_server/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── endpoints/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── chat_completions.py
│   │   │   │   │   │   ├── common.py
│   │   │   │   │   │   ├── completions.py
│   │   │   │   │   │   └── health.py
│   │   │   │   │   ├── text_generation_server.py
│   │   │   │   │   └── tokenization.py
│   │   │   │   ├── endpoints/
│   │   │   │   │   ├── common.py
│   │   │   │   │   └── completions.py
│   │   │   │   ├── run_mcore_engine.py
│   │   │   │   ├── text_generation_server.py
│   │   │   │   └── tokenization.py
│   │   │   ├── unified_memory.py
│   │   │   └── utils.py
│   │   ├── inference_params.py
│   │   ├── jit.py
│   │   ├── model_parallel_config.py
│   │   ├── models/
│   │   │   ├── T5/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── t5_model.py
│   │   │   │   └── t5_spec.py
│   │   │   ├── __init__.py
│   │   │   ├── backends.py
│   │   │   ├── bert/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── bert_layer_specs.py
│   │   │   │   ├── bert_lm_head.py
│   │   │   │   ├── bert_model.py
│   │   │   │   └── pooler.py
│   │   │   ├── common/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── embeddings/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── language_model_embedding.py
│   │   │   │   │   ├── relative_pos_embedding.py
│   │   │   │   │   ├── rope_utils.py
│   │   │   │   │   ├── rotary_pos_embedding.py
│   │   │   │   │   └── yarn_rotary_pos_embedding.py
│   │   │   │   ├── language_module/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── language_module.py
│   │   │   │   ├── model_chunk_schedule_plan.py
│   │   │   │   └── vision_module/
│   │   │   │       ├── __init__.py
│   │   │   │       └── vision_module.py
│   │   │   ├── gpt/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── experimental_attention_variant_module_specs.py
│   │   │   │   ├── fine_grained_callables.py
│   │   │   │   ├── gpt_layer_specs.py
│   │   │   │   ├── gpt_model.py
│   │   │   │   ├── heterogeneous/
│   │   │   │   │   └── heterogeneous_layer_specs.py
│   │   │   │   └── moe_module_specs.py
│   │   │   ├── huggingface/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── clip_model.py
│   │   │   │   ├── module.py
│   │   │   │   └── qwen_model.py
│   │   │   ├── mamba/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── mamba_layer_specs.py
│   │   │   │   └── mamba_model.py
│   │   │   ├── mimo/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── config/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── base_configs.py
│   │   │   │   ├── model/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── base.py
│   │   │   │   ├── partition/
│   │   │   │   │   └── utils.py
│   │   │   │   └── submodules/
│   │   │   │       ├── audio.py
│   │   │   │       ├── base.py
│   │   │   │       └── vision.py
│   │   │   ├── multimodal/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── context_parallel.py
│   │   │   │   ├── llava_model.py
│   │   │   │   └── llava_spec.py
│   │   │   └── vision/
│   │   │       ├── __init__.py
│   │   │       ├── clip_vit_model.py
│   │   │       ├── multimodal_projector.py
│   │   │       ├── radio.py
│   │   │       └── vit_layer_specs.py
│   │   ├── msc_utils.py
│   │   ├── nccl_allocator.py
│   │   ├── num_microbatches_calculator.py
│   │   ├── optimizer/
│   │   │   ├── __init__.py
│   │   │   ├── clip_grads.py
│   │   │   ├── cpu_offloading/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   └── hybrid_optimizer.py
│   │   │   ├── distrib_optimizer.py
│   │   │   ├── grad_scaler.py
│   │   │   ├── layer_wise_optimizer.py
│   │   │   ├── muon.py
│   │   │   ├── optimizer.py
│   │   │   ├── optimizer_config.py
│   │   │   └── qk_clip.py
│   │   ├── optimizer_param_scheduler.py
│   │   ├── package_info.py
│   │   ├── packed_seq_params.py
│   │   ├── parallel_state.py
│   │   ├── pipeline_parallel/
│   │   │   ├── __init__.py
│   │   │   ├── bridge_communicator.py
│   │   │   ├── combined_1f1b.py
│   │   │   ├── fine_grained_activation_offload.py
│   │   │   ├── hybrid_cp_schedule.py
│   │   │   ├── multimodule_communicator.py
│   │   │   ├── p2p_communication.py
│   │   │   ├── schedules.py
│   │   │   └── utils.py
│   │   ├── post_training/
│   │   │   ├── __init__.py
│   │   │   └── modelopt/
│   │   │       ├── __init__.py
│   │   │       ├── gpt/
│   │   │       │   ├── __init__.py
│   │   │       │   ├── model_specs.py
│   │   │       │   └── state_dict_hooks.py
│   │   │       ├── layers.py
│   │   │       └── mamba/
│   │   │           ├── __init__.py
│   │   │           └── model_specs.py
│   │   ├── process_groups_config.py
│   │   ├── quantization/
│   │   │   ├── __init__.py
│   │   │   ├── quant_config.py
│   │   │   └── utils.py
│   │   ├── requirements.txt
│   │   ├── rerun_state_machine.py
│   │   ├── resharding/
│   │   │   ├── __init__.py
│   │   │   ├── copy_services/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base.py
│   │   │   │   ├── gloo_copy_service.py
│   │   │   │   ├── nccl_copy_service.py
│   │   │   │   └── nvshmem_copy_service.py
│   │   │   ├── execution.py
│   │   │   ├── nvshmem_copy_service/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── compat.py
│   │   │   │   ├── core/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── gpu_resource_manager.py
│   │   │   │   │   ├── kernel_launcher.py
│   │   │   │   │   └── pipeline_executor.py
│   │   │   │   ├── kernels/
│   │   │   │   │   └── chunked_kernel.cu
│   │   │   │   ├── logger.py
│   │   │   │   ├── memory/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── double_buffer_manager.py
│   │   │   │   │   └── tensor_pointer_utils.py
│   │   │   │   ├── nvshmem_types.py
│   │   │   │   ├── planning/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── communication_scheduler.py
│   │   │   │   │   ├── gpu_execution_planner.py
│   │   │   │   │   ├── task_segmenter.py
│   │   │   │   │   └── workload_packer.py
│   │   │   │   ├── service.py
│   │   │   │   └── validation.py
│   │   │   ├── planner.py
│   │   │   ├── refit.py
│   │   │   ├── transforms.py
│   │   │   └── utils.py
│   │   ├── safe_globals.py
│   │   ├── ssm/
│   │   │   ├── __init__.py
│   │   │   ├── gated_delta_net.py
│   │   │   ├── mamba_block.py
│   │   │   ├── mamba_context_parallel.py
│   │   │   ├── mamba_hybrid_layer_allocation.py
│   │   │   ├── mamba_layer.py
│   │   │   ├── mamba_mixer.py
│   │   │   ├── mlp_layer.py
│   │   │   ├── ops/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── causal_conv1d_triton.py
│   │   │   │   ├── causal_conv1d_varlen.py
│   │   │   │   ├── determinism.py
│   │   │   │   ├── mamba_ssm.py
│   │   │   │   ├── ssd_bmm.py
│   │   │   │   ├── ssd_chunk_scan.py
│   │   │   │   ├── ssd_chunk_state.py
│   │   │   │   ├── ssd_combined.py
│   │   │   │   └── ssd_state_passing.py
│   │   │   └── triton_cache_manager.py
│   │   ├── tensor_parallel/
│   │   │   ├── __init__.py
│   │   │   ├── cross_entropy.py
│   │   │   ├── data.py
│   │   │   ├── inference_layers.py
│   │   │   ├── layers.py
│   │   │   ├── mappings.py
│   │   │   ├── random.py
│   │   │   └── utils.py
│   │   ├── timers.py
│   │   ├── tokenizers/
│   │   │   ├── __init__.py
│   │   │   ├── base_tokenizer.py
│   │   │   ├── megatron_tokenizer.py
│   │   │   ├── text/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── libraries/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── abstract_tokenizer.py
│   │   │   │   │   ├── bytelevel_tokenizer.py
│   │   │   │   │   ├── chat_template.py
│   │   │   │   │   ├── huggingface_tokenizer.py
│   │   │   │   │   ├── megatron_hf_tokenizer.py
│   │   │   │   │   ├── null_tokenizer.py
│   │   │   │   │   ├── sentencepiece_tokenizer.py
│   │   │   │   │   ├── sft_tokenizer.py
│   │   │   │   │   └── tiktoken_tokenizer.py
│   │   │   │   ├── models/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── bert_tokenizer.py
│   │   │   │   │   ├── default_tokenizer.py
│   │   │   │   │   ├── gpt_tokenizer.py
│   │   │   │   │   ├── mamba_tokenizer.py
│   │   │   │   │   └── t5_tokenizer.py
│   │   │   │   ├── parsers/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base_parser.py
│   │   │   │   │   ├── deepseek_r1_reasoning_parser.py
│   │   │   │   │   └── qwen3_coder_tool_parser.py
│   │   │   │   └── text_tokenizer.py
│   │   │   ├── utils/
│   │   │   │   └── build_tokenizer.py
│   │   │   └── vision/
│   │   │       ├── __init__.py
│   │   │       ├── libraries/
│   │   │       │   ├── __init__.py
│   │   │       │   ├── multimodal_tokenizer.py
│   │   │       │   └── null_multimodal_tokenizer.py
│   │   │       ├── models/
│   │   │       │   ├── __init__.py
│   │   │       │   └── default_tokenizer.py
│   │   │       └── vision_tokenizer.py
│   │   ├── transformer/
│   │   │   ├── __init__.py
│   │   │   ├── attention.py
│   │   │   ├── cuda_graphs.py
│   │   │   ├── custom_layers/
│   │   │   │   ├── __init__.py
│   │   │   │   └── batch_invariant_kernels.py
│   │   │   ├── dot_product_attention.py
│   │   │   ├── enums.py
│   │   │   ├── experimental_attention_variant/
│   │   │   │   ├── absorbed_mla.py
│   │   │   │   └── dsa.py
│   │   │   ├── fsdp_dtensor_checkpoint.py
│   │   │   ├── heterogeneous/
│   │   │   │   ├── heterogeneous_config.py
│   │   │   │   └── linear_replacements.py
│   │   │   ├── identity_op.py
│   │   │   ├── mlp.py
│   │   │   ├── module.py
│   │   │   ├── moe/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── experts.py
│   │   │   │   ├── fused_a2a.py
│   │   │   │   ├── moe_layer.py
│   │   │   │   ├── moe_utils.py
│   │   │   │   ├── router.py
│   │   │   │   ├── router_replay.py
│   │   │   │   ├── shared_experts.py
│   │   │   │   ├── token_dispatcher.py
│   │   │   │   ├── token_dispatcher_inference.py
│   │   │   │   └── upcycling_utils.py
│   │   │   ├── multi_latent_attention.py
│   │   │   ├── multi_token_prediction.py
│   │   │   ├── pipeline_parallel_layer_layout.py
│   │   │   ├── spec_utils.py
│   │   │   ├── torch_layer_norm.py
│   │   │   ├── torch_norm.py
│   │   │   ├── transformer_block.py
│   │   │   ├── transformer_config.py
│   │   │   ├── transformer_layer.py
│   │   │   └── utils.py
│   │   ├── typed_torch.py
│   │   └── utils.py
│   ├── inference/
│   │   ├── __init__.py
│   │   └── utils.py
│   ├── legacy/
│   │   ├── fp16_deprecated/
│   │   │   └── loss_scaler.py
│   │   ├── fused_kernels/
│   │   │   ├── __init__.py
│   │   │   ├── compat.h
│   │   │   ├── tests/
│   │   │   │   ├── __init__.py
│   │   │   │   └── test_fused_kernels.py
│   │   │   └── type_shim.h
│   │   └── model/
│   │       ├── __init__.py
│   │       ├── bert_model.py
│   │       ├── biencoder_model.py
│   │       ├── classification.py
│   │       ├── enums.py
│   │       ├── fused_bias_gelu.py
│   │       ├── fused_layer_norm.py
│   │       ├── fused_softmax.py
│   │       ├── gpt_model.py
│   │       ├── language_model.py
│   │       ├── module.py
│   │       ├── multiple_choice.py
│   │       ├── realm_model.py
│   │       ├── rms_norm.py
│   │       ├── t5_model.py
│   │       ├── transformer.py
│   │       ├── utils.py
│   │       └── vision/
│   │           ├── classification.py
│   │           ├── dino.py
│   │           ├── esvit_swin_backbone.py
│   │           ├── inpainting.py
│   │           ├── knn_monitor.py
│   │           ├── mit_backbone.py
│   │           ├── swin_backbone.py
│   │           ├── utils.py
│   │           └── vit_backbone.py
│   ├── post_training/
│   │   ├── __init__.py
│   │   ├── arguments.py
│   │   ├── checkpointing.py
│   │   ├── generate.py
│   │   ├── loss_func.py
│   │   ├── model_builder.py
│   │   ├── non_loss_data_func.py
│   │   └── utils.py
│   ├── rl/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── agent/
│   │   │   ├── __init__.py
│   │   │   ├── api.py
│   │   │   ├── huggingface_dataset_agent.py
│   │   │   ├── pass_at_evaluation_agent.py
│   │   │   ├── remote_agent.py
│   │   │   ├── reward_only_agent.py
│   │   │   └── weighted_multi_task.py
│   │   ├── inference/
│   │   │   ├── __init__.py
│   │   │   ├── api.py
│   │   │   ├── inference_interface.py
│   │   │   └── megatron.py
│   │   ├── logging.py
│   │   ├── parallel_utils.py
│   │   ├── rl_utils.py
│   │   ├── sequence_packing_utils.py
│   │   └── server/
│   │       ├── __init__.py
│   │       ├── agent/
│   │       │   ├── __init__.py
│   │       │   └── fastapi_env_server.py
│   │       ├── api.py
│   │       └── inference/
│   │           ├── __init__.py
│   │           └── inference_interface_server.py
│   └── training/
│       ├── __init__.py
│       ├── argument_utils.py
│       ├── arguments.py
│       ├── async_utils.py
│       ├── checkpointing.py
│       ├── config/
│       │   ├── __init__.py
│       │   ├── common_config.py
│       │   ├── resilience_config.py
│       │   └── training_config.py
│       ├── datasets/
│       │   ├── README.md
│       │   ├── __init__.py
│       │   ├── data_samplers.py
│       │   ├── fim_dataset.py
│       │   └── sft_dataset.py
│       ├── dgrad_logging.py
│       ├── dist_signal_handler.py
│       ├── ft_integration.py
│       ├── global_vars.py
│       ├── initialize.py
│       ├── inprocess_restart.py
│       ├── log_handler.py
│       ├── one_logger_utils.py
│       ├── theoretical_memory_usage.py
│       ├── training.py
│       ├── utils.py
│       ├── wandb_utils.py
│       └── yaml_arguments.py
├── model_provider.py
├── pretrain_bert.py
├── pretrain_gpt.py
├── pretrain_mamba.py
├── pretrain_t5.py
├── pretrain_vlm.py
├── pyproject.toml
├── scripts/
│   └── check_api_backwards_compatibility.py
├── setup.py
├── tasks/
│   ├── data_utils.py
│   ├── eval_utils.py
│   └── finetune_utils.py
├── tests/
│   ├── README.md
│   ├── __init__.py
│   ├── functional_tests/
│   │   ├── __init__.py
│   │   ├── python_test_utils/
│   │   │   ├── __init__.py
│   │   │   ├── common.py
│   │   │   ├── compute_golden_statistics.py
│   │   │   ├── conftest.py
│   │   │   ├── get_test_results_from_tensorboard_logs.py
│   │   │   ├── test_grpo_training_loop.py
│   │   │   ├── test_inference_regular_pipeline.py
│   │   │   ├── test_optimizer_grads_match.py
│   │   │   ├── test_pretraining_regular_pipeline.py
│   │   │   └── test_pretraining_resume_checkpoint_pipeline.py
│   │   ├── shell_test_utils/
│   │   │   ├── _run_training.sh
│   │   │   ├── run_batch_ci_tests.sh
│   │   │   ├── run_ci_test.sh
│   │   │   └── start_interactive_job.sh
│   │   └── test_cases/
│   │       ├── bert/
│   │       │   ├── bert_mcore_tp1_pp2/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── bert_mcore_tp1_pp4_vp2/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── bert_mcore_tp2_pp2/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── bert_mcore_tp2_pp2_frozen_resume_torch_dist/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── bert_mcore_tp2_pp2_local_spec/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── bert_mcore_tp2_pp2_resume_torch_dist/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── bert_mcore_tp2_pp2_resume_torch_dist_local_spec/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── bert_mcore_tp4_pp1/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── bert_release/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   └── bert_release_sm/
│   │       │       ├── golden_values_dev_dgx_gb200.json
│   │       │       ├── golden_values_dev_dgx_h100.json
│   │       │       └── model_config.yaml
│   │       ├── common/
│   │       │   ├── ckpt_converter/
│   │       │   │   ├── __main__.py
│   │       │   │   └── model_config.yaml
│   │       │   └── moe_perf/
│   │       │       ├── __main__.py
│   │       │       ├── baseline.json
│   │       │       └── test_cases.py
│   │       ├── gpt/
│   │       │   ├── gpt3_15b_8t_release/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_15b_8t_release_gb200/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_15b_8t_release_sm/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_15b_8t_release_sm_gb200/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_7b_tp1_pp4_memory_speed/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_7b_tp4_pp1_memory_speed/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_reruns_disable/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_reruns_enable/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_reruns_persistent_1/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_reruns_persistent_2/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_reruns_reshard/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_reruns_resume/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_reruns_resume_check_grads/
│   │       │   │   ├── README.md
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_reruns_transient/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp1_mup/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp1_uniform_full_recompute/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp2_rope_embeddings/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_disable_bias_linear/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_sequence_parallel/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_swiglu/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_untie_embeddings_and_outputs/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_calculate_per_token_loss/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_decoupled_lr/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxh100_dgxc.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_cp2_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_fsdp2_resume_torch_dist/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_gdn/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_modelopt_distill_resume/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_multi_dist_optimizer_instances/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cp2/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_ddp_average_in_collective/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_defer_embedding_wgrad_compute/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/
│   │       │   │   └── golden_values_dev_dgxh100_dgxc.json
│   │       │   ├── gpt3_mcore_te_tp2_pp2_mla/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_no_mmap_bin_files/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_resume_torch_dist/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp4_pp1_qk_layernorm_test_mode/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_dev_dgxh100_dgxc.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp1_pp1_fsdp2_resume_torch_dist_te/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp1_pp2/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp1_pp2_fp16/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp1_pp2_resume_torch_dist/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp1_pp4/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp1_pp4_resume_torch_dist/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_pp2_resume_torch_dist_uninstall_te/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_pp2_uninstall_te/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp4_pp1/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp4_pp1_resume_torch/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp4_pp1_resume_torch_dist/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_gb200_2nd.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100_2nd.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_weekly_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── model_config.yaml
│   │       │   │   └── tp_comm_overlap_cfg.yaml
│   │       │   ├── gpt3_weekly_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_validation/
│   │       │   │   ├── cuda_graphs.py
│   │       │   │   ├── cuda_graphs.sh
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp1_pp8_dp1_583m_logitsmatch_zmq/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp2_pp2_dp2_583m_logitsmatch_zmq/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_grpo_basic_function/
│   │       │   │   ├── env_config.yaml
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest/
│   │       │   │   ├── env_config.yaml
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest_github/
│   │       │   │   ├── env_config.yaml
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_grpo_tp4_pp1_dp2_8b_cudagraphs_throughput/
│   │       │   │   ├── env_config.yaml
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_grpo_tp4_pp1_dp2_8b_throughput/
│   │       │   │   ├── env_config.yaml
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_grpo_tp4_pp1_dp2_8b_throughput_github/
│   │       │   │   ├── env_config.yaml
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/
│   │       │   │   ├── README.md
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── model_config.yaml
│   │       │   │   └── test_prompts.jsonl
│   │       │   ├── gpt_static_inference_tp1_pp1_583m_cudagraphs/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   └── gpt_static_inference_tp1_pp1_583m_logitsmatch/
│   │       │       ├── golden_values_dev_dgx_a100.json
│   │       │       ├── golden_values_dev_dgx_h100.json
│   │       │       └── model_config.yaml
│   │       ├── gpt-nemo/
│   │       │   ├── bert-nemo_340m_mr_mbs2_gbs32_mcore_te_tp2_pp2_1N8G/
│   │       │   │   └── model_config.yaml
│   │       │   ├── gemma2-nemo_2b_mr_mbs1_gbs8_mcore_te_tp4_pp1_cp1_1N8G/
│   │       │   │   └── model_config.yaml
│   │       │   ├── llama3-nemo_8b_mr_mbs1_gbs8_mcore_te_8experts_tp2_ep2_pp2_dgx_a100_1N8G/
│   │       │   │   └── model_config.yaml
│   │       │   ├── llama3-nemo_8b_mr_mbs4_gbs64_mcore_te_tp1_pp1_cp2_dgx_a100_1N8G/
│   │       │   │   └── model_config.yaml
│   │       │   ├── mixtral-nemo_8x7b_mr_mbs1_gbs8_mcore_te_tp2_pp1_ep2_1N8G/
│   │       │   │   └── model_config.yaml
│   │       │   └── t5-nemo_220m_mr_mbs4_gbs64_te_tp1_pp1_1N8G/
│   │       │       └── model_config.yaml
│   │       ├── hybrid/
│   │       │   ├── hybrid_dynamic_inference_tp1_pp1_dp8_583m/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── hybrid_dynamic_inference_tp1_pp1_dp8_583m_chunked_prefill/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── hybrid_mr_mcore_te_tp1_pp2_vpp2_cp1_dgx_a100_1N8G/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── hybrid_mr_mcore_te_tp1_pp4_cp1_dgx_a100_1N8G/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── hybrid_static_inference_tp1_pp1_2B_cudagraphs/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   └── hybrid_static_inference_tp1_pp1_2B_logitsmatch/
│   │       │       ├── golden_values_dev_dgx_h100.json
│   │       │       └── model_config.yaml
│   │       ├── mimo/
│   │       │   ├── mimo_vlm_pretrain_convergence_tp1_pp1_cp1_dp8/
│   │       │   │   ├── golden_values_dev.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── mimo_vlm_pretrain_convergence_tp1_pp1_cp1_dp8_seq_packing/
│   │       │   │   ├── golden_values_dev.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   └── mimo_vlm_pretrain_convergence_tp1_pp1_cp2_dp8/
│   │       │       ├── golden_values_dev.json
│   │       │       ├── golden_values_dev_dgx_h100.json
│   │       │       └── model_config.yaml
│   │       ├── mixtral/
│   │       │   ├── deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release/
│   │       │   │   └── model_config.yaml
│   │       │   ├── deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release_sm/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── deepseekv3_proxy_flex_tp2pp2emp16etp1cp1_gb_200_release/
│   │       │   │   └── model_config.yaml
│   │       │   ├── deepseekv3_proxy_flex_tp2pp2emp16etp1cp1_gb_200_release_sm/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── mixtral_8x22b_tp2pp8ep8vpp1_release/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── mixtral_8x7b_alltoall_tp2pp4ep4_release/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   └── mixtral_8x7b_tp1pp4ep8vpp8_release/
│   │       │       ├── golden_values_dev_dgx_h100.json
│   │       │       ├── golden_values_lts_dgx_a100.json
│   │       │       └── model_config.yaml
│   │       ├── moe/
│   │       │   ├── deepseek_proxy_fsdp_ep2_fsdp2/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── deepseek_proxy_fsdp_ep2_fsdp2_1node/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_te_8experts2parallel_top2router/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4/
│   │       │   │   ├── golden_values_dev.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxh100_dgxc.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/
│   │       │   │   ├── golden_values_dev.json
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_dev_dgxa100_dracooci.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci-ord.json
│   │       │   │   ├── golden_values_lts_dgxa100_dracooci.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_mcore_tp2_pp2_resume_torch_dist_te_2experts/
│   │       │   │   ├── golden_values_dev.json
│   │       │   │   ├── golden_values_lts.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_moe_mcore_te_ep8_resume_torch_dist_muon/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   ├── golden_values_dev_dgxh100_dgxc.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/
│   │       │   │   ├── golden_values_dev_dgx_gb200.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_dev_dgx_h100_2nd.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp4_etp1_pp1_ep8_16B_logitsmatch_cudagraph_zmq/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp4_etp1_pp1_ep8_16B_logitsmatch_zmq/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_dynamic_inference_tp4_etp1_pp1_ep8_16B_logitsmatch_zmq_suspend_resume/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── model_config.yaml
│   │       │   │   └── prompts.json
│   │       │   ├── gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_grpo_tp8tp4_pp1_ep8ep2_dp8_throughputtest/
│   │       │   │   ├── env_config.yaml
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_static_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   ├── gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   └── model_config.yaml
│   │       │   └── gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/
│   │       │       ├── golden_values_dev_dgx_h100.json
│   │       │       └── model_config.yaml
│   │       ├── multimodal-llava/
│   │       │   ├── multimodal_llava_mcore_te_tp1_pp1/
│   │       │   │   ├── golden_values_dev_dgx_a100.json
│   │       │   │   ├── golden_values_dev_dgx_h100.json
│   │       │   │   ├── golden_values_lts_dgx_a100.json
│   │       │   │   └── model_config.yaml
│   │       │   └── multimodal_llava_mcore_te_tp4_sp_cp2/
│   │       │       ├── golden_values_dev_dgx_a100.json
│   │       │       ├── golden_values_dev_dgx_h100.json
│   │       │       ├── golden_values_lts_dgx_a100.json
│   │       │       └── model_config.yaml
│   │       └── t5/
│   │           ├── t5_11b_mcore_tp4_pp1/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_te_tp1_pp1_vp1_resume_torch/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_a100_2nd.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_dev_dgx_h100_2nd.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_te_tp2_pp1_vp1/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_te_tp2_pp1_vp1_sequence_parallel/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_te_tp4_pp1/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_te_tp4_pp1_resume_torch_dist/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_dev_dgx_h100_2nd.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_tp1_pp1_vp1/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_tp1_pp1_vp1_resume_torch/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_a100_2nd.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_dev_dgx_h100_2nd.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_tp2_pp1_vp1/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci-ord.json
│   │           │   ├── golden_values_dev_dgxa100_dracooci.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_tp4_pp1/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_mcore_tp4_pp1_resume_torch_dist/
│   │           │   ├── golden_values_dev_dgx_a100.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_dev_dgx_h100_2nd.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_release/
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   ├── golden_values_lts_dgx_a100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_release_sm/
│   │           │   ├── golden_values_dev_dgx_gb200.json
│   │           │   ├── golden_values_dev_dgx_h100.json
│   │           │   └── model_config.yaml
│   │           ├── t5_weekly_mcore_te_tp2_pp1_vp1/
│   │           │   └── golden_values_lts_dgx_a100.json
│   │           └── t5_weekly_mcore_te_tp2_pp1_vp1_sequence_parallel/
│   │               └── golden_values_lts_dgx_a100.json
│   ├── test_utils/
│   │   ├── python_scripts/
│   │   │   ├── approve_merge_gate.py
│   │   │   ├── auto_reminder.py
│   │   │   ├── auto_reminder_github.py
│   │   │   ├── check_status_of_main.py
│   │   │   ├── dashboard.py
│   │   │   ├── download_coverage_results.py
│   │   │   ├── download_golden_values.py
│   │   │   ├── download_unit_tests_dataset.py
│   │   │   ├── generate_jet_trigger_job.py
│   │   │   ├── generate_local_jobs.py
│   │   │   ├── launch_jet_workload.py
│   │   │   ├── launch_nemo_run_workload.py
│   │   │   ├── notify.py
│   │   │   ├── recipe_parser.py
│   │   │   ├── swap_pr_labels.py
│   │   │   └── wait_for_resources.py
│   │   └── recipes/
│   │       ├── _build-mcore-dev.yaml
│   │       ├── _build-mcore-lts.yaml
│   │       ├── _build-nemo.yaml
│   │       ├── gb200/
│   │       │   ├── gpt.yaml
│   │       │   ├── moe-1node.yaml
│   │       │   ├── moe.yaml
│   │       │   └── unit-tests.yaml
│   │       └── h100/
│   │           ├── bert.yaml
│   │           ├── ckpt_converter.yaml
│   │           ├── gpt-dynamic-inference-cuda-graphs.yaml
│   │           ├── gpt-dynamic-inference-with-coordinator.yaml
│   │           ├── gpt-dynamic-inference.yaml
│   │           ├── gpt-grads.yaml
│   │           ├── gpt-grpo.yaml
│   │           ├── gpt-nemo.yaml
│   │           ├── gpt-static-inference.yaml
│   │           ├── gpt.yaml
│   │           ├── mamba-dynamic-inference.yaml
│   │           ├── mamba-static-inference.yaml
│   │           ├── mamba.yaml
│   │           ├── mimo.yaml
│   │           ├── module_performance.yaml
│   │           ├── moe-dynamic-inference-with-coordinator.yaml
│   │           ├── moe-dynamic-inference.yaml
│   │           ├── moe-grpo.yaml
│   │           ├── moe-static-inference.yaml
│   │           ├── moe.yaml
│   │           ├── multimodal-llava.yaml
│   │           ├── t5.yaml
│   │           └── unit-tests.yaml
│   └── unit_tests/
│       ├── __init__.py
│       ├── a2a_overlap/
│       │   ├── test_cuda_graphed_schedule_chunk_1f1b.py
│       │   ├── test_schedule_chunk_1f1b.py
│       │   ├── test_schedule_layer_1f1b.py
│       │   └── utils.py
│       ├── conftest.py
│       ├── data/
│       │   ├── __init__.py
│       │   ├── test_bin_reader.py
│       │   ├── test_builder.py
│       │   ├── test_fim_dataset.py
│       │   ├── test_gpt_dataset.py
│       │   ├── test_multimodal_dataset.py
│       │   ├── test_preprocess_data.py
│       │   └── test_preprocess_mmdata.py
│       ├── dist_checkpointing/
│       │   ├── __init__.py
│       │   ├── conftest.py
│       │   ├── models/
│       │   │   ├── __init__.py
│       │   │   ├── common.py
│       │   │   ├── test_bert_model.py
│       │   │   ├── test_gpt_model.py
│       │   │   ├── test_mamba.py
│       │   │   ├── test_mlp_glu.py
│       │   │   ├── test_moe_experts.py
│       │   │   └── test_t5_model.py
│       │   ├── test_async_save.py
│       │   ├── test_checkpointable.py
│       │   ├── test_fp8.py
│       │   ├── test_fully_parallel.py
│       │   ├── test_global_metadata_reuse.py
│       │   ├── test_layer_wise_optimizer.py
│       │   ├── test_local.py
│       │   ├── test_mapping.py
│       │   ├── test_msc.py
│       │   ├── test_nonpersistent.py
│       │   ├── test_optimizer.py
│       │   ├── test_pipeline_parallel_layout.py
│       │   ├── test_replication.py
│       │   ├── test_safe_globals.py
│       │   ├── test_serialization.py
│       │   ├── test_strict.py
│       │   ├── test_torch_dist.py
│       │   └── utils.py
│       ├── distributed/
│       │   ├── megatron_fsdp/
│       │   │   ├── test_mcore_fully_sharded_data_parallel.py
│       │   │   ├── test_mfsdp_fully_shard.py
│       │   │   └── utils.py
│       │   ├── test_distributed_data_parallel.py
│       │   ├── test_finalize_model_grads.py
│       │   ├── test_grad_reduce_for_replicated_embedder.py
│       │   ├── test_grad_sync_with_expert_parallel.py
│       │   ├── test_param_and_grad_buffer.py
│       │   ├── test_reduce_scatter_with_fp32_accumulation.py
│       │   └── test_torch_fully_sharded_parallel.py
│       ├── export/
│       │   └── trtllm/
│       │       ├── __init__.py
│       │       ├── test_distributed_fp8.py
│       │       ├── test_single_device_fp8.py
│       │       ├── test_trtllm_distributed_gpu_converter.py
│       │       ├── test_trtllm_helper.py
│       │       ├── test_trtllm_layers.py
│       │       └── test_trtllm_single_device_converter.py
│       ├── extension/
│       │   └── test_kitchen_sdpa.py
│       ├── find_test_cases.py
│       ├── fusions/
│       │   ├── test_bias_dropout_fusion.py
│       │   ├── test_mla_yarn_rope_apply.py
│       │   ├── test_rmsnorm_residual_fusion.py
│       │   ├── test_swiglu_fusion.py
│       │   ├── test_torch_softmax.py
│       │   └── test_weighted_squared_relu_fusion.py
│       ├── inference/
│       │   ├── __init__.py
│       │   ├── contexts/
│       │   │   ├── attention_metadata/
│       │   │   │   ├── test_mamba_metadata.py
│       │   │   │   └── test_tensor_ops.py
│       │   │   ├── test_dynamic_context.py
│       │   │   └── test_dynamic_prefix_caching.py
│       │   ├── engines/
│       │   │   ├── __init__.py
│       │   │   ├── test_dynamic_engine.py
│       │   │   ├── test_dynamic_events.py
│       │   │   ├── test_mamba_prefix_caching_e2e.py
│       │   │   └── test_static_engine.py
│       │   ├── model_inference_wrappers/
│       │   │   ├── __init__.py
│       │   │   ├── gpt/
│       │   │   │   └── test_gpt_inference_wrapper.py
│       │   │   └── t5/
│       │   │       └── test_t5_inference_wrapper.py
│       │   ├── test_batch_dimension_utils.py
│       │   ├── test_common_inference_params.py
│       │   ├── test_communication_utils.py
│       │   ├── test_data_parallel_inference_coordinator.py
│       │   ├── test_dynamic_prefix_caching_coordinator.py
│       │   ├── test_flash_decode.py
│       │   ├── test_inference_config.py
│       │   ├── test_inference_utils.py
│       │   ├── test_moe_inference.py
│       │   ├── test_moe_permute.py
│       │   ├── test_mxfp8_utils.py
│       │   ├── test_scheduler.py
│       │   ├── test_stop_words.py
│       │   ├── test_wandb_logging.py
│       │   └── text_generation_controllers/
│       │       ├── __init__.py
│       │       ├── test_encoder_decoder_text_generation_controller.py
│       │       ├── test_text_generation_controller.py
│       │       └── test_vlm_text_generation_controller.py
│       ├── models/
│       │   ├── __init__.py
│       │   ├── test_base_embedding.py
│       │   ├── test_bert_model.py
│       │   ├── test_clip_vit_model.py
│       │   ├── test_gpt_model.py
│       │   ├── test_gpt_model_batch_invariant.py
│       │   ├── test_gpt_model_quantization.py
│       │   ├── test_heterogeneous_gpt_model.py
│       │   ├── test_llava_model.py
│       │   ├── test_mamba_model.py
│       │   ├── test_mamba_moe_model.py
│       │   ├── test_mimo_audio_submodules.py
│       │   ├── test_mimo_embedding_alignment.py
│       │   ├── test_mimo_model.py
│       │   ├── test_mimo_partition.py
│       │   ├── test_mimo_submodules.py
│       │   ├── test_multimodal_projector.py
│       │   ├── test_radio_model.py
│       │   └── test_t5_model.py
│       ├── optimizer/
│       │   ├── __init__.py
│       │   └── test_optimizer_config.py
│       ├── pipeline_parallel/
│       │   ├── __init__.py
│       │   ├── test_bridge_communicator.py
│       │   ├── test_fine_grained_activation_offloading.py
│       │   ├── test_helpers.py
│       │   ├── test_multimodule_communicator.py
│       │   ├── test_multimodule_schedules.py
│       │   ├── test_pipeline_layout.py
│       │   └── test_schedules.py
│       ├── post_training/
│       │   ├── __init__.py
│       │   ├── test_modelopt_model_builder.py
│       │   └── test_modelopt_module_spec.py
│       ├── resharding/
│       │   ├── test_communication_scheduler.py
│       │   ├── test_dp_balancing.py
│       │   ├── test_model_swap.py
│       │   ├── test_mxfp8_refit.py
│       │   ├── test_task_segmenter.py
│       │   └── test_workload_packer.py
│       ├── rl/
│       │   ├── test_grouped_rollouts.py
│       │   ├── test_rl_batch_invariant.py
│       │   ├── test_rl_utils.py
│       │   └── test_sequence_packing_utils.py
│       ├── run_ci_test.sh
│       ├── ssm/
│       │   ├── ops/
│       │   │   ├── test_causal_conv1d_varlen.py
│       │   │   ├── test_ops_init.py
│       │   │   ├── test_ssd_bmm.py
│       │   │   ├── test_ssd_chunk_scan.py
│       │   │   ├── test_ssd_chunk_state.py
│       │   │   ├── test_ssd_combined.py
│       │   │   ├── test_ssd_state_passing.py
│       │   │   └── test_ssm_kernel.py
│       │   ├── test_causal_conv1d_triton.py
│       │   ├── test_gated_delta_net.py
│       │   ├── test_mamba_block.py
│       │   ├── test_mamba_context_parallel.py
│       │   ├── test_mamba_hybrid_layer_allocation.py
│       │   ├── test_mamba_layer.py
│       │   └── test_mamba_mixer.py
│       ├── tensor_parallel/
│       │   ├── __init__.py
│       │   ├── test_cross_entropy.py
│       │   ├── test_data.py
│       │   ├── test_initialization.py
│       │   ├── test_layers.py
│       │   ├── test_mappings.py
│       │   ├── test_random.py
│       │   └── test_tensor_parallel_utils.py
│       ├── test_api_backwards_compat_setup.py
│       ├── test_argument_utils.py
│       ├── test_basic.py
│       ├── test_checkpointing.py
│       ├── test_fp8_param.py
│       ├── test_fp8_utils.py
│       ├── test_hyper_comm_grid.py
│       ├── test_imports.py
│       ├── test_inference.py
│       ├── test_layer_wise_optimizer.py
│       ├── test_lion_optimizer.py
│       ├── test_local_multi_tensor_fns.py
│       ├── test_model_configs.py
│       ├── test_muon_optimizer.py
│       ├── test_nccl_allocator.py
│       ├── test_num_microbatches_calculator.py
│       ├── test_optimizer.py
│       ├── test_optimizer_cpu_offloading.py
│       ├── test_optimizer_param_scheduler.py
│       ├── test_parallel_state.py
│       ├── test_process_groups_config.py
│       ├── test_training.py
│       ├── test_typed_torch.py
│       ├── test_utilities.py
│       ├── test_utils.py
│       ├── tokenizers/
│       │   └── test_tokenizer.py
│       ├── transformer/
│       │   ├── __init__.py
│       │   ├── experimental_attention_variant/
│       │   │   ├── test_absorbed_mla.py
│       │   │   └── test_attention_variant_dsa.py
│       │   ├── moe/
│       │   │   ├── __init__.py
│       │   │   ├── conftest.py
│       │   │   ├── test_a2a_token_dispatcher.py
│       │   │   ├── test_aux_loss.py
│       │   │   ├── test_grouped_mlp.py
│       │   │   ├── test_latent_moe_layer.py
│       │   │   ├── test_moe_layer.py
│       │   │   ├── test_moe_layer_discrepancy.py
│       │   │   ├── test_multihot_indices_converter.py
│       │   │   ├── test_router_replay.py
│       │   │   ├── test_routers.py
│       │   │   ├── test_sequential_mlp.py
│       │   │   ├── test_shared_experts.py
│       │   │   ├── test_token_dispatcher.py
│       │   │   └── test_upcycling.py
│       │   ├── test_attention.py
│       │   ├── test_attention_no_rope.py
│       │   ├── test_attention_packed_seq.py
│       │   ├── test_core_attention.py
│       │   ├── test_cuda_graphs.py
│       │   ├── test_full_cuda_graph.py
│       │   ├── test_mlp.py
│       │   ├── test_module.py
│       │   ├── test_multi_latent_attention.py
│       │   ├── test_multi_token_prediction.py
│       │   ├── test_mup.py
│       │   ├── test_quantization_config.py
│       │   ├── test_relative_attention.py
│       │   ├── test_rope.py
│       │   ├── test_spec_customization.py
│       │   ├── test_submodule_callables.py
│       │   ├── test_te_layers_batch_invariant.py
│       │   ├── test_thd_correctness.py
│       │   ├── test_transformer_block.py
│       │   ├── test_transformer_block_custom_pgs.py
│       │   ├── test_transformer_layer.py
│       │   ├── test_utils.py
│       │   └── test_vision_cuda_graphs.py
│       └── utils/
│           └── test_experimental_log_once.py
├── tools/
│   ├── __init__.py
│   ├── autoformat.sh
│   ├── bert_embedding/
│   │   ├── __init__.py
│   │   ├── dataset.py
│   │   ├── embed.py
│   │   ├── external_libs.py
│   │   └── huggingface.py
│   ├── build_sequences_per_dataset.py
│   ├── check_copyright.py
│   ├── checkpoint/
│   │   ├── checkpoint_inspector.py
│   │   ├── convert.py
│   │   ├── hybrid_conversion.py
│   │   ├── loader_base.py
│   │   ├── loader_core.py
│   │   ├── loader_legacy.py
│   │   ├── loader_llama_mistral.py
│   │   ├── loader_llava.py
│   │   ├── loader_mixtral_hf.py
│   │   ├── saver_base.py
│   │   ├── saver_core.py
│   │   ├── saver_hf_llava.py
│   │   ├── saver_legacy.py
│   │   ├── saver_llava.py
│   │   ├── schema_base.py
│   │   ├── schema_core.py
│   │   ├── schema_hf.py
│   │   └── utils.py
│   ├── copyright.sh
│   ├── linter.py
│   ├── merge_datasets.py
│   ├── preprocess_data.py
│   ├── preprocess_data_nmt.py
│   ├── preprocess_mmdata.py
│   ├── report_theoretical_memory.py
│   ├── run_dynamic_text_generation_server.py
│   ├── run_inference_performance_test.py
│   ├── run_mamba_text_generation_server.py
│   ├── run_mamba_text_generation_server_completions.py
│   ├── run_text_generation_server.py
│   ├── run_vlm_text_generation.py
│   ├── text_generation_cli.py
│   ├── trigger_internal_ci.md
│   ├── trigger_internal_ci.py
│   ├── upgrade_dependencies.sh
│   └── wait_daemon.sh
└── train_rl.py