gitextract_32xrv9zn/

├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── 100-installation.yml
│   │   ├── 200-usage.yml
│   │   ├── 300-bug-report.yml
│   │   ├── 400-feature-request.yml
│   │   ├── 500-new-model.yml
│   │   ├── 600-performance-discussion.yml
│   │   ├── 700-rfc.yml
│   │   └── config.yml
│   ├── labeler.yml
│   ├── prompts/
│   │   ├── issue-triage-system.txt
│   │   └── pr-summary-system.txt
│   ├── pull_request_template.md
│   └── workflows/
│       ├── ai-issue-triage.yml
│       ├── ai-pr-summary.yml
│       ├── pr-labeler.yml
│       └── pypi_publish.yml
├── .gitignore
├── .pylintrc
├── .readthedocs.yaml
├── CODE_OF_CONDUCT.md
├── COMMITTERS.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── csrc/
│   └── dp_core.cpp
├── docs/
│   ├── en/
│   │   ├── Makefile
│   │   ├── make.bat
│   │   └── source/
│   │       ├── 1_overview/
│   │       │   └── overview.md
│   │       ├── 2_installation/
│   │       │   └── installation.md
│   │       ├── 3_quick_start/
│   │       │   └── quick_start.md
│   │       ├── 4_galvatron_model_usage/
│   │       │   └── galvatron_model_usage.md
│   │       ├── 5_search_engine_usage/
│   │       │   └── search_engine_usage.md
│   │       ├── 6_developer_guide/
│   │       │   ├── adding_a_new_model_in_galvatron.md
│   │       │   ├── contributing_guide.md
│   │       │   └── developer_guide.rst
│   │       ├── 7_visualization/
│   │       │   └── visualization.md
│   │       ├── conf.py
│   │       └── index.rst
│   ├── requirements.txt
│   └── zh_CN/
│       ├── .readthedocs.yaml
│       ├── Makefile
│       ├── make.bat
│       └── source/
│           ├── 1_overview/
│           │   └── overview_zh.md
│           ├── 2_installation/
│           │   └── installation_zh.md
│           ├── 3_quick_start/
│           │   └── quick_start_zh.md
│           ├── 4_galvatron_model_usage/
│           │   └── galvatron_model_usage_zh.md
│           ├── 5_search_engine_usage/
│           │   └── search_engine_usage_zh.md
│           ├── 6_developer_guide/
│           │   ├── adding_a_new_model_in_galvatron_zh.md
│           │   ├── contributing_guide_zh.md
│           │   └── developer_guide_zh.rst
│           ├── 7_visualization/
│           │   └── visualization_zh.md
│           ├── conf.py
│           └── index.rst
├── galvatron/
│   ├── MANIFEST.in
│   ├── __init__.py
│   ├── core/
│   │   ├── __init__.py
│   │   ├── args_schema.py
│   │   ├── arguments.py
│   │   ├── cost_model/
│   │   │   ├── __init__.py
│   │   │   ├── components/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── embedding_lmhead_cost.py
│   │   │   │   └── layer_cost.py
│   │   │   ├── cost_model_args.py
│   │   │   └── cost_model_handler.py
│   │   ├── profiler/
│   │   │   ├── __init__.py
│   │   │   ├── args_schema.py
│   │   │   ├── arguments.py
│   │   │   ├── base_profiler.py
│   │   │   ├── hardware_profiler.py
│   │   │   ├── model_profiler.py
│   │   │   ├── runtime_profiler.py
│   │   │   └── utils.py
│   │   ├── runtime/
│   │   │   ├── __init__.py
│   │   │   ├── args_schema.py
│   │   │   ├── checkpoint/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── gpt_adapter.py
│   │   │   │   ├── llama_adapter.py
│   │   │   │   └── moe_adapter.py
│   │   │   ├── comm_groups.py
│   │   │   ├── dataloader.py
│   │   │   ├── datasets/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── megatron/
│   │   │   │   │   ├── Makefile
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── blended_dataset.py
│   │   │   │   │   ├── blended_megatron_dataset_builder.py
│   │   │   │   │   ├── blended_megatron_dataset_config.py
│   │   │   │   │   ├── gpt_dataset.py
│   │   │   │   │   ├── helpers.cpp
│   │   │   │   │   ├── helpers.py
│   │   │   │   │   ├── indexed_dataset.py
│   │   │   │   │   ├── megatron_dataset.py
│   │   │   │   │   ├── megatron_tokenizer.py
│   │   │   │   │   ├── readme.md
│   │   │   │   │   ├── tokenizer.py
│   │   │   │   │   ├── utils.py
│   │   │   │   │   └── utils_s3.py
│   │   │   │   └── random_dataset.py
│   │   │   ├── hybrid_parallel_config.py
│   │   │   ├── hybrid_parallel_model.py
│   │   │   ├── initialize.py
│   │   │   ├── models/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── arch.py
│   │   │   │   ├── builder.py
│   │   │   │   ├── modules.py
│   │   │   │   └── moe_modules.py
│   │   │   ├── moe/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── fused_a2a.py
│   │   │   │   ├── fused_kernels.py
│   │   │   │   ├── grouped_gemm_util.py
│   │   │   │   ├── mlp.py
│   │   │   │   ├── moe_utils.py
│   │   │   │   ├── router.py
│   │   │   │   └── token_dispatcher.py
│   │   │   ├── optimizer/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── clip_grads.py
│   │   │   │   ├── num_microbatches_calculator.py
│   │   │   │   ├── param_scheduler.py
│   │   │   │   └── utils.py
│   │   │   ├── parallel.py
│   │   │   ├── parallel_state.py
│   │   │   ├── pipeline/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── grad_reduce.py
│   │   │   │   ├── pipeline.py
│   │   │   │   ├── sp_grad_reduce.py
│   │   │   │   └── utils.py
│   │   │   ├── redistribute.py
│   │   │   ├── tensor_parallel/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── layers.py
│   │   │   │   ├── mappings.py
│   │   │   │   ├── random.py
│   │   │   │   ├── reset.py
│   │   │   │   ├── triton_cross_entropy.py
│   │   │   │   └── utils.py
│   │   │   ├── transformer/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── attention.py
│   │   │   │   ├── attention_impl.py
│   │   │   │   ├── fused_kernels.py
│   │   │   │   ├── inference.py
│   │   │   │   ├── mlp.py
│   │   │   │   ├── norm.py
│   │   │   │   ├── rope_utils.py
│   │   │   │   ├── rotary_pos_embedding.py
│   │   │   │   ├── spec_utils.py
│   │   │   │   └── utils.py
│   │   │   └── utils/
│   │   │       ├── __init__.py
│   │   │       ├── rerun_state_machine.py
│   │   │       └── utils.py
│   │   └── search_engine/
│   │       ├── __init__.py
│   │       ├── args_schema.py
│   │       ├── dynamic_programming.py
│   │       ├── search_engine.py
│   │       └── utils.py
│   ├── models/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── gpt/
│   │   │   ├── __init__.py
│   │   │   ├── configs/
│   │   │   │   ├── computation_profiling_bf16_llama2-7b_all.json
│   │   │   │   ├── computation_profiling_bf16_llama2-7b_seqlen2048_all.json
│   │   │   │   ├── galvatron_config_llama2-7b_1nodes_8gpus_per_node_36GB_bf16.json
│   │   │   │   ├── memory_profiling_bf16_llama2-7b_all.json
│   │   │   │   └── memory_profiling_bf16_llama2-7b_seqlen2048_all.json
│   │   │   ├── profiler.py
│   │   │   ├── run_train_and_log.sh
│   │   │   ├── scripts/
│   │   │   │   ├── computation_profile_scripts_all.sh
│   │   │   │   ├── memory_profile_scripts_all.sh
│   │   │   │   ├── profile_computation.sh
│   │   │   │   ├── profile_computation.yaml
│   │   │   │   ├── profile_memory.sh
│   │   │   │   ├── profile_memory.yaml
│   │   │   │   ├── profile_runtime.yaml
│   │   │   │   ├── search_dist.sh
│   │   │   │   ├── search_dist.yaml
│   │   │   │   ├── train_dist.yaml
│   │   │   │   └── train_yaml.sh
│   │   │   ├── search_dist.py
│   │   │   └── train_dist.py
│   │   ├── model_configs/
│   │   │   ├── gpt2-small.yaml
│   │   │   ├── gpt2-xl.yaml
│   │   │   ├── llama2-70b.yaml
│   │   │   ├── llama2-7b.yaml
│   │   │   ├── mistral-7b.yaml
│   │   │   ├── qwen2.5-7b.yaml
│   │   │   └── template.yaml
│   │   └── moe/
│   │       ├── scripts/
│   │       │   ├── train_dist.yaml
│   │       │   └── train_yaml.sh
│   │       └── train_dist.py
│   ├── profile_hardware/
│   │   ├── hardware_configs/
│   │   │   ├── allreduce_bandwidth_1nodes_4gpus_per_node.json
│   │   │   ├── allreduce_bandwidth_1nodes_8gpus_per_node.json
│   │   │   ├── allreduce_bandwidth_2nodes_8gpus_per_node.json
│   │   │   ├── overlap_coefficient.json
│   │   │   ├── p2p_bandwidth_1nodes_4gpus_per_node.json
│   │   │   ├── p2p_bandwidth_1nodes_8gpus_per_node.json
│   │   │   ├── p2p_bandwidth_2nodes_8gpus_per_node.json
│   │   │   └── sp_time_1nodes_8gpus_per_node.json
│   │   ├── hostfile
│   │   ├── profile_all2all.py
│   │   ├── profile_allreduce.py
│   │   ├── profile_hardware.py
│   │   ├── profile_overlap.py
│   │   ├── profile_p2p.py
│   │   └── scripts/
│   │       ├── profile_all2all_sp.sh
│   │       ├── profile_allreduce.sh
│   │       ├── profile_allreduce_sp.sh
│   │       ├── profile_hardware.sh
│   │       ├── profile_hardware.yaml
│   │       ├── profile_hardware_run_all.sh
│   │       ├── profile_overlap.sh
│   │       └── profile_p2p.sh
│   ├── scripts/
│   │   ├── flash_attn_ops_install.sh
│   │   └── prepare_env.sh
│   ├── tools/
│   │   ├── __init__.py
│   │   ├── args_schema.py
│   │   ├── checkpoint_convert_g2h.py
│   │   ├── checkpoint_convert_h2g.py
│   │   ├── convert_bert_g2h.sh
│   │   ├── convert_bert_h2g.sh
│   │   ├── convert_gpt.sh
│   │   ├── convert_llama_g2h.sh
│   │   ├── convert_llama_h2g.sh
│   │   └── convert_mixtral_h2g.sh
│   └── utils/
│       ├── __init__.py
│       ├── config_utils.py
│       ├── hf_config_adapter.py
│       ├── memory_utils.py
│       ├── print_utils.py
│       ├── strategy_utils.py
│       └── training_utils.py
├── galvatron.exp
├── pytest.ini
├── requirements.txt
├── setup.py
└── tests/
    ├── __init__.py
    ├── conftest.py
    ├── core/
    │   ├── __init__.py
    │   ├── test_ep.py
    │   ├── test_fsdp.py
    │   ├── test_hybrid.py
    │   ├── test_mixed_precision.py
    │   ├── test_pp.py
    │   ├── test_redistributed.py
    │   ├── test_tp.py
    │   └── test_utils.py
    ├── kernels/
    │   ├── __init__.py
    │   ├── test_triton_cross_entropy.py
    │   ├── test_triton_cross_entropy_debug.py
    │   ├── test_triton_cross_entropy_kernels.py
    │   └── test_triton_cross_entropy_kernels_debug.py
    ├── models/
    │   ├── __init__.py
    │   ├── configs/
    │   │   └── __init__.py
    │   ├── test_checkpoint_convert.py
    │   ├── test_dataloader.py
    │   ├── test_model_correctness.py
    │   └── test_moe_correctness.py
    ├── profiler/
    │   ├── test_hardware_profile.py
    │   ├── test_model_profile.py
    │   └── test_runtime_profile.py
    ├── search_engine/
    │   ├── test_bsz_utils.py
    │   ├── test_cost_model.py
    │   ├── test_generate_strategies.py
    │   ├── test_get_configs.py
    │   ├── test_initialize.py
    │   ├── test_parallelsim_optimization.py
    │   ├── test_pp_utils.py
    │   └── test_strategy_utils.py
    ├── test_arguments.py
    ├── utils/
    │   ├── __init__.py
    │   ├── cost_args.py
    │   ├── init_dist.py
    │   ├── model_configs/
    │   │   ├── gpt-test-256.yaml
    │   │   ├── gpt-test.yaml
    │   │   ├── gpt2-small.yaml
    │   │   ├── gpt2-xl.yaml
    │   │   ├── llama-test.yaml
    │   │   ├── llama2-70b.yaml
    │   │   ├── llama2-7b.yaml
    │   │   ├── llama2-test.yaml
    │   │   ├── mistral-7b.yaml
    │   │   ├── mixtral-test.yaml
    │   │   ├── qwen2.5-7b.yaml
    │   │   └── template.yaml
    │   ├── model_utils.py
    │   ├── parallel_config.py
    │   ├── profiler_configs.py
    │   ├── profiler_utils.py
    │   ├── runtime_args.py
    │   ├── search_args.py
    │   └── search_configs.py
    └── utils.py