gitextract_vek8qtxm/

├── .devcontainer/
│   └── devcontainer.json
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   └── bug-report.yml
│   ├── PULL_REQUEST_TEMPLATE.md
│   └── workflows/
│       ├── build-docker-images-release.yml
│       ├── build_and_run_tests.yml
│       ├── build_docker_images.yml
│       ├── build_documentation.yml
│       ├── build_pr_documentation.yml
│       ├── fp8_runner.yml
│       ├── gaudi3_scheduled.yml
│       ├── integration_tests.yml
│       ├── nightly.yml
│       ├── pr_style_bot.yml
│       ├── quality.yml
│       ├── run_merge_tests.yml
│       ├── self_hosted_integration_tests.yml
│       ├── stale.yml
│       ├── test.yml
│       ├── test_imports.yml
│       ├── trufflehog.yml
│       └── upload_pr_documentation.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── benchmarks/
│   ├── README.md
│   ├── big_model_inference/
│   │   ├── README.md
│   │   ├── big_model_inference.py
│   │   └── measures_util.py
│   ├── fp8/
│   │   ├── ms_amp/
│   │   │   ├── Dockerfile
│   │   │   ├── ddp.py
│   │   │   ├── distrib_deepspeed.py
│   │   │   ├── fp8_utils.py
│   │   │   └── non_distributed.py
│   │   ├── torchao/
│   │   │   ├── Dockerfile
│   │   │   ├── README.md
│   │   │   ├── ddp.py
│   │   │   ├── distrib_deepspeed.py
│   │   │   ├── fp8_utils.py
│   │   │   ├── fsdp.py
│   │   │   └── non_distributed.py
│   │   └── transformer_engine/
│   │       ├── Dockerfile
│   │       ├── README.md
│   │       ├── ddp.py
│   │       ├── distrib_deepspeed.py
│   │       ├── fp8_utils.py
│   │       ├── fsdp.py
│   │       └── non_distributed.py
│   ├── fsdp2/
│   │   ├── README.md
│   │   ├── main.py
│   │   ├── measure_utils.py
│   │   ├── utils.py
│   │   └── visualize.py
│   └── torch.compile/
│       ├── README.md
│       └── regional_compilation.py
├── docker/
│   ├── README.md
│   ├── accelerate-cpu/
│   │   └── Dockerfile
│   ├── accelerate-gpu/
│   │   └── Dockerfile
│   └── accelerate-gpu-deepspeed/
│       └── Dockerfile
├── docs/
│   ├── Makefile
│   ├── README.md
│   └── source/
│       ├── _toctree.yml
│       ├── basic_tutorials/
│       │   ├── execution.md
│       │   ├── install.md
│       │   ├── launch.md
│       │   ├── migration.md
│       │   ├── notebook.md
│       │   ├── overview.md
│       │   ├── tpu.md
│       │   └── troubleshooting.md
│       ├── concept_guides/
│       │   ├── big_model_inference.md
│       │   ├── context_parallelism.md
│       │   ├── deferring_execution.md
│       │   ├── fsdp1_vs_fsdp2.md
│       │   ├── fsdp_and_deepspeed.md
│       │   ├── gradient_synchronization.md
│       │   ├── internal_mechanism.md
│       │   ├── low_precision_training.md
│       │   ├── performance.md
│       │   ├── sequence_parallelism.md
│       │   └── training_tpu.md
│       ├── index.md
│       ├── package_reference/
│       │   ├── accelerator.md
│       │   ├── big_modeling.md
│       │   ├── cli.md
│       │   ├── deepspeed.md
│       │   ├── fp8.md
│       │   ├── fsdp.md
│       │   ├── inference.md
│       │   ├── kwargs.md
│       │   ├── launchers.md
│       │   ├── logging.md
│       │   ├── megatron_lm.md
│       │   ├── state.md
│       │   ├── torch_wrappers.md
│       │   ├── tracking.md
│       │   └── utilities.md
│       ├── quicktour.md
│       └── usage_guides/
│           ├── big_modeling.md
│           ├── checkpoint.md
│           ├── compilation.md
│           ├── ddp_comm_hook.md
│           ├── deepspeed.md
│           ├── deepspeed_multiple_model.md
│           ├── distributed_inference.md
│           ├── explore.md
│           ├── fsdp.md
│           ├── gaudi.md
│           ├── gradient_accumulation.md
│           ├── intel_cpu.md
│           ├── local_sgd.md
│           ├── low_precision_training.md
│           ├── megatron_lm.md
│           ├── model_size_estimator.md
│           ├── mps.md
│           ├── profiler.md
│           ├── quantization.md
│           ├── sagemaker.md
│           ├── tracking.md
│           └── training_zoo.md
├── examples/
│   ├── README.md
│   ├── alst_ulysses_sequence_parallelism/
│   │   ├── README.md
│   │   ├── sp-alst.accelerate-config.yml
│   │   ├── sp-alst.ds-config.json
│   │   ├── sp-alst.py
│   │   └── sp-alst.sh
│   ├── by_feature/
│   │   ├── README.md
│   │   ├── automatic_gradient_accumulation.py
│   │   ├── checkpointing.py
│   │   ├── cross_validation.py
│   │   ├── ddp_comm_hook.py
│   │   ├── deepspeed_with_config_support.py
│   │   ├── early_stopping.py
│   │   ├── fsdp_with_peak_mem_tracking.py
│   │   ├── gradient_accumulation.py
│   │   ├── gradient_accumulation_for_autoregressive_models.py
│   │   ├── local_sgd.py
│   │   ├── megatron_lm_gpt_pretraining.py
│   │   ├── memory.py
│   │   ├── multi_process_metrics.py
│   │   ├── profiler.py
│   │   ├── schedule_free.py
│   │   └── tracking.py
│   ├── complete_cv_example.py
│   ├── complete_nlp_example.py
│   ├── config_yaml_templates/
│   │   ├── README.md
│   │   ├── deepspeed.yaml
│   │   ├── fp8.yaml
│   │   ├── fsdp.yaml
│   │   ├── multi_gpu.yaml
│   │   ├── multi_node.yaml
│   │   ├── multi_xpu.yaml
│   │   ├── run_me.py
│   │   └── single_accelerator.yaml
│   ├── cv_example.py
│   ├── deepspeed_config_templates/
│   │   ├── zero_stage1_config.json
│   │   ├── zero_stage2_config.json
│   │   ├── zero_stage2_offload_config.json
│   │   ├── zero_stage3_config.json
│   │   └── zero_stage3_offload_config.json
│   ├── finetune_lm_tpu.py
│   ├── inference/
│   │   ├── distributed/
│   │   │   ├── README.md
│   │   │   ├── distributed_image_generation.py
│   │   │   ├── distributed_speech_generation.py
│   │   │   ├── florence2.py
│   │   │   ├── llava_next_video.py
│   │   │   ├── phi2.py
│   │   │   └── stable_diffusion.py
│   │   └── pippy/
│   │       ├── README.md
│   │       ├── bert.py
│   │       ├── gpt2.py
│   │       ├── llama.py
│   │       ├── requirements.txt
│   │       └── t5.py
│   ├── multigpu_remote_launcher.py
│   ├── nlp_example.py
│   ├── requirements.txt
│   ├── slurm/
│   │   ├── fsdp_config.yaml
│   │   ├── submit_multicpu.sh
│   │   ├── submit_multigpu.sh
│   │   ├── submit_multinode.sh
│   │   └── submit_multinode_fsdp.sh
│   └── torch_native_parallelism/
│       ├── README.md
│       ├── configs/
│       │   ├── cp.yaml
│       │   └── tp_hsdp.yaml
│       ├── fsdp2_fp8.py
│       ├── nd_parallel.py
│       ├── nd_parallel_trainer.py
│       └── utils.py
├── manim_animations/
│   ├── big_model_inference/
│   │   ├── stage_1.py
│   │   ├── stage_2.py
│   │   ├── stage_3.py
│   │   ├── stage_4.py
│   │   └── stage_5.py
│   └── dataloaders/
│       ├── stage_0.py
│       ├── stage_1.py
│       ├── stage_2.py
│       ├── stage_3.py
│       ├── stage_4.py
│       ├── stage_5.py
│       ├── stage_6.py
│       └── stage_7.py
├── pyproject.toml
├── setup.py
├── src/
│   └── accelerate/
│       ├── __init__.py
│       ├── accelerator.py
│       ├── big_modeling.py
│       ├── checkpointing.py
│       ├── commands/
│       │   ├── __init__.py
│       │   ├── accelerate_cli.py
│       │   ├── config/
│       │   │   ├── __init__.py
│       │   │   ├── cluster.py
│       │   │   ├── config.py
│       │   │   ├── config_args.py
│       │   │   ├── config_utils.py
│       │   │   ├── default.py
│       │   │   ├── sagemaker.py
│       │   │   └── update.py
│       │   ├── env.py
│       │   ├── estimate.py
│       │   ├── launch.py
│       │   ├── menu/
│       │   │   ├── __init__.py
│       │   │   ├── cursor.py
│       │   │   ├── helpers.py
│       │   │   ├── input.py
│       │   │   ├── keymap.py
│       │   │   └── selection_menu.py
│       │   ├── merge.py
│       │   ├── test.py
│       │   ├── to_fsdp2.py
│       │   ├── tpu.py
│       │   └── utils.py
│       ├── data_loader.py
│       ├── hooks.py
│       ├── inference.py
│       ├── launchers.py
│       ├── local_sgd.py
│       ├── logging.py
│       ├── memory_utils.py
│       ├── optimizer.py
│       ├── parallelism_config.py
│       ├── scheduler.py
│       ├── state.py
│       ├── test_utils/
│       │   ├── __init__.py
│       │   ├── examples.py
│       │   ├── scripts/
│       │   │   ├── __init__.py
│       │   │   ├── external_deps/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── test_checkpointing.py
│       │   │   │   ├── test_ds_alst_ulysses_sp.py
│       │   │   │   ├── test_ds_multiple_model.py
│       │   │   │   ├── test_metrics.py
│       │   │   │   ├── test_peak_memory_usage.py
│       │   │   │   ├── test_performance.py
│       │   │   │   ├── test_pippy.py
│       │   │   │   └── test_zero3_integration.py
│       │   │   ├── test_cli.py
│       │   │   ├── test_ddp_comm_hook.py
│       │   │   ├── test_distributed_data_loop.py
│       │   │   ├── test_merge_weights.py
│       │   │   ├── test_notebook.py
│       │   │   ├── test_ops.py
│       │   │   ├── test_script.py
│       │   │   └── test_sync.py
│       │   ├── testing.py
│       │   └── training.py
│       ├── tracking.py
│       └── utils/
│           ├── __init__.py
│           ├── ao.py
│           ├── bnb.py
│           ├── constants.py
│           ├── dataclasses.py
│           ├── deepspeed.py
│           ├── environment.py
│           ├── fsdp_utils.py
│           ├── imports.py
│           ├── launch.py
│           ├── megatron_lm.py
│           ├── memory.py
│           ├── modeling.py
│           ├── offload.py
│           ├── operations.py
│           ├── other.py
│           ├── random.py
│           ├── rich.py
│           ├── torch_xla.py
│           ├── tqdm.py
│           ├── transformer_engine.py
│           └── versions.py
├── tests/
│   ├── __init__.py
│   ├── deepspeed/
│   │   ├── ds_config_zero2.json
│   │   ├── ds_config_zero2_model_only.json
│   │   ├── ds_config_zero3.json
│   │   ├── ds_config_zero3_model_only.json
│   │   ├── test_alst_ulysses_sp.py
│   │   ├── test_deepspeed.py
│   │   ├── test_deepspeed_gradient_accumulation.py
│   │   └── test_deepspeed_multiple_model.py
│   ├── fsdp/
│   │   └── test_fsdp.py
│   ├── test_accelerator.py
│   ├── test_big_modeling.py
│   ├── test_cli.py
│   ├── test_compile.py
│   ├── test_configs/
│   │   ├── 0_11_0.yaml
│   │   ├── 0_12_0.yaml
│   │   ├── 0_28_0_mpi.yaml
│   │   ├── 0_30_0_sagemaker.yaml
│   │   ├── 0_34_0_fp8.yaml
│   │   ├── README.md
│   │   ├── invalid_keys.yaml
│   │   ├── latest.yaml
│   │   ├── latest_fsdp.yaml
│   │   └── validate_launch_cmd.yaml
│   ├── test_cpu.py
│   ├── test_data_loader.py
│   ├── test_dataclasses.py
│   ├── test_examples.py
│   ├── test_fp8.py
│   ├── test_grad_sync.py
│   ├── test_hooks.py
│   ├── test_imports.py
│   ├── test_kwargs_handlers.py
│   ├── test_launch.py
│   ├── test_load_checkpoint_and_dispatch_with_broadcast.py
│   ├── test_logging.py
│   ├── test_memory_utils.py
│   ├── test_metrics.py
│   ├── test_modeling_utils.py
│   ├── test_multidevice.py
│   ├── test_offload.py
│   ├── test_optimizer.py
│   ├── test_quantization.py
│   ├── test_sagemaker.py
│   ├── test_samples/
│   │   ├── MRPC/
│   │   │   ├── dev.csv
│   │   │   └── train.csv
│   │   └── test_command_file.sh
│   ├── test_scheduler.py
│   ├── test_state_checkpointing.py
│   ├── test_tpu.py
│   ├── test_tracking.py
│   ├── test_utils.py
│   ├── tp/
│   │   ├── fsdp2_tp_preparation.py
│   │   ├── fsdp2_tp_preparation_config.yaml
│   │   └── test_tp.py
│   └── xla_spawn.py
└── utils/
    ├── log_reports.py
    └── stale.py